fix bugs and add feature to convert compressed wav, boost vocal
This commit is contained in:
parent
7eb80ddb73
commit
a664c30eb1
6
.gitignore
vendored
6
.gitignore
vendored
@ -12,3 +12,9 @@ speechlib.egg-info
|
||||
# Override this by using 'git add -f'
|
||||
*.wav
|
||||
*.mp3
|
||||
/output
|
||||
/boosted_file
|
||||
/pcm_files
|
||||
/audios
|
||||
/segments
|
||||
/temp
|
||||
BIN
ffmpeg.exe
Normal file
BIN
ffmpeg.exe
Normal file
Binary file not shown.
3
main.py
3
main.py
@ -12,7 +12,7 @@ access_token = config.get('FILE', 'accesstoken')
|
||||
voicefolder = config.get('FILE', 'voicefolder')
|
||||
language = "id"
|
||||
quantization = str2bool(config.get('FILE', 'quantization'))
|
||||
modelSize = "medium"
|
||||
modelSize = "turbo"
|
||||
|
||||
### load the audio file in audio folder ###
|
||||
current_dir = os.getcwd()
|
||||
@ -40,4 +40,5 @@ for filename in os.listdir(audio_dir):
|
||||
### transcribe ###
|
||||
transcriptor = Transcriptor(audiofile, filepath, language, modelSize, access_token, voicefolder, quantization)
|
||||
res = transcriptor.faster_whisper()
|
||||
# res = transcriptor.whisper()
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
WATI (2.1 : 3.3) : See ya.
|
||||
WATI (6.3 : 9.9) : dari mapan cik mau kunjungan
|
||||
WATI (10.6 : 14.3) : Oh iya silahkan masuk mbak. Tak cek dulu ya cik ya.
|
||||
WATI (15.3 : 17.7) : Kita hitung dulu stoknya.
|
||||
WATI (18.6 : 20.2) : Terima kasih.
|
||||
WATI (21.6 : 38.1) : yang layan nih pesan-pesan tok ya cik ya onan-onan hujan terus ya hujan angin ini tinggal berapa itunya apa kemarin somek udang ya somek udang sama
|
||||
DWI (27.9 : 29.5) : Terima kasih.
|
||||
WATI (38.7 : 40.0) : itunya sampean
|
||||
WATI (44.2 : 45.7) : Minya tinggal satu lho, Ci.
|
||||
WATI (46.7 : 47.7) : Kita hitung ya.
|
||||
WATI (51.1 : 52.8) : Mbak helmnya diambil aja.
|
||||
@ -1 +1 @@
|
||||
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
|
||||
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
|
||||
@ -1 +1 @@
|
||||
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
|
||||
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
|
||||
@ -1 +1 @@
|
||||
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
|
||||
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
|
||||
@ -1 +1 @@
|
||||
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
|
||||
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
|
||||
@ -1 +1 @@
|
||||
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
|
||||
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
|
||||
@ -9,3 +9,6 @@ streamlit
|
||||
torch
|
||||
torchaudio
|
||||
assemblyai
|
||||
soundfile
|
||||
librosa
|
||||
pydub
|
||||
Binary file not shown.
BIN
speechlib/__pycache__/boost_vocal_range.cpython-312.pyc
Normal file
BIN
speechlib/__pycache__/boost_vocal_range.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
speechlib/__pycache__/convert_to_pcm.cpython-312.pyc
Normal file
BIN
speechlib/__pycache__/convert_to_pcm.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
23
speechlib/boost_vocal_range.py
Normal file
23
speechlib/boost_vocal_range.py
Normal file
@ -0,0 +1,23 @@
|
||||
from pydub import AudioSegment
|
||||
import os
|
||||
|
||||
def boost_vocal_range(input_path, output_path=None):
|
||||
"""
|
||||
Boost the vocal range of an audio file by applying a high-pass and low-pass filter.
|
||||
The function reads an audio file, applies the filters, and exports the modified audio.
|
||||
"""
|
||||
folder = os.getcwd() + "\\boosted_file\\"
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
|
||||
if output_path is None:
|
||||
base, _ = os.path.splitext(input_path)
|
||||
output_path = folder + base.split('\\')[-1] + "_boosted.wav"
|
||||
|
||||
sound = AudioSegment.from_wav(input_path)
|
||||
sound = sound.apply_gain(5) # Boost by 5 dB
|
||||
filtered = sound.high_pass_filter(300).low_pass_filter(3400)
|
||||
filtered.export(output_path, format="wav")
|
||||
return output_path
|
||||
|
||||
|
||||
@ -1,30 +1,30 @@
|
||||
import wave
|
||||
import numpy as np
|
||||
|
||||
import soundfile as sf
|
||||
# import soundfile as sf
|
||||
|
||||
def resave_audio(input_file, output_file):
|
||||
"""Loads an audio file and resaves it.
|
||||
# def resave_audio(input_file, output_file):
|
||||
# """Loads an audio file and resaves it.
|
||||
|
||||
Args:
|
||||
input_file (str): Path to the input audio file.
|
||||
output_file (str): Path to save the resaved audio file.
|
||||
"""
|
||||
try:
|
||||
# Read the audio file
|
||||
data, samplerate = sf.read(input_file)
|
||||
# Args:
|
||||
# input_file (str): Path to the input audio file.
|
||||
# output_file (str): Path to save the resaved audio file.
|
||||
# """
|
||||
# try:
|
||||
# # Read the audio file
|
||||
# data, samplerate = sf.read(input_file)
|
||||
|
||||
# Write the audio data to a new file
|
||||
sf.write(output_file, data, samplerate)
|
||||
print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
|
||||
# # Write the audio data to a new file
|
||||
# sf.write(output_file, data, samplerate)
|
||||
# print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing audio: {e}")
|
||||
# except Exception as e:
|
||||
# print(f"Error processing audio: {e}")
|
||||
|
||||
def convert_to_mono(input_wav):
|
||||
# Resave WAV file
|
||||
resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
|
||||
input_wav = input_wav.split('.')[0] + "_pcm.wav"
|
||||
# resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
|
||||
# input_wav = input_wav.split('.')[0] + "_pcm.wav"
|
||||
|
||||
with wave.open(input_wav, 'rb') as input_file:
|
||||
# Get the parameters of the input file
|
||||
|
||||
46
speechlib/convert_to_pcm.py
Normal file
46
speechlib/convert_to_pcm.py
Normal file
@ -0,0 +1,46 @@
|
||||
import soundfile as sf
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
def is_pcm_wav(file_path):
|
||||
try:
|
||||
info = sf.info(file_path)
|
||||
return info.subtype.startswith("PCM") # e.g., 'PCM_16'
|
||||
except RuntimeError as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
return False
|
||||
|
||||
def convert_to_pcm_ffmpeg(input_path, output_path=None):
|
||||
folder = os.getcwd() + "\\pcm_files\\"
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
|
||||
if output_path is None:
|
||||
base, _ = os.path.splitext(input_path)
|
||||
output_path = folder + base.split('\\')[-1] + "_pcm.wav"
|
||||
|
||||
"""Loads an audio file and resaves it.
|
||||
|
||||
Args:
|
||||
input_file (str): Path to the input audio file.
|
||||
output_file (str): Path to save the resaved audio file.
|
||||
"""
|
||||
try:
|
||||
# Read the audio file
|
||||
data, samplerate = sf.read(input_path)
|
||||
|
||||
# Write the audio data to a new file
|
||||
sf.write(output_path, data, samplerate)
|
||||
print(f"Successfully resaved audio from '{input_path}' to '{output_path}'")
|
||||
except Exception as e:
|
||||
print(f"Error processing audio: {e}")
|
||||
|
||||
return output_path
|
||||
|
||||
def prepare_wav_for_wave_module(input_path):
|
||||
if is_pcm_wav(input_path):
|
||||
print(f"{input_path} is already PCM.")
|
||||
return input_path
|
||||
else:
|
||||
print(f"{input_path} is compressed. Converting to PCM...")
|
||||
return convert_to_pcm_ffmpeg(input_path)
|
||||
@ -10,7 +10,8 @@ from .write_log_file import (write_log_file)
|
||||
from .re_encode import (re_encode)
|
||||
from .convert_to_mono import (convert_to_mono)
|
||||
from .convert_to_wav import (convert_to_wav)
|
||||
|
||||
from .convert_to_pcm import (prepare_wav_for_wave_module)
|
||||
from .boost_vocal_range import (boost_vocal_range)
|
||||
import subprocess
|
||||
|
||||
# by default use google speech-to-text API
|
||||
@ -18,18 +19,21 @@ import subprocess
|
||||
def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None):
|
||||
|
||||
# <-------------------PreProcessing file-------------------------->
|
||||
# convert compressed wav
|
||||
# if file_name.lower().endswith(".wav"):
|
||||
# subprocess.call(['ffmpeg','i',file_name,file_name.split(".")[0]+'.wav'])
|
||||
# check if file is in wav format, if not convert to wav
|
||||
file_name = convert_to_wav(file_name)
|
||||
|
||||
file_name = prepare_wav_for_wave_module(file_name)
|
||||
|
||||
# convert file to mono
|
||||
convert_to_mono(file_name)
|
||||
|
||||
# re-encode file to 16-bit PCM encoding
|
||||
re_encode(file_name)
|
||||
|
||||
# Voice Activity Detection (VAD)
|
||||
|
||||
# Boost vocal range
|
||||
boost_vocal_range(file_name)
|
||||
# <--------------------running analysis--------------------------->
|
||||
|
||||
speaker_tags = []
|
||||
|
||||
27
speechlib/speaker_embedding.py
Normal file
27
speechlib/speaker_embedding.py
Normal file
@ -0,0 +1,27 @@
|
||||
from speechbrain.inference import SpeakerRecognition
|
||||
import os
|
||||
from pydub import AudioSegment
|
||||
from collections import defaultdict
|
||||
import torch
|
||||
|
||||
SPEAKER_DATABASE = {}
|
||||
|
||||
if torch.cuda.is_available():
|
||||
verification = SpeakerRecognition.from_hparams(run_opts={"device":"cuda"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
|
||||
else:
|
||||
verification = SpeakerRecognition.from_hparams(run_opts={"device":"cpu"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
|
||||
|
||||
# recognize speaker name
|
||||
def speaker_embedding(voices_folder, segments):
|
||||
speakers = os.listdir(voices_folder)
|
||||
for speaker in speakers:
|
||||
voices = os.listdir(voices_folder + "/" + speaker)
|
||||
for voice in voices:
|
||||
voice_file = voices_folder + "/" + speaker + "/" + voice
|
||||
try:
|
||||
# compare voice file with audio file
|
||||
embedding = verification.encode_batch(voice_file).squeeze(0).detach()
|
||||
SPEAKER_DATABASE[speaker] = embedding
|
||||
except Exception as err:
|
||||
print("error occured while speaker recognition: ", err)
|
||||
torch.save(SPEAKER_DATABASE, "speaker_db.pt")
|
||||
@ -13,14 +13,14 @@ config.read('config.ini')
|
||||
modelpath = config.get('FILE', 'modelpath')
|
||||
batchsize = int(config.get('FILE', 'batchsize'))
|
||||
beamsize = int(config.get('FILE', 'beamsize'))
|
||||
localfile = False
|
||||
localfile = True
|
||||
|
||||
def transcribe(file, language, model_size, model_type, quantization, custom_model_path, hf_model_path, aai_api_key):
|
||||
res = ""
|
||||
if language in ["si", "Si"]:
|
||||
res = whisper_sinhala(file)
|
||||
return res
|
||||
elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]:
|
||||
elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3", "turbo"]:
|
||||
if model_type == "faster-whisper":
|
||||
if modelpath != "":
|
||||
model_size = modelpath
|
||||
@ -62,7 +62,7 @@ def transcribe(file, language, model_size, model_type, quantization, custom_mode
|
||||
except Exception as err:
|
||||
print("an error occured while transcribing: ", err)
|
||||
elif model_type == "custom":
|
||||
custom_model_path = modelpath
|
||||
# custom_model_path = modelpath
|
||||
model_folder = os.path.dirname(custom_model_path)
|
||||
model_folder = model_folder + "/"
|
||||
print("model file: ", custom_model_path)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user