fix bugs and add feature to convert compressed wav, boost vocal

This commit is contained in:
Suherdy Yacob 2025-05-30 20:12:22 +07:00
parent 7eb80ddb73
commit a664c30eb1
31 changed files with 141 additions and 42 deletions

6
.gitignore vendored
View File

@ -12,3 +12,9 @@ speechlib.egg-info
# Override this by using 'git add -f' # Override this by using 'git add -f'
*.wav *.wav
*.mp3 *.mp3
/output
/boosted_file
/pcm_files
/audios
/segments
/temp

BIN
ffmpeg.exe Normal file

Binary file not shown.

View File

@ -12,7 +12,7 @@ access_token = config.get('FILE', 'accesstoken')
voicefolder = config.get('FILE', 'voicefolder') voicefolder = config.get('FILE', 'voicefolder')
language = "id" language = "id"
quantization = str2bool(config.get('FILE', 'quantization')) quantization = str2bool(config.get('FILE', 'quantization'))
modelSize = "medium" modelSize = "turbo"
### load the audio file in audio folder ### ### load the audio file in audio folder ###
current_dir = os.getcwd() current_dir = os.getcwd()
@ -40,4 +40,5 @@ for filename in os.listdir(audio_dir):
### transcribe ### ### transcribe ###
transcriptor = Transcriptor(audiofile, filepath, language, modelSize, access_token, voicefolder, quantization) transcriptor = Transcriptor(audiofile, filepath, language, modelSize, access_token, voicefolder, quantization)
res = transcriptor.faster_whisper() res = transcriptor.faster_whisper()
# res = transcriptor.whisper()

View File

@ -1,11 +0,0 @@
WATI (2.1 : 3.3) : See ya.
WATI (6.3 : 9.9) : dari mapan cik mau kunjungan
WATI (10.6 : 14.3) : Oh iya silahkan masuk mbak. Tak cek dulu ya cik ya.
WATI (15.3 : 17.7) : Kita hitung dulu stoknya.
WATI (18.6 : 20.2) : Terima kasih.
WATI (21.6 : 38.1) : yang layan nih pesan-pesan tok ya cik ya onan-onan hujan terus ya hujan angin ini tinggal berapa itunya apa kemarin somek udang ya somek udang sama
DWI (27.9 : 29.5) : Terima kasih.
WATI (38.7 : 40.0) : itunya sampean
WATI (44.2 : 45.7) : Minya tinggal satu lho, Ci.
WATI (46.7 : 47.7) : Kita hitung ya.
WATI (51.1 : 52.8) : Mbak helmnya diambil aja.

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt

View File

@ -8,4 +8,7 @@ openai-whisper
streamlit streamlit
torch torch
torchaudio torchaudio
assemblyai assemblyai
soundfile
librosa
pydub

Binary file not shown.

View File

@ -0,0 +1,23 @@
from pydub import AudioSegment
import os
def boost_vocal_range(input_path, output_path=None):
"""
Boost the vocal range of an audio file by applying a high-pass and low-pass filter.
The function reads an audio file, applies the filters, and exports the modified audio.
"""
folder = os.getcwd() + "\\boosted_file\\"
if not os.path.exists(folder):
os.makedirs(folder)
if output_path is None:
base, _ = os.path.splitext(input_path)
output_path = folder + base.split('\\')[-1] + "_boosted.wav"
sound = AudioSegment.from_wav(input_path)
sound = sound.apply_gain(5) # Boost by 5 dB
filtered = sound.high_pass_filter(300).low_pass_filter(3400)
filtered.export(output_path, format="wav")
return output_path

View File

@ -1,30 +1,30 @@
import wave import wave
import numpy as np import numpy as np
import soundfile as sf # import soundfile as sf
def resave_audio(input_file, output_file): # def resave_audio(input_file, output_file):
"""Loads an audio file and resaves it. # """Loads an audio file and resaves it.
Args: # Args:
input_file (str): Path to the input audio file. # input_file (str): Path to the input audio file.
output_file (str): Path to save the resaved audio file. # output_file (str): Path to save the resaved audio file.
""" # """
try: # try:
# Read the audio file # # Read the audio file
data, samplerate = sf.read(input_file) # data, samplerate = sf.read(input_file)
# Write the audio data to a new file # # Write the audio data to a new file
sf.write(output_file, data, samplerate) # sf.write(output_file, data, samplerate)
print(f"Successfully resaved audio from '{input_file}' to '{output_file}'") # print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
except Exception as e: # except Exception as e:
print(f"Error processing audio: {e}") # print(f"Error processing audio: {e}")
def convert_to_mono(input_wav): def convert_to_mono(input_wav):
# Resave WAV file # Resave WAV file
resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav") # resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
input_wav = input_wav.split('.')[0] + "_pcm.wav" # input_wav = input_wav.split('.')[0] + "_pcm.wav"
with wave.open(input_wav, 'rb') as input_file: with wave.open(input_wav, 'rb') as input_file:
# Get the parameters of the input file # Get the parameters of the input file

View File

@ -0,0 +1,46 @@
import soundfile as sf
import subprocess
import os
def is_pcm_wav(file_path):
try:
info = sf.info(file_path)
return info.subtype.startswith("PCM") # e.g., 'PCM_16'
except RuntimeError as e:
print(f"Error reading {file_path}: {e}")
return False
def convert_to_pcm_ffmpeg(input_path, output_path=None):
folder = os.getcwd() + "\\pcm_files\\"
if not os.path.exists(folder):
os.makedirs(folder)
if output_path is None:
base, _ = os.path.splitext(input_path)
output_path = folder + base.split('\\')[-1] + "_pcm.wav"
"""Loads an audio file and resaves it.
Args:
input_file (str): Path to the input audio file.
output_file (str): Path to save the resaved audio file.
"""
try:
# Read the audio file
data, samplerate = sf.read(input_path)
# Write the audio data to a new file
sf.write(output_path, data, samplerate)
print(f"Successfully resaved audio from '{input_path}' to '{output_path}'")
except Exception as e:
print(f"Error processing audio: {e}")
return output_path
def prepare_wav_for_wave_module(input_path):
if is_pcm_wav(input_path):
print(f"{input_path} is already PCM.")
return input_path
else:
print(f"{input_path} is compressed. Converting to PCM...")
return convert_to_pcm_ffmpeg(input_path)

View File

@ -10,7 +10,8 @@ from .write_log_file import (write_log_file)
from .re_encode import (re_encode) from .re_encode import (re_encode)
from .convert_to_mono import (convert_to_mono) from .convert_to_mono import (convert_to_mono)
from .convert_to_wav import (convert_to_wav) from .convert_to_wav import (convert_to_wav)
from .convert_to_pcm import (prepare_wav_for_wave_module)
from .boost_vocal_range import (boost_vocal_range)
import subprocess import subprocess
# by default use google speech-to-text API # by default use google speech-to-text API
@ -18,18 +19,21 @@ import subprocess
def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None): def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None):
# <-------------------PreProcessing file--------------------------> # <-------------------PreProcessing file-------------------------->
# convert compressed wav
# if file_name.lower().endswith(".wav"):
# subprocess.call(['ffmpeg','i',file_name,file_name.split(".")[0]+'.wav'])
# check if file is in wav format, if not convert to wav # check if file is in wav format, if not convert to wav
file_name = convert_to_wav(file_name) file_name = convert_to_wav(file_name)
file_name = prepare_wav_for_wave_module(file_name)
# convert file to mono # convert file to mono
convert_to_mono(file_name) convert_to_mono(file_name)
# re-encode file to 16-bit PCM encoding # re-encode file to 16-bit PCM encoding
re_encode(file_name) re_encode(file_name)
# Voice Activity Detection (VAD)
# Boost vocal range
boost_vocal_range(file_name)
# <--------------------running analysis---------------------------> # <--------------------running analysis--------------------------->
speaker_tags = [] speaker_tags = []

View File

@ -0,0 +1,27 @@
from speechbrain.inference import SpeakerRecognition
import os
from pydub import AudioSegment
from collections import defaultdict
import torch
SPEAKER_DATABASE = {}
if torch.cuda.is_available():
verification = SpeakerRecognition.from_hparams(run_opts={"device":"cuda"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
else:
verification = SpeakerRecognition.from_hparams(run_opts={"device":"cpu"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
# recognize speaker name
def speaker_embedding(voices_folder, segments):
speakers = os.listdir(voices_folder)
for speaker in speakers:
voices = os.listdir(voices_folder + "/" + speaker)
for voice in voices:
voice_file = voices_folder + "/" + speaker + "/" + voice
try:
# compare voice file with audio file
embedding = verification.encode_batch(voice_file).squeeze(0).detach()
SPEAKER_DATABASE[speaker] = embedding
except Exception as err:
print("error occured while speaker recognition: ", err)
torch.save(SPEAKER_DATABASE, "speaker_db.pt")

View File

@ -13,14 +13,14 @@ config.read('config.ini')
modelpath = config.get('FILE', 'modelpath') modelpath = config.get('FILE', 'modelpath')
batchsize = int(config.get('FILE', 'batchsize')) batchsize = int(config.get('FILE', 'batchsize'))
beamsize = int(config.get('FILE', 'beamsize')) beamsize = int(config.get('FILE', 'beamsize'))
localfile = False localfile = True
def transcribe(file, language, model_size, model_type, quantization, custom_model_path, hf_model_path, aai_api_key): def transcribe(file, language, model_size, model_type, quantization, custom_model_path, hf_model_path, aai_api_key):
res = "" res = ""
if language in ["si", "Si"]: if language in ["si", "Si"]:
res = whisper_sinhala(file) res = whisper_sinhala(file)
return res return res
elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]: elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3", "turbo"]:
if model_type == "faster-whisper": if model_type == "faster-whisper":
if modelpath != "": if modelpath != "":
model_size = modelpath model_size = modelpath
@ -62,7 +62,7 @@ def transcribe(file, language, model_size, model_type, quantization, custom_mode
except Exception as err: except Exception as err:
print("an error occured while transcribing: ", err) print("an error occured while transcribing: ", err)
elif model_type == "custom": elif model_type == "custom":
custom_model_path = modelpath # custom_model_path = modelpath
model_folder = os.path.dirname(custom_model_path) model_folder = os.path.dirname(custom_model_path)
model_folder = model_folder + "/" model_folder = model_folder + "/"
print("model file: ", custom_model_path) print("model file: ", custom_model_path)