fix bugs and add feature to convert compressed wav, boost vocal

This commit is contained in:
Suherdy Yacob 2025-05-30 20:12:22 +07:00
parent 7eb80ddb73
commit a664c30eb1
31 changed files with 141 additions and 42 deletions

6
.gitignore vendored
View File

@ -12,3 +12,9 @@ speechlib.egg-info
# Override this by using 'git add -f'
*.wav
*.mp3
/output
/boosted_file
/pcm_files
/audios
/segments
/temp

BIN
ffmpeg.exe Normal file

Binary file not shown.

View File

@ -12,7 +12,7 @@ access_token = config.get('FILE', 'accesstoken')
voicefolder = config.get('FILE', 'voicefolder')
language = "id"
quantization = str2bool(config.get('FILE', 'quantization'))
modelSize = "medium"
modelSize = "turbo"
### load the audio file in audio folder ###
current_dir = os.getcwd()
@ -40,4 +40,5 @@ for filename in os.listdir(audio_dir):
### transcribe ###
transcriptor = Transcriptor(audiofile, filepath, language, modelSize, access_token, voicefolder, quantization)
res = transcriptor.faster_whisper()
# res = transcriptor.whisper()

View File

@ -1,11 +0,0 @@
WATI (2.1 : 3.3) : See ya.
WATI (6.3 : 9.9) : dari mapan cik mau kunjungan
WATI (10.6 : 14.3) : Oh iya silahkan masuk mbak. Tak cek dulu ya cik ya.
WATI (15.3 : 17.7) : Kita hitung dulu stoknya.
WATI (18.6 : 20.2) : Terima kasih.
WATI (21.6 : 38.1) : yang layan nih pesan-pesan tok ya cik ya onan-onan hujan terus ya hujan angin ini tinggal berapa itunya apa kemarin somek udang ya somek udang sama
DWI (27.9 : 29.5) : Terima kasih.
WATI (38.7 : 40.0) : itunya sampean
WATI (44.2 : 45.7) : Minya tinggal satu lho, Ci.
WATI (46.7 : 47.7) : Kita hitung ya.
WATI (51.1 : 52.8) : Mbak helmnya diambil aja.

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt

View File

@ -1 +1 @@
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt

View File

@ -8,4 +8,7 @@ openai-whisper
streamlit
torch
torchaudio
assemblyai
assemblyai
soundfile
librosa
pydub

Binary file not shown.

View File

@ -0,0 +1,23 @@
from pydub import AudioSegment
import os
def boost_vocal_range(input_path, output_path=None):
"""
Boost the vocal range of an audio file by applying a high-pass and low-pass filter.
The function reads an audio file, applies the filters, and exports the modified audio.
"""
folder = os.getcwd() + "\\boosted_file\\"
if not os.path.exists(folder):
os.makedirs(folder)
if output_path is None:
base, _ = os.path.splitext(input_path)
output_path = folder + base.split('\\')[-1] + "_boosted.wav"
sound = AudioSegment.from_wav(input_path)
sound = sound.apply_gain(5) # Boost by 5 dB
filtered = sound.high_pass_filter(300).low_pass_filter(3400)
filtered.export(output_path, format="wav")
return output_path

View File

@ -1,30 +1,30 @@
import wave
import numpy as np
import soundfile as sf
# import soundfile as sf
def resave_audio(input_file, output_file):
"""Loads an audio file and resaves it.
# def resave_audio(input_file, output_file):
# """Loads an audio file and resaves it.
Args:
input_file (str): Path to the input audio file.
output_file (str): Path to save the resaved audio file.
"""
try:
# Read the audio file
data, samplerate = sf.read(input_file)
# Args:
# input_file (str): Path to the input audio file.
# output_file (str): Path to save the resaved audio file.
# """
# try:
# # Read the audio file
# data, samplerate = sf.read(input_file)
# Write the audio data to a new file
sf.write(output_file, data, samplerate)
print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
# # Write the audio data to a new file
# sf.write(output_file, data, samplerate)
# print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
except Exception as e:
print(f"Error processing audio: {e}")
# except Exception as e:
# print(f"Error processing audio: {e}")
def convert_to_mono(input_wav):
# Resave WAV file
resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
input_wav = input_wav.split('.')[0] + "_pcm.wav"
# resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
# input_wav = input_wav.split('.')[0] + "_pcm.wav"
with wave.open(input_wav, 'rb') as input_file:
# Get the parameters of the input file

View File

@ -0,0 +1,46 @@
import soundfile as sf
import subprocess
import os
def is_pcm_wav(file_path):
try:
info = sf.info(file_path)
return info.subtype.startswith("PCM") # e.g., 'PCM_16'
except RuntimeError as e:
print(f"Error reading {file_path}: {e}")
return False
def convert_to_pcm_ffmpeg(input_path, output_path=None):
folder = os.getcwd() + "\\pcm_files\\"
if not os.path.exists(folder):
os.makedirs(folder)
if output_path is None:
base, _ = os.path.splitext(input_path)
output_path = folder + base.split('\\')[-1] + "_pcm.wav"
"""Loads an audio file and resaves it.
Args:
input_file (str): Path to the input audio file.
output_file (str): Path to save the resaved audio file.
"""
try:
# Read the audio file
data, samplerate = sf.read(input_path)
# Write the audio data to a new file
sf.write(output_path, data, samplerate)
print(f"Successfully resaved audio from '{input_path}' to '{output_path}'")
except Exception as e:
print(f"Error processing audio: {e}")
return output_path
def prepare_wav_for_wave_module(input_path):
if is_pcm_wav(input_path):
print(f"{input_path} is already PCM.")
return input_path
else:
print(f"{input_path} is compressed. Converting to PCM...")
return convert_to_pcm_ffmpeg(input_path)

View File

@ -10,7 +10,8 @@ from .write_log_file import (write_log_file)
from .re_encode import (re_encode)
from .convert_to_mono import (convert_to_mono)
from .convert_to_wav import (convert_to_wav)
from .convert_to_pcm import (prepare_wav_for_wave_module)
from .boost_vocal_range import (boost_vocal_range)
import subprocess
# by default use google speech-to-text API
@ -18,18 +19,21 @@ import subprocess
def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None):
# <-------------------PreProcessing file-------------------------->
# convert compressed wav
# if file_name.lower().endswith(".wav"):
# subprocess.call(['ffmpeg','i',file_name,file_name.split(".")[0]+'.wav'])
# check if file is in wav format, if not convert to wav
file_name = convert_to_wav(file_name)
file_name = prepare_wav_for_wave_module(file_name)
# convert file to mono
convert_to_mono(file_name)
# re-encode file to 16-bit PCM encoding
re_encode(file_name)
# Voice Activity Detection (VAD)
# Boost vocal range
boost_vocal_range(file_name)
# <--------------------running analysis--------------------------->
speaker_tags = []

View File

@ -0,0 +1,27 @@
from speechbrain.inference import SpeakerRecognition
import os
from pydub import AudioSegment
from collections import defaultdict
import torch
SPEAKER_DATABASE = {}
if torch.cuda.is_available():
verification = SpeakerRecognition.from_hparams(run_opts={"device":"cuda"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
else:
verification = SpeakerRecognition.from_hparams(run_opts={"device":"cpu"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
# recognize speaker name
def speaker_embedding(voices_folder, segments):
speakers = os.listdir(voices_folder)
for speaker in speakers:
voices = os.listdir(voices_folder + "/" + speaker)
for voice in voices:
voice_file = voices_folder + "/" + speaker + "/" + voice
try:
# compare voice file with audio file
embedding = verification.encode_batch(voice_file).squeeze(0).detach()
SPEAKER_DATABASE[speaker] = embedding
except Exception as err:
print("error occured while speaker recognition: ", err)
torch.save(SPEAKER_DATABASE, "speaker_db.pt")

View File

@ -13,14 +13,14 @@ config.read('config.ini')
modelpath = config.get('FILE', 'modelpath')
batchsize = int(config.get('FILE', 'batchsize'))
beamsize = int(config.get('FILE', 'beamsize'))
localfile = False
localfile = True
def transcribe(file, language, model_size, model_type, quantization, custom_model_path, hf_model_path, aai_api_key):
res = ""
if language in ["si", "Si"]:
res = whisper_sinhala(file)
return res
elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]:
elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3", "turbo"]:
if model_type == "faster-whisper":
if modelpath != "":
model_size = modelpath
@ -62,7 +62,7 @@ def transcribe(file, language, model_size, model_type, quantization, custom_mode
except Exception as err:
print("an error occured while transcribing: ", err)
elif model_type == "custom":
custom_model_path = modelpath
# custom_model_path = modelpath
model_folder = os.path.dirname(custom_model_path)
model_folder = model_folder + "/"
print("model file: ", custom_model_path)