fix bugs and add feature to convert compressed wav, boost vocal

2025-05-30 20:12:22 +07:00 · 2025-05-30 20:12:22 +07:00 · a664c30eb1
commit a664c30eb1
parent 7eb80ddb73
31 changed files with 141 additions and 42 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,9 @@ speechlib.egg-info
 # Override this by using 'git add -f'
 *.wav
 *.mp3
 /output
 /boosted_file
 /pcm_files
 /audios
 /segments
 /temp
--- a/ffmpeg.exe
+++ b/ffmpeg.exe
--- a/main.py
+++ b/main.py
@ -12,7 +12,7 @@ access_token = config.get('FILE', 'accesstoken')
 voicefolder = config.get('FILE', 'voicefolder')
 language = "id"
 quantization = str2bool(config.get('FILE', 'quantization'))
-modelSize = "medium"
+modelSize = "turbo"
 ### load the audio file in audio folder ###
 current_dir = os.getcwd()
@ -40,4 +40,5 @@ for filename in os.listdir(audio_dir):
        ### transcribe ###
        transcriptor = Transcriptor(audiofile, filepath, language, modelSize, access_token, voicefolder, quantization)
        res = transcriptor.faster_whisper()
        # res = transcriptor.whisper()
--- a/output/Candra
+++ b/output/Candra
--- a/mas_113350_id.txt
+++ b/mas_113350_id.txt
@ -1,11 +0,0 @@
 WATI (2.1 : 3.3) :  See ya. 
 WATI (6.3 : 9.9) :  dari mapan cik mau kunjungan 
 WATI (10.6 : 14.3) :  Oh iya silahkan masuk mbak. Tak cek dulu ya cik ya. 
 WATI (15.3 : 17.7) :  Kita hitung dulu stoknya. 
 WATI (18.6 : 20.2) :  Terima kasih. 
 WATI (21.6 : 38.1) :  yang layan nih pesan-pesan tok ya cik ya onan-onan hujan terus ya hujan angin ini tinggal berapa itunya apa kemarin somek udang ya somek udang sama 
 DWI (27.9 : 29.5) :  Terima kasih. 
 WATI (38.7 : 40.0) :  itunya sampean 
 WATI (44.2 : 45.7) :  Minya tinggal satu lho, Ci. 
 WATI (46.7 : 47.7) :  Kita hitung ya. 
 WATI (51.1 : 52.8) :  Mbak helmnya diambil aja. 
--- a/output/REC20250526103049.WAV.txt
+++ b/output/REC20250526103049.WAV.txt
--- a/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
--- a/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
--- a/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml
+++ b/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
--- a/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
--- a/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
--- a/requirements.txt
+++ b/requirements.txt
@ -8,4 +8,7 @@ openai-whisper
 streamlit
 torch
 torchaudio
-assemblyai
+assemblyai
 soundfile
 librosa
 pydub
--- a/speechlib/pycache/init.cpython-312.pyc
+++ b/speechlib/pycache/init.cpython-312.pyc
--- a/speechlib/pycache/boost_vocal_range.cpython-312.pyc
+++ b/speechlib/pycache/boost_vocal_range.cpython-312.pyc
--- a/speechlib/pycache/convert_to_mono.cpython-312.pyc
+++ b/speechlib/pycache/convert_to_mono.cpython-312.pyc
--- a/speechlib/pycache/convert_to_pcm.cpython-312.pyc
+++ b/speechlib/pycache/convert_to_pcm.cpython-312.pyc
--- a/speechlib/pycache/convert_to_wav.cpython-312.pyc
+++ b/speechlib/pycache/convert_to_wav.cpython-312.pyc
--- a/speechlib/pycache/core_analysis.cpython-312.pyc
+++ b/speechlib/pycache/core_analysis.cpython-312.pyc
--- a/speechlib/pycache/re_encode.cpython-312.pyc
+++ b/speechlib/pycache/re_encode.cpython-312.pyc
--- a/speechlib/pycache/speaker_recognition.cpython-312.pyc
+++ b/speechlib/pycache/speaker_recognition.cpython-312.pyc
--- a/speechlib/pycache/speechlib.cpython-312.pyc
+++ b/speechlib/pycache/speechlib.cpython-312.pyc
--- a/speechlib/pycache/transcribe.cpython-312.pyc
+++ b/speechlib/pycache/transcribe.cpython-312.pyc
--- a/speechlib/pycache/wav_segmenter.cpython-312.pyc
+++ b/speechlib/pycache/wav_segmenter.cpython-312.pyc
--- a/speechlib/pycache/whisper_sinhala.cpython-312.pyc
+++ b/speechlib/pycache/whisper_sinhala.cpython-312.pyc
--- a/speechlib/pycache/write_log_file.cpython-312.pyc
+++ b/speechlib/pycache/write_log_file.cpython-312.pyc
--- a/speechlib/boost_vocal_range.py
+++ b/speechlib/boost_vocal_range.py
@ -0,0 +1,23 @@
 from pydub import AudioSegment
 import os
 def boost_vocal_range(input_path, output_path=None):
    """
    Boost the vocal range of an audio file by applying a high-pass and low-pass filter.
    The function reads an audio file, applies the filters, and exports the modified audio.
    """
    folder = os.getcwd() + "\\boosted_file\\"
    if not os.path.exists(folder):
        os.makedirs(folder)
    if output_path is None:
        base, _ = os.path.splitext(input_path)
        output_path = folder + base.split('\\')[-1] + "_boosted.wav"
    sound = AudioSegment.from_wav(input_path)
    sound = sound.apply_gain(5)  # Boost by 5 dB
    filtered = sound.high_pass_filter(300).low_pass_filter(3400)
    filtered.export(output_path, format="wav")
    return output_path
--- a/speechlib/convert_to_mono.py
+++ b/speechlib/convert_to_mono.py
@ -1,30 +1,30 @@
 import wave
 import numpy as np
-import soundfile as sf
+# import soundfile as sf
-def resave_audio(input_file, output_file):
+# def resave_audio(input_file, output_file):
-    """Loads an audio file and resaves it.
+#     """Loads an audio file and resaves it.
-    Args:
+#     Args:
-        input_file (str): Path to the input audio file.
+#         input_file (str): Path to the input audio file.
-        output_file (str): Path to save the resaved audio file.
+#         output_file (str): Path to save the resaved audio file.
-    """
+#     """
-    try:
+#     try:
-        # Read the audio file
+#         # Read the audio file
-        data, samplerate = sf.read(input_file)
+#         data, samplerate = sf.read(input_file)
-        # Write the audio data to a new file
+#         # Write the audio data to a new file
-        sf.write(output_file, data, samplerate)
+#         sf.write(output_file, data, samplerate)
-        print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
+#         print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
-    except Exception as e:
+#     except Exception as e:
-        print(f"Error processing audio: {e}")
+#         print(f"Error processing audio: {e}")
 def convert_to_mono(input_wav):
    # Resave WAV file
-    resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
+    # resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
-    input_wav = input_wav.split('.')[0] + "_pcm.wav"
+    # input_wav = input_wav.split('.')[0] + "_pcm.wav"
    with wave.open(input_wav, 'rb') as input_file:
        # Get the parameters of the input file
--- a/speechlib/convert_to_pcm.py
+++ b/speechlib/convert_to_pcm.py
@ -0,0 +1,46 @@
 import soundfile as sf
 import subprocess
 import os
 def is_pcm_wav(file_path):
    try:
        info = sf.info(file_path)
        return info.subtype.startswith("PCM")  # e.g., 'PCM_16'
    except RuntimeError as e:
        print(f"Error reading {file_path}: {e}")
        return False
 def convert_to_pcm_ffmpeg(input_path, output_path=None):
    folder = os.getcwd() + "\\pcm_files\\"
    if not os.path.exists(folder):
        os.makedirs(folder)
    if output_path is None:
        base, _ = os.path.splitext(input_path)
        output_path = folder + base.split('\\')[-1] + "_pcm.wav"
    """Loads an audio file and resaves it.
    Args:
        input_file (str): Path to the input audio file.
        output_file (str): Path to save the resaved audio file.
    """
    try:
        # Read the audio file
        data, samplerate = sf.read(input_path)
        # Write the audio data to a new file
        sf.write(output_path, data, samplerate)
        print(f"Successfully resaved audio from '{input_path}' to '{output_path}'")
    except Exception as e:
        print(f"Error processing audio: {e}")
    return output_path
 def prepare_wav_for_wave_module(input_path):
    if is_pcm_wav(input_path):
        print(f"{input_path} is already PCM.")
        return input_path
    else:
        print(f"{input_path} is compressed. Converting to PCM...")
        return convert_to_pcm_ffmpeg(input_path)
--- a/speechlib/core_analysis.py
+++ b/speechlib/core_analysis.py
@ -10,7 +10,8 @@ from .write_log_file import (write_log_file)
 from .re_encode import (re_encode)
 from .convert_to_mono import (convert_to_mono)
 from .convert_to_wav import (convert_to_wav)
-
+from .convert_to_pcm import (prepare_wav_for_wave_module)
 from .boost_vocal_range import (boost_vocal_range)
 import subprocess
 # by default use google speech-to-text API
@ -18,18 +19,21 @@ import subprocess
 def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None):
    # <-------------------PreProcessing file-------------------------->
    # convert compressed wav
    # if file_name.lower().endswith(".wav"):
    #     subprocess.call(['ffmpeg','i',file_name,file_name.split(".")[0]+'.wav'])
    # check if file is in wav format, if not convert to wav
    file_name = convert_to_wav(file_name)
    file_name = prepare_wav_for_wave_module(file_name)
    # convert file to mono
    convert_to_mono(file_name)
    # re-encode file to 16-bit PCM encoding
    re_encode(file_name)
    # Voice Activity Detection (VAD)
    # Boost vocal range
    boost_vocal_range(file_name)
    # <--------------------running analysis--------------------------->
    speaker_tags = []
--- a/speechlib/speaker_embedding.py
+++ b/speechlib/speaker_embedding.py
@ -0,0 +1,27 @@
 from speechbrain.inference import SpeakerRecognition
 import os
 from pydub import AudioSegment
 from collections import defaultdict
 import torch
 SPEAKER_DATABASE = {}
 if torch.cuda.is_available():
    verification = SpeakerRecognition.from_hparams(run_opts={"device":"cuda"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
 else:
    verification = SpeakerRecognition.from_hparams(run_opts={"device":"cpu"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
 # recognize speaker name
 def speaker_embedding(voices_folder, segments):
    speakers = os.listdir(voices_folder)
    for speaker in speakers:
            voices = os.listdir(voices_folder + "/" + speaker)
            for voice in voices:
                voice_file = voices_folder + "/" + speaker + "/" + voice
                try:
                    # compare voice file with audio file
                    embedding = verification.encode_batch(voice_file).squeeze(0).detach()
                    SPEAKER_DATABASE[speaker] = embedding
                except Exception as err:
                    print("error occured while speaker recognition: ", err)
    torch.save(SPEAKER_DATABASE, "speaker_db.pt")
--- a/speechlib/transcribe.py
+++ b/speechlib/transcribe.py
@ -13,14 +13,14 @@ config.read('config.ini')
 modelpath = config.get('FILE', 'modelpath')
 batchsize = int(config.get('FILE', 'batchsize'))
 beamsize = int(config.get('FILE', 'beamsize'))
-localfile = False
+localfile = True
 def transcribe(file, language, model_size, model_type, quantization, custom_model_path, hf_model_path, aai_api_key):
    res = ""
    if language in ["si", "Si"]:
        res = whisper_sinhala(file)
        return res
-    elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]:
+    elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3", "turbo"]:
        if model_type == "faster-whisper":
            if modelpath != "":
                model_size = modelpath
@ -62,7 +62,7 @@ def transcribe(file, language, model_size, model_type, quantization, custom_mode
            except Exception as err:
                print("an error occured while transcribing: ", err)
        elif model_type == "custom":
-            custom_model_path = modelpath
+            # custom_model_path = modelpath
            model_folder = os.path.dirname(custom_model_path)
            model_folder = model_folder + "/"
            print("model file: ", custom_model_path)
		`@ -1 +1 @@`
			`C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt`				`C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt`