fix bugs and add feature to convert compressed wav, boost vocal

2025-05-30 20:12:22 +07:00 · 2025-05-30 20:12:22 +07:00 · a664c30eb1
commit a664c30eb1
parent 7eb80ddb73
31 changed files with 141 additions and 42 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,9 @@ speechlib.egg-info
 # Override this by using 'git add -f'
 *.wav
 *.mp3
+/output
+/boosted_file
+/pcm_files
+/audios
+/segments
+/temp
--- a/ffmpeg.exe
+++ b/ffmpeg.exe
--- a/main.py
+++ b/main.py
@ -12,7 +12,7 @@ access_token = config.get('FILE', 'accesstoken')
 voicefolder = config.get('FILE', 'voicefolder')
 language = "id"
 quantization = str2bool(config.get('FILE', 'quantization'))
-modelSize = "medium"
+modelSize = "turbo"

 ### load the audio file in audio folder ###
 current_dir = os.getcwd()
@ -40,4 +40,5 @@ for filename in os.listdir(audio_dir):
        ### transcribe ###
        transcriptor = Transcriptor(audiofile, filepath, language, modelSize, access_token, voicefolder, quantization)
        res = transcriptor.faster_whisper()
+        # res = transcriptor.whisper()
        
--- a/output/Candra
+++ b/output/Candra
--- a/mas_113350_id.txt
+++ b/mas_113350_id.txt
@ -1,11 +0,0 @@
-WATI (2.1 : 3.3) :  See ya. 
-WATI (6.3 : 9.9) :  dari mapan cik mau kunjungan 
-WATI (10.6 : 14.3) :  Oh iya silahkan masuk mbak. Tak cek dulu ya cik ya. 
-WATI (15.3 : 17.7) :  Kita hitung dulu stoknya. 
-WATI (18.6 : 20.2) :  Terima kasih. 
-WATI (21.6 : 38.1) :  yang layan nih pesan-pesan tok ya cik ya onan-onan hujan terus ya hujan angin ini tinggal berapa itunya apa kemarin somek udang ya somek udang sama 
-DWI (27.9 : 29.5) :  Terima kasih. 
-WATI (38.7 : 40.0) :  itunya sampean 
-WATI (44.2 : 45.7) :  Minya tinggal satu lho, Ci. 
-WATI (46.7 : 47.7) :  Kita hitung ya. 
-WATI (51.1 : 52.8) :  Mbak helmnya diambil aja. 
--- a/output/REC20250526103049.WAV.txt
+++ b/output/REC20250526103049.WAV.txt
--- a/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
--- a/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
--- a/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml
+++ b/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
--- a/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
--- a/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
+++ b/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
@ -1 +1 @@
-C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
+C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
--- a/requirements.txt
+++ b/requirements.txt
@ -9,3 +9,6 @@ streamlit
 torch
 torchaudio
 assemblyai
+soundfile
+librosa
+pydub
--- a/speechlib/pycache/init.cpython-312.pyc
+++ b/speechlib/pycache/init.cpython-312.pyc
--- a/speechlib/pycache/boost_vocal_range.cpython-312.pyc
+++ b/speechlib/pycache/boost_vocal_range.cpython-312.pyc
--- a/speechlib/pycache/convert_to_mono.cpython-312.pyc
+++ b/speechlib/pycache/convert_to_mono.cpython-312.pyc
--- a/speechlib/pycache/convert_to_pcm.cpython-312.pyc
+++ b/speechlib/pycache/convert_to_pcm.cpython-312.pyc
--- a/speechlib/pycache/convert_to_wav.cpython-312.pyc
+++ b/speechlib/pycache/convert_to_wav.cpython-312.pyc
--- a/speechlib/pycache/core_analysis.cpython-312.pyc
+++ b/speechlib/pycache/core_analysis.cpython-312.pyc
--- a/speechlib/pycache/re_encode.cpython-312.pyc
+++ b/speechlib/pycache/re_encode.cpython-312.pyc
--- a/speechlib/pycache/speaker_recognition.cpython-312.pyc
+++ b/speechlib/pycache/speaker_recognition.cpython-312.pyc
--- a/speechlib/pycache/speechlib.cpython-312.pyc
+++ b/speechlib/pycache/speechlib.cpython-312.pyc
--- a/speechlib/pycache/transcribe.cpython-312.pyc
+++ b/speechlib/pycache/transcribe.cpython-312.pyc
--- a/speechlib/pycache/wav_segmenter.cpython-312.pyc
+++ b/speechlib/pycache/wav_segmenter.cpython-312.pyc
--- a/speechlib/pycache/whisper_sinhala.cpython-312.pyc
+++ b/speechlib/pycache/whisper_sinhala.cpython-312.pyc
--- a/speechlib/pycache/write_log_file.cpython-312.pyc
+++ b/speechlib/pycache/write_log_file.cpython-312.pyc
--- a/speechlib/boost_vocal_range.py
+++ b/speechlib/boost_vocal_range.py
@ -0,0 +1,23 @@
+from pydub import AudioSegment
+import os
+
+def boost_vocal_range(input_path, output_path=None):
+    """
+    Boost the vocal range of an audio file by applying a high-pass and low-pass filter.
+    The function reads an audio file, applies the filters, and exports the modified audio.
+    """
+    folder = os.getcwd() + "\\boosted_file\\"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
+    if output_path is None:
+        base, _ = os.path.splitext(input_path)
+        output_path = folder + base.split('\\')[-1] + "_boosted.wav"
+
+    sound = AudioSegment.from_wav(input_path)
+    sound = sound.apply_gain(5)  # Boost by 5 dB
+    filtered = sound.high_pass_filter(300).low_pass_filter(3400)
+    filtered.export(output_path, format="wav")
+    return output_path
+
+
--- a/speechlib/convert_to_mono.py
+++ b/speechlib/convert_to_mono.py
@ -1,30 +1,30 @@
 import wave
 import numpy as np

-import soundfile as sf
+# import soundfile as sf

-def resave_audio(input_file, output_file):
-    """Loads an audio file and resaves it.
+# def resave_audio(input_file, output_file):
+#     """Loads an audio file and resaves it.

-    Args:
-        input_file (str): Path to the input audio file.
-        output_file (str): Path to save the resaved audio file.
-    """
-    try:
-        # Read the audio file
-        data, samplerate = sf.read(input_file)
+#     Args:
+#         input_file (str): Path to the input audio file.
+#         output_file (str): Path to save the resaved audio file.
+#     """
+#     try:
+#         # Read the audio file
+#         data, samplerate = sf.read(input_file)

-        # Write the audio data to a new file
-        sf.write(output_file, data, samplerate)
-        print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")
+#         # Write the audio data to a new file
+#         sf.write(output_file, data, samplerate)
+#         print(f"Successfully resaved audio from '{input_file}' to '{output_file}'")

-    except Exception as e:
-        print(f"Error processing audio: {e}")
+#     except Exception as e:
+#         print(f"Error processing audio: {e}")

 def convert_to_mono(input_wav):
    # Resave WAV file
-    resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
-    input_wav = input_wav.split('.')[0] + "_pcm.wav"
+    # resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav")
+    # input_wav = input_wav.split('.')[0] + "_pcm.wav"

    with wave.open(input_wav, 'rb') as input_file:
        # Get the parameters of the input file
--- a/speechlib/convert_to_pcm.py
+++ b/speechlib/convert_to_pcm.py
@ -0,0 +1,46 @@
+import soundfile as sf
+import subprocess
+import os
+
+def is_pcm_wav(file_path):
+    try:
+        info = sf.info(file_path)
+        return info.subtype.startswith("PCM")  # e.g., 'PCM_16'
+    except RuntimeError as e:
+        print(f"Error reading {file_path}: {e}")
+        return False
+
+def convert_to_pcm_ffmpeg(input_path, output_path=None):
+    folder = os.getcwd() + "\\pcm_files\\"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+        
+    if output_path is None:
+        base, _ = os.path.splitext(input_path)
+        output_path = folder + base.split('\\')[-1] + "_pcm.wav"
+
+    """Loads an audio file and resaves it.
+
+    Args:
+        input_file (str): Path to the input audio file.
+        output_file (str): Path to save the resaved audio file.
+    """
+    try:
+        # Read the audio file
+        data, samplerate = sf.read(input_path)
+
+        # Write the audio data to a new file
+        sf.write(output_path, data, samplerate)
+        print(f"Successfully resaved audio from '{input_path}' to '{output_path}'")
+    except Exception as e:
+        print(f"Error processing audio: {e}")
+
+    return output_path
+
+def prepare_wav_for_wave_module(input_path):
+    if is_pcm_wav(input_path):
+        print(f"{input_path} is already PCM.")
+        return input_path
+    else:
+        print(f"{input_path} is compressed. Converting to PCM...")
+        return convert_to_pcm_ffmpeg(input_path)
--- a/speechlib/core_analysis.py
+++ b/speechlib/core_analysis.py
@ -10,7 +10,8 @@ from .write_log_file import (write_log_file)
 from .re_encode import (re_encode)
 from .convert_to_mono import (convert_to_mono)
 from .convert_to_wav import (convert_to_wav)
-
+from .convert_to_pcm import (prepare_wav_for_wave_module)
+from .boost_vocal_range import (boost_vocal_range)
 import subprocess

 # by default use google speech-to-text API
@ -18,18 +19,21 @@ import subprocess
 def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None):

    # <-------------------PreProcessing file-------------------------->
-    # convert compressed wav
-    # if file_name.lower().endswith(".wav"):
-    #     subprocess.call(['ffmpeg','i',file_name,file_name.split(".")[0]+'.wav'])
    # check if file is in wav format, if not convert to wav
    file_name = convert_to_wav(file_name)

+    file_name = prepare_wav_for_wave_module(file_name)
+
    # convert file to mono
    convert_to_mono(file_name)

    # re-encode file to 16-bit PCM encoding
    re_encode(file_name)

+    # Voice Activity Detection (VAD)
+
+    # Boost vocal range
+    boost_vocal_range(file_name)
    # <--------------------running analysis--------------------------->

    speaker_tags = []
--- a/speechlib/speaker_embedding.py
+++ b/speechlib/speaker_embedding.py
@ -0,0 +1,27 @@
+from speechbrain.inference import SpeakerRecognition
+import os
+from pydub import AudioSegment
+from collections import defaultdict
+import torch
+
+SPEAKER_DATABASE = {}
+
+if torch.cuda.is_available():
+    verification = SpeakerRecognition.from_hparams(run_opts={"device":"cuda"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
+else:
+    verification = SpeakerRecognition.from_hparams(run_opts={"device":"cpu"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
+
+# recognize speaker name
+def speaker_embedding(voices_folder, segments):
+    speakers = os.listdir(voices_folder)
+    for speaker in speakers:
+            voices = os.listdir(voices_folder + "/" + speaker)
+            for voice in voices:
+                voice_file = voices_folder + "/" + speaker + "/" + voice
+                try:
+                    # compare voice file with audio file
+                    embedding = verification.encode_batch(voice_file).squeeze(0).detach()
+                    SPEAKER_DATABASE[speaker] = embedding
+                except Exception as err:
+                    print("error occured while speaker recognition: ", err)
+    torch.save(SPEAKER_DATABASE, "speaker_db.pt")
--- a/speechlib/transcribe.py
+++ b/speechlib/transcribe.py
@ -13,14 +13,14 @@ config.read('config.ini')
 modelpath = config.get('FILE', 'modelpath')
 batchsize = int(config.get('FILE', 'batchsize'))
 beamsize = int(config.get('FILE', 'beamsize'))
-localfile = False
+localfile = True

 def transcribe(file, language, model_size, model_type, quantization, custom_model_path, hf_model_path, aai_api_key):
    res = ""
    if language in ["si", "Si"]:
        res = whisper_sinhala(file)
        return res
-    elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]:
+    elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3", "turbo"]:
        if model_type == "faster-whisper":
            if modelpath != "":
                model_size = modelpath
@ -62,7 +62,7 @@ def transcribe(file, language, model_size, model_type, quantization, custom_mode
            except Exception as err:
                print("an error occured while transcribing: ", err)
        elif model_type == "custom":
-            custom_model_path = modelpath
+            # custom_model_path = modelpath
            model_folder = os.path.dirname(custom_model_path)
            model_folder = model_folder + "/"
            print("model file: ", custom_model_path)