diff --git a/.gitignore b/.gitignore index 12917b8..c7fb613 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,9 @@ speechlib.egg-info # Override this by using 'git add -f' *.wav *.mp3 +/output +/boosted_file +/pcm_files +/audios +/segments +/temp \ No newline at end of file diff --git a/ffmpeg.exe b/ffmpeg.exe new file mode 100644 index 0000000..3c550de Binary files /dev/null and b/ffmpeg.exe differ diff --git a/main.py b/main.py index 8600bcd..28db92c 100644 --- a/main.py +++ b/main.py @@ -12,7 +12,7 @@ access_token = config.get('FILE', 'accesstoken') voicefolder = config.get('FILE', 'voicefolder') language = "id" quantization = str2bool(config.get('FILE', 'quantization')) -modelSize = "medium" +modelSize = "turbo" ### load the audio file in audio folder ### current_dir = os.getcwd() @@ -40,4 +40,5 @@ for filename in os.listdir(audio_dir): ### transcribe ### transcriptor = Transcriptor(audiofile, filepath, language, modelSize, access_token, voicefolder, quantization) res = transcriptor.faster_whisper() + # res = transcriptor.whisper() \ No newline at end of file diff --git a/output/Candra mas.MP3.txt b/output/Candra mas.MP3.txt deleted file mode 100644 index e69de29..0000000 diff --git a/output/Candra mas_113350_id.txt b/output/Candra mas_113350_id.txt deleted file mode 100644 index 57726aa..0000000 --- a/output/Candra mas_113350_id.txt +++ /dev/null @@ -1,11 +0,0 @@ -WATI (2.1 : 3.3) : See ya. -WATI (6.3 : 9.9) : dari mapan cik mau kunjungan -WATI (10.6 : 14.3) : Oh iya silahkan masuk mbak. Tak cek dulu ya cik ya. -WATI (15.3 : 17.7) : Kita hitung dulu stoknya. -WATI (18.6 : 20.2) : Terima kasih. -WATI (21.6 : 38.1) : yang layan nih pesan-pesan tok ya cik ya onan-onan hujan terus ya hujan angin ini tinggal berapa itunya apa kemarin somek udang ya somek udang sama -DWI (27.9 : 29.5) : Terima kasih. -WATI (38.7 : 40.0) : itunya sampean -WATI (44.2 : 45.7) : Minya tinggal satu lho, Ci. -WATI (46.7 : 47.7) : Kita hitung ya. -WATI (51.1 : 52.8) : Mbak helmnya diambil aja. diff --git a/output/REC20250526103049.WAV.txt b/output/REC20250526103049.WAV.txt deleted file mode 100644 index e69de29..0000000 diff --git a/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt b/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt index 45cdcdc..dc7ecac 120000 --- a/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt +++ b/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt @@ -1 +1 @@ -C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt \ No newline at end of file +C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt \ No newline at end of file diff --git a/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt b/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt index 58374ec..aa8ffd1 120000 --- a/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt +++ b/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt @@ -1 +1 @@ -C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt \ No newline at end of file +C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt \ No newline at end of file diff --git a/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml b/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml index 2e87164..c7dad24 120000 --- a/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml +++ b/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml @@ -1 +1 @@ -C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml \ No newline at end of file +C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml \ No newline at end of file diff --git a/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt b/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt index faa4f12..666e7c5 120000 --- a/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt +++ b/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt @@ -1 +1 @@ -C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt \ No newline at end of file +C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt \ No newline at end of file diff --git a/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt b/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt index 701ebe0..a7a8aec 120000 --- a/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt +++ b/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt @@ -1 +1 @@ -C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt \ No newline at end of file +C:/Users/suher/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a00a094..518cc22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,7 @@ openai-whisper streamlit torch torchaudio -assemblyai \ No newline at end of file +assemblyai +soundfile +librosa +pydub \ No newline at end of file diff --git a/speechlib/__pycache__/__init__.cpython-312.pyc b/speechlib/__pycache__/__init__.cpython-312.pyc index b0c8e35..4aa9996 100644 Binary files a/speechlib/__pycache__/__init__.cpython-312.pyc and b/speechlib/__pycache__/__init__.cpython-312.pyc differ diff --git a/speechlib/__pycache__/boost_vocal_range.cpython-312.pyc b/speechlib/__pycache__/boost_vocal_range.cpython-312.pyc new file mode 100644 index 0000000..682a3e1 Binary files /dev/null and b/speechlib/__pycache__/boost_vocal_range.cpython-312.pyc differ diff --git a/speechlib/__pycache__/convert_to_mono.cpython-312.pyc b/speechlib/__pycache__/convert_to_mono.cpython-312.pyc index d8e2e4d..9937d18 100644 Binary files a/speechlib/__pycache__/convert_to_mono.cpython-312.pyc and b/speechlib/__pycache__/convert_to_mono.cpython-312.pyc differ diff --git a/speechlib/__pycache__/convert_to_pcm.cpython-312.pyc b/speechlib/__pycache__/convert_to_pcm.cpython-312.pyc new file mode 100644 index 0000000..6c6cd53 Binary files /dev/null and b/speechlib/__pycache__/convert_to_pcm.cpython-312.pyc differ diff --git a/speechlib/__pycache__/convert_to_wav.cpython-312.pyc b/speechlib/__pycache__/convert_to_wav.cpython-312.pyc index b76c8cb..4d59d64 100644 Binary files a/speechlib/__pycache__/convert_to_wav.cpython-312.pyc and b/speechlib/__pycache__/convert_to_wav.cpython-312.pyc differ diff --git a/speechlib/__pycache__/core_analysis.cpython-312.pyc b/speechlib/__pycache__/core_analysis.cpython-312.pyc index 47d3870..2e8f374 100644 Binary files a/speechlib/__pycache__/core_analysis.cpython-312.pyc and b/speechlib/__pycache__/core_analysis.cpython-312.pyc differ diff --git a/speechlib/__pycache__/re_encode.cpython-312.pyc b/speechlib/__pycache__/re_encode.cpython-312.pyc index 94f3bb4..ab88631 100644 Binary files a/speechlib/__pycache__/re_encode.cpython-312.pyc and b/speechlib/__pycache__/re_encode.cpython-312.pyc differ diff --git a/speechlib/__pycache__/speaker_recognition.cpython-312.pyc b/speechlib/__pycache__/speaker_recognition.cpython-312.pyc index 0e5b397..9334585 100644 Binary files a/speechlib/__pycache__/speaker_recognition.cpython-312.pyc and b/speechlib/__pycache__/speaker_recognition.cpython-312.pyc differ diff --git a/speechlib/__pycache__/speechlib.cpython-312.pyc b/speechlib/__pycache__/speechlib.cpython-312.pyc index a2da075..127a78a 100644 Binary files a/speechlib/__pycache__/speechlib.cpython-312.pyc and b/speechlib/__pycache__/speechlib.cpython-312.pyc differ diff --git a/speechlib/__pycache__/transcribe.cpython-312.pyc b/speechlib/__pycache__/transcribe.cpython-312.pyc index 1d47d52..de21f8b 100644 Binary files a/speechlib/__pycache__/transcribe.cpython-312.pyc and b/speechlib/__pycache__/transcribe.cpython-312.pyc differ diff --git a/speechlib/__pycache__/wav_segmenter.cpython-312.pyc b/speechlib/__pycache__/wav_segmenter.cpython-312.pyc index fb1e46b..53af939 100644 Binary files a/speechlib/__pycache__/wav_segmenter.cpython-312.pyc and b/speechlib/__pycache__/wav_segmenter.cpython-312.pyc differ diff --git a/speechlib/__pycache__/whisper_sinhala.cpython-312.pyc b/speechlib/__pycache__/whisper_sinhala.cpython-312.pyc index 4279f8b..bb377ea 100644 Binary files a/speechlib/__pycache__/whisper_sinhala.cpython-312.pyc and b/speechlib/__pycache__/whisper_sinhala.cpython-312.pyc differ diff --git a/speechlib/__pycache__/write_log_file.cpython-312.pyc b/speechlib/__pycache__/write_log_file.cpython-312.pyc index 5ae8d00..df95ed8 100644 Binary files a/speechlib/__pycache__/write_log_file.cpython-312.pyc and b/speechlib/__pycache__/write_log_file.cpython-312.pyc differ diff --git a/speechlib/boost_vocal_range.py b/speechlib/boost_vocal_range.py new file mode 100644 index 0000000..11a8da9 --- /dev/null +++ b/speechlib/boost_vocal_range.py @@ -0,0 +1,23 @@ +from pydub import AudioSegment +import os + +def boost_vocal_range(input_path, output_path=None): + """ + Boost the vocal range of an audio file by applying a high-pass and low-pass filter. + The function reads an audio file, applies the filters, and exports the modified audio. + """ + folder = os.getcwd() + "\\boosted_file\\" + if not os.path.exists(folder): + os.makedirs(folder) + + if output_path is None: + base, _ = os.path.splitext(input_path) + output_path = folder + base.split('\\')[-1] + "_boosted.wav" + + sound = AudioSegment.from_wav(input_path) + sound = sound.apply_gain(5) # Boost by 5 dB + filtered = sound.high_pass_filter(300).low_pass_filter(3400) + filtered.export(output_path, format="wav") + return output_path + + diff --git a/speechlib/convert_to_mono.py b/speechlib/convert_to_mono.py index f61a25c..4df637b 100644 --- a/speechlib/convert_to_mono.py +++ b/speechlib/convert_to_mono.py @@ -1,30 +1,30 @@ import wave import numpy as np -import soundfile as sf +# import soundfile as sf -def resave_audio(input_file, output_file): - """Loads an audio file and resaves it. +# def resave_audio(input_file, output_file): +# """Loads an audio file and resaves it. - Args: - input_file (str): Path to the input audio file. - output_file (str): Path to save the resaved audio file. - """ - try: - # Read the audio file - data, samplerate = sf.read(input_file) +# Args: +# input_file (str): Path to the input audio file. +# output_file (str): Path to save the resaved audio file. +# """ +# try: +# # Read the audio file +# data, samplerate = sf.read(input_file) - # Write the audio data to a new file - sf.write(output_file, data, samplerate) - print(f"Successfully resaved audio from '{input_file}' to '{output_file}'") +# # Write the audio data to a new file +# sf.write(output_file, data, samplerate) +# print(f"Successfully resaved audio from '{input_file}' to '{output_file}'") - except Exception as e: - print(f"Error processing audio: {e}") +# except Exception as e: +# print(f"Error processing audio: {e}") def convert_to_mono(input_wav): # Resave WAV file - resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav") - input_wav = input_wav.split('.')[0] + "_pcm.wav" + # resave_audio(input_wav, input_wav.split('.')[0] + "_pcm.wav") + # input_wav = input_wav.split('.')[0] + "_pcm.wav" with wave.open(input_wav, 'rb') as input_file: # Get the parameters of the input file diff --git a/speechlib/convert_to_pcm.py b/speechlib/convert_to_pcm.py new file mode 100644 index 0000000..1550385 --- /dev/null +++ b/speechlib/convert_to_pcm.py @@ -0,0 +1,46 @@ +import soundfile as sf +import subprocess +import os + +def is_pcm_wav(file_path): + try: + info = sf.info(file_path) + return info.subtype.startswith("PCM") # e.g., 'PCM_16' + except RuntimeError as e: + print(f"Error reading {file_path}: {e}") + return False + +def convert_to_pcm_ffmpeg(input_path, output_path=None): + folder = os.getcwd() + "\\pcm_files\\" + if not os.path.exists(folder): + os.makedirs(folder) + + if output_path is None: + base, _ = os.path.splitext(input_path) + output_path = folder + base.split('\\')[-1] + "_pcm.wav" + + """Loads an audio file and resaves it. + + Args: + input_file (str): Path to the input audio file. + output_file (str): Path to save the resaved audio file. + """ + try: + # Read the audio file + data, samplerate = sf.read(input_path) + + # Write the audio data to a new file + sf.write(output_path, data, samplerate) + print(f"Successfully resaved audio from '{input_path}' to '{output_path}'") + except Exception as e: + print(f"Error processing audio: {e}") + + return output_path + +def prepare_wav_for_wave_module(input_path): + if is_pcm_wav(input_path): + print(f"{input_path} is already PCM.") + return input_path + else: + print(f"{input_path} is compressed. Converting to PCM...") + return convert_to_pcm_ffmpeg(input_path) diff --git a/speechlib/core_analysis.py b/speechlib/core_analysis.py index fc578fa..e691890 100644 --- a/speechlib/core_analysis.py +++ b/speechlib/core_analysis.py @@ -10,7 +10,8 @@ from .write_log_file import (write_log_file) from .re_encode import (re_encode) from .convert_to_mono import (convert_to_mono) from .convert_to_wav import (convert_to_wav) - +from .convert_to_pcm import (prepare_wav_for_wave_module) +from .boost_vocal_range import (boost_vocal_range) import subprocess # by default use google speech-to-text API @@ -18,18 +19,21 @@ import subprocess def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None): # <-------------------PreProcessing file--------------------------> - # convert compressed wav - # if file_name.lower().endswith(".wav"): - # subprocess.call(['ffmpeg','i',file_name,file_name.split(".")[0]+'.wav']) # check if file is in wav format, if not convert to wav file_name = convert_to_wav(file_name) + file_name = prepare_wav_for_wave_module(file_name) + # convert file to mono convert_to_mono(file_name) # re-encode file to 16-bit PCM encoding re_encode(file_name) + # Voice Activity Detection (VAD) + + # Boost vocal range + boost_vocal_range(file_name) # <--------------------running analysis---------------------------> speaker_tags = [] diff --git a/speechlib/speaker_embedding.py b/speechlib/speaker_embedding.py new file mode 100644 index 0000000..63c3127 --- /dev/null +++ b/speechlib/speaker_embedding.py @@ -0,0 +1,27 @@ +from speechbrain.inference import SpeakerRecognition +import os +from pydub import AudioSegment +from collections import defaultdict +import torch + +SPEAKER_DATABASE = {} + +if torch.cuda.is_available(): + verification = SpeakerRecognition.from_hparams(run_opts={"device":"cuda"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb") +else: + verification = SpeakerRecognition.from_hparams(run_opts={"device":"cpu"}, source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb") + +# recognize speaker name +def speaker_embedding(voices_folder, segments): + speakers = os.listdir(voices_folder) + for speaker in speakers: + voices = os.listdir(voices_folder + "/" + speaker) + for voice in voices: + voice_file = voices_folder + "/" + speaker + "/" + voice + try: + # compare voice file with audio file + embedding = verification.encode_batch(voice_file).squeeze(0).detach() + SPEAKER_DATABASE[speaker] = embedding + except Exception as err: + print("error occured while speaker recognition: ", err) + torch.save(SPEAKER_DATABASE, "speaker_db.pt") \ No newline at end of file diff --git a/speechlib/transcribe.py b/speechlib/transcribe.py index 3dea5ae..28cc744 100644 --- a/speechlib/transcribe.py +++ b/speechlib/transcribe.py @@ -13,14 +13,14 @@ config.read('config.ini') modelpath = config.get('FILE', 'modelpath') batchsize = int(config.get('FILE', 'batchsize')) beamsize = int(config.get('FILE', 'beamsize')) -localfile = False +localfile = True def transcribe(file, language, model_size, model_type, quantization, custom_model_path, hf_model_path, aai_api_key): res = "" if language in ["si", "Si"]: res = whisper_sinhala(file) return res - elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]: + elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3", "turbo"]: if model_type == "faster-whisper": if modelpath != "": model_size = modelpath @@ -62,7 +62,7 @@ def transcribe(file, language, model_size, model_type, quantization, custom_mode except Exception as err: print("an error occured while transcribing: ", err) elif model_type == "custom": - custom_model_path = modelpath + # custom_model_path = modelpath model_folder = os.path.dirname(custom_model_path) model_folder = model_folder + "/" print("model file: ", custom_model_path)