second commit with streamlit
This commit is contained in:
parent
1d722cf99b
commit
a7511e15ac
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,6 +3,7 @@ build
|
|||||||
dist
|
dist
|
||||||
speechlib.egg-info
|
speechlib.egg-info
|
||||||
.env
|
.env
|
||||||
|
.venv
|
||||||
|
|
||||||
*.swp
|
*.swp
|
||||||
*.swo
|
*.swo
|
||||||
|
|||||||
@ -132,14 +132,6 @@ end: ending time of speech in seconds
|
|||||||
text: transcribed text for speech during start and end
|
text: transcribed text for speech during start and end
|
||||||
speaker: speaker of the text
|
speaker: speaker of the text
|
||||||
|
|
||||||
#### voices_folder structure:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
#### Transcription:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
supported language codes:
|
supported language codes:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
16
config.ini
16
config.ini
@ -1,11 +1,11 @@
|
|||||||
[FILE]
|
[FILE]
|
||||||
mode = cpu
|
|
||||||
modelpath = d:\faster-whisper-large-v3-turbo-ct2
|
modelpath = d:\faster-whisper-large-v3-turbo-ct2
|
||||||
audiofolder = \audios
|
audiofolder = \audios
|
||||||
outputfolder = D:\speechlib\output
|
outputfolder = D:\Pythoncode\speechlib\output
|
||||||
outputfolder2 = D:\speechlib\output
|
outputfolder2 = D:\Pythoncode\speechlib\output
|
||||||
outputfolder3 = D:\speechlib\output
|
outputfolder3 = D:\Pythoncode\speechlib\output
|
||||||
voicefolder = D:\speechlib\voices
|
voicefolder = D:\Pythoncode\speechlib\voices
|
||||||
beamsize = 6
|
beamsize = 5
|
||||||
batchsize = 8
|
batchsize = 4
|
||||||
accesstoken = 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
|
accesstoken = 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
|
||||||
|
quantization = False
|
||||||
9
main.py
9
main.py
@ -1,16 +1,17 @@
|
|||||||
from speechlib import Transcriptor
|
from speechlib import Transcriptor
|
||||||
from speechlib import PreProcessor
|
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
def str2bool(v):
|
||||||
|
return v.lower() in ("yes", "true", "t", "1")
|
||||||
|
|
||||||
config = ConfigParser()
|
config = ConfigParser()
|
||||||
config.read('config.ini')
|
config.read('config.ini')
|
||||||
mode = config.get('FILE', 'mode')
|
|
||||||
audiofolder = config.get('FILE', 'audiofolder')
|
audiofolder = config.get('FILE', 'audiofolder')
|
||||||
access_token = config.get('FILE', 'accesstoken')
|
access_token = config.get('FILE', 'accesstoken')
|
||||||
voicefolder = config.get('FILE', 'voicefolder')
|
voicefolder = config.get('FILE', 'voicefolder')
|
||||||
language = "id"
|
language = "id"
|
||||||
quantization = False
|
quantization = str2bool(config.get('FILE', 'quantization'))
|
||||||
modelSize = "medium"
|
modelSize = "medium"
|
||||||
|
|
||||||
### load the audio file in audio folder ###
|
### load the audio file in audio folder ###
|
||||||
@ -27,7 +28,7 @@ if not os.path.exists(output_dir):
|
|||||||
|
|
||||||
### Loop for each audio file in the audio folder ###
|
### Loop for each audio file in the audio folder ###
|
||||||
for filename in os.listdir(audio_dir):
|
for filename in os.listdir(audio_dir):
|
||||||
if filename.endswith(".mp3") or filename.endswith(".wav"):
|
if filename.lower().endswith(".mp3") or filename.lower().endswith(".wav"):
|
||||||
audiofile = os.path.join(audio_dir, filename)
|
audiofile = os.path.join(audio_dir, filename)
|
||||||
print(f"Audio file: {audiofile}")
|
print(f"Audio file: {audiofile}")
|
||||||
audiofilewithoutextension = audiofile.split(".mp3")[0].split(".wav")[0]
|
audiofilewithoutextension = audiofile.split(".mp3")[0].split(".wav")[0]
|
||||||
|
|||||||
57
main2.py
Normal file
57
main2.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from speechlib import Transcriptor
|
||||||
|
from configparser import ConfigParser
|
||||||
|
import os
|
||||||
|
import streamlit as st
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
||||||
|
|
||||||
|
def str2bool(v):
|
||||||
|
return v.lower() in ("yes", "true", "t", "1")
|
||||||
|
|
||||||
|
config = ConfigParser()
|
||||||
|
config.read('config.ini')
|
||||||
|
audiofolder = config.get('FILE', 'audiofolder')
|
||||||
|
access_token = config.get('FILE', 'accesstoken')
|
||||||
|
voicefolder = config.get('FILE', 'voicefolder')
|
||||||
|
outputfolder = config.get('FILE', 'outputfolder')
|
||||||
|
language = "id"
|
||||||
|
quantization = str2bool(config.get('FILE', 'quantization'))
|
||||||
|
modelSize = "medium"
|
||||||
|
|
||||||
|
st.title("Crew Transcription")
|
||||||
|
|
||||||
|
audio = st.file_uploader("Upload audio file (.mp3/.wav/.m4a)", type=["mp3", "wav", "m4a"])
|
||||||
|
crew_id = st.text_input("Crew ID")
|
||||||
|
crew_name = st.text_input("Crew Name")
|
||||||
|
outlet = st.selectbox('Outlet',('Icon Mall','Kediri','Ponti','Plaza Surabaya Delta','AEON','Rungkut','Darmo','Royal Plaza',
|
||||||
|
'Kusuma Bangsa','Sepanjang','Barata','Pakuwon City Mall','Tropodo','Wiyung','Citraland',
|
||||||
|
'Lotte Avenue','Pakuwon Trade Center','Rest Area Sidoarjo','Gayungsari','Wahidin','Imam Bonjol',
|
||||||
|
'BG Junction','Puri Surya Jaya','Merr','Karawaci','Pepelegi','Plaza Festival'
|
||||||
|
))
|
||||||
|
date = st.date_input("Date")
|
||||||
|
if st.button("Transcribe"):
|
||||||
|
if audio is not None and crew_id:
|
||||||
|
with st.spinner("Transcribing..."):
|
||||||
|
# Save the uploaded audio file to a temporary location
|
||||||
|
file_details = {"filename": audio.name, "filetype": audio.type, "filesize": audio.size}
|
||||||
|
print(file_details)
|
||||||
|
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
path = os.path.join(temp_dir, audio.name)
|
||||||
|
|
||||||
|
with open(path, "wb") as f:
|
||||||
|
f.write(audio.getvalue())
|
||||||
|
print(f"Temporary file saved at: {path}")
|
||||||
|
|
||||||
|
#audiofilewithoutextension = path.split(".mp3")[0].split(".wav")[0].split(".m4a")[0]
|
||||||
|
#filepath = os.path.join(outputfolder, os.path.basename(audiofilewithoutextension).split('/')[-1]+".txt")
|
||||||
|
filepath = os.path.join(outputfolder, f"{outlet}-{crew_id}-{crew_name}-{date}-Transkrip.txt")
|
||||||
|
print(f"Output file: {filepath}")
|
||||||
|
filename = open(filepath, "w")
|
||||||
|
|
||||||
|
### transcribe ###
|
||||||
|
transcriptor = Transcriptor(path, filepath, language, modelSize, access_token, voicefolder, quantization)
|
||||||
|
res = transcriptor.faster_whisper()
|
||||||
|
print(f"Content has been written to {filepath}")
|
||||||
|
st.success(f"Transcribe successful!")
|
||||||
Binary file not shown.
1
pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt
Symbolic link
1
pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt
|
||||||
Binary file not shown.
1
pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
Symbolic link
1
pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
|
||||||
@ -1,58 +0,0 @@
|
|||||||
# ############################################################################
|
|
||||||
# Model: ECAPA big for Speaker verification
|
|
||||||
# ############################################################################
|
|
||||||
|
|
||||||
# Feature parameters
|
|
||||||
n_mels: 80
|
|
||||||
|
|
||||||
# Pretrain folder (HuggingFace)
|
|
||||||
pretrained_path: speechbrain/spkrec-ecapa-voxceleb
|
|
||||||
|
|
||||||
# Output parameters
|
|
||||||
out_n_neurons: 7205
|
|
||||||
|
|
||||||
# Model params
|
|
||||||
compute_features: !new:speechbrain.lobes.features.Fbank
|
|
||||||
n_mels: !ref <n_mels>
|
|
||||||
|
|
||||||
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
|
|
||||||
norm_type: sentence
|
|
||||||
std_norm: False
|
|
||||||
|
|
||||||
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
|
|
||||||
input_size: !ref <n_mels>
|
|
||||||
channels: [1024, 1024, 1024, 1024, 3072]
|
|
||||||
kernel_sizes: [5, 3, 3, 3, 1]
|
|
||||||
dilations: [1, 2, 3, 4, 1]
|
|
||||||
attention_channels: 128
|
|
||||||
lin_neurons: 192
|
|
||||||
|
|
||||||
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
|
|
||||||
input_size: 192
|
|
||||||
out_neurons: !ref <out_n_neurons>
|
|
||||||
|
|
||||||
mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
|
|
||||||
norm_type: global
|
|
||||||
std_norm: False
|
|
||||||
|
|
||||||
modules:
|
|
||||||
compute_features: !ref <compute_features>
|
|
||||||
mean_var_norm: !ref <mean_var_norm>
|
|
||||||
embedding_model: !ref <embedding_model>
|
|
||||||
mean_var_norm_emb: !ref <mean_var_norm_emb>
|
|
||||||
classifier: !ref <classifier>
|
|
||||||
|
|
||||||
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
|
|
||||||
|
|
||||||
|
|
||||||
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
|
||||||
loadables:
|
|
||||||
embedding_model: !ref <embedding_model>
|
|
||||||
mean_var_norm_emb: !ref <mean_var_norm_emb>
|
|
||||||
classifier: !ref <classifier>
|
|
||||||
label_encoder: !ref <label_encoder>
|
|
||||||
paths:
|
|
||||||
embedding_model: !ref <pretrained_path>/embedding_model.ckpt
|
|
||||||
mean_var_norm_emb: !ref <pretrained_path>/mean_var_norm_emb.ckpt
|
|
||||||
classifier: !ref <pretrained_path>/classifier.ckpt
|
|
||||||
label_encoder: !ref <pretrained_path>/label_encoder.txt
|
|
||||||
1
pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml
Symbolic link
1
pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml
|
||||||
File diff suppressed because it is too large
Load Diff
1
pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt
Symbolic link
1
pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt
|
||||||
Binary file not shown.
1
pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
Symbolic link
1
pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
C:/Users/admin.suherdy/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt
|
||||||
@ -5,3 +5,7 @@ speechbrain
|
|||||||
accelerate
|
accelerate
|
||||||
faster-whisper
|
faster-whisper
|
||||||
openai-whisper
|
openai-whisper
|
||||||
|
streamlit
|
||||||
|
torch
|
||||||
|
torchaudio
|
||||||
|
assemblyai
|
||||||
1
run1.bat
Normal file
1
run1.bat
Normal file
@ -0,0 +1 @@
|
|||||||
|
streamlit run main2.py --server.headless true --server.maxUploadSize 1000
|
||||||
1
run2.bat
Normal file
1
run2.bat
Normal file
@ -0,0 +1 @@
|
|||||||
|
streamlit run main3.py --server.headless true --server.maxUploadSize 1000
|
||||||
1
run3.bat
Normal file
1
run3.bat
Normal file
@ -0,0 +1 @@
|
|||||||
|
streamlit run main4.py --server.headless true --server.maxUploadSize 1000
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -11,12 +11,16 @@ from .re_encode import (re_encode)
|
|||||||
from .convert_to_mono import (convert_to_mono)
|
from .convert_to_mono import (convert_to_mono)
|
||||||
from .convert_to_wav import (convert_to_wav)
|
from .convert_to_wav import (convert_to_wav)
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
|
||||||
# by default use google speech-to-text API
|
# by default use google speech-to-text API
|
||||||
# if False, then use whisper finetuned version for sinhala
|
# if False, then use whisper finetuned version for sinhala
|
||||||
def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None):
|
def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, model_type, quantization=False, custom_model_path=None, hf_model_id=None, aai_api_key=None):
|
||||||
|
|
||||||
# <-------------------PreProcessing file-------------------------->
|
# <-------------------PreProcessing file-------------------------->
|
||||||
|
# convert compressed wav
|
||||||
|
# if file_name.lower().endswith(".wav"):
|
||||||
|
# subprocess.call(['ffmpeg','i',file_name,file_name.split(".")[0]+'.wav'])
|
||||||
# check if file is in wav format, if not convert to wav
|
# check if file is in wav format, if not convert to wav
|
||||||
file_name = convert_to_wav(file_name)
|
file_name = convert_to_wav(file_name)
|
||||||
|
|
||||||
@ -136,4 +140,6 @@ def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACC
|
|||||||
# writing log file
|
# writing log file
|
||||||
write_log_file(common_segments, log_folder, file_name, language)
|
write_log_file(common_segments, log_folder, file_name, language)
|
||||||
|
|
||||||
|
## TODO cleaning segments and temp folder
|
||||||
|
|
||||||
return common_segments
|
return common_segments
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def speaker_recognition(file_name, voices_folder, segments, wildcards):
|
|||||||
end = segment[1] * 1000 # end time in miliseconds
|
end = segment[1] * 1000 # end time in miliseconds
|
||||||
clip = audio[start:end]
|
clip = audio[start:end]
|
||||||
i = i + 1
|
i = i + 1
|
||||||
file = folder_name + "/" + file_name.split("/")[-1].split(".")[0] + "_segment"+ str(i) + ".wav"
|
file = folder_name + "/" + file_name.split("\\")[-1].split(".")[0] + "_segment"+ str(i) + ".wav"
|
||||||
clip.export(file, format="wav")
|
clip.export(file, format="wav")
|
||||||
|
|
||||||
max_score = 0
|
max_score = 0
|
||||||
@ -69,7 +69,10 @@ def speaker_recognition(file_name, voices_folder, segments, wildcards):
|
|||||||
Id_count[person] += 1
|
Id_count[person] += 1
|
||||||
|
|
||||||
# Delete the WAV file after processing
|
# Delete the WAV file after processing
|
||||||
os.remove(file)
|
# try:
|
||||||
|
# os.remove(file)
|
||||||
|
# except OSError as e:
|
||||||
|
# print (f'Access-error on file {str(e)}')
|
||||||
|
|
||||||
current_pred = max(Id_count, key=Id_count.get)
|
current_pred = max(Id_count, key=Id_count.get)
|
||||||
|
|
||||||
|
|||||||
@ -23,7 +23,7 @@ def wav_file_segmentation(file_name, segments, language, modelSize, model_type,
|
|||||||
end = segment[1] * 1000 # end time in miliseconds
|
end = segment[1] * 1000 # end time in miliseconds
|
||||||
clip = audio[start:end]
|
clip = audio[start:end]
|
||||||
i = i + 1
|
i = i + 1
|
||||||
file = folder_name + "/" + f"segment-{file_name}"+ str(i) + ".wav"
|
file = folder_name + "/" + f"segment-{file_name.split("\\")[-1].split(".")[0]}"+ str(i) + ".wav"
|
||||||
clip.export(file, format="wav")
|
clip.export(file, format="wav")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -34,10 +34,10 @@ def wav_file_segmentation(file_name, segments, language, modelSize, model_type,
|
|||||||
except Exception as err:
|
except Exception as err:
|
||||||
print("ERROR while transcribing: ", err)
|
print("ERROR while transcribing: ", err)
|
||||||
# Delete the WAV file after processing
|
# Delete the WAV file after processing
|
||||||
try:
|
# try:
|
||||||
os.remove(file)
|
# os.remove(file)
|
||||||
except OSError as e:
|
# except OSError as e:
|
||||||
print (f'Access-error on file {str(e)}')
|
# print (f'Access-error on file {str(e)}')
|
||||||
|
|
||||||
|
|
||||||
return texts
|
return texts
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user