285 lines
6.5 KiB
Python
285 lines
6.5 KiB
Python
from .core_analysis import (core_analysis)
|
|
from .re_encode import (re_encode)
|
|
from .convert_to_mono import (convert_to_mono)
|
|
from .convert_to_wav import (convert_to_wav)
|
|
|
|
class Transcriptor:
|
|
|
|
def __init__(self, file, log_folder, logfile, language, modelSize, ACCESS_TOKEN, voices_folder=None, quantization=False):
|
|
'''
|
|
transcribe a wav file
|
|
|
|
arguments:
|
|
|
|
file: name of wav file with extension ex: file.wav
|
|
|
|
log_folder: name of folder where transcript will be stored
|
|
|
|
language: language of wav file
|
|
|
|
modelSize: tiny, small, medium, large, large-v1, large-v2, large-v3 (bigger model is more accurate but slow!!)
|
|
|
|
ACCESS_TOKEN: huggingface access token
|
|
|
|
voices_folder: folder containing subfolders named after each speaker with speaker voice samples in them. This will be used for speaker recognition
|
|
|
|
quantization: whether to use int8 quantization or not (default=False)
|
|
|
|
see documentation: https://github.com/Navodplayer1/speechlib
|
|
|
|
|
|
supported languages:
|
|
#### Afrikaans
|
|
"af",
|
|
#### Amharic
|
|
"am",
|
|
#### Arabic
|
|
"ar",
|
|
#### Assamese
|
|
"as",
|
|
#### Azerbaijani
|
|
"az",
|
|
#### Bashkir
|
|
"ba",
|
|
#### Belarusian
|
|
"be",
|
|
#### Bulgarian
|
|
"bg",
|
|
#### Bengali
|
|
"bn",
|
|
#### Tibetan
|
|
"bo",
|
|
#### Breton
|
|
"br",
|
|
#### Bosnian
|
|
"bs",
|
|
#### Catalan
|
|
"ca",
|
|
#### Czech
|
|
"cs",
|
|
#### Welsh
|
|
"cy",
|
|
#### Danish
|
|
"da",
|
|
#### German
|
|
"de",
|
|
#### Greek
|
|
"el",
|
|
#### English
|
|
"en",
|
|
#### Spanish
|
|
"es",
|
|
#### Estonian
|
|
"et",
|
|
#### Basque
|
|
"eu",
|
|
#### Persian
|
|
"fa",
|
|
#### Finnish
|
|
"fi",
|
|
#### Faroese
|
|
"fo",
|
|
#### French
|
|
"fr",
|
|
#### Galician
|
|
"gl",
|
|
#### Gujarati
|
|
"gu",
|
|
#### Hausa
|
|
"ha",
|
|
#### Hawaiian
|
|
"haw",
|
|
#### Hebrew
|
|
"he",
|
|
#### Hindi
|
|
"hi",
|
|
#### Croatian
|
|
"hr",
|
|
#### Haitian
|
|
"ht",
|
|
#### Hungarian
|
|
"hu",
|
|
#### Armenian
|
|
"hy",
|
|
#### Indonesian
|
|
"id",
|
|
#### Icelandic
|
|
"is",
|
|
#### Italian
|
|
"it",
|
|
#### Japanese
|
|
"ja",
|
|
#### Javanese
|
|
"jw",
|
|
#### Georgian
|
|
"ka",
|
|
#### Kazakh
|
|
"kk",
|
|
#### Khmer
|
|
"km",
|
|
#### Kannada
|
|
"kn",
|
|
#### Korean
|
|
"ko",
|
|
#### Latin
|
|
"la",
|
|
#### Luxembourgish
|
|
"lb",
|
|
#### Lingala
|
|
"ln",
|
|
#### Lao
|
|
"lo",
|
|
#### Lithuanian
|
|
"lt",
|
|
#### Latvian
|
|
"lv",
|
|
#### Malagasy
|
|
"mg",
|
|
#### Maori
|
|
"mi",
|
|
#### Macedonian
|
|
"mk",
|
|
#### Malayalam
|
|
"ml",
|
|
#### Mongolian
|
|
"mn",
|
|
#### Marathi
|
|
"mr",
|
|
#### Malay
|
|
"ms",
|
|
#### Maltese
|
|
"mt",
|
|
#### Burmese
|
|
"my",
|
|
#### Nepali
|
|
"ne",
|
|
#### Dutch
|
|
"nl",
|
|
#### Norwegian Nynorsk
|
|
"nn",
|
|
#### Norwegian
|
|
"no",
|
|
#### Occitan
|
|
"oc",
|
|
#### Punjabi
|
|
"pa",
|
|
#### Polish
|
|
"pl",
|
|
#### Pashto
|
|
"ps",
|
|
#### Portuguese
|
|
"pt",
|
|
#### Romanian
|
|
"ro",
|
|
#### Russian
|
|
"ru",
|
|
#### Sanskrit
|
|
"sa",
|
|
#### Sindhi
|
|
"sd",
|
|
#### Sinhalese
|
|
"si",
|
|
#### Slovak
|
|
"sk",
|
|
#### Slovenian
|
|
"sl",
|
|
#### Shona
|
|
"sn",
|
|
#### Somali
|
|
"so",
|
|
#### Albanian
|
|
"sq",
|
|
#### Serbian
|
|
"sr",
|
|
#### Sundanese
|
|
"su",
|
|
#### Swedish
|
|
"sv",
|
|
#### Swahili
|
|
"sw",
|
|
#### Tamil
|
|
"ta",
|
|
#### Telugu
|
|
"te",
|
|
#### Tajik
|
|
"tg",
|
|
#### Thai
|
|
"th",
|
|
#### Turkmen
|
|
"tk",
|
|
#### Tagalog
|
|
"tl",
|
|
#### Turkish
|
|
"tr",
|
|
#### Tatar
|
|
"tt",
|
|
#### Ukrainian
|
|
"uk",
|
|
#### Urdu
|
|
"ur",
|
|
#### Uzbek
|
|
"uz",
|
|
#### Vietnamese
|
|
"vi",
|
|
#### Yiddish
|
|
"yi",
|
|
#### Yoruba
|
|
"yo",
|
|
#### Chinese
|
|
"zh",
|
|
#### Cantonese
|
|
"yue",
|
|
'''
|
|
self.file = file
|
|
self.voices_folder = voices_folder
|
|
self.language = language
|
|
self.log_folder = log_folder
|
|
self.logfile = logfile
|
|
self.modelSize = modelSize
|
|
self.quantization = quantization
|
|
self.ACCESS_TOKEN = ACCESS_TOKEN
|
|
|
|
def whisper(self):
|
|
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "whisper", self.quantization)
|
|
return res
|
|
|
|
def faster_whisper(self):
|
|
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "faster-whisper", self.quantization)
|
|
return res
|
|
|
|
def custom_whisper(self, custom_model_path):
|
|
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "custom", self.quantization, custom_model_path)
|
|
return res
|
|
|
|
def huggingface_model(self, hf_model_id):
|
|
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "huggingface", self.quantization, None, hf_model_id)
|
|
return res
|
|
|
|
def assemby_ai_model(self, aai_api_key):
|
|
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "assemblyAI", self.quantization, None, None, aai_api_key)
|
|
return res
|
|
|
|
class PreProcessor:
|
|
'''
|
|
class for preprocessing audio files.
|
|
|
|
methods:
|
|
|
|
re_encode(file) -> re-encode file to 16-bit PCM encoding
|
|
|
|
convert_to_mono(file) -> convert file from stereo to mono
|
|
|
|
mp3_to_wav(file) -> convert mp3 file to wav format
|
|
|
|
'''
|
|
|
|
def re_encode(self, file):
|
|
re_encode(file)
|
|
|
|
def convert_to_mono(self, file):
|
|
convert_to_mono(file)
|
|
|
|
def convert_to_wav(self, file):
|
|
path = convert_to_wav(file)
|
|
return path
|