speechlib/speechlib/speechlib.py

285 lines
6.5 KiB
Python

from .core_analysis import (core_analysis)
from .re_encode import (re_encode)
from .convert_to_mono import (convert_to_mono)
from .convert_to_wav import (convert_to_wav)
class Transcriptor:
def __init__(self, file, log_folder, logfile, language, modelSize, ACCESS_TOKEN, voices_folder=None, quantization=False):
'''
transcribe a wav file
arguments:
file: name of wav file with extension ex: file.wav
log_folder: name of folder where transcript will be stored
language: language of wav file
modelSize: tiny, small, medium, large, large-v1, large-v2, large-v3 (bigger model is more accurate but slow!!)
ACCESS_TOKEN: huggingface access token
voices_folder: folder containing subfolders named after each speaker with speaker voice samples in them. This will be used for speaker recognition
quantization: whether to use int8 quantization or not (default=False)
see documentation: https://github.com/Navodplayer1/speechlib
supported languages:
#### Afrikaans
"af",
#### Amharic
"am",
#### Arabic
"ar",
#### Assamese
"as",
#### Azerbaijani
"az",
#### Bashkir
"ba",
#### Belarusian
"be",
#### Bulgarian
"bg",
#### Bengali
"bn",
#### Tibetan
"bo",
#### Breton
"br",
#### Bosnian
"bs",
#### Catalan
"ca",
#### Czech
"cs",
#### Welsh
"cy",
#### Danish
"da",
#### German
"de",
#### Greek
"el",
#### English
"en",
#### Spanish
"es",
#### Estonian
"et",
#### Basque
"eu",
#### Persian
"fa",
#### Finnish
"fi",
#### Faroese
"fo",
#### French
"fr",
#### Galician
"gl",
#### Gujarati
"gu",
#### Hausa
"ha",
#### Hawaiian
"haw",
#### Hebrew
"he",
#### Hindi
"hi",
#### Croatian
"hr",
#### Haitian
"ht",
#### Hungarian
"hu",
#### Armenian
"hy",
#### Indonesian
"id",
#### Icelandic
"is",
#### Italian
"it",
#### Japanese
"ja",
#### Javanese
"jw",
#### Georgian
"ka",
#### Kazakh
"kk",
#### Khmer
"km",
#### Kannada
"kn",
#### Korean
"ko",
#### Latin
"la",
#### Luxembourgish
"lb",
#### Lingala
"ln",
#### Lao
"lo",
#### Lithuanian
"lt",
#### Latvian
"lv",
#### Malagasy
"mg",
#### Maori
"mi",
#### Macedonian
"mk",
#### Malayalam
"ml",
#### Mongolian
"mn",
#### Marathi
"mr",
#### Malay
"ms",
#### Maltese
"mt",
#### Burmese
"my",
#### Nepali
"ne",
#### Dutch
"nl",
#### Norwegian Nynorsk
"nn",
#### Norwegian
"no",
#### Occitan
"oc",
#### Punjabi
"pa",
#### Polish
"pl",
#### Pashto
"ps",
#### Portuguese
"pt",
#### Romanian
"ro",
#### Russian
"ru",
#### Sanskrit
"sa",
#### Sindhi
"sd",
#### Sinhalese
"si",
#### Slovak
"sk",
#### Slovenian
"sl",
#### Shona
"sn",
#### Somali
"so",
#### Albanian
"sq",
#### Serbian
"sr",
#### Sundanese
"su",
#### Swedish
"sv",
#### Swahili
"sw",
#### Tamil
"ta",
#### Telugu
"te",
#### Tajik
"tg",
#### Thai
"th",
#### Turkmen
"tk",
#### Tagalog
"tl",
#### Turkish
"tr",
#### Tatar
"tt",
#### Ukrainian
"uk",
#### Urdu
"ur",
#### Uzbek
"uz",
#### Vietnamese
"vi",
#### Yiddish
"yi",
#### Yoruba
"yo",
#### Chinese
"zh",
#### Cantonese
"yue",
'''
self.file = file
self.voices_folder = voices_folder
self.language = language
self.log_folder = log_folder
self.logfile = logfile
self.modelSize = modelSize
self.quantization = quantization
self.ACCESS_TOKEN = ACCESS_TOKEN
def whisper(self):
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "whisper", self.quantization)
return res
def faster_whisper(self):
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "faster-whisper", self.quantization)
return res
def custom_whisper(self, custom_model_path):
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "custom", self.quantization, custom_model_path)
return res
def huggingface_model(self, hf_model_id):
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "huggingface", self.quantization, None, hf_model_id)
return res
def assemby_ai_model(self, aai_api_key):
res = core_analysis(self.file, self.voices_folder, self.log_folder, self.logfile, self.language, self.modelSize, self.ACCESS_TOKEN, "assemblyAI", self.quantization, None, None, aai_api_key)
return res
class PreProcessor:
'''
class for preprocessing audio files.
methods:
re_encode(file) -> re-encode file to 16-bit PCM encoding
convert_to_mono(file) -> convert file from stereo to mono
mp3_to_wav(file) -> convert mp3 file to wav format
'''
def re_encode(self, file):
re_encode(file)
def convert_to_mono(self, file):
convert_to_mono(file)
def convert_to_wav(self, file):
path = convert_to_wav(file)
return path