From 3867f6172e2789da5ff3df275a09eedb014a8b27 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 20 Oct 2024 03:36:48 +0800 Subject: [PATCH] ref: re-organize project structure --- .gitignore | 169 ++++++++++++++++++++++++ README.md | 36 +---- UModel.py | 6 +- cafe.wav => assets/cafe.wav | Bin perlin.wav => assets/perlin.wav | Bin radionoise.wav => assets/radionoise.wav | Bin silence.wav => assets/silence.wav | Bin white.wav => assets/white.wav | Bin data.py | 8 +- requirements.txt | 6 + test1.wav => test/test1.wav | Bin test2.wav => test/test2.wav | Bin test3.wav => test/test3.wav | Bin test4.wav => test/test4.wav | Bin test5.wav => test/test5.wav | Bin test6.wav => test/test6.wav | Bin tonu.py | 10 +- train.py | 9 +- 18 files changed, 199 insertions(+), 45 deletions(-) create mode 100644 .gitignore rename cafe.wav => assets/cafe.wav (100%) rename perlin.wav => assets/perlin.wav (100%) rename radionoise.wav => assets/radionoise.wav (100%) rename silence.wav => assets/silence.wav (100%) rename white.wav => assets/white.wav (100%) create mode 100644 requirements.txt rename test1.wav => test/test1.wav (100%) rename test2.wav => test/test2.wav (100%) rename test3.wav => test/test3.wav (100%) rename test4.wav => test/test4.wav (100%) rename test5.wav => test/test5.wav (100%) rename test6.wav => test/test6.wav (100%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9dff66e --- /dev/null +++ b/.gitignore @@ -0,0 +1,169 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# OS-specific +.DS_Store + +# Project Specific +*.mp3 +results/ \ No newline at end of file diff --git a/README.md b/README.md index 83ea1fd..e7935bb 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,9 @@ -# Speech Recognition for Uyghur using deep learning -Training: +# Agnlash -this model using CTC loss for training. +A ASR(Automatic Speech Recognition) model for Uyghur language. -Download [pretrained model](https://github.com/gheyret/uyghur-asr-ctc/releases/download/data/results.7z) and [dataset](https://github.com/gheyret/uyghur-asr-ctc/releases/download/data/thuyg20_data.7z). +This project is forked from [uyghur-asr-ctc](https://github.com/gheyret/uyghur-asr-ctc/forks). -unzip results.7z and thuyg20_data.7z to the same folder where python source files located. then run: -``` -python train.py -``` - -Recognition: - -for recognition download only pretrained model(results.7z). then run: - -``` -python tonu.py test1.wav -``` -result will be: -``` - Model loaded: results/UModel_last.pth - Best CER: 7.21% - Trained: 473 epochs -The model has 26,389,282 trainable parameters - -====================== -Recognizing file .\test2.wav -test2.wav -> bu öy eslide xotunining xush tebessumi oghlining omaq külküsi bilen güzel idi -``` - -This project using - -[**A free Uyghur speech database Released by CSLT@Tsinghua University & Xinjiang University**](http://www.openslr.org/22/) +The Anglash is fine-tuned on the [CommonVoice](https://commonvoice.mozilla.org/) dataset which contains 313 hours of data. +The original project uses [**A free Uyghur speech database Released by CSLT@Tsinghua University & Xinjiang University**](http://www.openslr.org/22/). This dataset contains 22.45 hours of data. diff --git a/UModel.py b/UModel.py index bb4f65f..dc98d43 100644 --- a/UModel.py +++ b/UModel.py @@ -162,12 +162,12 @@ if __name__ == "__main__": net = UModel(featurelen).to(device) #net.save(0) - text = net.predict("test1.wav",device) + text = net.predict("./test/test1.wav",device) print(text) - text = net.predict("test2.wav",device) + text = net.predict("./test/test2.wav",device) print(text) - melf = melfuture("test3.wav") + melf = melfuture("./test/test3.wav") melf.unsqueeze_(0) conv0 = nn.Conv1d(featurelen,256,11,2, 5, 1) diff --git a/cafe.wav b/assets/cafe.wav similarity index 100% rename from cafe.wav rename to assets/cafe.wav diff --git a/perlin.wav b/assets/perlin.wav similarity index 100% rename from perlin.wav rename to assets/perlin.wav diff --git a/radionoise.wav b/assets/radionoise.wav similarity index 100% rename from radionoise.wav rename to assets/radionoise.wav diff --git a/silence.wav b/assets/silence.wav similarity index 100% rename from silence.wav rename to assets/silence.wav diff --git a/white.wav b/assets/white.wav similarity index 100% rename from white.wav rename to assets/white.wav diff --git a/data.py b/data.py index 80a653b..d357c84 100644 --- a/data.py +++ b/data.py @@ -17,10 +17,10 @@ window_len = fft_len window = "hann" hop_len = 200 -white_noise,_=librosa.load('white.wav',sr=sample_rate, duration=15.0) -perlin_noise,_=librosa.load('perlin.wav',sr=sample_rate, duration=15.0) -cafe_noise, _ = librosa.load('cafe.wav',sr=sample_rate, duration=15.0) -radio_noise, _ = librosa.load('radionoise.wav',sr=sample_rate, duration=15.0) +white_noise,_=librosa.load('./assets/white.wav',sr=sample_rate, duration=15.0) +perlin_noise,_=librosa.load('./assets/perlin.wav',sr=sample_rate, duration=15.0) +cafe_noise, _ = librosa.load('./assets/cafe.wav',sr=sample_rate, duration=15.0) +radio_noise, _ = librosa.load('./assets/radionoise.wav',sr=sample_rate, duration=15.0) def addnoise(audio): rnd = random.random() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e68c8d9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +librosa==0.9.2 +numpy==1.24.4 +scikit_learn==1.3.2 +torch==2.2.2 +tqdm==4.66.1 +umsc==0.3.0 diff --git a/test1.wav b/test/test1.wav similarity index 100% rename from test1.wav rename to test/test1.wav diff --git a/test2.wav b/test/test2.wav similarity index 100% rename from test2.wav rename to test/test2.wav diff --git a/test3.wav b/test/test3.wav similarity index 100% rename from test3.wav rename to test/test3.wav diff --git a/test4.wav b/test/test4.wav similarity index 100% rename from test4.wav rename to test/test4.wav diff --git a/test5.wav b/test/test5.wav similarity index 100% rename from test5.wav rename to test/test5.wav diff --git a/test6.wav b/test/test6.wav similarity index 100% rename from test6.wav rename to test/test6.wav diff --git a/tonu.py b/tonu.py index 22712bf..087c71e 100644 --- a/tonu.py +++ b/tonu.py @@ -1,7 +1,11 @@ import sys -import os from data import featurelen from UModel import UModel +from umsc import UgMultiScriptConverter +source_script = 'UAS' +target_script = 'ULS' +converter = UgMultiScriptConverter(source_script, target_script) + if __name__ == '__main__': model = UModel(featurelen) @@ -12,6 +16,6 @@ if __name__ == '__main__': device = 'cpu' model.to(device) audiofile = sys.argv[1] - print(f"\n======================\nRecognizing file {audiofile}") txt = model.predict(audiofile,device) - print("%s -> %s" %(os.path.basename(audiofile),txt)) + script = converter(txt) + print(script) diff --git a/train.py b/train.py index f033607..801ad65 100644 --- a/train.py +++ b/train.py @@ -128,7 +128,7 @@ def train(model, train_loader): if __name__ == "__main__": - device = "cuda" + device = "mps" os.makedirs('./results',exist_ok=True) @@ -167,9 +167,10 @@ if __name__ == "__main__": torch.cuda.empty_cache() model.eval() msg = "" - for afile in testfile: - text = model.predict(afile,device) - text = f"{afile}-->{text}\n" + for file in testfile: + file = "./test/" + file + text = model.predict(file,device) + text = f"{file}-->{text}\n" print(text,end="") msg += text