feature: implement training

This commit is contained in:
alikia2x (寒寒) 2024-10-20 05:01:15 +08:00
parent 3867f6172e
commit 496efa87bc
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
6 changed files with 81 additions and 12085 deletions

3
.gitignore vendored
View File

@ -167,3 +167,6 @@ cython_debug/
# Project Specific # Project Specific
*.mp3 *.mp3
results/ results/
train/
data/
tmp/

21
commonvoice.py Normal file
View File

@ -0,0 +1,21 @@
import pandas as pd
from umsc import UgMultiScriptConverter
from tqdm import tqdm
source_script = 'UAS'
target_script = 'ULS'
converter = UgMultiScriptConverter(source_script, target_script)
df = pd.read_csv('./data/commonvoice/validated.tsv', sep='\t')
new_df = pd.DataFrame(columns=['path', 'script'])
qbar = tqdm(total=len(df))
for index, row in df.iterrows():
qbar.update(1)
# Skip too bad samples
if row['up_votes'] < row['down_votes']:
continue
new_df.loc[index] = [row['path'], converter(row['sentence'])]
new_df.to_csv('./data/training/commonvoice_train.csv', index=False)

26
data.py
View File

@ -1,3 +1,4 @@
import pandas as pd
import torch import torch
from torch.utils.data import Dataset from torch.utils.data import Dataset
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
@ -38,7 +39,7 @@ def addnoise(audio):
def randomstretch(audio): def randomstretch(audio):
factor = random.uniform(0.8, 1.2) factor = random.uniform(0.8, 1.2)
audio = librosa.core.resample(audio,sample_rate,sample_rate*factor) audio = librosa.core.resample(audio, orig_sr=sample_rate, target_sr=sample_rate*factor)
return audio return audio
#def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1): #def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1):
@ -104,18 +105,19 @@ def melfuture(wav_path, augument = False):
class SpeechDataset(Dataset): class SpeechDataset(Dataset):
def __init__(self, index_path, augumentation = False): def __init__(self, index_path, augumentation = False):
self.Raw = False self.Raw = False
with open(index_path,encoding='utf_8_sig') as f: self.idx = []
lines = f.readlines()
self.idx = [] df = pd.read_csv(index_path)
for x in lines: for _, row in df.iterrows():
item = x.strip().split("\t") path = row['path']
if os.path.exists(item[0]): sentence = row['script']
line = [] if not os.path.exists(path):
line.append(item[0]) continue
char_indx = uyghur_latin.encode(item[1]) line = []
line.append(char_indx) line.append(path)
self.idx.append(line) char_indx = uyghur_latin.encode(sentence)
line.append(char_indx)
self.idx.append(line)
self.augument = augumentation self.augument = augumentation

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
import numpy as np import numpy as np
import os import os
import pandas as pd
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
@ -128,25 +129,59 @@ def train(model, train_loader):
if __name__ == "__main__": if __name__ == "__main__":
device = "mps" device = "cuda"
training_data_path = './data/training/'
os.makedirs('./results',exist_ok=True) os.makedirs('./results',exist_ok=True)
os.makedirs('./tmp',exist_ok=True)
train_file = 'thuyg20_train.csv' # Read the folder train, list all the files in it
test_file = 'thuyg20_test.csv' files = os.listdir(training_data_path)
# Filter out the files that are not csv/tsv files
files = [f for f in files if f.endswith('.csv') or f.endswith('.tsv')]
# Find out the file with `_train` and `_test`
train_files = [f for f in files if '_train' in f]
test_files = [f for f in files if '_test' in f]
# Read all train files, combine their `path` and `script` columns
train_df = pd.DataFrame()
for f in train_files:
if f.endswith('.tsv'):
df = pd.read_csv(training_data_path + f, sep='\t')
else:
df = pd.read_csv(training_data_path + f)
train_df = pd.concat([train_df, df])
# Read all test files, combine their `path` and `script` columns
test_df = pd.DataFrame()
for f in test_files:
if f.endswith('.tsv'):
df = pd.read_csv(training_data_path + f, sep='\t')
else:
df = pd.read_csv(training_data_path + f)
test_df = pd.concat([test_df, df])
# Save the combined dataframe to csv
train_df.to_csv('./tmp/train.csv', index=False)
test_df.to_csv('./tmp/test.csv', index=False)
train_file = './tmp/train.csv'
test_file = './tmp/test.csv'
train_set = SpeechDataset(train_file, augumentation=True) train_set = SpeechDataset(train_file, augumentation=True)
train_loader = SpeechDataLoader(train_set,num_workers=4, pin_memory = True, shuffle=True, batch_size=20) train_loader = SpeechDataLoader(train_set, num_workers=8, pin_memory = True, shuffle=True, batch_size=64)
validation_set = SpeechDataset(test_file, augumentation=False) validation_set = SpeechDataset(test_file, augumentation=False)
validation_loader = SpeechDataLoader(validation_set,num_workers=4, pin_memory = True, shuffle=True, batch_size=20) validation_loader = SpeechDataLoader(validation_set, num_workers=8, pin_memory = True, shuffle=True, batch_size=64)
print("="*50) print("="*50)
msg = f" Training Set: {train_file}, {len(train_set)} samples" + "\n" msg = f" Training Set: {train_file}, {len(train_set)} samples" + "\n"
msg += f" Validation Set: {test_file}, {len(validation_set)} samples" + "\n" msg += f" Validation Set: {test_file}, {len(validation_set)} samples" + "\n"
msg += f" Vocab Size : {uyghur_latin.vocab_size}" msg += f" Vocab Size : {uyghur_latin.vocab_size}"
print(msg) print(msg)
model = UModel(num_features_input = featurelen) model = UModel(num_features_input = featurelen)
print("="*50) print("="*50)