feature: implement training

This commit is contained in:
alikia2x (寒寒) 2024-10-20 05:01:15 +08:00
parent 3867f6172e
commit 496efa87bc
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
6 changed files with 81 additions and 12085 deletions

5
.gitignore vendored
View File

@ -166,4 +166,7 @@ cython_debug/
# Project Specific
*.mp3
results/
results/
train/
data/
tmp/

21
commonvoice.py Normal file
View File

@ -0,0 +1,21 @@
import pandas as pd
from umsc import UgMultiScriptConverter
from tqdm import tqdm
source_script = 'UAS'
target_script = 'ULS'
converter = UgMultiScriptConverter(source_script, target_script)
df = pd.read_csv('./data/commonvoice/validated.tsv', sep='\t')
new_df = pd.DataFrame(columns=['path', 'script'])
qbar = tqdm(total=len(df))
for index, row in df.iterrows():
qbar.update(1)
# Skip too bad samples
if row['up_votes'] < row['down_votes']:
continue
new_df.loc[index] = [row['path'], converter(row['sentence'])]
new_df.to_csv('./data/training/commonvoice_train.csv', index=False)

28
data.py
View File

@ -1,3 +1,4 @@
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
@ -38,7 +39,7 @@ def addnoise(audio):
def randomstretch(audio):
factor = random.uniform(0.8, 1.2)
audio = librosa.core.resample(audio,sample_rate,sample_rate*factor)
audio = librosa.core.resample(audio, orig_sr=sample_rate, target_sr=sample_rate*factor)
return audio
#def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1):
@ -104,18 +105,19 @@ def melfuture(wav_path, augument = False):
class SpeechDataset(Dataset):
def __init__(self, index_path, augumentation = False):
self.Raw = False
with open(index_path,encoding='utf_8_sig') as f:
lines = f.readlines()
self.idx = []
for x in lines:
item = x.strip().split("\t")
if os.path.exists(item[0]):
line = []
line.append(item[0])
char_indx = uyghur_latin.encode(item[1])
line.append(char_indx)
self.idx.append(line)
self.idx = []
df = pd.read_csv(index_path)
for _, row in df.iterrows():
path = row['path']
sentence = row['script']
if not os.path.exists(path):
continue
line = []
line.append(path)
char_indx = uyghur_latin.encode(sentence)
line.append(char_indx)
self.idx.append(line)
self.augument = augumentation

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
import numpy as np
import os
import pandas as pd
import torch
import torch.nn.functional as F
@ -128,25 +129,59 @@ def train(model, train_loader):
if __name__ == "__main__":
device = "mps"
device = "cuda"
training_data_path = './data/training/'
os.makedirs('./results',exist_ok=True)
os.makedirs('./tmp',exist_ok=True)
# Read the folder train, list all the files in it
files = os.listdir(training_data_path)
# Filter out the files that are not csv/tsv files
files = [f for f in files if f.endswith('.csv') or f.endswith('.tsv')]
# Find out the file with `_train` and `_test`
train_files = [f for f in files if '_train' in f]
test_files = [f for f in files if '_test' in f]
train_file = 'thuyg20_train.csv'
test_file = 'thuyg20_test.csv'
# Read all train files, combine their `path` and `script` columns
train_df = pd.DataFrame()
for f in train_files:
if f.endswith('.tsv'):
df = pd.read_csv(training_data_path + f, sep='\t')
else:
df = pd.read_csv(training_data_path + f)
train_df = pd.concat([train_df, df])
# Read all test files, combine their `path` and `script` columns
test_df = pd.DataFrame()
for f in test_files:
if f.endswith('.tsv'):
df = pd.read_csv(training_data_path + f, sep='\t')
else:
df = pd.read_csv(training_data_path + f)
test_df = pd.concat([test_df, df])
# Save the combined dataframe to csv
train_df.to_csv('./tmp/train.csv', index=False)
test_df.to_csv('./tmp/test.csv', index=False)
train_file = './tmp/train.csv'
test_file = './tmp/test.csv'
train_set = SpeechDataset(train_file, augumentation=True)
train_loader = SpeechDataLoader(train_set,num_workers=4, pin_memory = True, shuffle=True, batch_size=20)
train_loader = SpeechDataLoader(train_set, num_workers=8, pin_memory = True, shuffle=True, batch_size=64)
validation_set = SpeechDataset(test_file, augumentation=False)
validation_loader = SpeechDataLoader(validation_set,num_workers=4, pin_memory = True, shuffle=True, batch_size=20)
validation_loader = SpeechDataLoader(validation_set, num_workers=8, pin_memory = True, shuffle=True, batch_size=64)
print("="*50)
msg = f" Training Set: {train_file}, {len(train_set)} samples" + "\n"
msg += f" Validation Set: {test_file}, {len(validation_set)} samples" + "\n"
msg += f" Vocab Size : {uyghur_latin.vocab_size}"
print(msg)
model = UModel(num_features_input = featurelen)
print("="*50)