feature: implement training
This commit is contained in:
parent
3867f6172e
commit
496efa87bc
5
.gitignore
vendored
5
.gitignore
vendored
@ -166,4 +166,7 @@ cython_debug/
|
||||
|
||||
# Project Specific
|
||||
*.mp3
|
||||
results/
|
||||
results/
|
||||
train/
|
||||
data/
|
||||
tmp/
|
21
commonvoice.py
Normal file
21
commonvoice.py
Normal file
@ -0,0 +1,21 @@
|
||||
import pandas as pd
|
||||
from umsc import UgMultiScriptConverter
|
||||
from tqdm import tqdm
|
||||
|
||||
source_script = 'UAS'
|
||||
target_script = 'ULS'
|
||||
converter = UgMultiScriptConverter(source_script, target_script)
|
||||
|
||||
df = pd.read_csv('./data/commonvoice/validated.tsv', sep='\t')
|
||||
|
||||
new_df = pd.DataFrame(columns=['path', 'script'])
|
||||
|
||||
qbar = tqdm(total=len(df))
|
||||
for index, row in df.iterrows():
|
||||
qbar.update(1)
|
||||
# Skip too bad samples
|
||||
if row['up_votes'] < row['down_votes']:
|
||||
continue
|
||||
new_df.loc[index] = [row['path'], converter(row['sentence'])]
|
||||
|
||||
new_df.to_csv('./data/training/commonvoice_train.csv', index=False)
|
28
data.py
28
data.py
@ -1,3 +1,4 @@
|
||||
import pandas as pd
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
from torch.utils.data import DataLoader
|
||||
@ -38,7 +39,7 @@ def addnoise(audio):
|
||||
|
||||
def randomstretch(audio):
|
||||
factor = random.uniform(0.8, 1.2)
|
||||
audio = librosa.core.resample(audio,sample_rate,sample_rate*factor)
|
||||
audio = librosa.core.resample(audio, orig_sr=sample_rate, target_sr=sample_rate*factor)
|
||||
return audio
|
||||
|
||||
#def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1):
|
||||
@ -104,18 +105,19 @@ def melfuture(wav_path, augument = False):
|
||||
class SpeechDataset(Dataset):
|
||||
def __init__(self, index_path, augumentation = False):
|
||||
self.Raw = False
|
||||
with open(index_path,encoding='utf_8_sig') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
self.idx = []
|
||||
for x in lines:
|
||||
item = x.strip().split("\t")
|
||||
if os.path.exists(item[0]):
|
||||
line = []
|
||||
line.append(item[0])
|
||||
char_indx = uyghur_latin.encode(item[1])
|
||||
line.append(char_indx)
|
||||
self.idx.append(line)
|
||||
self.idx = []
|
||||
|
||||
df = pd.read_csv(index_path)
|
||||
for _, row in df.iterrows():
|
||||
path = row['path']
|
||||
sentence = row['script']
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
line = []
|
||||
line.append(path)
|
||||
char_indx = uyghur_latin.encode(sentence)
|
||||
line.append(char_indx)
|
||||
self.idx.append(line)
|
||||
|
||||
self.augument = augumentation
|
||||
|
||||
|
2142
thuyg20_test.csv
2142
thuyg20_test.csv
File diff suppressed because it is too large
Load Diff
9923
thuyg20_train.csv
9923
thuyg20_train.csv
File diff suppressed because it is too large
Load Diff
47
train.py
47
train.py
@ -1,5 +1,6 @@
|
||||
import numpy as np
|
||||
import os
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
@ -128,25 +129,59 @@ def train(model, train_loader):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
device = "mps"
|
||||
device = "cuda"
|
||||
|
||||
training_data_path = './data/training/'
|
||||
|
||||
os.makedirs('./results',exist_ok=True)
|
||||
os.makedirs('./tmp',exist_ok=True)
|
||||
|
||||
# Read the folder train, list all the files in it
|
||||
files = os.listdir(training_data_path)
|
||||
# Filter out the files that are not csv/tsv files
|
||||
files = [f for f in files if f.endswith('.csv') or f.endswith('.tsv')]
|
||||
# Find out the file with `_train` and `_test`
|
||||
train_files = [f for f in files if '_train' in f]
|
||||
test_files = [f for f in files if '_test' in f]
|
||||
|
||||
train_file = 'thuyg20_train.csv'
|
||||
test_file = 'thuyg20_test.csv'
|
||||
# Read all train files, combine their `path` and `script` columns
|
||||
train_df = pd.DataFrame()
|
||||
for f in train_files:
|
||||
if f.endswith('.tsv'):
|
||||
df = pd.read_csv(training_data_path + f, sep='\t')
|
||||
else:
|
||||
df = pd.read_csv(training_data_path + f)
|
||||
|
||||
train_df = pd.concat([train_df, df])
|
||||
# Read all test files, combine their `path` and `script` columns
|
||||
test_df = pd.DataFrame()
|
||||
for f in test_files:
|
||||
if f.endswith('.tsv'):
|
||||
df = pd.read_csv(training_data_path + f, sep='\t')
|
||||
else:
|
||||
df = pd.read_csv(training_data_path + f)
|
||||
test_df = pd.concat([test_df, df])
|
||||
|
||||
# Save the combined dataframe to csv
|
||||
train_df.to_csv('./tmp/train.csv', index=False)
|
||||
test_df.to_csv('./tmp/test.csv', index=False)
|
||||
|
||||
train_file = './tmp/train.csv'
|
||||
test_file = './tmp/test.csv'
|
||||
|
||||
train_set = SpeechDataset(train_file, augumentation=True)
|
||||
train_loader = SpeechDataLoader(train_set,num_workers=4, pin_memory = True, shuffle=True, batch_size=20)
|
||||
train_loader = SpeechDataLoader(train_set, num_workers=8, pin_memory = True, shuffle=True, batch_size=64)
|
||||
|
||||
validation_set = SpeechDataset(test_file, augumentation=False)
|
||||
validation_loader = SpeechDataLoader(validation_set,num_workers=4, pin_memory = True, shuffle=True, batch_size=20)
|
||||
validation_loader = SpeechDataLoader(validation_set, num_workers=8, pin_memory = True, shuffle=True, batch_size=64)
|
||||
|
||||
print("="*50)
|
||||
|
||||
msg = f" Training Set: {train_file}, {len(train_set)} samples" + "\n"
|
||||
msg += f" Validation Set: {test_file}, {len(validation_set)} samples" + "\n"
|
||||
msg += f" Vocab Size : {uyghur_latin.vocab_size}"
|
||||
|
||||
print(msg)
|
||||
|
||||
model = UModel(num_features_input = featurelen)
|
||||
|
||||
print("="*50)
|
||||
|
Loading…
Reference in New Issue
Block a user