22 lines
610 B
Python
22 lines
610 B
Python
import pandas as pd
|
|
from umsc import UgMultiScriptConverter
|
|
from tqdm import tqdm
|
|
|
|
source_script = 'UAS'
|
|
target_script = 'ULS'
|
|
converter = UgMultiScriptConverter(source_script, target_script)
|
|
|
|
df = pd.read_csv('./data/commonvoice/validated.tsv', sep='\t')
|
|
|
|
new_df = pd.DataFrame(columns=['path', 'script'])
|
|
|
|
qbar = tqdm(total=len(df))
|
|
for index, row in df.iterrows():
|
|
qbar.update(1)
|
|
# Skip too bad samples
|
|
if row['up_votes'] < row['down_votes']:
|
|
continue
|
|
new_df.loc[index] = [row['path'], converter(row['sentence'])]
|
|
|
|
new_df.to_csv('./data/training/commonvoice_train.csv', index=False)
|