add: forced alignment example
This commit is contained in:
parent
37d2507f10
commit
aeab34f84b
4
.gitignore
vendored
4
.gitignore
vendored
@ -14,4 +14,6 @@ translate/source*
|
||||
translate/result
|
||||
*.db
|
||||
dataset/raw
|
||||
translate/special-spiders
|
||||
translate/special-spiders
|
||||
ugNMT/BPE/output*
|
||||
ugNMT/BPE/codes
|
BIN
forced-alignment/1.wav
Normal file
BIN
forced-alignment/1.wav
Normal file
Binary file not shown.
318
forced-alignment/test.ipynb
Normal file
318
forced-alignment/test.ipynb
Normal file
File diff suppressed because one or more lines are too long
30
translate/hf-dataset.py
Normal file
30
translate/hf-dataset.py
Normal file
@ -0,0 +1,30 @@
|
||||
import pandas as pd
|
||||
|
||||
# 定义文件路径
|
||||
source_files = ['./result/source.txt', './result/source-new.txt']
|
||||
target_files = ['./result/target.txt', './result/target-new.txt']
|
||||
|
||||
# 读取source和target文件内容
|
||||
source_data = []
|
||||
target_data = []
|
||||
|
||||
for file in source_files:
|
||||
with open(file, 'r', encoding='utf-8') as f:
|
||||
source_data.extend(f.readlines())
|
||||
|
||||
for file in target_files:
|
||||
with open(file, 'r', encoding='utf-8') as f:
|
||||
target_data.extend(f.readlines())
|
||||
|
||||
# 确保source和target行数一致
|
||||
if len(source_data) != len(target_data):
|
||||
print("Warning: The number of lines in source and target files do not match.")
|
||||
|
||||
# 创建DataFrame
|
||||
df = pd.DataFrame({
|
||||
'zh': [line.strip() for line in source_data], # 去掉每行的换行符
|
||||
'en': [line.strip() for line in target_data] # 去掉每行的换行符
|
||||
})
|
||||
|
||||
|
||||
df.to_csv('./result/data.csv', index=False, encoding='utf-8')
|
16
ugNMT/BPE/filter_non-ug_char.py
Normal file
16
ugNMT/BPE/filter_non-ug_char.py
Normal file
@ -0,0 +1,16 @@
|
||||
import re
|
||||
|
||||
# 读取文件内容
|
||||
with open('ug.txt', 'r', encoding='utf-8') as file:
|
||||
data = file.read()
|
||||
|
||||
# 定义正则表达式,保留维吾尔语字母、阿拉伯数字及常见标点符号
|
||||
# 维吾尔语字母的Unicode范围是U+0600-U+06FF
|
||||
# 阿拉伯数字 0-9,以及标点符号(。!?,,;:)可以根据需要调整
|
||||
filtered_data = re.sub(r'[^\u0600-\u06FF0-9.,!?؛:\s]', '', data)
|
||||
|
||||
# 将过滤后的数据输出或保存到新的文件中
|
||||
with open('filtered_ug.txt', 'w', encoding='utf-8') as file:
|
||||
file.write(filtered_data)
|
||||
|
||||
print("过滤完成,结果已保存到 filtered_ug.txt")
|
13
ugNMT/BPE/filter_space.py
Normal file
13
ugNMT/BPE/filter_space.py
Normal file
@ -0,0 +1,13 @@
|
||||
import re
|
||||
|
||||
def replace_spaces_in_file(input_file_path, output_file_path):
|
||||
with open(input_file_path, 'r', encoding='utf-8') as file:
|
||||
text = file.read()
|
||||
|
||||
new_text = re.sub(r' +', ' ', text)
|
||||
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
file.write(new_text)
|
||||
|
||||
# 调用函数,替换文件中的空格
|
||||
replace_spaces_in_file('./data/ug_texts1.txt', './data/2.txt')
|
Loading…
Reference in New Issue
Block a user