add: forced alignment example

This commit is contained in:
alikia2x (寒寒) 2024-11-02 15:46:12 +08:00
parent 37d2507f10
commit aeab34f84b
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
6 changed files with 380 additions and 1 deletions

4
.gitignore vendored
View File

@ -14,4 +14,6 @@ translate/source*
translate/result
*.db
dataset/raw
translate/special-spiders
translate/special-spiders
ugNMT/BPE/output*
ugNMT/BPE/codes

BIN
forced-alignment/1.wav Normal file

Binary file not shown.

318
forced-alignment/test.ipynb Normal file

File diff suppressed because one or more lines are too long

30
translate/hf-dataset.py Normal file
View File

@ -0,0 +1,30 @@
import pandas as pd
# 定义文件路径
source_files = ['./result/source.txt', './result/source-new.txt']
target_files = ['./result/target.txt', './result/target-new.txt']
# 读取source和target文件内容
source_data = []
target_data = []
for file in source_files:
with open(file, 'r', encoding='utf-8') as f:
source_data.extend(f.readlines())
for file in target_files:
with open(file, 'r', encoding='utf-8') as f:
target_data.extend(f.readlines())
# 确保source和target行数一致
if len(source_data) != len(target_data):
print("Warning: The number of lines in source and target files do not match.")
# 创建DataFrame
df = pd.DataFrame({
'zh': [line.strip() for line in source_data], # 去掉每行的换行符
'en': [line.strip() for line in target_data] # 去掉每行的换行符
})
df.to_csv('./result/data.csv', index=False, encoding='utf-8')

View File

@ -0,0 +1,16 @@
import re
# 读取文件内容
with open('ug.txt', 'r', encoding='utf-8') as file:
data = file.read()
# 定义正则表达式,保留维吾尔语字母、阿拉伯数字及常见标点符号
# 维吾尔语字母的Unicode范围是U+0600-U+06FF
# 阿拉伯数字 0-9以及标点符号,:)可以根据需要调整
filtered_data = re.sub(r'[^\u0600-\u06FF0-9.,!?؛:\s]', '', data)
# 将过滤后的数据输出或保存到新的文件中
with open('filtered_ug.txt', 'w', encoding='utf-8') as file:
file.write(filtered_data)
print("过滤完成,结果已保存到 filtered_ug.txt")

13
ugNMT/BPE/filter_space.py Normal file
View File

@ -0,0 +1,13 @@
import re
def replace_spaces_in_file(input_file_path, output_file_path):
with open(input_file_path, 'r', encoding='utf-8') as file:
text = file.read()
new_text = re.sub(r' +', ' ', text)
with open(output_file_path, 'w', encoding='utf-8') as file:
file.write(new_text)
# 调用函数,替换文件中的空格
replace_spaces_in_file('./data/ug_texts1.txt', './data/2.txt')