add: forced alignment example

2024-11-02 15:46:12 +08:00 · 2024-11-02 15:46:12 +08:00 · aeab34f84b
commit aeab34f84b
parent 37d2507f10
6 changed files with 380 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,4 +14,6 @@ translate/source*
 translate/result
 *.db
 dataset/raw
-translate/special-spiders
+translate/special-spiders
+ugNMT/BPE/output*
+ugNMT/BPE/codes
--- a/forced-alignment/1.wav
+++ b/forced-alignment/1.wav
--- a/forced-alignment/test.ipynb
+++ b/forced-alignment/test.ipynb
--- a/translate/hf-dataset.py
+++ b/translate/hf-dataset.py
@ -0,0 +1,30 @@
+import pandas as pd
+
+# 定义文件路径
+source_files = ['./result/source.txt', './result/source-new.txt']
+target_files = ['./result/target.txt', './result/target-new.txt']
+
+# 读取source和target文件内容
+source_data = []
+target_data = []
+
+for file in source_files:
+    with open(file, 'r', encoding='utf-8') as f:
+        source_data.extend(f.readlines())
+
+for file in target_files:
+    with open(file, 'r', encoding='utf-8') as f:
+        target_data.extend(f.readlines())
+
+# 确保source和target行数一致
+if len(source_data) != len(target_data):
+    print("Warning: The number of lines in source and target files do not match.")
+
+# 创建DataFrame
+df = pd.DataFrame({
+    'zh': [line.strip() for line in source_data],  # 去掉每行的换行符
+    'en': [line.strip() for line in target_data]   # 去掉每行的换行符
+})
+
+
+df.to_csv('./result/data.csv', index=False, encoding='utf-8')
--- a/ugNMT/BPE/filter_non-ug_char.py
+++ b/ugNMT/BPE/filter_non-ug_char.py
@ -0,0 +1,16 @@
+import re
+
+# 读取文件内容
+with open('ug.txt', 'r', encoding='utf-8') as file:
+    data = file.read()
+
+# 定义正则表达式，保留维吾尔语字母、阿拉伯数字及常见标点符号
+# 维吾尔语字母的Unicode范围是U+0600-U+06FF
+# 阿拉伯数字 0-9，以及标点符号（。！？,，；:）可以根据需要调整
+filtered_data = re.sub(r'[^\u0600-\u06FF0-9.,!?؛:\s]', '', data)
+
+# 将过滤后的数据输出或保存到新的文件中
+with open('filtered_ug.txt', 'w', encoding='utf-8') as file:
+    file.write(filtered_data)
+
+print("过滤完成，结果已保存到 filtered_ug.txt")
--- a/ugNMT/BPE/filter_space.py
+++ b/ugNMT/BPE/filter_space.py
@ -0,0 +1,13 @@
+import re
+
+def replace_spaces_in_file(input_file_path, output_file_path):
+    with open(input_file_path, 'r', encoding='utf-8') as file:
+        text = file.read()
+
+    new_text = re.sub(r' +', ' ', text)
+
+    with open(output_file_path, 'w', encoding='utf-8') as file:
+        file.write(new_text)
+
+# 调用函数，替换文件中的空格
+replace_spaces_in_file('./data/ug_texts1.txt', './data/2.txt')