sparkastML/translate/postprocess.py
2024-09-15 23:54:37 +08:00

42 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
def read_converted_files(filename):
"""读取converted.txt文件返回一个包含已处理文件名的集合"""
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as file:
return set(file.read().splitlines())
return set()
def write_converted_file(filename, file_name):
"""将处理过的文件名写入converted.txt"""
with open(filename, 'a', encoding='utf-8') as file:
file.write(file_name + '\n')
def process_json_files(directory, converted_filename):
"""处理指定目录下的所有json文件"""
converted_files = read_converted_files(converted_filename)
for filename in os.listdir(directory):
if filename.endswith('.json') and filename not in converted_files:
file_path = os.path.join(directory, filename)
with open(file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
segments = data.get('segments', [])
with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
open('./result/target.txt', 'a', encoding='utf-8') as target_file:
for segment in segments:
chinese_text = segment.get('chinese', '').replace('\n', ' ')
english_text = segment.get('english', '').replace('\n', ' ')
source_file.write(chinese_text + '\n')
target_file.write(english_text + '\n')
write_converted_file(converted_filename, filename)
if __name__ == "__main__":
json_directory = './output' # 替换为你的JSON文件目录路径
converted_filename = './result/converted.txt'
process_json_files(json_directory, converted_filename)