sparkastML/translate/postprocess.py
alikia2x a9a7430a58
update: fetching with cooldown
fix: post-process unmatch
improve: LLM-translate now request with temprature
2024-09-16 04:08:33 +08:00

48 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
from pybloom_live import BloomFilter
def read_converted_files(filename):
"""读取converted.txt文件返回一个包含已处理文件名的集合"""
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as file:
return set(file.read().splitlines())
return set()
def write_converted_file(filename, file_name):
"""将处理过的文件名写入converted.txt"""
with open(filename, 'a', encoding='utf-8') as file:
file.write(file_name + '\n')
def process_json_files(directory, converted_filename):
"""处理指定目录下的所有json文件"""
converted_files = read_converted_files(converted_filename)
bloom_filter_chinese = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
bloom_filter_english = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
for filename in os.listdir(directory):
if filename.endswith('.json') and filename not in converted_files:
file_path = os.path.join(directory, filename)
with open(file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
segments = data.get('segments', [])
with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
open('./result/target.txt', 'a', encoding='utf-8') as target_file:
for segment in segments:
chinese_text = segment.get('chinese', '').replace('\n', ' ')
english_text = segment.get('english', '').replace('\n', ' ')
if chinese_text not in bloom_filter_chinese and english_text not in bloom_filter_english:
bloom_filter_chinese.add(chinese_text)
source_file.write(chinese_text + '\n')
bloom_filter_english.add(english_text)
target_file.write(english_text + '\n')
write_converted_file(converted_filename, filename)
if __name__ == "__main__":
json_directory = './output' # 替换为你的JSON文件目录路径
converted_filename = './result/converted.txt'
process_json_files(json_directory, converted_filename)