sparkastML/translate/postprocess.py

import os
import json
from pybloom_live import BloomFilter

def read_converted_files(filename):
    """读取converted.txt文件，返回一个包含已处理文件名的集合"""
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as file:
            return set(file.read().splitlines())
    return set()

def write_converted_file(filename, file_name):
    """将处理过的文件名写入converted.txt"""
    with open(filename, 'a', encoding='utf-8') as file:
        file.write(file_name + '\n')

def process_json_files(directory, converted_filename):
    """处理指定目录下的所有json文件"""
    converted_files = read_converted_files(converted_filename)
    bloom_filter_chinese = BloomFilter(capacity=1000000, error_rate=0.001)  # 初始化Bloom Filter
    bloom_filter_english = BloomFilter(capacity=1000000, error_rate=0.001)  # 初始化Bloom Filter

    for filename in os.listdir(directory):
        if filename.endswith('.json') and filename not in converted_files:
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                segments = data.get('segments', [])

                with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
                     open('./result/target.txt', 'a', encoding='utf-8') as target_file:
                    for segment in segments:
                        chinese_text = segment.get('chinese', '').replace('\n', ' ')
                        english_text = segment.get('english', '').replace('\n', ' ')

                        if chinese_text not in bloom_filter_chinese and english_text not in bloom_filter_english:
                            bloom_filter_chinese.add(chinese_text)
                            source_file.write(chinese_text + '\n')
                            bloom_filter_english.add(english_text)
                            target_file.write(english_text + '\n')

            write_converted_file(converted_filename, filename)

if __name__ == "__main__":
    json_directory = './output'  # 替换为你的JSON文件目录路径
    converted_filename = './result/converted.txt'

    process_json_files(json_directory, converted_filename)