48 lines
2.2 KiB
Python
48 lines
2.2 KiB
Python
import os
|
||
import json
|
||
from pybloom_live import BloomFilter
|
||
|
||
def read_converted_files(filename):
|
||
"""读取converted.txt文件,返回一个包含已处理文件名的集合"""
|
||
if os.path.exists(filename):
|
||
with open(filename, 'r', encoding='utf-8') as file:
|
||
return set(file.read().splitlines())
|
||
return set()
|
||
|
||
def write_converted_file(filename, file_name):
|
||
"""将处理过的文件名写入converted.txt"""
|
||
with open(filename, 'a', encoding='utf-8') as file:
|
||
file.write(file_name + '\n')
|
||
|
||
def process_json_files(directory, converted_filename):
|
||
"""处理指定目录下的所有json文件"""
|
||
converted_files = read_converted_files(converted_filename)
|
||
bloom_filter_chinese = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
|
||
bloom_filter_english = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
|
||
|
||
for filename in os.listdir(directory):
|
||
if filename.endswith('.json') and filename not in converted_files:
|
||
file_path = os.path.join(directory, filename)
|
||
with open(file_path, 'r', encoding='utf-8') as json_file:
|
||
data = json.load(json_file)
|
||
segments = data.get('segments', [])
|
||
|
||
with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
|
||
open('./result/target.txt', 'a', encoding='utf-8') as target_file:
|
||
for segment in segments:
|
||
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
||
english_text = segment.get('english', '').replace('\n', ' ')
|
||
|
||
if chinese_text not in bloom_filter_chinese and english_text not in bloom_filter_english:
|
||
bloom_filter_chinese.add(chinese_text)
|
||
source_file.write(chinese_text + '\n')
|
||
bloom_filter_english.add(english_text)
|
||
target_file.write(english_text + '\n')
|
||
|
||
write_converted_file(converted_filename, filename)
|
||
|
||
if __name__ == "__main__":
|
||
json_directory = './output' # 替换为你的JSON文件目录路径
|
||
converted_filename = './result/converted.txt'
|
||
|
||
process_json_files(json_directory, converted_filename) |