sparkastML/translate/split_source.py

52 lines
1.7 KiB
Python

import os
import re
def split_content(content):
sentences = re.split(r'[。!?;.!?;]', content)
segments = []
current_segment = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence)
if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
segments.append(''.join(current_segment))
current_segment = []
current_length = 0
current_segment.append(sentence)
current_length += sentence_length
if current_segment:
segments.append(''.join(current_segment))
return segments
def process_files_in_directory(directory):
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
# 只处理文件,跳过目录
if os.path.isfile(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
segments = split_content(content)
if len(segments) > 1:
# 删除原始文件
os.remove(file_path)
# 保存分割后的文件
for i, segment in enumerate(segments):
new_filename = f"{filename}_{i+1}"
new_file_path = os.path.join(directory, new_filename)
with open(new_file_path, 'w', encoding='utf-8') as new_file:
new_file.write(segment)
else:
print(f"文件 {filename} 不需要分割")
# 指定目录
directory = './source-new'
process_files_in_directory(directory)