sparkastML/translate/split_source.py

import os
import re

def split_content(content):
    sentences = re.split(r'[。！？；.!?;]', content)
    segments = []
    current_segment = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence)
        if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
            segments.append(''.join(current_segment))
            current_segment = []
            current_length = 0

        current_segment.append(sentence)
        current_length += sentence_length

    if current_segment:
        segments.append(''.join(current_segment))

    return segments

def process_files_in_directory(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        # 只处理文件，跳过目录
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            segments = split_content(content)

            if len(segments) > 1:
                # 删除原始文件
                os.remove(file_path)

                # 保存分割后的文件
                for i, segment in enumerate(segments):
                    new_filename = f"{filename}_{i+1}"
                    new_file_path = os.path.join(directory, new_filename)

                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
                        new_file.write(segment)
            else:
                print(f"文件 {filename} 不需要分割")

# 指定目录
directory = './source-new'
process_files_in_directory(directory)