52 lines
1.7 KiB
Python
52 lines
1.7 KiB
Python
import os
|
|
import re
|
|
|
|
def split_content(content):
|
|
sentences = re.split(r'[。!?;.!?;]', content)
|
|
segments = []
|
|
current_segment = []
|
|
current_length = 0
|
|
|
|
for sentence in sentences:
|
|
sentence_length = len(sentence)
|
|
if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
|
|
segments.append(''.join(current_segment))
|
|
current_segment = []
|
|
current_length = 0
|
|
|
|
current_segment.append(sentence)
|
|
current_length += sentence_length
|
|
|
|
if current_segment:
|
|
segments.append(''.join(current_segment))
|
|
|
|
return segments
|
|
|
|
def process_files_in_directory(directory):
|
|
for filename in os.listdir(directory):
|
|
file_path = os.path.join(directory, filename)
|
|
|
|
# 只处理文件,跳过目录
|
|
if os.path.isfile(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
content = file.read()
|
|
|
|
segments = split_content(content)
|
|
|
|
if len(segments) > 1:
|
|
# 删除原始文件
|
|
os.remove(file_path)
|
|
|
|
# 保存分割后的文件
|
|
for i, segment in enumerate(segments):
|
|
new_filename = f"{filename}_{i+1}"
|
|
new_file_path = os.path.join(directory, new_filename)
|
|
|
|
with open(new_file_path, 'w', encoding='utf-8') as new_file:
|
|
new_file.write(segment)
|
|
else:
|
|
print(f"文件 {filename} 不需要分割")
|
|
|
|
# 指定目录
|
|
directory = './source-new'
|
|
process_files_in_directory(directory) |