From 4c9f411f67c71933d56b9b2ab9a0b651f40eda47 Mon Sep 17 00:00:00 2001
From: alikia2x <alikia2x@outlook.com>
Date: Sun, 15 Sep 2024 23:43:01 +0800
Subject: [PATCH] add: content fetcher for translate

---
 translate/fetcher.py       | 125 +++++++++++++++++++++++++++++++++++++
 translate/llm-translate.py |  22 +++++--
 translate/spider.py        |  21 ++++---
 3 files changed, 155 insertions(+), 13 deletions(-)
 create mode 100644 translate/fetcher.py

diff --git a/translate/fetcher.py b/translate/fetcher.py
new file mode 100644
index 0000000..8637ac8
--- /dev/null
+++ b/translate/fetcher.py
@@ -0,0 +1,125 @@
+import sqlite3
+import trafilatura
+import hashlib
+import re
+import os
+from trafilatura.readability_lxml import is_probably_readerable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# 常量定义
+MAX_FETCH_LIMIT = 300  # 每次运行时获取的最大任务数量
+
+# 数据库连接
+def connect_db(db_path):
+    return sqlite3.connect(db_path)
+
+# 创建fetch_list表
+def create_fetch_list_table(conn):
+    cursor = conn.cursor()
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS fetch_list (
+            url TEXT PRIMARY KEY,
+            fetched_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+    conn.commit()
+
+# 获取未爬取的URL列表
+def get_unfetched_urls(conn, limit):
+    cursor = conn.cursor()
+    cursor.execute("""
+        SELECT url FROM url_list
+        WHERE url NOT IN (SELECT url FROM fetch_list)
+        LIMIT ?
+    """, (limit,))
+    return [row[0] for row in cursor.fetchall()]
+
+# 下载并提取网页内容
+def fetch_and_extract_content(url):
+    downloaded = trafilatura.fetch_url(url)
+    if not downloaded:
+        return None
+    
+    html_string = downloaded
+    if not is_probably_readerable(html_string):
+        return None
+    
+    content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
+    return content
+
+# 计算URL的MD5
+def md5_hash(url):
+    return hashlib.md5(url.encode()).hexdigest()
+
+# 分段规则
+def split_content(content):
+    sentences = re.split(r'[。！？；.!?;]', content)
+    segments = []
+    current_segment = []
+    current_length = 0
+    
+    for sentence in sentences:
+        sentence_length = len(sentence)
+        if (len(current_segment) >= 10 or current_length + sentence_length > 1500):
+            segments.append(''.join(current_segment))
+            current_segment = []
+            current_length = 0
+        
+        current_segment.append(sentence)
+        current_length += sentence_length
+    
+    if current_segment:
+        segments.append(''.join(current_segment))
+    
+    return segments
+
+# 保存分段内容到文件
+def save_segments(url, segments, path):
+    url_hash = md5_hash(url)
+    for idx, segment in enumerate(segments):
+        save_path = os.path.join(path, f"{url_hash}_{idx}.txt")
+        with open(save_path, "w", encoding="utf-8") as f:
+            f.write(segment)
+
+# 记录已爬取的URL
+def record_fetched_url(conn, url):
+    cursor = conn.cursor()
+    cursor.execute("""
+        INSERT INTO fetch_list (url, fetched_time)
+        VALUES (?, CURRENT_TIMESTAMP)
+    """, (url,))
+    conn.commit()
+
+# 处理单个URL的任务
+def process_url(url, db_path, save_path):
+    conn = connect_db(db_path)
+    content = fetch_and_extract_content(url)
+    if content:
+        segments = split_content(content)
+        save_segments(url, segments, save_path)
+        record_fetched_url(conn, url)
+    conn.close()
+
+# 主函数
+def main():
+    db_path = "crawler.db"
+    save_path = "./source"
+    conn = connect_db(db_path)
+    
+    # 创建fetch_list表
+    create_fetch_list_table(conn)
+    
+    unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT)
+    conn.close()
+    
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls]
+        
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/translate/llm-translate.py b/translate/llm-translate.py
index 3873768..99b3897 100644
--- a/translate/llm-translate.py
+++ b/translate/llm-translate.py
@@ -3,6 +3,7 @@ from dotenv import load_dotenv
 import json
 import threading
 from openai import OpenAI
+from pathlib import Path
 
 load_dotenv()
 
@@ -14,6 +15,12 @@ client = OpenAI(
 system_prompt = """
 The user will provide some text. Please parse the text into segments, each segment contains 1 to 5 sentences. Translate each sentence into the corresponding language. If the input is in Chinese, return the English translation, and vice versa.
 
+IMPORTANT:
+1. Segment should not be too long, each segment should be under 100 English words or 180 Chinese characters.
+2. For segments or sentences that appear multiple times in the original text, they are only output **once** in the returned translation.
+3. **For content with obvious semantic differences, such as different components on a web page, no matter how short it is, it should be divided into a separate segment.**
+4. **Information such as web page headers, footers, and other fixed text, such as copyright notices, website or company names, and conventional link text (such as "About Us", "Privacy Policy", etc.) will be **ignored and not translated**
+
 EXAMPLE INPUT: 
 法律之前人人平等，并有权享受法律的平等保护，不受任何歧视。人人有权享受平等保护，以免受违反本宣言的任何歧视行为以及煽动这种歧视的任何行为之害。
 
@@ -47,11 +54,11 @@ def process_file(input_file, output_dir):
         
         translation = translate_text(text)
         
-        output_file = os.path.join(output_dir, os.path.basename(input_file) + '.json')
-        with open(output_file, 'w', encoding='utf-8') as f:
+        output_path = os.path.join(output_dir, Path(input_file).stem + ".json")
+        with open(output_path, 'w', encoding='utf-8') as f:
             json.dump(translation, f, ensure_ascii=False, indent=4)
         
-        print(f"Processed {input_file} and saved to {output_file}")
+        print(f"Successfully translated and saved to {output_path}")
     
     except Exception as e:
         print(f"Error processing {input_file}: {e}")
@@ -60,7 +67,12 @@ def batch_process(input_dir, output_dir, num_threads=4):
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
     
-    files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
+    input_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
+    output_files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))]
+    
+    output_stems = {Path(f).stem for f in output_files}
+    
+    files = [os.path.join(input_dir, f) for f in input_files if Path(f).stem not in output_stems]
     
     threads = []
     for file in files:
@@ -79,4 +91,4 @@ def batch_process(input_dir, output_dir, num_threads=4):
 if __name__ == "__main__":
     input_dir = "./source"
     output_dir = "./output"
-    batch_process(input_dir, output_dir)
\ No newline at end of file
+    batch_process(input_dir, output_dir, num_threads=64)
\ No newline at end of file
diff --git a/translate/spider.py b/translate/spider.py
index c8aa510..7b9a54b 100644
--- a/translate/spider.py
+++ b/translate/spider.py
@@ -1,9 +1,10 @@
+import os
 import requests
 from bs4 import BeautifulSoup
 import sqlite3
 import urllib.robotparser as urobot
 from urllib.parse import urljoin, urlparse
-
+from dotenv import load_dotenv
 
 MAX_RECURSION_DEPTH = 5
 MAX_URLS = 1000
@@ -124,11 +125,15 @@ def main(seed_url, rp, sitemap=None):
             save_url(sitemap_url)
     crawl(seed_url, rp=rp)
 
+def get_config(key):
+    return os.getenv(key)
+
 # Example usage
-# if __name__ == "__main__":
-#     seed_url = "https://www.bbc.co.uk/news"
-#     rp = urobot.RobotFileParser()
-#     rp.set_url("https://www.bbc.co.uk/robots.txt")
-#     rp.read()
-#     main(seed_url, rp, "https://www.bbc.co.uk/sitemap.xml")
-#     conn.close()
+if __name__ == "__main__":
+    load_dotenv()
+    seed_url = get_config("SEED_URL")
+    rp = urobot.RobotFileParser()
+    rp.set_url(get_config("ROBOTS_URL"))
+    rp.read()
+    main(seed_url, rp, get_config("SITEMAP_URL"))
+    conn.close()