add: content fetcher for translate

2024-09-15 23:43:01 +08:00 · 2024-09-15 23:43:01 +08:00 · 4c9f411f67
commit 4c9f411f67
parent ebd1113a6e
3 changed files with 155 additions and 13 deletions
--- a/translate/fetcher.py
+++ b/translate/fetcher.py
@ -0,0 +1,125 @@
 import sqlite3
 import trafilatura
 import hashlib
 import re
 import os
 from trafilatura.readability_lxml import is_probably_readerable
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # 常量定义
 MAX_FETCH_LIMIT = 300  # 每次运行时获取的最大任务数量
 # 数据库连接
 def connect_db(db_path):
    return sqlite3.connect(db_path)
 # 创建fetch_list表
 def create_fetch_list_table(conn):
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS fetch_list (
            url TEXT PRIMARY KEY,
            fetched_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    conn.commit()
 # 获取未爬取的URL列表
 def get_unfetched_urls(conn, limit):
    cursor = conn.cursor()
    cursor.execute("""
        SELECT url FROM url_list
        WHERE url NOT IN (SELECT url FROM fetch_list)
        LIMIT ?
    """, (limit,))
    return [row[0] for row in cursor.fetchall()]
 # 下载并提取网页内容
 def fetch_and_extract_content(url):
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return None
    html_string = downloaded
    if not is_probably_readerable(html_string):
        return None
    content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
    return content
 # 计算URL的MD5
 def md5_hash(url):
    return hashlib.md5(url.encode()).hexdigest()
 # 分段规则
 def split_content(content):
    sentences = re.split(r'[。！？；.!?;]', content)
    segments = []
    current_segment = []
    current_length = 0
    for sentence in sentences:
        sentence_length = len(sentence)
        if (len(current_segment) >= 10 or current_length + sentence_length > 1500):
            segments.append(''.join(current_segment))
            current_segment = []
            current_length = 0
        current_segment.append(sentence)
        current_length += sentence_length
    if current_segment:
        segments.append(''.join(current_segment))
    return segments
 # 保存分段内容到文件
 def save_segments(url, segments, path):
    url_hash = md5_hash(url)
    for idx, segment in enumerate(segments):
        save_path = os.path.join(path, f"{url_hash}_{idx}.txt")
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(segment)
 # 记录已爬取的URL
 def record_fetched_url(conn, url):
    cursor = conn.cursor()
    cursor.execute("""
        INSERT INTO fetch_list (url, fetched_time)
        VALUES (?, CURRENT_TIMESTAMP)
    """, (url,))
    conn.commit()
 # 处理单个URL的任务
 def process_url(url, db_path, save_path):
    conn = connect_db(db_path)
    content = fetch_and_extract_content(url)
    if content:
        segments = split_content(content)
        save_segments(url, segments, save_path)
        record_fetched_url(conn, url)
    conn.close()
 # 主函数
 def main():
    db_path = "crawler.db"
    save_path = "./source"
    conn = connect_db(db_path)
    # 创建fetch_list表
    create_fetch_list_table(conn)
    unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT)
    conn.close()
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls]
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"An error occurred: {e}")
 if __name__ == "__main__":
    main()
--- a/translate/llm-translate.py
+++ b/translate/llm-translate.py
@ -3,6 +3,7 @@ from dotenv import load_dotenv
 import json
 import threading
 from openai import OpenAI
 from pathlib import Path
 load_dotenv()
@ -14,6 +15,12 @@ client = OpenAI(
 system_prompt = """
 The user will provide some text. Please parse the text into segments, each segment contains 1 to 5 sentences. Translate each sentence into the corresponding language. If the input is in Chinese, return the English translation, and vice versa.
 IMPORTANT:
 1. Segment should not be too long, each segment should be under 100 English words or 180 Chinese characters.
 2. For segments or sentences that appear multiple times in the original text, they are only output **once** in the returned translation.
 3. **For content with obvious semantic differences, such as different components on a web page, no matter how short it is, it should be divided into a separate segment.**
 4. **Information such as web page headers, footers, and other fixed text, such as copyright notices, website or company names, and conventional link text (such as "About Us", "Privacy Policy", etc.) will be **ignored and not translated**
 EXAMPLE INPUT: 
 法律之前人人平等，并有权享受法律的平等保护，不受任何歧视。人人有权享受平等保护，以免受违反本宣言的任何歧视行为以及煽动这种歧视的任何行为之害。
@ -47,11 +54,11 @@ def process_file(input_file, output_dir):
        translation = translate_text(text)
-        output_file = os.path.join(output_dir, os.path.basename(input_file) + '.json')
+        output_path = os.path.join(output_dir, Path(input_file).stem + ".json")
-        with open(output_file, 'w', encoding='utf-8') as f:
+        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(translation, f, ensure_ascii=False, indent=4)
-        print(f"Processed {input_file} and saved to {output_file}")
+        print(f"Successfully translated and saved to {output_path}")
    except Exception as e:
        print(f"Error processing {input_file}: {e}")
@ -60,7 +67,12 @@ def batch_process(input_dir, output_dir, num_threads=4):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
-    files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
+    input_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
    output_files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))]
    output_stems = {Path(f).stem for f in output_files}
    files = [os.path.join(input_dir, f) for f in input_files if Path(f).stem not in output_stems]
    threads = []
    for file in files:
@ -79,4 +91,4 @@ def batch_process(input_dir, output_dir, num_threads=4):
 if __name__ == "__main__":
    input_dir = "./source"
    output_dir = "./output"
-    batch_process(input_dir, output_dir)
+    batch_process(input_dir, output_dir, num_threads=64)
--- a/translate/spider.py
+++ b/translate/spider.py
@ -1,9 +1,10 @@
 import os
 import requests
 from bs4 import BeautifulSoup
 import sqlite3
 import urllib.robotparser as urobot
 from urllib.parse import urljoin, urlparse
-
+from dotenv import load_dotenv
 MAX_RECURSION_DEPTH = 5
 MAX_URLS = 1000
@ -124,11 +125,15 @@ def main(seed_url, rp, sitemap=None):
            save_url(sitemap_url)
    crawl(seed_url, rp=rp)
 def get_config(key):
    return os.getenv(key)
 # Example usage
-# if __name__ == "__main__":
+if __name__ == "__main__":
-#     seed_url = "https://www.bbc.co.uk/news"
+    load_dotenv()
-#     rp = urobot.RobotFileParser()
+    seed_url = get_config("SEED_URL")
-#     rp.set_url("https://www.bbc.co.uk/robots.txt")
+    rp = urobot.RobotFileParser()
-#     rp.read()
+    rp.set_url(get_config("ROBOTS_URL"))
-#     main(seed_url, rp, "https://www.bbc.co.uk/sitemap.xml")
+    rp.read()
-#     conn.close()
+    main(seed_url, rp, get_config("SITEMAP_URL"))
    conn.close()