add: dataset

2024-09-16 17:29:12 +08:00 · 2024-09-16 17:29:12 +08:00 · 932cbd4336
commit 932cbd4336
parent a9a7430a58
3 changed files with 21 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,6 @@ __pycache__
 translate/output
 translate/source
 translate/result
-*.db
+*.db
 dataset/raw
 translate/special-spiders
--- a/dataset/public/README.md
+++ b/dataset/public/README.md
@ -0,0 +1,13 @@
 # sparkastML Datasets
 Here are the datasets published by sparkastML project.
 ## Translation ZH-EN
 High-quality, fresh synthetic data containing over 100,000 sentences of Chinese-English parallel corpora.
 Version: 1
 Last Update: 2024/09/16
 [Google Drive](https://drive.google.com/drive/folders/1_ADblZcB5p9BUvawkYDmp1qIUDZgkkoe?usp=sharing)
 [IPFS](https://ipfs.a2x.pub/ipfs/QmYz4ew4nSzPc6TZvoWk6jXpGN82qt3J46nwfb75N2YKc4/)
--- a/translate/fetcher.py
+++ b/translate/fetcher.py
@ -38,13 +38,15 @@ def get_unfetched_urls(conn, limit):
    return [row[0] for row in cursor.fetchall()]
 # 下载并提取网页内容
-def fetch_and_extract_content(url):
+def fetch_and_extract_content(conn, url):
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return None
    html_string = downloaded
-    if not is_probably_readerable(html_string):
+    if not is_probably_readerable(html_string) and os.getenv("FETCH_IGNORE_CHECK").capitalize() == "TRUE":
        print(f"URL {url} is not readable.")
        record_fetched_url(conn, url)
        return None
    content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
@ -100,7 +102,7 @@ def process_url(url, db_path, save_path):
    cooldown_base = float(os.getenv("FETCH_COOLDOWN"))
    time.sleep(random.random() * cooldown_base)
    conn = connect_db(db_path)
-    content = fetch_and_extract_content(url)
+    content = fetch_and_extract_content(conn, url)
    if content:
        segments = split_content(content)
        save_segments(url, segments, save_path)