From 932cbd4336bffdd1826d1f5d0ba857495ec44a29 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 16 Sep 2024 17:29:12 +0800 Subject: [PATCH] add: dataset --- .gitignore | 4 +++- dataset/public/README.md | 13 +++++++++++++ translate/fetcher.py | 8 +++++--- 3 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 dataset/public/README.md diff --git a/.gitignore b/.gitignore index 0d46073..2dcd07e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ __pycache__ translate/output translate/source translate/result -*.db \ No newline at end of file +*.db +dataset/raw +translate/special-spiders \ No newline at end of file diff --git a/dataset/public/README.md b/dataset/public/README.md new file mode 100644 index 0000000..33be04f --- /dev/null +++ b/dataset/public/README.md @@ -0,0 +1,13 @@ +# sparkastML Datasets + +Here are the datasets published by sparkastML project. + +## Translation ZH-EN + +High-quality, fresh synthetic data containing over 100,000 sentences of Chinese-English parallel corpora. + +Version: 1 +Last Update: 2024/09/16 + +[Google Drive](https://drive.google.com/drive/folders/1_ADblZcB5p9BUvawkYDmp1qIUDZgkkoe?usp=sharing) +[IPFS](https://ipfs.a2x.pub/ipfs/QmYz4ew4nSzPc6TZvoWk6jXpGN82qt3J46nwfb75N2YKc4/) diff --git a/translate/fetcher.py b/translate/fetcher.py index b1bb1dc..55839f7 100644 --- a/translate/fetcher.py +++ b/translate/fetcher.py @@ -38,13 +38,15 @@ def get_unfetched_urls(conn, limit): return [row[0] for row in cursor.fetchall()] # 下载并提取网页内容 -def fetch_and_extract_content(url): +def fetch_and_extract_content(conn, url): downloaded = trafilatura.fetch_url(url) if not downloaded: return None html_string = downloaded - if not is_probably_readerable(html_string): + if not is_probably_readerable(html_string) and os.getenv("FETCH_IGNORE_CHECK").capitalize() == "TRUE": + print(f"URL {url} is not readable.") + record_fetched_url(conn, url) return None content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True) @@ -100,7 +102,7 @@ def process_url(url, db_path, save_path): cooldown_base = float(os.getenv("FETCH_COOLDOWN")) time.sleep(random.random() * cooldown_base) conn = connect_db(db_path) - content = fetch_and_extract_content(url) + content = fetch_and_extract_content(conn, url) if content: segments = split_content(content) save_segments(url, segments, save_path)