add: dataset

This commit is contained in:
alikia2x (寒寒) 2024-09-16 17:29:12 +08:00
parent a9a7430a58
commit 932cbd4336
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
3 changed files with 21 additions and 4 deletions

2
.gitignore vendored
View File

@ -13,3 +13,5 @@ translate/output
translate/source translate/source
translate/result translate/result
*.db *.db
dataset/raw
translate/special-spiders

13
dataset/public/README.md Normal file
View File

@ -0,0 +1,13 @@
# sparkastML Datasets
Here are the datasets published by sparkastML project.
## Translation ZH-EN
High-quality, fresh synthetic data containing over 100,000 sentences of Chinese-English parallel corpora.
Version: 1
Last Update: 2024/09/16
[Google Drive](https://drive.google.com/drive/folders/1_ADblZcB5p9BUvawkYDmp1qIUDZgkkoe?usp=sharing)
[IPFS](https://ipfs.a2x.pub/ipfs/QmYz4ew4nSzPc6TZvoWk6jXpGN82qt3J46nwfb75N2YKc4/)

View File

@ -38,13 +38,15 @@ def get_unfetched_urls(conn, limit):
return [row[0] for row in cursor.fetchall()] return [row[0] for row in cursor.fetchall()]
# 下载并提取网页内容 # 下载并提取网页内容
def fetch_and_extract_content(url): def fetch_and_extract_content(conn, url):
downloaded = trafilatura.fetch_url(url) downloaded = trafilatura.fetch_url(url)
if not downloaded: if not downloaded:
return None return None
html_string = downloaded html_string = downloaded
if not is_probably_readerable(html_string): if not is_probably_readerable(html_string) and os.getenv("FETCH_IGNORE_CHECK").capitalize() == "TRUE":
print(f"URL {url} is not readable.")
record_fetched_url(conn, url)
return None return None
content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True) content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
@ -100,7 +102,7 @@ def process_url(url, db_path, save_path):
cooldown_base = float(os.getenv("FETCH_COOLDOWN")) cooldown_base = float(os.getenv("FETCH_COOLDOWN"))
time.sleep(random.random() * cooldown_base) time.sleep(random.random() * cooldown_base)
conn = connect_db(db_path) conn = connect_db(db_path)
content = fetch_and_extract_content(url) content = fetch_and_extract_content(conn, url)
if content: if content:
segments = split_content(content) segments = split_content(content)
save_segments(url, segments, save_path) save_segments(url, segments, save_path)