add: dataset
This commit is contained in:
parent
a9a7430a58
commit
932cbd4336
4
.gitignore
vendored
4
.gitignore
vendored
@ -12,4 +12,6 @@ __pycache__
|
|||||||
translate/output
|
translate/output
|
||||||
translate/source
|
translate/source
|
||||||
translate/result
|
translate/result
|
||||||
*.db
|
*.db
|
||||||
|
dataset/raw
|
||||||
|
translate/special-spiders
|
13
dataset/public/README.md
Normal file
13
dataset/public/README.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# sparkastML Datasets
|
||||||
|
|
||||||
|
Here are the datasets published by sparkastML project.
|
||||||
|
|
||||||
|
## Translation ZH-EN
|
||||||
|
|
||||||
|
High-quality, fresh synthetic data containing over 100,000 sentences of Chinese-English parallel corpora.
|
||||||
|
|
||||||
|
Version: 1
|
||||||
|
Last Update: 2024/09/16
|
||||||
|
|
||||||
|
[Google Drive](https://drive.google.com/drive/folders/1_ADblZcB5p9BUvawkYDmp1qIUDZgkkoe?usp=sharing)
|
||||||
|
[IPFS](https://ipfs.a2x.pub/ipfs/QmYz4ew4nSzPc6TZvoWk6jXpGN82qt3J46nwfb75N2YKc4/)
|
@ -38,13 +38,15 @@ def get_unfetched_urls(conn, limit):
|
|||||||
return [row[0] for row in cursor.fetchall()]
|
return [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
# 下载并提取网页内容
|
# 下载并提取网页内容
|
||||||
def fetch_and_extract_content(url):
|
def fetch_and_extract_content(conn, url):
|
||||||
downloaded = trafilatura.fetch_url(url)
|
downloaded = trafilatura.fetch_url(url)
|
||||||
if not downloaded:
|
if not downloaded:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
html_string = downloaded
|
html_string = downloaded
|
||||||
if not is_probably_readerable(html_string):
|
if not is_probably_readerable(html_string) and os.getenv("FETCH_IGNORE_CHECK").capitalize() == "TRUE":
|
||||||
|
print(f"URL {url} is not readable.")
|
||||||
|
record_fetched_url(conn, url)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
|
content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
|
||||||
@ -100,7 +102,7 @@ def process_url(url, db_path, save_path):
|
|||||||
cooldown_base = float(os.getenv("FETCH_COOLDOWN"))
|
cooldown_base = float(os.getenv("FETCH_COOLDOWN"))
|
||||||
time.sleep(random.random() * cooldown_base)
|
time.sleep(random.random() * cooldown_base)
|
||||||
conn = connect_db(db_path)
|
conn = connect_db(db_path)
|
||||||
content = fetch_and_extract_content(url)
|
content = fetch_and_extract_content(conn, url)
|
||||||
if content:
|
if content:
|
||||||
segments = split_content(content)
|
segments = split_content(content)
|
||||||
save_segments(url, segments, save_path)
|
save_segments(url, segments, save_path)
|
||||||
|
Loading…
Reference in New Issue
Block a user