add: spider
This commit is contained in:
parent
1acc1ce703
commit
dcf53ca002
3
.gitignore
vendored
3
.gitignore
vendored
@ -10,4 +10,5 @@ __pycache__
|
|||||||
.env
|
.env
|
||||||
.env*
|
.env*
|
||||||
translate/output
|
translate/output
|
||||||
translate/source
|
translate/source
|
||||||
|
*.db
|
134
translate/spider.py
Normal file
134
translate/spider.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import sqlite3
|
||||||
|
import urllib.robotparser as urobot
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
|
||||||
|
MAX_RECURSION_DEPTH = 5
|
||||||
|
MAX_URLS = 1000
|
||||||
|
MAX_THREADS = 10
|
||||||
|
HEADERS = {
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
||||||
|
}
|
||||||
|
|
||||||
|
conn = sqlite3.connect("crawler.db")
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Initialization
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS url_list (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
fetched_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
visited BOOLEAN,
|
||||||
|
parent_url TEXT,
|
||||||
|
child_url_count INTEGER
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_url(url, headers=None):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text, response.url
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching {url}: {e}")
|
||||||
|
return None, url
|
||||||
|
|
||||||
|
|
||||||
|
def extract_links(html, base_url):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
links = set()
|
||||||
|
for a_tag in soup.find_all("a", href=True):
|
||||||
|
href = a_tag["href"]
|
||||||
|
full_url = urljoin(base_url, href)
|
||||||
|
if urlparse(full_url).netloc == urlparse(base_url).netloc:
|
||||||
|
links.add(full_url)
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_sitemap(sitemap_url):
|
||||||
|
html, _ = fetch_url(sitemap_url)
|
||||||
|
if html:
|
||||||
|
soup = BeautifulSoup(html, "xml")
|
||||||
|
urls = {loc.text for loc in soup.find_all("loc")}
|
||||||
|
return urls
|
||||||
|
return set()
|
||||||
|
|
||||||
|
|
||||||
|
def save_url(url, parent_url=None):
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
INSERT OR IGNORE INTO url_list (url, visited, parent_url, child_url_count)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
(url, False, parent_url, 0),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def update_url(url, child_url_count):
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
UPDATE url_list SET child_url_count = ? WHERE url = ?
|
||||||
|
""",
|
||||||
|
(child_url_count, url),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def crawl(url, rp=None, depth=0):
|
||||||
|
if depth > MAX_RECURSION_DEPTH:
|
||||||
|
return
|
||||||
|
|
||||||
|
if (
|
||||||
|
rp
|
||||||
|
and rp.can_fetch("*", url) == False
|
||||||
|
and rp.can_fetch("Googlebot", url) == False
|
||||||
|
and rp.can_fetch("Baiduspider", url) == False
|
||||||
|
):
|
||||||
|
return
|
||||||
|
save_url(url)
|
||||||
|
html, fetched_url = fetch_url(url, HEADERS)
|
||||||
|
if not html:
|
||||||
|
return
|
||||||
|
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
UPDATE url_list SET visited = TRUE, fetched_time = CURRENT_TIMESTAMP WHERE url = ?
|
||||||
|
""",
|
||||||
|
(fetched_url,),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
links = extract_links(html, fetched_url)
|
||||||
|
for link in links:
|
||||||
|
save_url(link, fetched_url)
|
||||||
|
|
||||||
|
update_url(fetched_url, len(links))
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
crawl(link, depth=depth + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def main(seed_url, rp, sitemap=None):
|
||||||
|
if sitemap:
|
||||||
|
sitemap_urls = fetch_sitemap(sitemap)
|
||||||
|
for sitemap_url in sitemap_urls:
|
||||||
|
save_url(sitemap_url)
|
||||||
|
crawl(seed_url, rp=rp)
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
# seed_url = "https://www.bbc.co.uk/news"
|
||||||
|
# rp = urobot.RobotFileParser()
|
||||||
|
# rp.set_url("https://www.bbc.co.uk/robots.txt")
|
||||||
|
# rp.read()
|
||||||
|
# main(seed_url, rp, "https://www.bbc.co.uk/sitemap.xml")
|
||||||
|
# conn.close()
|
Loading…
Reference in New Issue
Block a user