diff --git a/.gitignore b/.gitignore index d0be852..6ab4ef9 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ __pycache__ .env .env* translate/output -translate/source \ No newline at end of file +translate/source +*.db \ No newline at end of file diff --git a/translate/spider.py b/translate/spider.py new file mode 100644 index 0000000..c8aa510 --- /dev/null +++ b/translate/spider.py @@ -0,0 +1,134 @@ +import requests +from bs4 import BeautifulSoup +import sqlite3 +import urllib.robotparser as urobot +from urllib.parse import urljoin, urlparse + + +MAX_RECURSION_DEPTH = 5 +MAX_URLS = 1000 +MAX_THREADS = 10 +HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", +} + +conn = sqlite3.connect("crawler.db") +cursor = conn.cursor() + +# Initialization +cursor.execute( + """ +CREATE TABLE IF NOT EXISTS url_list ( + url TEXT PRIMARY KEY, + fetched_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + visited BOOLEAN, + parent_url TEXT, + child_url_count INTEGER +) +""" +) +conn.commit() + + +def fetch_url(url, headers=None): + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + return response.text, response.url + except requests.RequestException as e: + print(f"Error fetching {url}: {e}") + return None, url + + +def extract_links(html, base_url): + soup = BeautifulSoup(html, "html.parser") + links = set() + for a_tag in soup.find_all("a", href=True): + href = a_tag["href"] + full_url = urljoin(base_url, href) + if urlparse(full_url).netloc == urlparse(base_url).netloc: + links.add(full_url) + return links + + +def fetch_sitemap(sitemap_url): + html, _ = fetch_url(sitemap_url) + if html: + soup = BeautifulSoup(html, "xml") + urls = {loc.text for loc in soup.find_all("loc")} + return urls + return set() + + +def save_url(url, parent_url=None): + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR IGNORE INTO url_list (url, visited, parent_url, child_url_count) + VALUES (?, ?, ?, ?) + """, + (url, False, parent_url, 0), + ) + conn.commit() + + +def update_url(url, child_url_count): + cursor.execute( + """ + UPDATE url_list SET child_url_count = ? WHERE url = ? + """, + (child_url_count, url), + ) + conn.commit() + + +def crawl(url, rp=None, depth=0): + if depth > MAX_RECURSION_DEPTH: + return + + if ( + rp + and rp.can_fetch("*", url) == False + and rp.can_fetch("Googlebot", url) == False + and rp.can_fetch("Baiduspider", url) == False + ): + return + save_url(url) + html, fetched_url = fetch_url(url, HEADERS) + if not html: + return + + cursor.execute( + """ + UPDATE url_list SET visited = TRUE, fetched_time = CURRENT_TIMESTAMP WHERE url = ? + """, + (fetched_url,), + ) + conn.commit() + + links = extract_links(html, fetched_url) + for link in links: + save_url(link, fetched_url) + + update_url(fetched_url, len(links)) + + for link in links: + crawl(link, depth=depth + 1) + + +def main(seed_url, rp, sitemap=None): + if sitemap: + sitemap_urls = fetch_sitemap(sitemap) + for sitemap_url in sitemap_urls: + save_url(sitemap_url) + crawl(seed_url, rp=rp) + +# Example usage +# if __name__ == "__main__": +# seed_url = "https://www.bbc.co.uk/news" +# rp = urobot.RobotFileParser() +# rp.set_url("https://www.bbc.co.uk/robots.txt") +# rp.read() +# main(seed_url, rp, "https://www.bbc.co.uk/sitemap.xml") +# conn.close()