#! /usr/bin/env python from html.parser import HTMLParser from urllib.request import urlopen from urllib.parse import urljoin, urlparse, urlunparse import sys class Link_Finder(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.links = [] def handle_starttag(self, tag, attrs): if tag == "a": for k, v in attrs: if k == "href": self.links.append(v) def spider(url): todo = [url] seen = set() while len(todo) > 0: u = todo[0] del todo[0] if u in seen: continue seen.add(u) lf = Link_Finder() f = urlopen(u) if not f.info()["Content-Type"].startswith("text/html"): continue lf.feed(f.read().decode("utf-8")) print("URL", u, "has", len(lf.links), "links") for l in lf.links: new_url = urljoin(u, l) if urlparse(new_url)[0] == "http": if new_url.startswith(url): split = list(urlparse(new_url)) split[5] = "" todo.append(urlunparse(split)) spider(sys.argv[1])