from html.parser import HTMLParser from urllib.error import HTTPError from urllib.parse import urljoin, urlparse, urlunparse from urllib.request import urlopen import sys class Analyzer(HTMLParser): def __init__(self): super().__init__() self.links = [] def handle_starttag(self, tags, attrs): if tags == "a": for k, v in attrs: if k == "href": self.links.append(v) def spider(start_url): todo = [start_url] seen = set() while len(todo) > 0: bu = todo[0] del todo[0] bu_split = list(urlparse(bu)) bu_split[5] = "" bu = urlunparse(bu_split) if bu in seen: continue seen.add(bu) print(bu, end=" ") try: f = urlopen(bu) except HTTPError: print("") continue d = f.read() f.close() ct = f.info()["Content-Type"] if ct is None or not ct.startswith("text/html"): print("") continue an = Analyzer() an.feed(str(d)) for l in an.links: full_url = urljoin(bu, l) if not full_url.startswith(start_url): continue todo.append(full_url) print("" % len(set(an.links))) print("Total: %s links" % len(seen)) spider(sys.argv[1])