from html.parser import HTMLParser from urllib.parse import urljoin, urlparse, urlunparse from urllib.request import urlopen import sys class Analyzer(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.links = [] def handle_starttag(self, tags, attrs): if tags == "a": for k, v in attrs: if k == "href": self.links.append(v) def spider(start_url): todo = [start_url] seen = set() while len(todo) > 0: bu = todo[0] del todo[0] bu_split = list(urlparse(bu)) bu_split[5] = "" bu = urlunparse(bu_split) if bu in seen: continue seen.add(bu) print(bu, end=" ") f = urlopen(bu) d = f.read() f.close() if not f.info()["Content-Type"].startswith("text/html"): print("") continue an = Analyzer() an.feed(str(d)) for l in an.links: full_url = urljoin(bu, l) if not full_url.startswith(start_url): continue todo.append(full_url) print("" % len(set(an.links))) print("Total: %s links" % len(seen)) spider(sys.argv[1])