#! /usr/bin/env python
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib.parse import urljoin, urlparse, urlunparse
import sys
class Link_Finder(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for k, v in attrs:
if k == "href":
self.links.append(v)
def spider(url):
todo = [url]
seen = set()
while len(todo) > 0:
u = todo[0]
del todo[0]
if u in seen:
continue
seen.add(u)
lf = Link_Finder()
f = urlopen(u)
if not f.info()["Content-Type"].startswith("text/html"):
continue
lf.feed(f.read().decode("utf-8"))
print("URL", u, "has", len(lf.links), "links")
for l in lf.links:
new_url = urljoin(u, l)
if urlparse(new_url)[0] == "http":
if new_url.startswith(url):
split = list(urlparse(new_url))
split[5] = ""
todo.append(urlunparse(split))
spider(sys.argv[1])