from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse, urlunparse
from urllib.request import urlopen
import sys


class Analyzer(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

    def handle_starttag(self, tags, attrs):
        if tags == "a":
            for k, v in attrs:
                if k == "href":
                    self.links.append(v)

def spider(start_url):
    todo = [start_url]
    seen = set()
    
    while len(todo) > 0:
        bu = todo[0]
        del todo[0]
        
        bu_split = list(urlparse(bu))
        bu_split[5] = ""
        bu = urlunparse(bu_split)
        
        if bu in seen:
            continue
        seen.add(bu)
        
        print(bu, end=" ")
        
        f = urlopen(bu)
        d = f.read()
        f.close()
 
        if not f.info()["Content-Type"].startswith("text/html"):
            print("<skipping>")
            continue
        
        an = Analyzer()
        an.feed(str(d))
        
        for l in an.links:
            full_url = urljoin(bu, l)
            if not full_url.startswith(start_url):
                continue
            
            todo.append(full_url)
        
        print("<processed %d links>" % len(set(an.links)))
    print("Total: %s links" % len(seen))

spider(sys.argv[1])