req = request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0') #读取网页内容以utf-8解码 html = request.urlopen(req).read().decode('utf-8')
soup = BS(html,'lxml') #查找所有<a> a = soup.find_all('a') for link in a: #找href属性的内容 href = link.get('href') try: #排除一些#或./之类的干扰链接 if href.startswith('http'): print(href) except: pass