题目
用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-)
分析
打开链接 ,F12,我们看到图片链接还是比较有规律的,可以用Requests 获取网页,接着用BeautifulSoup4 或正则 提取图片链接地址,然后下载到指定目录。
pip install requests bs4 lxml
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 import reimport requestsimport osdef save_img (urls,img_dir ): if not os.path.exists(img_dir): os.mkdir(img_dir) for img_url in img_urls: try : img = requests.get(img_url,timeout=3 ).content imgname = img_url.split('/' )[-1 ] filename = os.path.join(img_dir,imgname) print ('downing ..... {}' .format (imgname) ) with open (filename,'wb' ) as f: f.write(img) except : print ('downling {} fail !' .format (filename) ) if __name__ == '__main__' : img_dir = 'img' url = "http://tieba.baidu.com/p/2166231880" html = requests.get(url) img_urls = re.findall(r'pic_type="0" class="BDE_Image" src="(.*?)"' ,html.text) save_img(img_urls,img_dir)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 import requestsfrom bs4 import BeautifulSoup as BSurl = "http://tieba.baidu.com/p/2166231880" html = requests.get(url) soup = BS(html.text,'lxml' ) img = soup('img' ,{'class' :'BDE_Image' }) urls = [] for i in img: urls.append(i['src' ])
扩展
当数据很多的时候,我们就需要考虑多线程/多进程/协程以提高运行速度。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 import osimport reimport requestsimport threadingdef save_img (url,filename ): try : img = requests.get(url,timeout=3 ).content print ('downing ..... {}' .format (filename) ) with open (filename,'wb' ) as f: f.write(img) except Exception as e: print (e) def thread_get (urls,img_dir ): if not os.path.exists(img_dir):os.mkdir(img_dir) l = [] for url in urls: imgname = url.split('/' )[-1 ] filename = os.path.join(img_dir,imgname) t = threading.Thread(target=save_img,args=(url,filename)) l.append(t) for i in l: i.start() for i in l: i.join() if __name__ == "__main__" : img_dir = 'img' url = "http://tieba.baidu.com/p/2166231880" html = requests.get(url) img_urls = re.findall(r'img pic_type="0" class="BDE_Image" src="(.*?)"' ,html.text) thread_get(img_urls,img_dir)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 import osimport reimport requestsimport geventfrom gevent import pool,monkeymonkey.patch_all() def save_img (url,filename ): try : img = requests.get(url,timeout=3 ).content print ('downing ..... {}' .format (filename) ) with open (filename,'wb' ) as f: f.write(img) except Exception as e: print (e) def gevent_get (urls,img_dir ): if not os.path.exists(img_dir):os.mkdir(img_dir) jobs = [] for url in urls: imgname = url.split('/' )[-1 ] filename = os.path.join(img_dir,imgname) jobs.append(gevent.spawn(save_img,url,filename)) gevent.joinall(jobs) for i in jobs: i.join() if __name__ == "__main__" : img_dir = 'img' url = "http://tieba.baidu.com/p/2166231880" html = requests.get(url) img_urls = re.findall(r'img pic_type="0" class="BDE_Image" src="(.*?)"' ,html.text) gevent_get(img_urls,img_dir)
参考