0%

Python练习册:0013

题目

    用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-)

分析

打开链接,F12,我们看到图片链接还是比较有规律的,可以用Requests获取网页,接着用BeautifulSoup4正则提取图片链接地址,然后下载到指定目录。

pip install requests bs4 lxml

代码

  • 使用正则提取图片链接
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import re
import requests
import os


#批量保存图片
def save_img(urls,img_dir):

#新建存储位置
if not os.path.exists(img_dir):
os.mkdir(img_dir)

for img_url in img_urls:
try:
#content的内容即为图片的二进制
img = requests.get(img_url,timeout=3).content
imgname = img_url.split('/')[-1]

filename = os.path.join(img_dir,imgname)
print('downing ..... {}'.format(imgname) )

#保存图片
with open(filename,'wb') as f:
f.write(img)
except:
print('downling {} fail !'.format(filename) )


if __name__ == '__main__':
img_dir = 'img'
url = "http://tieba.baidu.com/p/2166231880"
html = requests.get(url)

#正则提取 图片链接
img_urls = re.findall(r'pic_type="0" class="BDE_Image" src="(.*?)"',html.text)

save_img(img_urls,img_dir)
  • 使用bs4库提取图片链接片段
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import requests
from bs4 import BeautifulSoup as BS
url = "http://tieba.baidu.com/p/2166231880"
html = requests.get(url)

#创建BeautifulSoup对象
soup = BS(html.text,'lxml')
#提取所有class="BDE_Image"的 img 标签
img = soup('img',{'class':'BDE_Image'})
urls = []
for i in img:
#里面的src属性的内容即为我们所需要的链接
urls.append(i['src'])

扩展

当数据很多的时候,我们就需要考虑多线程/多进程/协程以提高运行速度。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import re
import requests
import threading


def save_img(url,filename):
try:
#content的内容即为图片的二进制
img = requests.get(url,timeout=3).content
print('downing ..... {}'.format(filename) )

#保存图片
with open(filename,'wb') as f:
f.write(img)
except Exception as e:
print(e)


def thread_get(urls,img_dir):

if not os.path.exists(img_dir):os.mkdir(img_dir)

l = []
for url in urls:
imgname = url.split('/')[-1]
filename = os.path.join(img_dir,imgname)
#绑定线程
t = threading.Thread(target=save_img,args=(url,filename))
l.append(t)

#启动线程
for i in l:
i.start()

#阻塞线程等待结束
for i in l:
i.join()

if __name__ == "__main__":
img_dir = 'img'
url = "http://tieba.baidu.com/p/2166231880"
html = requests.get(url)

#获取所有图片地址
img_urls = re.findall(r'img pic_type="0" class="BDE_Image" src="(.*?)"',html.text)

thread_get(img_urls,img_dir)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import re
import requests
import gevent
from gevent import pool,monkey

#打补丁,使阻塞IO变成非阻塞
monkey.patch_all()


def save_img(url,filename):
try:
#content的内容即为图片的二进制
img = requests.get(url,timeout=3).content
print('downing ..... {}'.format(filename) )

#保存图片
with open(filename,'wb') as f:
f.write(img)
except Exception as e:
print(e)


def gevent_get(urls,img_dir):

if not os.path.exists(img_dir):os.mkdir(img_dir)

jobs = []
for url in urls:
imgname = url.split('/')[-1]
filename = os.path.join(img_dir,imgname)
#绑定协程对象
jobs.append(gevent.spawn(save_img,url,filename))

#启动
gevent.joinall(jobs)

#阻塞
for i in jobs:
i.join()

if __name__ == "__main__":
img_dir = 'img'
url = "http://tieba.baidu.com/p/2166231880"
html = requests.get(url)

#获取所有图片地址
img_urls = re.findall(r'img pic_type="0" class="BDE_Image" src="(.*?)"',html.text)

gevent_get(img_urls,img_dir)

参考

欢迎关注我的其它发布渠道