0%

乐摄网图片爬虫

有趣的技术

思路:

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

import requests
import parsel
import os

for page in range(1, 6):
print(f'===========正在抓取第{page}页数据==============')

base_url = f'https://www.leshetu.top/xz/twyh/page/{page}'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}

response = requests.get(url=base_url, headers=headers)
html = response.text
# print(html)

selector = parsel.Selector(html)

divs = selector.xpath('//div[@class="row posts-wrapper"]/div')

for div in divs:
pic_title = div.xpath('.//h2/a/text()').get()
pic_url = div.xpath('.//h2/a/@href').get()
# print(pic_title, pic_url)
print('正在下载相册:', pic_title)


if not os.path.exists('C:/Users/13089/Desktop/images/' + pic_title):
os.mkdir('C:/Users/13089/Desktop/images//' + pic_title)

try:

html_2 = requests.get(url=pic_url, headers=headers).text
except:
continue
selector_2 = parsel.Selector(html_2)
img_url_list = selector_2.xpath('//div[@class="entry-content u-text-format u-clearfix"]//img/@data-srcset').getall()
# print(img_url_list)

for img_url in img_url_list:
try:
img_data = requests.get(url=img_url, headers=headers).content
except:
continue

file_name = img_url.split('/')[-1]

with open(f'C:/Users/13089/Desktop\images//{pic_title}//{file_name}' + file_name, mode='wb') as f:
f.write(img_data)
print('下载完成:', file_name)


更新

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

import requests
import parsel
import os

for page in range(1, 2):
print(f'===========正在抓取第{page}页数据==============')

base_url = f'https://www.leshetu.top/xz/slct/page/{page}'

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'Referer':'https://www.leshetu.me/xz/slct'

}

response = requests.get(url=base_url, headers=headers)
html = response.text
# print(html)

selector = parsel.Selector(html)

divs = selector.xpath('//div[@class="row posts-wrapper"]/div')

for div in divs:
pic_title = div.xpath('.//h2/a/text()').get()
pic_url = div.xpath('.//h2/a/@href').get()
# print(pic_title, pic_url)
print('正在下载相册:', pic_title)


if not os.path.exists('C:/Users/13089/Desktop/images\\' + pic_title):
os.mkdir('C:/Users/13089/Desktop/images\\' + pic_title)
#发送相册地址的请求,解析图片地址
try:

html_2 = requests.get(url=pic_url, headers=headers).text
except:
continue
selector_2 = parsel.Selector(html_2)
img_url_list = selector_2.xpath('//div[@class="entry-content u-text-format u-clearfix"]/p//img/@src').getall()
# print(img_url_list)

#请求图片数据
for img_url in img_url_list:
try:
img_data = requests.get(url=img_url, headers=headers).content#content 提取对象的二进制数据
except:
continue
#准备文件名
file_name = img_url.split('/')[-1]
# 数据保存
with open(f'C:/Users/13089/Desktop\images//{pic_title}//{file_name}' + file_name, mode='wb') as f:
f.write(img_data)
print('下载完成:', file_name)