python爬取wallhaven壁纸实例

1
2
3
import os
import requests
from lxml import etree

定义原始网页地址

1
url = 'https://wallhaven.cc/search?categories=111&purity=100&resolutions=3840x2160&topRange=1w&sorting=toplist&order=desc&ai_art_filter=0&page=2'

循环5次,每次处理一个网页

1
for i in range(5):

匹配提取网址中的数字,并累加1

1
2
3
num_str = url.split('=')[-1]
num = int(num_str) + 1
new_url = url[:-len(num_str)] + str(num)

使用requests库获取网页内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
response = requests.get(new_url)
html = response.content

# 解析HTML页面
selector = etree.HTML(html)

# 获取所有壁纸链接
links = selector.xpath('/html/body/main/div[1]/section[1]/ul/li/figure/a[1]/@href')

# 逐个访问壁纸链接,并提取图片链接
directory = 'zhenbang'
if not os.path.exists(directory):
os.makedirs(directory)
for link in links:
# 访问壁纸链接
response = requests.get(link)
html = response.content

# 解析HTML页面
selector = etree.HTML(html)

# 获取图片链接
img_url = selector.xpath('/html/body/main/section/div[1]/img/@src')[0]

# 保存图片
file_path = os.path.join(directory, img_url.split('/')[-1])
try:
with open(file_path, 'wb') as f:
f.write(requests.get(img_url).content)
print('文件已保存到本地:{}'.format(file_path))
except Exception as e:
print('保存文件时出错:{}'.format(e))

# 更新URL
url = new_url