python爬取wallhaven壁纸实例

1
2
3

import os
import requests
from lxml import etree

定义原始网页地址

1	url = 'https://wallhaven.cc/search?categories=111&purity=100&resolutions=3840x2160&topRange=1w&sorting=toplist&order=desc&ai_art_filter=0&page=2'

循环5次，每次处理一个网页

1	for i in range(5):

匹配提取网址中的数字，并累加1

1
2
3

num_str = url.split('=')[-1]
num = int(num_str) + 1
new_url = url[:-len(num_str)] + str(num)

使用requests库获取网页内容

response = requests.get(new_url)
html = response.content

# 解析HTML页面
selector = etree.HTML(html)

# 获取所有壁纸链接
links = selector.xpath('/html/body/main/div[1]/section[1]/ul/li/figure/a[1]/@href')

# 逐个访问壁纸链接，并提取图片链接
directory = 'zhenbang'
if not os.path.exists(directory):
    os.makedirs(directory)
for link in links:
    # 访问壁纸链接
    response = requests.get(link)
    html = response.content

    # 解析HTML页面
    selector = etree.HTML(html)

    # 获取图片链接
    img_url = selector.xpath('/html/body/main/section/div[1]/img/@src')[0]

    # 保存图片
    file_path = os.path.join(directory, img_url.split('/')[-1])
    try:
        with open(file_path, 'wb') as f:
            f.write(requests.get(img_url).content)
        print('文件已保存到本地：{}'.format(file_path))
    except Exception as e:
        print('保存文件时出错：{}'.format(e))

# 更新URL
url = new_url