知識點
- 爬蟲的步驟
- requests
- parsel
- xpath數據解析
1.獲取網頁地址 (目標地址)
2.發送請求
3.數據解析
4.保存 本地
目標網址:https://hdqwalls.com/
網站是靜態數據,那么只要找到它的規律,以及url地址就行
import requests import parsel
url =f'https://hdqwalls.com/latest-wallpapers/page/1' # url = 'https://hdqwalls.com' # 請求頭 偽裝 爬蟲:偽裝成客戶端向服務器發送數據請求 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' } requ = requests.get(url=url, headers=headers).text
sel = parsel.Selector(requ) # <Selector xpath=None data='<html lang="en">\n<head>\n<script src="...'> pic_html = sel.xpath('//body/div/div[3]/div/a[1]/@href').getall() for html in pic_html: pic_html = 'https://hdqwalls.com' + html requ2 = requests.get(url=pic_html, headers=headers).text sel2 = parsel.Selector(requ2) title = sel2.xpath('//body/header/div/div/h1/text()').get().strip() href = sel2.xpath('//body/div/div[2]/div/div/div/a/@href').get() # 二進制請求 requ3 = requests.get(url=href, headers=headers).content
with open('壁紙\\' + title + '.jpg', mode='wb')as fp: fp.write(requ3) print(title, '下載完成')
import requests import parsel for page in range(1,6): # 包頭不包尾 url =f'https://hdqwalls.com/latest-wallpapers/page/{page}' # url = 'https://hdqwalls.com' # 請求頭 偽裝 爬蟲:偽裝成客戶端向服務器發送數據請求 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' } requ = requests.get(url=url, headers=headers).text # <Response [200]> 數據請求成功 sel = parsel.Selector(requ) # <Selector xpath=None data='<html lang="en">\n<head>\n<script src="...'> pic_html = sel.xpath('//body/div/div[3]/div/a[1]/@href').getall() for html in pic_html: pic_html = 'https://hdqwalls.com' + html requ2 = requests.get(url=pic_html, headers=headers).text sel2 = parsel.Selector(requ2) title = sel2.xpath('//body/header/div/div/h1/text()').get().strip() href = sel2.xpath('//body/div/div[2]/div/div/div/a/@href').get() # 二進制請求 requ3 = requests.get(url=href, headers=headers).content with open('壁紙\\' + title + '.jpg', mode='wb')as fp: fp.write(requ3) print(title, '下載完成') print(f'----------------------第{page}頁下載完成----------------------')