爬蟲遇到HTTP Error 403的問題

本文轉載自查看原文 2019-06-03 21:18 1110

# coding=gbk


from bs4 import BeautifulSoup
import requests
import urllib
x = 1
y = 1

def crawl(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    global y
    with open(f'F:/pachong/xnt/{y}.txt','w',encoding="utf-8") as f:
        f.write(str(soup))
        y += 1
    yinhuns = soup.select('img')
    print(yinhuns)
    for yh in yinhuns:
        print(yh)
        link = yh.get('src')
        print(link)
        global x    
        urllib.request.urlretrieve(link, f'F:/pachong/xnt/{x}.jpg')
        print(f'正在下載第{x}張圖片')
        x += 1
        
for i in range(1,5):
    url = "https://acg.fi/hentai/23643.htm/" + str(i)
    
    try:
        crawl(url)
    except ValueError as f:
        continue
    except Exception as e:
        print(e)

運行程序過程中返回下面結果

<img alt="A區(ACG.Fi)" class="logo" src="https://acg.fi/logo.png"/>
https://acg.fi/logo.png
HTTP Error 403: Forbidden

問題有三個
- 搜索src值的時候，沒有搜索到全部符合要找的圖片網址
- 返回的第一個網址出現了403錯誤，拒絕訪問
- soup.select返回的不是正確的list
思考
- 有可能所要找的網址中包含中文，無法編譯
- 如果通過正則對，請求的url的text進行，篩選

#coding=gbk
from bs4 import BeautifulSoup
import requests
import urllib
x = 1


def crawl(url, header):
	
    res = requests.get(url, headers=header)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    yinhuns = soup.find('div', attrs = {'id':"content-innerText"}).find_all('img',limit=4)
    print(yinhuns)
    
    for yh in yinhuns:
        
        link = yh.get('src')
        global x
        print(x)
        urllib.request.urlretrieve(link, 'F:/pachong/xnt/{}.jpg'.format(x))
        print('正在下載第{0}張圖片'.format(x))
        x += 1


header = {
		"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
			}       
for i in range(1,5):	
    url = "https://acg.fi/hentai/23643.htm/" + str(i)
    
    try:
        crawl(url, header)
    except ValueError as f:
        continue
    except Exception as e:
        print(e)

這個過程用了find(),find_all()方法，依舊沒有解決list的問題
后續過程使用urllib.parse.quote對中文部分重新編碼，但是urllib.request.urlretrieve依然報錯
重新修改后

#coding=gbk

import requests
import urllib
import re
from PIL import Image
from io import BytesIO
x = 1 

# 獲取抓取的圖片源網址
def crawl(url, header):
    
    res = requests.get(url, headers=header)
    # 防止被反爬，打開后關閉
    res.close()
    res = res.text
    pattern = re.compile('http.*?apic.*?jpg')
    result = re.findall(pattern, res)
    return result

# 對重編碼的網址下載圖片
def down(outs, folder_path):
	global x
	for out in outs:
		# 獲取新編碼的URL地址
		res = requests.get(out)
		# 防止被反爬，打開后關閉
		res.close()
		bf = BytesIO()
		bf.write(res.content)
		img = Image.open(bf)
		print(f'正在下載第{x}張圖片')
		img.save(folder_path + f"{x}.jpg")
		x += 1

# 對獲取的圖片源網址進行重編碼
def bianma(results):
	outs = []
	for s in results:
		# 用正則篩選出中文部分
		pattern = re.compile('[\u4e00-\u9fa5]+')
		result = re.search(pattern, s)
		su = result.group(0)
		# 把中文部分重洗編碼
		li = urllib.parse.quote(su)
		# 把原URL地址中文部分替換成編碼后的
		out = re.sub(pattern, li, s)
		outs.append(out)
	# 對列表進行去重並且按照原來的次序排列
	outs_cp = sorted(set(outs), key=outs.index)	
	return outs_cp

def main():
	try:
		header = {
				"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
					}
		folder_path = 'F:/pachong/xnt/'
		for i in range(1,5):
			url = "https://acg.fi/hentai/23643.htm/" + str(i)
			results = crawl(url, header)
			outs = bianma(results)
			down(outs, folder_path)
	except Exception as e:
		print(e)

if __name__ == '__main__':
	main()

對於圖片路徑中有中文的，可以使用BytesIO和PIL下載圖片，證實可以有效解決
幾次試驗出現[Errno 10054] 遠程主機強迫關閉了一個現有的連接，可以在requests.get()后使用close()
程序運行無誤，就是有點慢，后期可以使用多線程嘗試

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python爬蟲報錯："HTTP Error 403: Forbidden" Python 爬蟲報錯 403 HTTP Error 403: Forbidden 網絡爬蟲-HTTP error fetching URL. Status=403 python 爬蟲爬取網頁遇到403問題 python爬蟲報錯urllib.error.HTTPError: HTTP Error 403: Forbidden解決方案 Python "HTTP Error 403: Forbidden" nginx配置遇到403問題 Python 3.x 中"HTTP Error 403: Forbidden"問題的解決方案 Python 3.x 中"HTTP Error 403: Forbidden"問題的解決方案 Python 3.x 中"HTTP Error 403: Forbidden"問題的解決方案