# coding=gbk
from bs4 import BeautifulSoup
import requests
import urllib
x = 1
y = 1
def crawl(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
global y
with open(f'F:/pachong/xnt/{y}.txt','w',encoding="utf-8") as f:
f.write(str(soup))
y += 1
yinhuns = soup.select('img')
print(yinhuns)
for yh in yinhuns:
print(yh)
link = yh.get('src')
print(link)
global x
urllib.request.urlretrieve(link, f'F:/pachong/xnt/{x}.jpg')
print(f'正在下載第{x}張圖片')
x += 1
for i in range(1,5):
url = "https://acg.fi/hentai/23643.htm/" + str(i)
try:
crawl(url)
except ValueError as f:
continue
except Exception as e:
print(e)
- 運行程序過程中返回下面結果
<img alt="A區(ACG.Fi)" class="logo" src="https://acg.fi/logo.png"/>
https://acg.fi/logo.png
HTTP Error 403: Forbidden
-
問題有三個
- 搜索src值的時候,沒有搜索到全部符合要找的圖片網址
- 返回的第一個網址出現了403錯誤,拒絕訪問
- soup.select返回的不是正確的list
-
思考
- 有可能所要找的網址中包含中文,無法編譯
- 如果通過正則對,請求的url的text進行,篩選
#coding=gbk
from bs4 import BeautifulSoup
import requests
import urllib
x = 1
def crawl(url, header):
res = requests.get(url, headers=header)
soup = BeautifulSoup(res.text, 'html.parser')
yinhuns = soup.find('div', attrs = {'id':"content-innerText"}).find_all('img',limit=4)
print(yinhuns)
for yh in yinhuns:
link = yh.get('src')
global x
print(x)
urllib.request.urlretrieve(link, 'F:/pachong/xnt/{}.jpg'.format(x))
print('正在下載第{0}張圖片'.format(x))
x += 1
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
for i in range(1,5):
url = "https://acg.fi/hentai/23643.htm/" + str(i)
try:
crawl(url, header)
except ValueError as f:
continue
except Exception as e:
print(e)
- 這個過程用了find(),find_all()方法,依舊沒有解決list的問題
- 后續過程使用urllib.parse.quote對中文部分重新編碼,但是urllib.request.urlretrieve依然報錯
- 重新修改后
#coding=gbk
import requests
import urllib
import re
from PIL import Image
from io import BytesIO
x = 1
# 獲取抓取的圖片源網址
def crawl(url, header):
res = requests.get(url, headers=header)
# 防止被反爬,打開后關閉
res.close()
res = res.text
pattern = re.compile('http.*?apic.*?jpg')
result = re.findall(pattern, res)
return result
# 對重編碼的網址下載圖片
def down(outs, folder_path):
global x
for out in outs:
# 獲取新編碼的URL地址
res = requests.get(out)
# 防止被反爬,打開后關閉
res.close()
bf = BytesIO()
bf.write(res.content)
img = Image.open(bf)
print(f'正在下載第{x}張圖片')
img.save(folder_path + f"{x}.jpg")
x += 1
# 對獲取的圖片源網址進行重編碼
def bianma(results):
outs = []
for s in results:
# 用正則篩選出中文部分
pattern = re.compile('[\u4e00-\u9fa5]+')
result = re.search(pattern, s)
su = result.group(0)
# 把中文部分重洗編碼
li = urllib.parse.quote(su)
# 把原URL地址中文部分替換成編碼后的
out = re.sub(pattern, li, s)
outs.append(out)
# 對列表進行去重並且按照原來的次序排列
outs_cp = sorted(set(outs), key=outs.index)
return outs_cp
def main():
try:
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
folder_path = 'F:/pachong/xnt/'
for i in range(1,5):
url = "https://acg.fi/hentai/23643.htm/" + str(i)
results = crawl(url, header)
outs = bianma(results)
down(outs, folder_path)
except Exception as e:
print(e)
if __name__ == '__main__':
main()
- 對於圖片路徑中有中文的,可以使用BytesIO和PIL下載圖片,證實可以有效解決
- 幾次試驗出現[Errno 10054] 遠程主機強迫關閉了一個現有的連接,可以在requests.get()后使用close()
- 程序運行無誤,就是有點慢,后期可以使用多線程嘗試