Python requests+BeautifulSoup爬蟲（下載圖片）

本文轉載自查看原文 2020-06-01 11:15 787 Python

1、導入庫

import requests
from bs4 import BeautifulSoup

2、下載圖片流程

【網址https://wall.alphacoders.com/】【若有侵權，請聯系1150646501@qq.com，立馬刪除】

正常手動下載圖片流程

1）訪問https://wall.alphacoders.com/

2）點擊圖片進入新的鏈接

3）保存圖片

python代碼下載流程

1）獲取https://wall.alphacoders.com/網頁源代碼

2）找到圖片點擊后對應鏈接（假設為xxx）

3）獲取xxx網頁源代碼(假設為yyy)

4）找到yyy中圖片鏈接（假設為zzz.jpg）

5)下載圖片

【代碼如下】

1）獲取https://wall.alphacoders.com/網頁源代碼

# -*- encoding=utf-8 -*-

import requests
from bs4 import BeautifulSoup


# 獲取網頁源代碼
def get_html(url='https://wall.alphacoders.com/'):
    ret = requests.get(url)
    # 獲取返回的狀態 200表示請求成功
    print ret.status_code
    html = ret.text
    # 網頁源代碼
    # print html
    return html


if __name__ == '__main__':
    pass

2）找到圖片點擊后對應鏈接（假設為xxx）

通過查看網頁源代碼可以發現，每組圖片是這樣子的

<img class="" data-src="https://images7.alphacoders.com/108/thumb-350-1080652.jpg"

alt="HD Wallpaper | Background Image ID:1080652"

src="https://images7.alphacoders.com/108/thumb-350-1080652.jpg">

</a>

</div>

所以我們可以通過查找class="boxgrid"的所有div，然后取a中的href。

其中也有https://images7.alphacoders.com/108/thumb-350-1080652.jpg的網址，但是點進去看是小圖片。我們需要下載大圖片。

href還需要把基礎地址加上才可以，把big.php?i=1080652變成https://wall.alphacoders.com/big.php?i=1080652

PS:可以使用F12,然后定位圖片，然后選擇圖片找到對應的html代碼就能找到規律

# -*- encoding=utf-8 -*-

import requests
from bs4 import BeautifulSoup


# 獲取網頁源代碼
def get_html(url='https://wall.alphacoders.com/'):
    ret = requests.get(url)
    # 獲取返回的狀態 200表示請求成功
    print ret.status_code
    html = ret.text
    # 網頁源代碼
    # print html
    return html


# 獲取鏈接
def get_photo_link(html):
    # 用來存放鏈接
    links = []
    soup = BeautifulSoup(html, features='lxml')
    # 找到所有class_='boxgrid'的div
    all_div = soup.find_all('div', class_='boxgrid')
    for div in all_div:
        # 取到href的值
        href = div.a.attrs['href']
        # 添加原地址組成真正url
        links.append('https://wall.alphacoders.com/' + href)
    return links


if __name__ == '__main__':
    html = get_html()
    links = get_photo_link(html)
    for link in links:
        print link

PS：BeautifulSoup使用可以參考https://cuiqingcai.com/1319.html

運行

https://wall.alphacoders.com/big.php?i=1080652
https://wall.alphacoders.com/big.php?i=1080635
https://wall.alphacoders.com/big.php?i=1080566
https://wall.alphacoders.com/big.php?i=1080405
https://wall.alphacoders.com/big.php?i=1080377
https://wall.alphacoders.com/big.php?i=1080376
https://wall.alphacoders.com/big.php?i=1080204
https://wall.alphacoders.com/big.php?i=1080025
https://wall.alphacoders.com/big.php?i=1079983
https://wall.alphacoders.com/big.php?i=1079969
https://wall.alphacoders.com/big.php?i=1079911
https://wall.alphacoders.com/big.php?i=1079838
https://wall.alphacoders.com/big.php?i=1079806
https://wall.alphacoders.com/big.php?i=1079738
https://wall.alphacoders.com/big.php?i=1079732
https://wall.alphacoders.com/big.php?i=1079731
https://wall.alphacoders.com/big.php?i=1079579
https://wall.alphacoders.com/big.php?i=1079360
https://wall.alphacoders.com/big.php?i=1079302
https://wall.alphacoders.com/big.php?i=1079174
https://wall.alphacoders.com/big.php?i=1079169
https://wall.alphacoders.com/big.php?i=1078982
https://wall.alphacoders.com/big.php?i=1078682
https://wall.alphacoders.com/big.php?i=1078534
https://wall.alphacoders.com/big.php?i=1078068
https://wall.alphacoders.com/big.php?i=1078043
https://wall.alphacoders.com/big.php?i=1077847
https://wall.alphacoders.com/big.php?i=1077792
https://wall.alphacoders.com/big.php?i=1077568
https://wall.alphacoders.com/big.php?i=1077530

3）4）獲取xxx網頁源代碼(假設為yyy)以及找到yyy中圖片鏈接（假設為zzz.jpg）

# -*- encoding=utf-8 -*-

import requests
from bs4 import BeautifulSoup


# 獲取網頁源代碼
def get_html(url='https://wall.alphacoders.com/'):
    ret = requests.get(url)
    # 獲取返回的狀態 200表示請求成功
    print ret.status_code
    html = ret.text
    # 網頁源代碼
    # print html
    return html


# 獲取鏈接
def get_photo_link(html):
    # 用來存放鏈接
    links = []
    soup = BeautifulSoup(html, features='lxml')
    # 找到所有class_='boxgrid'的div
    all_div = soup.find_all('div', class_='boxgrid')
    for div in all_div:
        # 取到href的值
        href = div.a.attrs['href']
        # 添加原地址組成真正url
        links.append('https://wall.alphacoders.com/' + href)
    return links


# 獲取每一個鏈接的html以及找到對應圖片的鏈接
def get_img_url(links):
    img_urls = []
    for link in links:
        html = get_html(link)
        img_soup = BeautifulSoup(html, features='lxml')
        all_img = img_soup.find_all('img', class_='main-content')
        for img in all_img:
            img_url = img.attrs['src']
            img_urls.append(img_url)
    return img_urls


if __name__ == '__main__':
    html = get_html()
    links = get_photo_link(html)
    img_urls = get_img_url(links)
    for img_url in img_urls:
        print img_url

運行

https://images7.alphacoders.com/108/thumb-1920-1080652.jpg
https://images7.alphacoders.com/108/thumb-1920-1080635.jpg
https://images8.alphacoders.com/108/thumb-1920-1080566.jpg
https://images6.alphacoders.com/108/thumb-1920-1080507.jpg
https://images.alphacoders.com/108/thumb-1920-1080405.png
https://images8.alphacoders.com/108/thumb-1920-1080377.jpg
https://images4.alphacoders.com/108/thumb-1920-1080376.jpg
https://images.alphacoders.com/108/thumb-1920-1080204.jpg
https://images4.alphacoders.com/108/thumb-1920-1080025.jpg
https://images8.alphacoders.com/107/thumb-1920-1079983.jpg
https://images6.alphacoders.com/107/thumb-1920-1079969.jpg
https://images4.alphacoders.com/107/thumb-1920-1079911.jpg
https://images6.alphacoders.com/107/thumb-1920-1079838.jpg
https://images6.alphacoders.com/107/thumb-1920-1079806.jpg
https://images8.alphacoders.com/107/thumb-1920-1079738.jpg
https://images.alphacoders.com/107/thumb-1920-1079732.jpg
https://images2.alphacoders.com/107/thumb-1920-1079731.jpg
https://images6.alphacoders.com/107/thumb-1920-1079579.jpg
https://images7.alphacoders.com/107/thumb-1920-1079360.jpg
https://images6.alphacoders.com/107/thumb-1920-1079302.jpg
https://images6.alphacoders.com/107/thumb-1920-1079174.jpg
https://images3.alphacoders.com/107/thumb-1920-1079169.jpg
https://images7.alphacoders.com/107/thumb-1920-1078982.png
https://images4.alphacoders.com/107/thumb-1920-1078682.jpg
https://images.alphacoders.com/107/thumb-1920-1078534.jpg
https://images4.alphacoders.com/107/thumb-1920-1078068.jpg
https://images5.alphacoders.com/107/thumb-1920-1078043.jpg
https://images3.alphacoders.com/107/thumb-1920-1077847.png
https://images2.alphacoders.com/107/thumb-1920-1077792.jpg
https://images4.alphacoders.com/107/thumb-1920-1077568.jpg

5）下載圖片

# -*- encoding=utf-8 -*-
import os

import requests
from bs4 import BeautifulSoup


# 獲取網頁源代碼
def get_html(url='https://wall.alphacoders.com/'):
    ret = requests.get(url)
    # 獲取返回的狀態 200表示請求成功
    print ret.status_code
    html = ret.text
    # 網頁源代碼
    # print html
    return html


# 獲取鏈接
def get_photo_link(html):
    # 用來存放鏈接
    links = []
    soup = BeautifulSoup(html, features='lxml')
    # 找到所有class_='boxgrid'的div
    all_div = soup.find_all('div', class_='boxgrid')
    for div in all_div:
        # 取到href的值
        href = div.a.attrs['href']
        # 添加原地址組成真正url
        links.append('https://wall.alphacoders.com/' + href)
    return links


# 獲取每一個鏈接的html以及找到對應圖片的鏈接
def get_img_url(links):
    img_urls = []
    for link in links:
        html = get_html(link)
        img_soup = BeautifulSoup(html, features='lxml')
        all_img = img_soup.find_all('img', class_='main-content')
        for img in all_img:
            img_url = img.attrs['src']
            img_urls.append(img_url)
    return img_urls


# 文件夾不存在就創建
def create_folder(file_name):
    path = os.path.split(file_name)[0]
    if path != '' and not os.path.exists(path):
        os.makedirs(path)


# 下載圖片
def download_img(img_urls):
    for img_url in img_urls:
        # 后綴名
        save_name_suffix = img_url[-3:]
        # 保存的名字
        save_name = 'ptotos/img.{}'.format(save_name_suffix)
        ret = requests.get(img_url)
        # 圖片信息
        info = ret.content
        # 不存在文件就創建
        create_folder(save_name)
        # 二進制方式寫入
        with open(save_name, 'wb') as f:
            f.write(info)


if __name__ == '__main__':
    html = get_html()
    links = get_photo_link(html)
    img_urls = get_img_url(links)
    download_img(img_urls)
    print 'End'

運行后

6、遇到亂碼處理以及添加請求頭

# -*- coding: utf-8 -*-

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
url = 'http://sports.sina.com.cn/g/premierleague/index.shtml'
ret = requests.get(url, headers=headers)
print(ret.status_code)
ret.encoding = 'utf-8'
print(ret.text)

7、需要json數據時

# -*- coding: utf-8 -*-
import json

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
data = {'key1': 'value1', 'key2': 'value2'}
url = 'http://sports.sina.com.cn/g/premierleague/index.shtml'
ret = requests.get(url, headers=headers, data=json.dumps(data))
# ret = requests.post(url, json=data)  # 使用post需要json參數時
print(ret.status_code)
ret.encoding = 'utf-8'
print(ret.text)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python3 爬蟲（requests+BeautifulSoup） python 爬蟲（一） requests+BeautifulSoup 爬取簡單網頁代碼示例 Python 爬蟲之 Beautifulsoup4，爬網站圖片 python 爬蟲之requests爬取頁面圖片的url，並將圖片下載到本地 python爬蟲之request and BeautifulSoup python爬蟲---BeautifulSoup的用法 python網絡爬蟲之解析網頁的BeautifulSoup(爬取電影圖片)[三] python3爬蟲-6.使用requests和BeautifulSoup爬取豆瓣Top250電影 Python requests庫如何下載一個圖片資源 Python網絡爬蟲之BeautifulSoup模塊