python網絡爬蟲抓取網站圖片


本文介紹兩種爬取方式:

1.正則表達式

2.bs4解析Html

 

以下為正則表達式爬蟲,面向對象封裝后的代碼如下:

import urllib.request  # 用於下載圖片
import os
import requests  # 發送http請求
import re   # 正則表達式匹配


class GetJpg(object):

    def __init__(self, start_urls):
        self.start_urls = start_urls

    def get_response(self,url):

        '''獲取網頁響應內容'''

        response = requests.get(url).text
        return response

    def get_content(self,html):

        '''獲取網頁響應內容中所有圖片的整體div部分'''

        reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)', re.S)
        return re.findall(reg, html)

    def get_jpg_url(self,content):

        '''獲取圖片url'''

        reg = r'data-original="(.*?)"'
        return re.findall(reg, content)

    def get_jpg_name(self,content):

        ''' 獲取圖片名稱'''

        reg = re.compile(r'<a href="/detail-.{8}.html">(.*?)</a>')
        return re.findall(reg, content)

    def download_jpg(self,src_url, path,index):

        '''下載圖片保存到本地目錄'''

        path = ''.join(path.split())
        path = 'E:\Python\爬圖片\{name}.{index}'.format(name=path,index=index)
        if not os.path.exists(path):
            urllib.request.urlretrieve(src_url, path)  # 下載圖片
            print('OK!!!')
        else:
            print('文件已存在')

    def get_url_name(self,start_url):

        ''' 逐頁下載,本部分本來可以放在main函數里,考慮到會多嵌套一個循環所以單獨拿出來作為一個函數'''

        content = self.get_content(self.get_response(start_url))
        for i in content:
            jpg_url = self.get_jpg_url(i)
            if jpg_url:
                jpg_name = self.get_jpg_name(i)
                index = jpg_url[0].split('.')[-1]
                try:
                    self.download_jpg(jpg_url[0], jpg_name[0],index)
                except:
                    continue

    def main(self):
        
        ''' 執行'''
        
        [self.get_url_name(start_url) for start_url in self.start_urls]   # 此處列表生成器來執行
        # 這部分的代碼相當於:
        # for start_url in self.start_urls:
        #     self.get_url_name(start_url)


if __name__ == '__main__':
    start_urls = ['http://www.budejie.com/{id}'.format(id=i) for i in range(1,10)]
    jpg = GetJpg(start_urls)  # 實例化一個對象
    jpg.main()

以下為使用bs4爬取的代碼:

from bs4 import BeautifulSoup
import urllib.request
import re


def get_urls(img_girl):
    '''
    :param img_girl: <img>標簽內容
    :return: 所有圖片的url
    '''
    all_urls = [girl.get('src') for girl in img_girl]
    return all_urls


def get_img_name(img_girl):
    '''
    :param img_girl:  <img>標簽內容
    :return: 所有圖片title
    '''

    all_name = [girl.get('title') for girl in img_girl]
    return all_name


def get_img_resource(url):
    '''
    :param url:網站url
    :return:網頁源碼中的所有<img>標簽內容
    '''
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
               'Accept - Language': 'zh - CN, zh;q = 0.8'
    }  # 添加請求頭部,模擬瀏覽器
    req = urllib.request.Request(url, headers=headers)  # 創建對象
    res = urllib.request.urlopen(req, timeout=20)  # 發送請求
    content = res.read()  # 獲取響應網頁源碼
    soup = BeautifulSoup(content,'html.parser')  # HMTL源碼解析
    img_girl = soup.find_all('img')  # 獲取 源碼中的<img>標簽模塊內容
    return img_girl


def main(url):
    '''
    下載保存圖片
    :param url: 網站url
    '''
    urls = get_urls(get_img_resource(url))
    names = get_img_name(get_img_resource(url))
    x = 1
    for src_url in urls:
        path_l = re.split(r'\W', names[urls.index(src_url)])  # 去除圖片名稱中的特殊字符,不然文件名可能在保存的時候報錯
        path = ''.join(path_l)
        path = 'E:\Python\爬圖片\BS4\{name}_{index}.jpg'.format(name=path,index=x)
        urllib.request.urlretrieve(src_url, path)
        print('OK')
        x += 1

if __name__ == "__main__":
    urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ]
    [main(url)for url in urls]

 bs4面向對象封裝后代碼:

from bs4 import BeautifulSoup
import urllib.request
import re


class GetWebImg(object):

    def __init__(self, url, index):
        self.url = url
        self.index = index

    def get_urls(self,img_girl):
        '''
        :param img_girl: <img>標簽內容
        :return: 所有圖片的url
        '''
        all_urls = [girl.get('src') for girl in img_girl]
        return all_urls

    def get_img_name(self,img_girl):
        '''
        :param img_girl:  <img>標簽內容
        :return: 所有圖片title
        '''

        all_name = [girl.get('title') for girl in img_girl]
        return all_name

    def get_img_resource(self, url):
        '''
        :param url:網站url
        :return:網頁源碼中的所有<img>標簽內容
        '''
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
            'Accept - Language': 'zh - CN, zh;q = 0.8'
            }  # 添加請求頭部,模擬瀏覽器
        req = urllib.request.Request(url, headers=headers)  # 創建對象
        res = urllib.request.urlopen(req, timeout=20)  # 發送請求
        content = res.read()  # 獲取響應網頁源碼
        soup = BeautifulSoup(content, 'html.parser')  # HMTL源碼解析
        img_girl = soup.find_all('img')  # 獲取 源碼中的<img>標簽模塊內容
        return img_girl

    def main(self):
        '''
        下載保存圖片
        :param url: 網站url
        '''
        url_list = self.get_urls(self.get_img_resource(self.url))
        name_list = self.get_img_name(self.get_img_resource(self.url))
        x = 1
        for src_url in url_list:
            path_l = re.split(r'\W', name_list[url_list.index(src_url)])  # 去除圖片名稱中的特殊字符,不然文件名可能在保存的時候報錯
            path = ''.join(path_l)
            path = 'E:\Python\爬圖片\BS4\{name}_{index}_{id}.jpg'.format(name=path, index=self.index,id =x)
            urllib.request.urlretrieve(src_url, path)
            print('第{index}頁第{id}張圖片下載OK'.format(index=self.index,id =x))
            x += 1


if __name__ == "__main__":
    urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ]
    index = 1
    for url in urls:
        get_img = GetWebImg(url,index)
        get_img.main()
        index += 1

運行結果:

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM