本文介紹兩種爬取方式:
1.正則表達式
2.bs4解析Html
以下為正則表達式爬蟲,面向對象封裝后的代碼如下:
import urllib.request # 用於下載圖片 import os import requests # 發送http請求 import re # 正則表達式匹配 class GetJpg(object): def __init__(self, start_urls): self.start_urls = start_urls def get_response(self,url): '''獲取網頁響應內容''' response = requests.get(url).text return response def get_content(self,html): '''獲取網頁響應內容中所有圖片的整體div部分''' reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)', re.S) return re.findall(reg, html) def get_jpg_url(self,content): '''獲取圖片url''' reg = r'data-original="(.*?)"' return re.findall(reg, content) def get_jpg_name(self,content): ''' 獲取圖片名稱''' reg = re.compile(r'<a href="/detail-.{8}.html">(.*?)</a>') return re.findall(reg, content) def download_jpg(self,src_url, path,index): '''下載圖片保存到本地目錄''' path = ''.join(path.split()) path = 'E:\Python\爬圖片\{name}.{index}'.format(name=path,index=index) if not os.path.exists(path): urllib.request.urlretrieve(src_url, path) # 下載圖片 print('OK!!!') else: print('文件已存在') def get_url_name(self,start_url): ''' 逐頁下載,本部分本來可以放在main函數里,考慮到會多嵌套一個循環所以單獨拿出來作為一個函數''' content = self.get_content(self.get_response(start_url)) for i in content: jpg_url = self.get_jpg_url(i) if jpg_url: jpg_name = self.get_jpg_name(i) index = jpg_url[0].split('.')[-1] try: self.download_jpg(jpg_url[0], jpg_name[0],index) except: continue def main(self): ''' 執行''' [self.get_url_name(start_url) for start_url in self.start_urls] # 此處列表生成器來執行 # 這部分的代碼相當於: # for start_url in self.start_urls: # self.get_url_name(start_url) if __name__ == '__main__': start_urls = ['http://www.budejie.com/{id}'.format(id=i) for i in range(1,10)] jpg = GetJpg(start_urls) # 實例化一個對象 jpg.main()
以下為使用bs4爬取的代碼:
from bs4 import BeautifulSoup import urllib.request import re def get_urls(img_girl): ''' :param img_girl: <img>標簽內容 :return: 所有圖片的url ''' all_urls = [girl.get('src') for girl in img_girl] return all_urls def get_img_name(img_girl): ''' :param img_girl: <img>標簽內容 :return: 所有圖片title ''' all_name = [girl.get('title') for girl in img_girl] return all_name def get_img_resource(url): ''' :param url:網站url :return:網頁源碼中的所有<img>標簽內容 ''' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', 'Accept - Language': 'zh - CN, zh;q = 0.8' } # 添加請求頭部,模擬瀏覽器 req = urllib.request.Request(url, headers=headers) # 創建對象 res = urllib.request.urlopen(req, timeout=20) # 發送請求 content = res.read() # 獲取響應網頁源碼 soup = BeautifulSoup(content,'html.parser') # HMTL源碼解析 img_girl = soup.find_all('img') # 獲取 源碼中的<img>標簽模塊內容 return img_girl def main(url): ''' 下載保存圖片 :param url: 網站url ''' urls = get_urls(get_img_resource(url)) names = get_img_name(get_img_resource(url)) x = 1 for src_url in urls: path_l = re.split(r'\W', names[urls.index(src_url)]) # 去除圖片名稱中的特殊字符,不然文件名可能在保存的時候報錯 path = ''.join(path_l) path = 'E:\Python\爬圖片\BS4\{name}_{index}.jpg'.format(name=path,index=x) urllib.request.urlretrieve(src_url, path) print('OK') x += 1 if __name__ == "__main__": urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ] [main(url)for url in urls]
bs4面向對象封裝后代碼:
from bs4 import BeautifulSoup import urllib.request import re class GetWebImg(object): def __init__(self, url, index): self.url = url self.index = index def get_urls(self,img_girl): ''' :param img_girl: <img>標簽內容 :return: 所有圖片的url ''' all_urls = [girl.get('src') for girl in img_girl] return all_urls def get_img_name(self,img_girl): ''' :param img_girl: <img>標簽內容 :return: 所有圖片title ''' all_name = [girl.get('title') for girl in img_girl] return all_name def get_img_resource(self, url): ''' :param url:網站url :return:網頁源碼中的所有<img>標簽內容 ''' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', 'Accept - Language': 'zh - CN, zh;q = 0.8' } # 添加請求頭部,模擬瀏覽器 req = urllib.request.Request(url, headers=headers) # 創建對象 res = urllib.request.urlopen(req, timeout=20) # 發送請求 content = res.read() # 獲取響應網頁源碼 soup = BeautifulSoup(content, 'html.parser') # HMTL源碼解析 img_girl = soup.find_all('img') # 獲取 源碼中的<img>標簽模塊內容 return img_girl def main(self): ''' 下載保存圖片 :param url: 網站url ''' url_list = self.get_urls(self.get_img_resource(self.url)) name_list = self.get_img_name(self.get_img_resource(self.url)) x = 1 for src_url in url_list: path_l = re.split(r'\W', name_list[url_list.index(src_url)]) # 去除圖片名稱中的特殊字符,不然文件名可能在保存的時候報錯 path = ''.join(path_l) path = 'E:\Python\爬圖片\BS4\{name}_{index}_{id}.jpg'.format(name=path, index=self.index,id =x) urllib.request.urlretrieve(src_url, path) print('第{index}頁第{id}張圖片下載OK'.format(index=self.index,id =x)) x += 1 if __name__ == "__main__": urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ] index = 1 for url in urls: get_img = GetWebImg(url,index) get_img.main() index += 1
運行結果:


