Python爬蟲之數據解析的三種方式
requests實現數據爬取及解析數據的流程
1. 執行url 2. 基於requests模塊發起請求 3. 獲取響應對象的數據
4. 解析數據 5. 數據持久化
1. 正則解析
1.1 爬取圖片

import re import os import requests url="https://www.meizitu.com/a/4774.html" headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } params ={ "src": "pc", "page": "searchresult", "time": "1578311515256" } img_data = requests.get(url=url,headers=headers).text ex='<img src="(.*?)".*?>' pa = re.compile(ex) img_urls = pa.findall(img_data) print(img_urls) urls='http://pic.topmeizi.com/wp-content/uploads/2016a/02/05/limg.jpg' images = requests.get(url=urls,headers=headers).content for urls in img_urls: img_name = urls.split("/")[-1] num = str(img_urls.index(urls)) local_url = 'images/' + num + img_name images = requests.get(url=urls,headers=headers).content with open(local_url,'wb') as fp: fp.write(images)
2. bs4解析(BeautifulSoup)
安裝
pip3 install bs4 pip3 install lxml
bs4原理解析
bs4原理解析 (1) 實例化一個BeautifulSoup對象,必須把即將被解析的頁面源碼加載到該對象中 (2) 調用該對象中相關的屬性或方法進行標簽的定位和內容的提取 --導包: from bs4 import BeautifulSoup -- (1) 轉化本地文件: soup = BeautifulSoup(open(本地文件),'lxml') (2) 轉化網絡文件: soup = BeautifulSoup('字符串類型或者字節類型數據','lxml')
BeautifulSoup對象方法
(1) 根據標簽名查找 - soup.a # 查找第一個a標簽 (2) 獲取標簽屬性 - soup.a.attrs # 獲取a標簽所有屬性和屬性值,返回一個字典 - soup.a.attrs['href'] # 獲取href屬性 - soup.a['href'] # 獲取href屬性 (3) 獲取文本內容 - soup.string # 獲取當前標簽的文本內容 - soup.text # 獲取該標簽下所有本文內容 - soup.text() # 獲取該標簽下所有文本內容 (4) find() : 找到第一個符合要求的標簽 - soup.find('a') - soup.find('a',title='xxx') (5) find_all() : 找到所有符合要求的標簽,返回一個列表 - soup.find_all('a') - soup.find_all('a','b') # 找到所有的a標簽和b標簽 - soup.find_all('a',limit=2) # 限制前兩個 (6) 根據選擇器選擇指定的額內容 select選擇器,返回一個列表: soup.select('#id') - 標簽選擇器,類選擇器,id選擇器,層級選擇器 - 層級選擇器: - 單層級: div > p > a > span - 多層級: div p
2.1 爬取三國演義

import requests from bs4 import BeautifulSoup home_url = "http://www.shicimingju.com/book/sanguoyanyi.html" headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } page_text = requests.get(url=home_url,headers=headers).text soup = BeautifulSoup(page_text,'lxml') a_list = soup.select(".book-mulu a") f = open("sanguo.txt","w",encoding="utf-8") for a_lst in a_list[0:1]: title = a_lst.string detail_url = "http://www.shicimingju.com" + a_lst["href"] detail_data = requests.get(url=detail_url,headers=headers).text soup2 = BeautifulSoup(detail_data,"lxml") desc = soup2.select(".chapter_content p")[0].string f.close()
3. xpath解析(常用)
xpath原理解析
下載: pip3 install lxml 導入: from lxml import etree 原理解析: (1) 實例化一個etree對象,把將要解析的頁面源碼加載到該對象中 (2) 使用該對象中的xpath方法結合xpath表達式進行標簽的定位和數據提取 使用方法: (1) 本地文件: tree = etree.parse(本地文件) tree.xpath('/div[@class=""]') (2) 網絡文件: tree = etree.HTML(網頁字符串) tree.xpath('//div[@class=""]') 常用xpath表達式: /: 相對定位 //:任意位置定位 屬性定位: tag[@attrName=""] # div[@class=""] 層級&索引定位: # 下標索引從1開始 tree.xpath('//div[@class=""]/ul/li[2]') 邏輯運算: tree.xpath('//a[@href="" and @class="du"]') # 找到href屬性為空,且class屬性為du的a標簽 模糊匹配: //div[contains(@class,"ng")] //div[starts-with(@class,"ta")] 取文本: //div[@class="abc"]/p[1]/text() # 取直系文本內容 //div[@class="abc"]/p[1]//text() # 取所有文本內容 取屬性: //div[@class="abc"]//li[2]/a/@href
3.1 爬取二手房信息1

import requests from lxml import etree import re home_url = "https://sz.58.com/ershoufang/" headers ={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } page_text = requests.get(url=home_url,headers=headers).text tree = etree.HTML(page_text) page_lst = tree.xpath('//ul[@class="house-list-wrap"]/li') print(page_lst) all_data = list() f = open('./58二手房.txt',"w",encoding="utf-8") for li in page_lst: title = li.xpath('./div[2]/h2/a/text()')[0] price1 = li.xpath('./div[3]/p/b/text()') price2 = li.xpath('./div[3]/p/text()') price = price1[0] + price2[0] + "(" + price2[1] + ")" deti_url = li.xpath('./div[2]/h2/a/@href')[0] # print(deti_url) detail_text = requests.get(url=deti_url,headers=headers).text tree = etree.HTML(detail_text) desc = "".join(tree.xpath('//div[@id="generalSituation"]/div//text()')) desc = desc.replace("\n","").replace(" ","") pa = re.compile("房屋總價(.*?)房屋戶型") desc2 = re.sub(pa,"房屋總價" + price + "房屋戶型",desc) dic = { "title":title, "price":price, "desc":desc2 } f.write(title + ": " +desc2 + "\n") print("{}數據已寫入".format(title)) f.close()
3.2 爬取二手房信息2

import requests from lxml import etree import re home_url = "https://shenzhen.leyoujia.com/esf/" headers ={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } page_text = requests.get(url=home_url,headers=headers).text tree = etree.HTML(page_text) page_lst = tree.xpath('//div[@class="list-box"]/ul/li') f = open("./樂有家二手房.txt","w",encoding="utf-8") for li in page_lst: title = li.xpath('./div[2]/p/a/text()') if title: title = title[0] detail_url = "https://shenzhen.leyoujia.com" + li.xpath('./div[2]/p/a/@href')[0] detail_text = requests.get(url=detail_url,headers=headers).text tree = etree.HTML(detail_text) desc = " ".join(tree.xpath('//div[@class="xq-box xq-box2"]/div/p/span/text()')) f.write(title + ": " + desc + "\n") print("{}數據已寫入 ... ".format(title)) f.close() print("數據寫入完成!")
3.3 解析所有城市名稱

import requests from lxml import etree home_url = "https://www.aqistudy.cn/historydata/" headers ={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } page_text = requests.get(url=home_url,headers=headers).text tree = etree.HTML(page_text) hot_data = ",".join(tree.xpath('//div[@class="hot"]/div/ul/li/a/text()')) all_ul_data = tree.xpath('//div[@class="all"]/div/ul') f = open("./城市列表.txt","w",encoding="utf-8") f.write("熱門城市:" + hot_data +"\n\n") for ul in all_ul_data: title = ul.xpath('./div[1]/b/text()')[0] citys = ",".join(ul.xpath('./div[2]/li/a/text()')) f.write(title + " " +citys + "\n") print("{}數據已寫入 ... ".format(title)) f.close() print("已完成!!!")
3.4 爬取貼吧留言

import requests from lxml import etree import re home_url = "https://tieba.baidu.com/p/6428562248" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } html_text = requests.get(url=home_url,headers=headers).text tree = etree.HTML(html_text) contents = tree.xpath('//div[@class="d_post_content j_d_post_content "]') # 回復留言 ans_url = "https://tieba.baidu.com/p/totalComment?t=1578396061786&tid=6428562248&fid=280050&pn=1&see_lz=0" params = { "t": "1578396061786", "": "6428562248", "": "280050", "pn": "1", "see_lz": "0" } comment_list = requests.get(url=ans_url,params=params,headers=headers).json()["data"]["comment_list"] # 留言 content_all = str() for div in contents: msg_top = " ".join(div.xpath('./text()')).strip() if msg_top: content_all += msg_top + "\n" detail_id = div.xpath('./@id')[0][13:] if comment_list.get(detail_id): comment_data = comment_list[detail_id] content_all += "回復:" + "\n" for comm in comment_data["comment_info"]: username = comm["username"] content = comm["content"] con_all = " " + username + " : " + content content_all += con_all + "\n" content_all += "---------------------------------\n" # pa = re.compile(r"<.*?>") content_all = pa.sub("",content_all) title = tree.xpath('//div[@id="j_core_title_wrap"]/h3/text()')[0] file_name = "./貼吧/LOL/{}.txt".format(title) f = open(file_name,"w",encoding="utf-8") f.write(content_all) f.close() print("數據已下載完成!!!")
3.5 爬取視頻

import requests from lxml import etree import re home_url = "https://www.pearvideo.com/category_31" page_text = requests.get(url=home_url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') for li in li_list: title = li.xpath('./div/a/div[2]/text()')[0] detail_url = li.xpath('./div/a/div[1]/div/div/@style')[0] pa = re.compile(r"cont-(.*?)-") video_url = "https://www.pearvideo.com/video_" + pa.findall(detail_url)[0] video_text = requests.get(url=video_url,headers=headers).text pa = re.compile(r"https://video.pearvideo.com/.*?.mp4") video_url = pa.findall(video_text)[0] video_lcoal = "./梨視頻/{}.mp4".format(title) f = open(video_lcoal,"wb") video_data = requests.get(url=video_url,headers=headers).content f.write(video_data) f.close() print("數據已下載完成!!!")