Python爬虫之数据解析的三种方式
requests实现数据爬取及解析数据的流程
1. 执行url
2. 基于requests模块发起请求
3. 获取响应对象的数据
4. 解析数据
5. 数据持久化
1. 正则解析
1.1 爬取图片
import re
import os
import requests
url="https://www.meizitu.com/a/4774.html"
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
params ={
"src": "pc",
"page": "searchresult",
"time": "1578311515256"
}
img_data = requests.get(url=url,headers=headers).text
ex='<img src="(.*?)".*?>'
pa = re.compile(ex)
img_urls = pa.findall(img_data)
print(img_urls)
urls='http://pic.topmeizi.com/wp-content/uploads/2016a/02/05/limg.jpg'
images = requests.get(url=urls,headers=headers).content
for urls in img_urls:
img_name = urls.split("/")[-1]
num = str(img_urls.index(urls))
local_url = 'images/' + num + img_name
images = requests.get(url=urls,headers=headers).content
with open(local_url,'wb') as fp:
fp.write(images)
View Code
2. bs4解析(BeautifulSoup)
安装
pip3 install bs4
pip3 install lxml
bs4原理解析
bs4原理解析
(1) 实例化一个BeautifulSoup对象,必须把即将被解析的页面源码加载到该对象中
(2) 调用该对象中相关的属性或方法进行标签的定位和内容的提取
--导包: from bs4 import BeautifulSoup
--
(1) 转化本地文件:
soup = BeautifulSoup(open(本地文件),'lxml')
(2) 转化网络文件:
soup = BeautifulSoup('字符串类型或者字节类型数据','lxml')
BeautifulSoup对象方法
(1) 根据标签名查找
- soup.a # 查找第一个a标签
(2) 获取标签属性
- soup.a.attrs # 获取a标签所有属性和属性值,返回一个字典
- soup.a.attrs['href'] # 获取href属性
- soup.a['href'] # 获取href属性
(3) 获取文本内容
- soup.string # 获取当前标签的文本内容
- soup.text # 获取该标签下所有本文内容
- soup.text() # 获取该标签下所有文本内容
(4) find() : 找到第一个符合要求的标签
- soup.find('a')
- soup.find('a',title='xxx')
(5) find_all() : 找到所有符合要求的标签,返回一个列表
- soup.find_all('a')
- soup.find_all('a','b') # 找到所有的a标签和b标签
- soup.find_all('a',limit=2) # 限制前两个
(6) 根据选择器选择指定的额内容
select选择器,返回一个列表: soup.select('#id')
- 标签选择器,类选择器,id选择器,层级选择器
- 层级选择器:
- 单层级: div > p > a > span
- 多层级: div p
2.1 爬取三国演义
import requests
from bs4 import BeautifulSoup
home_url = "http://www.shicimingju.com/book/sanguoyanyi.html"
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
page_text = requests.get(url=home_url,headers=headers).text
soup = BeautifulSoup(page_text,'lxml')
a_list = soup.select(".book-mulu a")
f = open("sanguo.txt","w",encoding="utf-8")
for a_lst in a_list[0:1]:
title = a_lst.string
detail_url = "http://www.shicimingju.com" + a_lst["href"]
detail_data = requests.get(url=detail_url,headers=headers).text
soup2 = BeautifulSoup(detail_data,"lxml")
desc = soup2.select(".chapter_content p")[0].string
f.close()
View Code
3. xpath解析(常用)
xpath原理解析
下载: pip3 install lxml
导入: from lxml import etree
原理解析:
(1) 实例化一个etree对象,把将要解析的页面源码加载到该对象中
(2) 使用该对象中的xpath方法结合xpath表达式进行标签的定位和数据提取
使用方法:
(1) 本地文件: tree = etree.parse(本地文件)
tree.xpath('/div[@class=""]')
(2) 网络文件: tree = etree.HTML(网页字符串)
tree.xpath('//div[@class=""]')
常用xpath表达式:
/: 相对定位
//:任意位置定位
属性定位:
tag[@attrName=""] # div[@class=""]
层级&索引定位:
# 下标索引从1开始
tree.xpath('//div[@class=""]/ul/li[2]')
逻辑运算:
tree.xpath('//a[@href="" and @class="du"]') # 找到href属性为空,且class属性为du的a标签
模糊匹配:
//div[contains(@class,"ng")]
//div[starts-with(@class,"ta")]
取文本:
//div[@class="abc"]/p[1]/text() # 取直系文本内容
//div[@class="abc"]/p[1]//text() # 取所有文本内容
取属性:
//div[@class="abc"]//li[2]/a/@href
3.1 爬取二手房信息1
import requests
from lxml import etree
import re
home_url = "https://sz.58.com/ershoufang/"
headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
page_text = requests.get(url=home_url,headers=headers).text
tree = etree.HTML(page_text)
page_lst = tree.xpath('//ul[@class="house-list-wrap"]/li')
print(page_lst)
all_data = list()
f = open('./58二手房.txt',"w",encoding="utf-8")
for li in page_lst:
title = li.xpath('./div[2]/h2/a/text()')[0]
price1 = li.xpath('./div[3]/p/b/text()')
price2 = li.xpath('./div[3]/p/text()')
price = price1[0] + price2[0] + "(" + price2[1] + ")"
deti_url = li.xpath('./div[2]/h2/a/@href')[0]
# print(deti_url)
detail_text = requests.get(url=deti_url,headers=headers).text
tree = etree.HTML(detail_text)
desc = "".join(tree.xpath('//div[@id="generalSituation"]/div//text()'))
desc = desc.replace("\n","").replace(" ","")
pa = re.compile("房屋总价(.*?)房屋户型")
desc2 = re.sub(pa,"房屋总价" + price + "房屋户型",desc)
dic = {
"title":title,
"price":price,
"desc":desc2
}
f.write(title + ": " +desc2 + "\n")
print("{}数据已写入".format(title))
f.close()
View Code
3.2 爬取二手房信息2
import requests
from lxml import etree
import re
home_url = "https://shenzhen.leyoujia.com/esf/"
headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
page_text = requests.get(url=home_url,headers=headers).text
tree = etree.HTML(page_text)
page_lst = tree.xpath('//div[@class="list-box"]/ul/li')
f = open("./乐有家二手房.txt","w",encoding="utf-8")
for li in page_lst:
title = li.xpath('./div[2]/p/a/text()')
if title:
title = title[0]
detail_url = "https://shenzhen.leyoujia.com" + li.xpath('./div[2]/p/a/@href')[0]
detail_text = requests.get(url=detail_url,headers=headers).text
tree = etree.HTML(detail_text)
desc = " ".join(tree.xpath('//div[@class="xq-box xq-box2"]/div/p/span/text()'))
f.write(title + ": " + desc + "\n")
print("{}数据已写入 ... ".format(title))
f.close()
print("数据写入完成!")
View Code
3.3 解析所有城市名称
import requests
from lxml import etree
home_url = "https://www.aqistudy.cn/historydata/"
headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
page_text = requests.get(url=home_url,headers=headers).text
tree = etree.HTML(page_text)
hot_data = ",".join(tree.xpath('//div[@class="hot"]/div/ul/li/a/text()'))
all_ul_data = tree.xpath('//div[@class="all"]/div/ul')
f = open("./城市列表.txt","w",encoding="utf-8")
f.write("热门城市:" + hot_data +"\n\n")
for ul in all_ul_data:
title = ul.xpath('./div[1]/b/text()')[0]
citys = ",".join(ul.xpath('./div[2]/li/a/text()'))
f.write(title + " " +citys + "\n")
print("{}数据已写入 ... ".format(title))
f.close()
print("已完成!!!")
View Code
3.4 爬取贴吧留言
import requests
from lxml import etree
import re
home_url = "https://tieba.baidu.com/p/6428562248"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
html_text = requests.get(url=home_url,headers=headers).text
tree = etree.HTML(html_text)
contents = tree.xpath('//div[@class="d_post_content j_d_post_content "]')
# 回复留言
ans_url = "https://tieba.baidu.com/p/totalComment?t=1578396061786&tid=6428562248&fid=280050&pn=1&see_lz=0"
params = {
"t": "1578396061786",
"": "6428562248",
"": "280050",
"pn": "1",
"see_lz": "0"
}
comment_list = requests.get(url=ans_url,params=params,headers=headers).json()["data"]["comment_list"]
# 留言
content_all = str()
for div in contents:
msg_top = " ".join(div.xpath('./text()')).strip()
if msg_top:
content_all += msg_top + "\n"
detail_id = div.xpath('./@id')[0][13:]
if comment_list.get(detail_id):
comment_data = comment_list[detail_id]
content_all += "回复:" + "\n"
for comm in comment_data["comment_info"]:
username = comm["username"]
content = comm["content"]
con_all = " " + username + " : " + content
content_all += con_all + "\n"
content_all += "---------------------------------\n"
#
pa = re.compile(r"<.*?>")
content_all = pa.sub("",content_all)
title = tree.xpath('//div[@id="j_core_title_wrap"]/h3/text()')[0]
file_name = "./贴吧/LOL/{}.txt".format(title)
f = open(file_name,"w",encoding="utf-8")
f.write(content_all)
f.close()
print("数据已下载完成!!!")
View Code
3.5 爬取视频
import requests
from lxml import etree
import re
home_url = "https://www.pearvideo.com/category_31"
page_text = requests.get(url=home_url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
for li in li_list:
title = li.xpath('./div/a/div[2]/text()')[0]
detail_url = li.xpath('./div/a/div[1]/div/div/@style')[0]
pa = re.compile(r"cont-(.*?)-")
video_url = "https://www.pearvideo.com/video_" + pa.findall(detail_url)[0]
video_text = requests.get(url=video_url,headers=headers).text
pa = re.compile(r"https://video.pearvideo.com/.*?.mp4")
video_url = pa.findall(video_text)[0]
video_lcoal = "./梨视频/{}.mp4".format(title)
f = open(video_lcoal,"wb")
video_data = requests.get(url=video_url,headers=headers).content
f.write(video_data)
f.close()
print("数据已下载完成!!!")
View Code