前言
本篇繼續lxml.etree學習,在線訪問接口,通過接口返回的html,解析出想要的text文本內容
環境准備:
python3.7
lxml
requests
定位目標
爬取我的博客首頁https://www.cnblogs.com/canglongdao/側邊個人基本信息。
打開fiddler抓包,刷新我的博客首頁。抓取到的接口地址如下圖。
找到該接口地址https://www.cnblogs.com/canglongdao/ajax/news.aspx

# coding:utf-8
import requests
from lxml import etree
import urllib3
urllib3.disable_warnings()
url="https://www.cnblogs.com/canglongdao/ajax/news.aspx"
r=requests.get(url,verify=False)
#print(r.text)
a=etree.HTML(r.content.decode("utf-8"))
b=a.xpath("//*[@id='profile_block']")
#打印提取到的結果
r=etree.tostring(b[0],encoding="utf-8",pretty_print=True)
print(r.decode("utf-8"))
運行結果

提取內容
# coding:utf-8
import requests
from lxml import etree
import urllib3
urllib3.disable_warnings()
url="https://www.cnblogs.com/canglongdao/ajax/news.aspx"
r=requests.get(url,verify=False)
#print(r.text)
a=etree.HTML(r.content.decode("utf-8"))
b=a.xpath("//*[@id='profile_block']")
t0=b[0].xpath("text()") #獲取當前節點文本元素
print(t0)
t00=t0[::2]
print(t00)
t1=b[0].xpath('a')#定位a標簽的位置
print(t1)
#打印結果
for i,j in zip(t00,t1):
ii=i.replace('\n','').replace(' ','')#去掉i中的所有\n,空格
jj=j.text.replace('\n','').replace(' ','')
print(ii,jj)
運行結果
['\n 昵稱:\n ', '\n ', '\n 園齡:\n ', '\n ', '\n 粉絲:\n ', '\n ', '\n 關注:\n ', '\n ', '\n ', '\n '] ['\n 昵稱:\n ', '\n 園齡:\n ', '\n 粉絲:\n ', '\n 關注:\n ', '\n '] [<Element a at 0x163596090c8>, <Element a at 0x16359609048>, <Element a at 0x1635961ba08>, <Element a at 0x1635961b248>] 昵稱: 星空6 園齡: 1年7個月 粉絲: 8 關注: 3
總結
1.獲取當前節點標簽名稱.tag
print(b[0].tag) div
2.獲取當前節點文本
print(b[0].text) 昵稱:
3.獲取當前節點元素全部屬性dict
print(b[0].attrib)
{'id': 'profile_block'}
4.獲取當前節點某個屬性
print(b[0].get("id"))
profile_block
5.所有子節點
for i in b[0].iter():
print(i.text)
昵稱:
星空6
None
1年7個月
None
8
None
3
None
getFollowStatus('fe2d40f4-c531-49cf-1c8d-08d666411c36');
6.獲取當前節點下全部文本
print(b[0].xpath('text()'))
['\n 昵稱:\n ', '\n ', '\n 園齡:\n ', '\n ', '\n 粉絲:\n ', '\n ', '\n 關注:\n ', '\n ', '\n ', '\n ']
7.獲取本節點和子節點所有文本信息
print(b[0].xpath('.//text()'))
['\n 昵稱:\n ', '\n 星空6\n ', '\n ', '\n 園齡:\n ', '\n 1年7個月\n ', '\n ', '\n 粉絲:\n ', '\n 8\n ', '\n ', '\n 關注:\n ', '\n 3\n ', '\n ', '\n ', "getFollowStatus('fe2d40f4-c531-49cf-1c8d-08d666411c36');", '\n ']
8.獲取父節點
print(b[0].getparent().tag) div
# coding:utf-8
import requests
from lxml import etree
import urllib3
urllib3.disable_warnings()
url="https://www.cnblogs.com/canglongdao/ajax/news.aspx"
r=requests.get(url,verify=False)
#print(r.text)
a=etree.HTML(r.content.decode("utf-8"))
b=a.xpath("//*[@id='profile_block']")
print(b[0].tag) #div
print(b[0].text)#昵稱:
print(b[0].attrib)#{'id': 'profile_block'}
print(b[0].get("id"))#profile_block
for i in b[0].iter():
print(i.text)
print(b[0].xpath('text()'))
print(b[0].xpath('.//text()'))
print(b[0].getparent().tag)
