Python爬蟲lxml解析實戰

本文轉載自查看原文 2018-10-21 18:29 3788

XPath常用規則
/                            從當前節點選取直接子節點
//                           從當前節點選取子孫節點
.                            選取當前節點
..                           選取當前節點的父節點
@                          選取屬性
*                           通配符，選擇所有元素節點與元素名
@*                        選取所有屬性
[@attrib]               選取具有給定屬性的所有元素
[@attrib='value']    選取給定屬性具有給定值的所有元素
[tag]                     選取所有具有指定元素的直接子節點
[tag='text']            選取所有具有指定元素並且文本內容是text節點

"""爬取豆瓣網站的信息"""
import requests
from lxml import etree

# 請求頭設置
headers = {
    "User-Agentv": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
    "Referer": "https://movie.douban.com/",
}

url = "https://movie.douban.com/cinema/nowplaying/chongqing/"
# 發起請求
rep = requests.get(url, headers=headers)
text = rep.text
# 轉換成html格式
html = etree.HTML(text)
# 找到子孫節點ul標簽
ul = html.xpath("//ul[@class='lists']")[0]
# 當前ul下的所有li標簽
lis = ul.xpath("./li")
movies = []
# 循環每個li標簽
for li in lis:
    # 直接@li標簽的屬性獲取值
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-score")[0]
    region = li.xpath("@data-region")[0]
    actors = li.xpath("@data-actors")[0]
    director = li.xpath("@data-director")[0]
    liimg = li.xpath(".//img/@src")
    movie = {
        "title": title,
        "score": score,
        "region": region,
        "actors": actors,
        "director": director,
        "liimg": liimg,
    }
    movies.append(movie)
print(movies)

View Code

電影天堂

import requests
from lxml import etree

BASE_DOMAIN = "http://www.ygdy8.net"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
}


def get_detail_urls(url):
    # 進入首頁
    rep = requests.get(url=url, headers=HEADERS)
    # 小坑(編碼里面有非法字符，所以加ignore過濾掉)
    text = rep.content.decode("gbk", "ignore")
    html = etree.HTML(text)
    # 通過規律直接找table下的a標簽屬性
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    # map接受一個函數和list，並通過匿名函數lambda依次作用在list的每個元素上,得到一個新的list並返回
    detail_urls = map(lambda url:BASE_DOMAIN+url, detail_urls)
    # 返回拼接完成的詳情url
    return detail_urls


def parse_detail_page(url):
    # 爬取詳情頁面信息
    movie = {}
    res = requests.get(url, headers=HEADERS)
    text = res.content.decode("gbk")
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie["title"] = title
    zoomE = html.xpath("//div[@id='Zoom']")[0]
    # 獲取當前標簽下的img
    imgs = zoomE.xpath(".//img/@src")
    # 列表切片法，避免取超過范圍的數據報錯
    cover = imgs[0:1]
    movie["cover"] = cover
    poster = imgs[1:2]
    movie["poster"] = poster
    infos = zoomE.xpath(".//text()")

    def parse_info(info, rule):
        # 重復操作，提取出一個函數
        return info.replace(rule, "").strip()

    for index, info in enumerate(infos):
        if info.startswith("◎年　　代"):
            text = parse_info(info, "◎年　　代")
            movie["year"] = text
        elif info.startswith("◎產　　地"):
            text = parse_info(info, "◎產　　地")
            movie["country"] = text
        elif info.startswith("◎類　　別"):
            text = parse_info(info, "◎類　　別")
            movie["category"] = text
        elif info.startswith("◎豆瓣評分"):
            text = parse_info(info, "◎豆瓣評分")
            movie["douban_rating"] = text
        elif info.startswith("◎片　　長"):
            text = parse_info(info, "◎片　　長")
            movie["duration"] = text
        elif info.startswith("◎導　　演"):
            text = parse_info(info, "◎導　　演")
            movie["director"] = text
        elif info.startswith("◎主　　演"):
            text = parse_info(info, "◎主　　演")
            actors = [text]
            for x in range(index+1, len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎標"):
                    break
                actors.append(actor)
                movie["actors"] = actors
        elif info.startswith("◎簡　　介"):
            text = parse_info(info, "◎簡　　介")
            for x in range(index+1, len(infos)):
                profile = infos[x].strip()
                if profile.startswith("◎獲獎情況"):
                    break
                movie["profile"] = profile
    download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
    movie["download_url"] = download_url
    return movie


def spider():
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    movies = []
    # 設置爬取頁面數量的url
    for i in range(1, 180):
        url = base_url.format(i)
        # 傳遞到第一個首頁爬取詳情頁面鏈接
        detail_urls = get_detail_urls(url)
        # 獲取待爬取頁面詳情的url
        for detail_url in detail_urls:
            # 傳遞到詳情頁面爬取並獲取爬取的詳情數據
            movie = parse_detail_page(detail_url)
            movies.append(movie)
    print(movies)


if __name__ == '__main__':
    spider()

View Code

貓眼電影

"""貓眼電影爬取"""
import requests
from lxml import etree

BASE_URL = "http://maoyan.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"
}


def get_detail_urls(url):
    # 具體獲取詳情url
    rep = requests.get(url=url, headers=HEADERS)
    html = etree.HTML(rep.text)
    # 找到詳情url
    detail_urls = html.xpath("//dl//div[@class='movie-item']/a/@href")
    detail_urls = map(lambda url: BASE_URL+url, detail_urls)
    return detail_urls


def parse_detail_page(url):
    # 獲取數據
    movie = {}
    res = requests.get(url=url, headers=HEADERS)
    text = res.content.decode("utf-8")
    html = etree.HTML(text)
    name = html.xpath("//div[@class='movie-brief-container']/h3/text()")[0]
    movie["name"] = name
    lis = html.xpath("//div[@class='movie-brief-container']//li")
    for li in range(len(lis)):
        if li == 0:
            movie["plot"] = lis[li].xpath("./text()")[0]
        if li == 1:
            movie["country"] = lis[li].xpath("./text()")[0].split()[0]
            movie["duration"] = lis[li].xpath("./text()")[0].split()[1]
        if li == 2:
            try:
                movie["release_time"] = lis[li].xpath("./text()")[0]
            except Exception as e:
                continue

    avatar = html.xpath("//div[@class='avatar-shadow']/img/@src")
    movie["avatar"] = avatar
    content = html.xpath("//div[@class='mod-content']/span/text()")[0]
    movie["content"] = content
    container = html.xpath("//div[@class='comment-list-container']/ul")
    for li in container:
        li_name = li.xpath(".//span[@class='name']/text()")
        li_content = li.xpath(".//div[@class='comment-content']/text()")
        livs = zip(li_name, li_content)
        movie["user"] = dict((name, value)for name, value in livs)
    return movie


def spider():
    # 獲取url自行拼接
    base_url = "http://maoyan.com/films?showType=1&offset={}"
    movies = []
    for i in range(0, 31, 30):
        url = base_url.format(i)
        # 拿到url之后去找到詳情頁面url
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            # 去獲取詳情頁面數據
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)
    print(movies)


if __name__ == '__main__':
    spider()

View Code

騰訊招聘網

"""爬取騰訊招聘網找工作"""
import requests
from lxml import etree

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
           "Referer": "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start=0"
           }
BASE_URL = "https://hr.tencent.com/"


def get_detail_urls(url):
    rep = requests.get(url=url, headers=HEADERS)
    html = etree.HTML(rep.text)
    detail_urls = html.xpath("//table//td[@class='l square']/a/@href")
    detail_urls = map(lambda url: BASE_URL+url, detail_urls)
    return detail_urls


def get_parse_detail(url):
    job_offers = {}
    res = requests.get(url=url, headers=HEADERS)
    html = etree.HTML(res.text)
    position = html.xpath("//table//td[@class='l2 bold size16']/text()")[0]
    job_offers["position"] = position
    tds = html.xpath("//table//tr[@class='c bottomline']/td/text()")
    for i in range(len(tds)):
        job_offers["location"] = tds[0]
        job_offers["category"] = tds[1]
        job_offers["recruits"] = tds[2]
    duties = html.xpath("//tr[3][contains(@class, 'c')]//li/text()")
    job_offers["duties"] = duties
    claim = html.xpath("//tr[4][contains(@class, 'c')]//li/text()")
    job_offers["claim"] = claim
    return job_offers


def spider():
    base_url = "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start={}#a"
    squres = []
    for i in range(0, 340, 10):
        url = base_url.format(i)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            squre = get_parse_detail(detail_url)
            squres.append(squre)
            print(squre)


if __name__ == '__main__':
    spider()

View Code

可參考博客鏈接(我就懶得寫了)：http://www.cnblogs.com/zhangxinqi/p/9210211.html#_label11

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python爬蟲中XPath和lxml解析庫 Python爬蟲 | lxml解析html頁面 python爬蟲網頁解析之lxml模塊 python簡單爬蟲用lxml解析頁面中的表格 Python 之lxml解析庫 Python 之lxml解析模塊 python解析xml之lxml python網絡爬蟲之LXML與HTMLParser python爬蟲（十三） lxml模塊 python 爬蟲-解析網頁之 lxml 解析網頁和自動化測試