python爬取某站磁力鏈

本文轉載自查看原文 2019-04-13 19:17 1436 前端和爬蟲

不同磁力鏈網站網頁內容都不同，需要定制

1，並發爬取

並發爬取后，好像一會就被封了

import requests
from lxml import etree
import re
from concurrent.futures import ThreadPoolExecutor


def get_mlink(url, headers):
    """輸入某影片磁力鏈所在的網頁，返回該網頁中的磁力鏈"""
    r = requests.get(url, headers=headers)
    select = etree.HTML(r.text)
    try:
        magnetlink = select.xpath('//textarea[@id="magnetLink"]//text()')
        return magnetlink[0]
    except AttributeError:
        return None


def get_page_mlinks(url, headers):
    """輸入某一頁搜索結果，返回該網頁中所有的元組（url, 影片大小，時間，磁力鏈）"""
    r = requests.get(url, headers=headers)
    select = etree.HTML(r.text)
    div_rows = select.xpath('//div[@class="row"]')

    def get_each(se):
        size = se.xpath('.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()')
        date = se.xpath('.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()')
        href = se.xpath('.//a/@href')
        try:
            return href[0], size[0], date[0], get_mlink(href[0], headers)
        except IndexError:
            pass

    with ThreadPoolExecutor() as executor:  # 並發執行爬取單個網頁中所有的磁力鏈
        res = executor.map(get_each, div_rows)

    return res


def get_urls(baseurl, headers, suffix=None):
    """輸入搜索網頁，遞歸獲取所有頁的搜索結果"""
    if suffix:
        url = baseurl + suffix
    else:
        url = baseurl

    r = requests.get(url, headers=headers)
    select = etree.HTML(r.text)
    page_suffixes = select.xpath('//ul[@class="pagination pagination-lg"]'
                                 '//li//a[@name="numbar"]/@href')

    # 有時該站會返回/search/.../search/...search/.../page，需要處理下
    p = r'/search/[^/]+/page/\d+(?=\D|$)'
    page_suffixes = [re.search(p, i).group() for i in page_suffixes]

    # 如果還有下一頁，需要進一步遞歸查詢獲取
    r = requests.get(url + page_suffixes[-1], headers=headers)
    select = etree.HTML(r.text)
    next_page = select.xpath('//ul[@class="pagination pagination-lg"]'
                             '//li//a[@name="nextpage"]/@href')
    if next_page:
        page_suffixes = page_suffixes + get_urls(baseurl, headers, next_page[0])

    return page_suffixes


if __name__ == '__main__':
    keyword = "金剛狼3"
    baseurl = 'https://btsow.club/search/{}'.format(keyword)  # 該站是采用get方式提交搜索關鍵詞
    headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"}

    urls = get_urls(baseurl, headers)
    new_urls = list(set(urls))
    new_urls.sort(key=urls.index)
    new_urls = [baseurl + i for i in new_urls]

    with ThreadPoolExecutor() as executor:
        res = executor.map(get_page_mlinks, new_urls, [headers for i in range(7)])

    for r in res:
        for i in r:
            print(i)

2，逐頁爬取

手工輸入關鍵詞和頁數

超過網站已有頁數時，返回None

爬取單個搜索頁中所有磁力鏈時，仍然用的是並發

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor


def get_mlink(url, headers):
    """輸入某影片磁力鏈所在的網頁，返回該網頁中的磁力鏈"""
    r = requests.get(url, headers=headers)
    select = etree.HTML(r.text)
    try:
        magnetlink = select.xpath('//textarea[@id="magnetLink"]//text()')
        return magnetlink[0]
    except AttributeError:
        return None


def get_page_mlinks(url, headers):
    """輸入某一頁搜索結果，返回該網頁中所有的元組（url, 影片大小，時間，磁力鏈）"""
    r = requests.get(url, headers=headers)
    select = etree.HTML(r.text)
    div_rows = select.xpath('//div[@class="row"]')

    def get_each(se):
        size = se.xpath('.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()')
        date = se.xpath('.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()')
        href = se.xpath('.//a/@href')
        try:
            return href[0], size[0], date[0], get_mlink(href[0], headers)
        except IndexError:
            pass

    with ThreadPoolExecutor() as executor:  # 並發執行爬取單個網頁中所有的磁力鏈
        res = executor.map(get_each, div_rows)

    return res


if __name__ == '__main__':
    keyword = input('請輸入查找關鍵詞>> ')
    page = input('請輸入查找頁>> ')

    url = 'https://btsow.club/search/{}/page/{}'.format(keyword, page)
    headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"}

    r = get_page_mlinks(url, headers)
    for i in r:
        print(i)

3，先輸入影片，在選擇下載哪個磁力鏈

import requests
from lxml import etree


def get_mlink(url, headers):
    """輸入某影片磁力鏈所在的網頁，返回該網頁中的磁力鏈"""
    r = requests.get(url, headers=headers)
    select = etree.HTML(r.text)
    try:
        magnetlink = select.xpath('//textarea[@id="magnetLink"]//text()')
        return magnetlink[0]
    except AttributeError:
        return None


def get_row(row):
    size = row.xpath('.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()')
    date = row.xpath('.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()')
    href = row.xpath('.//a/@href')
    title = row.xpath('.//a/@title')
    try:
        return href[0], size[0], date[0], title[0]
    except IndexError:
        pass


if __name__ == '__main__':
    headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"}

    while True:
        keyword = input('請輸入查找關鍵詞>> ')
        if keyword == 'quit':
            break
        url = 'https://btsow.club/search/{}'.format(keyword)
        r = requests.get(url, headers=headers)
        print(r.status_code)

        select = etree.HTML(r.text)
        div_rows = select.xpath('//div[@class="row"]')
        div_rows = [get_row(row) for row in div_rows if get_row(row)]
        if not div_rows:
            continue
        for index, row in enumerate(div_rows):
            print(index, row[2], row[1], row[3])

        # 選擇和下載哪部片子
        choice = input('請選擇下載項>> ')
        try:  # 如果不是數字，退回到輸入關鍵詞
            choice = int(choice)
        except ValueError:
            continue
        download_url = div_rows[choice][0]
        mlink = get_mlink(download_url, headers)
        print(r.status_code)
        print(mlink)
        print('\n\n')

執行效果：

4，補充下lxml的使用

            <div class="item" data-houseid="*****"> 
           *************************************************************
           </div>


           <div class="item" data-houseid="107102426781">
            <a class="img" href="https://sh.lianjia.com/ershoufang/107102426781.html" target="_blank" data-bl="list" data-log_index="5" data-housecode="107102426781" data-is_focus=""  data-el="ershoufang">
                <img class="lj-lazy" src="https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20200428212347" data-original="https://image1.ljcdn.com/110000-inspection/pc1_JZKtMEOU3_1.jpg.296x216.jpg.437x300.jpg">
                <div class="btn-follow follow" data-hid="107102426781"><span class="star"></span><span class="follow-text">關注</span></div>
                <div class="leftArrow"><span></span></div>
                <div class="rightArrow"><span></span></div><div class="price"><span>375</span>萬</div>
            </a>
            <a class="title" href="https://sh.lianjia.com/ershoufang/107102426781.html" target="_blank" data-bl="list" data-log_index="5" data-housecode="107102426781" data-is_focus=""  data-el="ershoufang">臨河位置，全明戶型帶邊窗，滿五年唯一，拎包入住</a>
            <div class="info">
                御橋
                <span>/</span>
                2室1廳
                <span>/</span>
                50.11平米
                <span>/</span>
                南
                <span>/</span>
                精裝
            </div>
            <div class="tag"><span class="subway">近地鐵</span><span class="vr">VR房源</span></div>
        </div>

       <div class="tag"><span class="subway">近地鐵</span><span class="vr">VR房源</span></div> </div> 

       <div class="item" data-houseid="*****">
        ************************************************************* 
       </div>

要獲取所有房源tilte，價格，朝向，裝修情況等，可以：

elements = select.xpath('//div[@class="item"]')   # 所有房源組成的items列表，即所有class='item'的div標簽

for element in elements:
    title = element.xpath('a[@class="title"]/text()')[0]   # class='item'的div標簽下，所有class='title'的a標簽
    price = element.xpath('a[@class="img"]/div[@class="price"]/span/text()')[0]
    _, scale, size, orient, deco = element.xpath('div[@class="info"]/text()')

    print(title, price, scale, size, orient, deco)

輸入某小區的結果：

中間樓層+精裝保養好+滿兩年+雙軌交匯+誠意出售 385 2室1廳 62.7平米 南 精裝
南北通風，戶型方正，樓層佳位置佳，11/18號線雙軌 368 2室1廳 52.49平米 南 精裝
一手動遷 業主置換 急售 雙南采光佳 看房方便 370 2室1廳 62.7平米 南 簡裝
臨河位置，全明戶型帶邊窗，滿五年唯一，拎包入住 375 2室1廳 50.11平米 南 精裝
一手動遷，稅費少，樓層采光好，精裝修。 388 2室1廳 62.7平米 南 精裝
南北通兩房 近地鐵  拎包入住  業主誠意出售 508 2室2廳 90.88平米 南 其他

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 開始第一個自己的python爬蟲程序爬磁力鏈 Python 爬取b站專欄圖片 Python爬取b站視頻簡單python爬蟲練習 E站本爬取用Python爬E站本 scrapy框架下爬取老司機網站獲取磁力鏈接 Java爬蟲——B站彈幕爬取 python3爬蟲-爬取B站排行榜信息 Python爬取b站任意up主所有視頻彈幕 Python爬蟲入門教程05：B站視頻彈幕的爬取