不同磁力鏈網站網頁內容都不同,需要定制
1,並發爬取
並發爬取后,好像一會就被封了
import requests
from lxml import etree
import re
from concurrent.futures import ThreadPoolExecutor
def get_mlink(url, headers):
"""輸入某影片磁力鏈所在的網頁,返回該網頁中的磁力鏈"""
r = requests.get(url, headers=headers)
select = etree.HTML(r.text)
try:
magnetlink = select.xpath('//textarea[@id="magnetLink"]//text()')
return magnetlink[0]
except AttributeError:
return None
def get_page_mlinks(url, headers):
"""輸入某一頁搜索結果,返回該網頁中所有的元組(url, 影片大小,時間,磁力鏈)"""
r = requests.get(url, headers=headers)
select = etree.HTML(r.text)
div_rows = select.xpath('//div[@class="row"]')
def get_each(se):
size = se.xpath('.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()')
date = se.xpath('.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()')
href = se.xpath('.//a/@href')
try:
return href[0], size[0], date[0], get_mlink(href[0], headers)
except IndexError:
pass
with ThreadPoolExecutor() as executor: # 並發執行爬取單個網頁中所有的磁力鏈
res = executor.map(get_each, div_rows)
return res
def get_urls(baseurl, headers, suffix=None):
"""輸入搜索網頁,遞歸獲取所有頁的搜索結果"""
if suffix:
url = baseurl + suffix
else:
url = baseurl
r = requests.get(url, headers=headers)
select = etree.HTML(r.text)
page_suffixes = select.xpath('//ul[@class="pagination pagination-lg"]'
'//li//a[@name="numbar"]/@href')
# 有時該站會返回/search/.../search/...search/.../page,需要處理下
p = r'/search/[^/]+/page/\d+(?=\D|$)'
page_suffixes = [re.search(p, i).group() for i in page_suffixes]
# 如果還有下一頁,需要進一步遞歸查詢獲取
r = requests.get(url + page_suffixes[-1], headers=headers)
select = etree.HTML(r.text)
next_page = select.xpath('//ul[@class="pagination pagination-lg"]'
'//li//a[@name="nextpage"]/@href')
if next_page:
page_suffixes = page_suffixes + get_urls(baseurl, headers, next_page[0])
return page_suffixes
if __name__ == '__main__':
keyword = "金剛狼3"
baseurl = 'https://btsow.club/search/{}'.format(keyword) # 該站是采用get方式提交搜索關鍵詞
headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"}
urls = get_urls(baseurl, headers)
new_urls = list(set(urls))
new_urls.sort(key=urls.index)
new_urls = [baseurl + i for i in new_urls]
with ThreadPoolExecutor() as executor:
res = executor.map(get_page_mlinks, new_urls, [headers for i in range(7)])
for r in res:
for i in r:
print(i)
2,逐頁爬取
手工輸入關鍵詞和頁數
超過網站已有頁數時,返回None
爬取單個搜索頁中所有磁力鏈時,仍然用的是並發
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
def get_mlink(url, headers):
"""輸入某影片磁力鏈所在的網頁,返回該網頁中的磁力鏈"""
r = requests.get(url, headers=headers)
select = etree.HTML(r.text)
try:
magnetlink = select.xpath('//textarea[@id="magnetLink"]//text()')
return magnetlink[0]
except AttributeError:
return None
def get_page_mlinks(url, headers):
"""輸入某一頁搜索結果,返回該網頁中所有的元組(url, 影片大小,時間,磁力鏈)"""
r = requests.get(url, headers=headers)
select = etree.HTML(r.text)
div_rows = select.xpath('//div[@class="row"]')
def get_each(se):
size = se.xpath('.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()')
date = se.xpath('.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()')
href = se.xpath('.//a/@href')
try:
return href[0], size[0], date[0], get_mlink(href[0], headers)
except IndexError:
pass
with ThreadPoolExecutor() as executor: # 並發執行爬取單個網頁中所有的磁力鏈
res = executor.map(get_each, div_rows)
return res
if __name__ == '__main__':
keyword = input('請輸入查找關鍵詞>> ')
page = input('請輸入查找頁>> ')
url = 'https://btsow.club/search/{}/page/{}'.format(keyword, page)
headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"}
r = get_page_mlinks(url, headers)
for i in r:
print(i)
3,先輸入影片,在選擇下載哪個磁力鏈
import requests
from lxml import etree
def get_mlink(url, headers):
"""輸入某影片磁力鏈所在的網頁,返回該網頁中的磁力鏈"""
r = requests.get(url, headers=headers)
select = etree.HTML(r.text)
try:
magnetlink = select.xpath('//textarea[@id="magnetLink"]//text()')
return magnetlink[0]
except AttributeError:
return None
def get_row(row):
size = row.xpath('.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()')
date = row.xpath('.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()')
href = row.xpath('.//a/@href')
title = row.xpath('.//a/@title')
try:
return href[0], size[0], date[0], title[0]
except IndexError:
pass
if __name__ == '__main__':
headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"}
while True:
keyword = input('請輸入查找關鍵詞>> ')
if keyword == 'quit':
break
url = 'https://btsow.club/search/{}'.format(keyword)
r = requests.get(url, headers=headers)
print(r.status_code)
select = etree.HTML(r.text)
div_rows = select.xpath('//div[@class="row"]')
div_rows = [get_row(row) for row in div_rows if get_row(row)]
if not div_rows:
continue
for index, row in enumerate(div_rows):
print(index, row[2], row[1], row[3])
# 選擇和下載哪部片子
choice = input('請選擇下載項>> ')
try: # 如果不是數字,退回到輸入關鍵詞
choice = int(choice)
except ValueError:
continue
download_url = div_rows[choice][0]
mlink = get_mlink(download_url, headers)
print(r.status_code)
print(mlink)
print('\n\n')
執行效果:

4,補充下lxml的使用
<div class="item" data-houseid="*****">
*************************************************************
</div>
<div class="item" data-houseid="107102426781">
<a class="img" href="https://sh.lianjia.com/ershoufang/107102426781.html" target="_blank" data-bl="list" data-log_index="5" data-housecode="107102426781" data-is_focus="" data-el="ershoufang">
<img class="lj-lazy" src="https://s1.ljcdn.com/feroot/pc/asset/img/blank.gif?_v=20200428212347" data-original="https://image1.ljcdn.com/110000-inspection/pc1_JZKtMEOU3_1.jpg.296x216.jpg.437x300.jpg">
<div class="btn-follow follow" data-hid="107102426781"><span class="star"></span><span class="follow-text">關注</span></div>
<div class="leftArrow"><span></span></div>
<div class="rightArrow"><span></span></div><div class="price"><span>375</span>萬</div>
</a>
<a class="title" href="https://sh.lianjia.com/ershoufang/107102426781.html" target="_blank" data-bl="list" data-log_index="5" data-housecode="107102426781" data-is_focus="" data-el="ershoufang">臨河位置,全明戶型帶邊窗,滿五年唯一,拎包入住</a>
<div class="info">
御橋
<span>/</span>
2室1廳
<span>/</span>
50.11平米
<span>/</span>
南
<span>/</span>
精裝
</div>
<div class="tag"><span class="subway">近地鐵</span><span class="vr">VR房源</span></div>
</div>
<div class="tag"><span class="subway">近地鐵</span><span class="vr">VR房源</span></div> </div>
<div class="item" data-houseid="*****">
*************************************************************
</div>
要獲取所有房源tilte,價格,朝向,裝修情況等,可以:
elements = select.xpath('//div[@class="item"]') # 所有房源組成的items列表,即所有class='item'的div標簽
for element in elements:
title = element.xpath('a[@class="title"]/text()')[0] # class='item'的div標簽下,所有class='title'的a標簽
price = element.xpath('a[@class="img"]/div[@class="price"]/span/text()')[0]
_, scale, size, orient, deco = element.xpath('div[@class="info"]/text()')
print(title, price, scale, size, orient, deco)
輸入某小區的結果:
中間樓層+精裝保養好+滿兩年+雙軌交匯+誠意出售 385 2室1廳 62.7平米 南 精裝 南北通風,戶型方正,樓層佳位置佳,11/18號線雙軌 368 2室1廳 52.49平米 南 精裝 一手動遷 業主置換 急售 雙南采光佳 看房方便 370 2室1廳 62.7平米 南 簡裝 臨河位置,全明戶型帶邊窗,滿五年唯一,拎包入住 375 2室1廳 50.11平米 南 精裝 一手動遷,稅費少,樓層采光好,精裝修。 388 2室1廳 62.7平米 南 精裝 南北通兩房 近地鐵 拎包入住 業主誠意出售 508 2室2廳 90.88平米 南 其他
