爬蟲（2）-電影天堂2022精品電影

本文轉載自查看原文 2022-03-19 12:20 29766 python/ python爬蟲筆記

1.爬取2022精品電影名和下載鏈接（種子）

# -*- coding: utf-8 -*-
"""
@Time    :  2022/3/18 17:21
@Author  : Andrew
@File    : 電影天堂.py
"""
import csv
import requests
import re
"""
    1.第一步還是先定位目標2022新片所在weizhi1
    2.從中提取到子頁面的鏈接地址
    3.請求該鏈接地址，拿到我們的下載鏈接
    4.查看返回的網頁源代碼時，發現亂碼，
        由於request.get()后會自動進行一次utf-8解碼，
        但這只對用utf-8編碼的網頁有效，因此需要查看源
        代碼中meta標簽的charset檢查源代碼的編碼格式
    5.在設計正則表達式時，要注意：
        不能有多余的空格；
        注意看原來的代碼里的引號結構，定位的源代碼屬性值是用'',那么在就r""，反之r''
    6.提取頁面的子鏈接，就找它的跳轉url，a標簽就找href
    7.關於finditer的返回值迭代器：
        it里面：
            match為匹配的字符串，通過it.group("name")獲取，（?P<name>.*?）
            span為該字符串從始末位置，是個元組，通過it.span()獲取
        it.groupdict()將會把it.group("name")的值和“name”組成字典
    8.面對爬取到的頁面，當作字符串處理就行了，不用考慮html結構上的問題，
        截取哪里就從哪里開始，哪里結束就到哪里，哪怕取到問號呢，不要受到干擾
        比如下面代碼中取href時，取出從href="到"的內容: r'href="(?P<url>.*?)"'
"""


# 創建文件 ,utf-8保證不亂碼，newline=""防止excel打開csv時有多余空行
f = open("data_movie.csv", mode="w", encoding="utf-8", newline="")
csvWriter = csv.writer(f)
title = {'title': "電影名字", 'download': "下載鏈接"}
csvWriter.writerow(title.values())  # 寫入標題
# 電影天堂的主頁
domain = "https://m.dytt8.net/index2.htm"
# header
resp = requests.get(domain, verify=False)  # verify = False 取消安全驗證
resp.encoding = 'gb2312'
# print(resp.text)
content1 = resp.text
# 預定義正則
obj1 = re.compile(r'2022新片精品.*?手機瀏覽,推薦下載本站app,綠色小巧,簡單實用！詳情請點擊！.*?</tr>(?P<table>.*?)</table>', re.S)
obj2 = re.compile(r"最新電影下載</a>]<a href='(?P<href>.*?)'>", re.S)
obj3 = re.compile(r'◎譯　　名(?P<movieName>.*?)<br />.*?<a target="_blank" href="(?P<downLoad>.*?)"', re.S)
result1 = obj1.finditer(content1)
childDomainList = []
for it in result1:
    # 提取子頁面的url
    content2 = it.group('table')
    result2 = obj2.finditer(content2)
    for itt in result2:
        # itt:  最新電影下載</a>]<a href='(?P<href>.*?)'> 匹配到的一條信息
        child_domain = domain.replace("/index2.htm", "") + itt.group('href')
        childDomainList.append(child_domain)
# 提取子頁面
for href in childDomainList:
    respChild = requests.get(href, verify=False)
    respChild.encoding = 'gb2312'
    content3 = respChild.text
    result3 = obj3.finditer(content3)
    for i in result3:
        print(i.group('movieName'))
        print(i.group('downLoad'))
        dic = i.groupdict()
        csvWriter.writerow(dic.values())
resp.close()

2.beautifulsoup2022精品電影名和子鏈接

# -*- coding: utf-8 -*-
"""
@Time    :  2022/3/19 14:15
@Author  : Andrew
@File    : bs4解析.py
"""
import csv
import re

import requests
from bs4 import BeautifulSoup

"""
    bs4基於html（hyper text market language）的標簽和屬性來進行解析的
    <標簽 屬性=屬性值>內容</標簽>
"""
# 創建文件 ,utf-8保證不亂碼，newline=""防止excel打開csv時有多余空行
f = open("movieName.csv", mode="w", encoding="utf-8", newline="")
csvWriter = csv.writer(f)
title = {'movieName': "電影名字", 'childurl': "子鏈接"}
csvWriter.writerow(title.values())  # 寫入標題
# 安裝 pip install bs4
url = "https://m.dytt8.net/index2.htm"
resp = requests.get(url)
resp.encoding = "gb2312"
# print(resp.text)
# 解析數據
# 1.把頁面源代碼交給BeauifulSoup進行處理，生成bs4對象
page = BeautifulSoup(resp.text, "html.parser")  # 告訴BeautifulSoup，resp.text就是html，指定html解析器

# 2.從bs對象中取數據
# find（標簽，屬性 = 屬性值） 找到一個滿足條件的標簽就返回
# find_all（標簽，屬性 = 屬性值） 找到所有的滿足條件的標簽返回

# table = page.find("table", class_="co_content8")  # class_對應標簽屬性class
table = page.find("table", attrs={"border": "0", "cellpadding": "0", "cellspacing": "0", "width": "100%"})  # 這個也可以
# print(table)
# 找所有的tr，並從第2個tr開始，即就是去掉第一個游戲app雜物
trs = table.find_all('tr')[1:]
obj = re.compile(r'href="(?P<href>.*?)"', re.S)
for tr in trs:
    # 找出每個tr中所有的td，並將從第二個a開始存，因為源代碼中，第一個a是最新電影下載，不是我們要的
    td = tr.find_all("td", attrs={"width": "85%", "height": "22", "class": "inddline"})
    a = td[0].find_all("a")[1:]
    # print(str(a[0]))
    result = obj.finditer(str(a[0]))
    # 子鏈接
    childurl = ""
    for it in result:
        childurl = url.replace("/index2.htm", "") + it.group("href")
    movies = {"movieName": a[0].text, "childurl": childurl}
    csvWriter.writerow(movies.values())
f.close()
resp.close()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 電影天堂爬蟲實戰電影天堂爬蟲爬蟲爬取電影天堂電影鏈接 python爬蟲——抓取電影天堂電影信息 Python爬蟲 -- 抓取電影天堂8分以上電影 Python多線程爬蟲爬取電影天堂資源電影天堂電影鏈接爬取 21天打造分布式爬蟲-豆瓣電影和電影天堂實戰（三） scrapy--dytt(電影天堂) LOL電影天堂下載攻略