爬蟲實戰01——爬取貓眼電影top100榜單


#需求:抓取貓眼電影TOP100的電影名稱、時間、評分、圖片等信息,提取的結果會以文件的形式保存下來
import requests
import time
from lxml import etree
import json
import csv
import codecs

class MaoYanTop100Spider:
    #存儲電影詳情頁的url
    film_page_url_list = []
    #存儲每個的電影信息
    #film_info = {}
    film_info_list = []

    # 1.獲取電影列表頁數據
    def Top100_list(self, session, headers):
        #1.1向列表頁發送請求
        #https://maoyan.com/board/4?offset=20
        #(1)固定url
        base_url = "https://maoyan.com/board/4"
        #(2)url變化部分:
        for i in range(0, 91, 10):
            #(3)拼接URL:
            final_url = base_url + "?offset=" + str(i)
            #(4)發送請求:
            time.sleep(5)
            response = session.get(url=final_url, headers=headers)
            #1.2解析列表頁
            film_list_page_data = response.content.decode("utf-8")
            #1.2.1使用xpath解析數據
            #(1)轉類型
            xpath_data = etree.HTML(film_list_page_data)
            #(2)
            #/dl/dd[1]/a
            #dl/dd[10]/a/img[2]
            # dl / dd[2] / a / img[2]
            for xpath_num in range(1, 11):
                # 電影名稱
                #dl/dd[1]/a
                film_name = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/@title')[0]

                # 時間
                #//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[3]
                #dl/dd[1]/div/div/div[1]/p[3]
                #dl/dd[2]/div/div/div[1]/p[3]
                film_time = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[3]/text()')[0][5:].strip()

                # 主演
                #dl/dd[1]/div/div/div[1]/p[2]
                film_actors = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[2]/text()')[0].strip()[3:]

                # 評分
                #dl/dd[1]/div/div/div[2]/p/i[1]
                score_int = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[1]/text()')[0]
                #dl/dd[1]/div/div/div[2]/p/i[2]
                score_fraction = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[2]/text()')[0]
                film_score = str(score_int) + str(score_fraction)
                # 圖片
                #dl/dd[1]/a/img[2]
                #dl/dd[1]/a/img[2]
                film_img = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/img[2]/@data-src')[0]

                # 詳情頁url
                #dl/dd[1]/div/div/div[1]/p[1]/a
                #dl/dd[1]/div/div/div[1]/p[1]/a
                film_url = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[1]/a/@href')[0]

                #電影信息
                film_info = {}
                film_info["name"] = film_name
                film_info["time"] = film_time
                film_info["actors"] = film_actors
                film_info["score"] = film_score
                film_info["img"] = film_img
                film_info["url"] = film_url
                self.film_info_list.append(film_info)
                #print(film_info)
                #詳情頁url
                self.film_page_url_list.append(film_url)





    # 2.獲取電影詳情頁數據
    def film_page(self, url, session, headers, num):
        #2.1向詳情頁發送請求
        base_url = "https://maoyan.com"
        final_url = base_url + str(url)
        print(final_url)
        time.sleep(3)
        response = session.get(url=final_url, headers=headers)
        data = response.content.decode("utf-8")
        #print(response)
        #2.2解析詳情頁
        xpath_data = etree.HTML(data)
        #//*[@id="app"]/div/div[1]/div/div[2]/div[1]/div[1]/div[2]/span
        film_summary = xpath_data.xpath('//span[@class="dra"]/text()')[0].strip()
        #print(film_summary)
        self.film_info_list[num]["summary"] = film_summary

    #將數據保存至CSV文件
    def save_data(self):
        #1.讀取json文件,創建csv文件
        #json_fp = open("new.json", "r")
        csv_fp = codecs.open("maoyan.csv", "w", "utf-8")
        #2.提出csv文件表頭,表內容
        #2.1 表頭
        #data_list = json.load(json_fp)
        title_list = self.film_info_list[0].keys()

        #2.2 表內容
        excel_data = []
        for data in self.film_info_list:
            excel_data.append(data.values())
        #3.使用csv寫入器,寫入文件
        #3.1創建csv寫入器
        csv_writer = csv.writer(csv_fp)
        #3.2寫入表頭和表內容
        #(1)寫入表頭
        csv_writer.writerow(title_list)
        #(2)寫入表內容
        csv_writer.writerows(excel_data)
        #4.關閉csv文件和json文件
        #json_fp.close()
        csv_fp.close()

    #運行:
    def run(self):
        #0.創建session,維持會話
        session = requests.Session()
        #0.1請求頭:headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        }
        #1.獲取電影列表頁數據
        self.Top100_list(session=session, headers=headers)
        #print(self.film_info_list)
        #2.獲取電影詳情頁數據
        for i, film_page_url in enumerate(self.film_page_url_list):
            self.film_page(url=film_page_url, session=session, headers=headers, num=i)
            print(self.film_info_list[i])

        #3.保存數據
        self.save_data()



MaoYanTop100Spider().run()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM