requests+BeautifulSoup | 爬取電影天堂全站電影資源

本文轉載自查看原文 2019-12-28 23:43 681 python

import requests
import urllib.request as ur
from bs4 import BeautifulSoup
import csv
import threading
class MovieHeven():
    def __init__(self):
        self.url="https://www.dytt8.net/html/gndy/dyzz/index.html"
        self.page=1
        self.No=1
        self.fobj=open("movies.csv", "wt", encoding="gbk", newline='')
    def spider(self):
        try:
            print("正在爬取第{}頁...".format(self.page))
            # time.sleep(1)
            #獲取網頁鏈接並讀取
            html = requests.get(self.url)#.Session()
            html.encoding="gbk"
            html=html.text
            #beautfulSoup裝載文檔
            root=BeautifulSoup(html,"lxml")
            #查找所需元素，獲取tables列表
            tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
            for table in tables:
                name = table.find("a").text
                href = "http://www.dytt8.net"+table.find("a")["href"]
                # 文件寫入操作
                writer = csv.writer(self.fobj)
                writer.writerow([name, href])
                print("No:", self.No, name, href)
                self.No += 1
            # time.sleep(1)
            urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
            #尋找下一頁的鏈接
            for u in urls:
                if u.text == "下一頁":#如有下一頁
                    self.url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
                    print(self.url)
                    self.page += 1
                    self.spider()#爬取下一頁


        # except:#沒有下一頁
        #     print("finished")
            # spider(url)
        except Exception as err:
            print(err)
    def main(self):
    ##    threading.Thread(target=spiderA(url)).start()
        import time
        begin_time = time.time()
        self.spider()  # 執行主程序
        self.fobj.close()
        end_time = time.time()
        time = end_time - begin_time
        m, s = divmod(round(time), 60)
        print("用時：{}min{}s".format(m, s))

if __name__ == '__main__':
    billie=MovieHeven()
    billie.main()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬蟲爬取電影天堂電影鏈接電影天堂電影鏈接爬取 Python多線程爬蟲爬取電影天堂資源 python利用requests和threading模塊，實現多線程爬取電影天堂最新電影信息。「Videos」- 爬取電影天堂 @20210218 電影天堂爬取詳情頁 python3 爬取電影天堂最新電影 Python爬取電影天堂指定電視劇或者電影 python3爬蟲-6.使用requests和BeautifulSoup爬取豆瓣Top250電影