requests+BeautifulSoup | 爬取電影天堂全站電影資源


import requests
import urllib.request as ur
from bs4 import BeautifulSoup
import csv
import threading
class MovieHeven():
    def __init__(self):
        self.url="https://www.dytt8.net/html/gndy/dyzz/index.html"
        self.page=1
        self.No=1
        self.fobj=open("movies.csv", "wt", encoding="gbk", newline='')
    def spider(self):
        try:
            print("正在爬取第{}頁...".format(self.page))
            # time.sleep(1)
            #獲取網頁鏈接並讀取
            html = requests.get(self.url)#.Session()
            html.encoding="gbk"
            html=html.text
            #beautfulSoup裝載文檔
            root=BeautifulSoup(html,"lxml")
            #查找所需元素,獲取tables列表
            tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
            for table in tables:
                name = table.find("a").text
                href = "http://www.dytt8.net"+table.find("a")["href"]
                # 文件寫入操作
                writer = csv.writer(self.fobj)
                writer.writerow([name, href])
                print("No:", self.No, name, href)
                self.No += 1
            # time.sleep(1)
            urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
            #尋找下一頁的鏈接
            for u in urls:
                if u.text == "下一頁":#如有下一頁
                    self.url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
                    print(self.url)
                    self.page += 1
                    self.spider()#爬取下一頁


        # except:#沒有下一頁
        #     print("finished")
            # spider(url)
        except Exception as err:
            print(err)
    def main(self):
    ##    threading.Thread(target=spiderA(url)).start()
        import time
        begin_time = time.time()
        self.spider()  # 執行主程序
        self.fobj.close()
        end_time = time.time()
        time = end_time - begin_time
        m, s = divmod(round(time), 60)
        print("用時:{}min{}s".format(m, s))

if __name__ == '__main__':
    billie=MovieHeven()
    billie.main()

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM