import requests
import urllib.request as ur
from bs4 import BeautifulSoup
import csv
import threading
class MovieHeven():
def __init__(self):
self.url="https://www.dytt8.net/html/gndy/dyzz/index.html"
self.page=1
self.No=1
self.fobj=open("movies.csv", "wt", encoding="gbk", newline='')
def spider(self):
try:
print("正在爬取第{}頁...".format(self.page))
# time.sleep(1)
#獲取網頁鏈接並讀取
html = requests.get(self.url)#.Session()
html.encoding="gbk"
html=html.text
#beautfulSoup裝載文檔
root=BeautifulSoup(html,"lxml")
#查找所需元素,獲取tables列表
tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
for table in tables:
name = table.find("a").text
href = "http://www.dytt8.net"+table.find("a")["href"]
# 文件寫入操作
writer = csv.writer(self.fobj)
writer.writerow([name, href])
print("No:", self.No, name, href)
self.No += 1
# time.sleep(1)
urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
#尋找下一頁的鏈接
for u in urls:
if u.text == "下一頁":#如有下一頁
self.url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
print(self.url)
self.page += 1
self.spider()#爬取下一頁
# except:#沒有下一頁
# print("finished")
# spider(url)
except Exception as err:
print(err)
def main(self):
## threading.Thread(target=spiderA(url)).start()
import time
begin_time = time.time()
self.spider() # 執行主程序
self.fobj.close()
end_time = time.time()
time = end_time - begin_time
m, s = divmod(round(time), 60)
print("用時:{}min{}s".format(m, s))
if __name__ == '__main__':
billie=MovieHeven()
billie.main()