利用爬到的數據,基於Django搭建的一個最新電影信息網站:
今天想利用所學知識來爬取電影天堂所有最新電影信息,用到的模塊:
requests:用於獲取網頁信息
re:獲取網頁中具體想要的信息
Beautifulsoup:便於標簽搜索,獲取想要信息
threading:使用多線程大幅度縮短爬取時間
queue:使用線程隊列來寫入文件(實際上我是把數據全部存入了數據庫)
大概差不多就這些模塊。
歡迎各位大牛指點。
# Author : 'n1celll' import requests import json import re from bs4 import BeautifulSoup import threading import queue,time header = header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html' def get_page(url): index = requests.get(url, headers=header) index.enconding = 'GBK'# 將編碼轉為與HTML一致 t = index.text index_soup = BeautifulSoup(t, 'html.parser')# 將獲得的網頁信息 轉成soup對象 all_pages = index_soup.find('select', attrs={'name': 'sldd'}).find_all('option')[-1] # 獲得總頁數 page = int(all_pages.string) return page def get_data(page): page_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_%s.html' % (page) # 獲取每一頁數據 print(page) # page_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_30.html' res = requests.get(page_url, headers=header) res.encoding = 'GBK' # 'gb2312' a = res.text soup = BeautifulSoup(a, 'html.parser') name = soup.find_all('a', attrs={'class': 'ulink'}) # print(name)測試 for i in name: try: moive_name = re.search('《(.*?)(》|】)', i.string).group() # 有兩個坑,這個里面有個電影名字不是用的書名號,還有一個電影有兩個a標簽 except: continue html = 'http://www.ygdy8.net' + i['href'] da = requests.get(html, headers=header) da.encoding = 'GBK' # da.apparent_encoding db = da.text # f = open('test2.txt','w',encoding='utf8') # f.write(a.text) # f.close() dr = BeautifulSoup(db, 'html.parser') span = dr.find('span', attrs={'style': 'FONT-SIZE: 12px'}) if span: dc = span.text.split() data = '' for i in dc: data += i print(data) msg = {} if data: msg['mname'] = moive_name try: show_t = re.search(r'(?<=(◎年代|◎時間|品年代|年代】|播時間|播】:))(.*?)(?=◎|年|【)', data).group() except: show_t = re.search(r'(?<=日期|份:)(.*?)(?=(-|劇))', data).group() msg['mtime'] = show_t try: country = re.search(r'(?<=(◎國家|◎產地|◎地區|◎國別|國家】))(.*?)(?=◎|【類)', data).group() except: try: country = re.search(r'(?<=地區)(.*?)(?=語言)', data).group() except: country = '未知' msg['mcountry'] = country try: time = re.search(r'(?<=◎片長|長度】)(.*?)(?=◎|【)', data).group() except: time = '未知' msg['mtime'] = time try: mtype = re.search(\ r'(?<=(◎類別|別類型|影類型|◎類型|集類型|◎分類|類型:|類別】|片類型|型】:))(.*?)(?=(◎|級別|【出品|【主演))', \ data).group() except: try: mtype = re.search(r'(?<=類型:)(.*?)(?=國)', data).group() except: mtype = re.search(r'動作|愛情|戰爭', data).group() #以上的正則表達式,感覺用的很笨拙,希望有技術大牛提點建議 # with open('test4.txt','a+',encoding='utf8') as f:測試 # f.write('%s: %s,%s,%s,%s\n' % (moive_name, country, mtype, time, show_t))測試 q.put('%s: %s,%s,%s,%s,%s\n' % (moive_name, country, mtype, time, show_t,html)) q = queue.Queue(maxsize=10000) t_obj = [] lock = threading.Lock()#加上線程鎖 # semaphore = threading.BoundedSemaphore(200) def writing(f): # semaphore.acquire() data = q.get() lock.acquire() f.write(data) lock.release() # semaphore.release() # if not q.get(): # f.close() # break # print('寫入完成') all_page = get_page(url) f = open('test4.txt', 'w', encoding='utf8') print(all_page+1) for i in range(1,all_page+1): t = threading.Thread(target=get_data,args=(i,)) t.start() t_obj.append(t) for t in t_obj: t.join()#保證所有線程結束后開始寫入 print('%s over'%t) while q.qsize():#判斷隊列里面是否還有元素 w = threading.Thread(target=writing, args=(f,)) w.start() w.join() else: print('寫入完成')