爬取的顯示結果如下:
話不多說,下面是python代碼。。。。。。。。。。。。。。。
1 import requests 2 import re 3 from bs4 import BeautifulSoup 4 import json 5 6 if __name__ == '__main__': 7 8 for i in range(1,10):#翻頁數可自行選擇 9 #獲取URL,並自動翻頁 10 url = 'http://list.iqiyi.com/www/2/-------------11-'+str(i)+'-1-iqiyi--.html' 11 12 headers = { 13 'Access-Control-Allow-Credentials': 'true', 14 'Cache-Control': 'max-age=900', 15 'Content-Encoding': 'gzip', 16 'Content-Language': 'zh-CN', 17 'Content-Type': 'text/html; charset=UTF-8', 18 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', 19 'Referer': 'http://list.iqiyi.com/www/2/-------------11-4-1-iqiyi--.html', 20 'Upgrade-Insecure-Requests': '1' 21 } 22 target = requests.get(url=url,headers=headers).text 23 #爬取愛奇藝電視劇總網頁,解析HTML網頁 24 soup = BeautifulSoup(target,'html.parser') #html.parser解析HTML網頁 25 returnSoup = soup.find_all("div", attrs={"class": "wrapper-piclist"})[0] 26 returnSoup1= str(returnSoup).replace('\r\n','').replace('\n','').replace(' ','').replace('"rseat="bigTitle','') 27 href_title = re.findall('"data-widget-qidanadd="qidanadd"href="(.*?)"target="_blank"title=".*?"><imgalt="(.*?)"height="236"rseat="dsjp7"src=',str(returnSoup1)) 28 29 30 for i in href_title: 31 href=i[0] 32 title=i[1] 33 34 #轉到某個電視劇鏈接網頁,並解析 35 href1=str(href).split('#')[0] 36 url2=href1 37 target2 = requests.get(url=url2).text 38 soup2 = BeautifulSoup(target2,'html.parser') 39 returnsoup2 = soup2.find_all('div',attrs={'class':'site-piclist_pic'}) 40 41 #用正則表達式獲取劇集鏈接 42 result2 = re.findall('(?<=href=\").*?(?=\")',str(returnsoup2)) 43 #用正則表達式獲取劇集名稱 44 title2 = re.findall('(?<=title=\").*?(?=\">)',str(returnsoup2)) 45 j=len(title2) 46 #輸出爬取結果 47 for i in range(1,j-2): 48 str1='第'+str(i)+'集' 49 print(result2[i]) 50 print(str1,title2[i])