環境:
windows,python3.4
參考鏈接:
https://blog.csdn.net/weixin_36604953/article/details/78156605
代碼:(親測可以運行)
1 import requests 2 from bs4 import BeautifulSoup 3 import re 4 import random 5 import time 6 7 8 # 爬蟲主函數 9 def mm(url): 10 # 設置目標url,使用requests創建請求 11 header = { 12 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"} 13 req0 = requests.get(url=url, headers=header) 14 req0.encoding = "gb18030" # 解決亂碼問題 15 html0 = req0.text 16 17 # 使用BeautifulSoup創建html代碼的BeautifulSoup實例,存為soup0 18 soup0 = BeautifulSoup(html0, "html.parser") 19 20 # 獲取最后一頁數字,對應-122(對照前一小節獲取尾頁的內容看你就明白了) 21 total_page = int(soup0.find("div", class_="pagers").findAll("a")[-2].get_text()) 22 myfile = open("aika_qc_gn_1_1_1.txt", "a", encoding='gb18030', errors='ignore') # 解決亂碼問題 23 print("user", " 來源", " 認為有用人數", " 類型", " comment") 24 NAME = "user" + " 來源" + " 認為有用人數" + " 類型" + " comment" 25 myfile.write(NAME + "\n") 26 for i in list(range(1, total_page + 1)): 27 # 設置隨機暫停時間 28 stop = random.uniform(1, 3) 29 30 url = "http://newcar.xcar.com.cn/257/review/0/0_" + str(i) + ".htm" 31 req = requests.get(url=url, headers=header) 32 req.encoding = "gb18030" # 解決亂碼問題 33 html = req.text 34 35 soup = BeautifulSoup(html, "html.parser") 36 contents = soup.find('div', class_="review_comments").findAll("dl") 37 l = len(contents) 38 for content in contents: 39 tiaoshu = contents.index(content) 40 try: 41 ss = "正在爬取第%d頁的第%d的評論,網址為%s" % (i, tiaoshu + 1, url) 42 print(ss) # 正在爬取的條數 43 try: 44 45 # 點評角度 46 comment_jiaodu = content.find("dt").find("em").find("a").get_text().strip().replace("\n", 47 "").replace( 48 "\t", "").replace("\r", "") 49 except: 50 comment_jiaodu = "sunny" 51 try: 52 53 # 點評類型 54 comment_type0 = content.find("dt").get_text().strip().replace("\n", "").replace("\t", "").replace( 55 "\r", 56 "") 57 comment_type1 = comment_type0.split("【")[1] 58 comment_type = comment_type1.split("】")[0] 59 except: 60 comment_type = "sunny" 61 62 # 認為該條評價有用的人數 63 try: 64 useful = int( 65 content.find("dd").find("div", class_="useful").find("i").find( 66 "span").get_text().strip().replace( 67 "\n", "").replace("\t", "").replace("\r", "")) 68 except: 69 useful = "sunny" 70 71 # 評論來源 72 try: 73 comment_region = content.find("dd").find("p").find("a").get_text().strip().replace("\n", 74 "").replace( 75 "\t", "").replace("\r", "") 76 except: 77 comment_region = "sunny" 78 79 # 評論者名稱 80 try: 81 user = \ 82 content.find("dd").find("p").get_text().strip().replace("\n", "").replace("\t", "").replace( 83 "\r", 84 "").split( 85 ":")[-1] 86 except: 87 user = "sunny" 88 89 # 評論內容 90 try: 91 comment_url = content.find('dt').findAll('a')[-1]['href'] 92 urlc = comment_url 93 headerc = { 94 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"} 95 reqc = requests.get(urlc, headers=headerc) 96 htmlc = reqc.text 97 soupc = BeautifulSoup(htmlc, "html.parser") 98 99 comment0 = \ 100 soupc.find('div', id='mainNew').find('div', class_='maintable').findAll('form')[1].find('table', 101 class_='t_msg').findAll( 102 'tr')[1] 103 try: 104 comment = comment0.find('font').get_text().strip().replace("\n", "").replace("\t", "") 105 except: 106 comment = "sunny" 107 try: 108 comment_time = soupc.find('div', id='mainNew').find('div', class_='maintable').findAll('form')[ 109 1].find('table', class_='t_msg').find('div', 110 style='padding-top: 4px;float:left').get_text().strip().replace( 111 "\n", "").replace( 112 "\t", "")[4:] 113 except: 114 comment_time = "sunny" 115 except: 116 try: 117 comment = \ 118 content.find("dd").get_text().split("\n")[-1].split('\r')[-1].strip().replace("\n", 119 "").replace( 120 "\t", "").replace("\r", "").split(":")[-1] 121 except: 122 comment = "sunny" 123 124 time.sleep(stop) 125 print(user, comment_region, useful, comment_type, comment) 126 127 tt = user + " " + comment_region + " " + str(useful) + " " + comment_type + " " + comment 128 myfile.write(tt + "\n") 129 except Exception as e: 130 print(e) 131 s = "爬取第%d頁的第%d的評論失敗,網址為%s" % (i, tiaoshu + 1, url) 132 print(s) 133 pass 134 myfile.close() 135 136 137 # 統計評論分布 138 def fenxi(): 139 myfile = open("aika_qc_gn_1_1_1.txt", "r") 140 good = 0 141 middle = 0 142 bad = 0 143 nn = 0 144 for line in myfile: 145 commit = line.split(" ")[3] 146 if commit == "好評": 147 good = good + 1 148 elif commit == "中評": 149 middle = middle + 1 150 elif commit == "差評": 151 bad = bad + 1 152 else: 153 nn = nn + 1 154 count = good + middle + bad + nn 155 g = round(good / (count - nn) * 100, 2) 156 m = round(middle / (count - nn) * 100, 2) 157 b = round(bad / (count - nn) * 100, 2) 158 n = round(nn / (count - nn) * 100, 2) 159 print("好評占比:", g) 160 print("中評占比:", m) 161 print("差評占比:", b) 162 print ("未評論:", n) 163 164 165 url = "http://newcar.xcar.com.cn/257/review/0.htm" 166 mm(url) 167 fenxi()
BeautifulSoup神器
Python一個第三方庫bs4中有一個BeautifulSoup庫,是用於解析html代碼的,換句話說就是可以幫助你更方便的通過標簽定位你需要的信息。這里只介紹兩個比較關鍵的方法:
1、find方法和findAll方法:
首先,BeautifulSoup會先將整個html或者你所指定的html代碼編程一個BeautifulSoup對象的實例(不懂對象和實例不要緊,你只要把它當作是一套你使用F12看到的樹形html代碼代碼就好),這個實例可以使用很多方法,最常用的就是find和findAll,二者的功能是相同的,通過find( )的參數,即find( )括號中指定的標簽名,屬性名,屬性值去搜索對應的標簽,並獲取它,不過find只獲取搜索到的第一個標簽,而findAll將會獲取搜索到的所有符合條件的標簽,放入一個迭代器(實際上是將所有符合條件的標簽放入一個list),findAll常用於兄弟標簽的定位,如剛才定位口碑信息,口碑都在dl標簽下,而同一頁的10條口碑對應於10個dl標簽,這時候用find方法只能獲取第一個,而findAll會獲取全部的10個標簽,存入一個列表,想要獲取每個標簽的內容,只需對這個列表使用一個for循環遍歷一遍即可。
2、get_text()方法:
使用find獲取的內容不僅僅是我們需要的內容,而且包括標簽名、屬性名、屬性值等,比如使用find方法獲取"<Y yy='aaa'>xxxx</Y>" 的內容xxxx,使用find后,我們會得到整個"<Y yy='aaa'>xxxx</Y>",十分冗長,實際我們想要的僅僅是這個標簽的內容xxxx,因此,對使用find方法后的對象再使用get_text( )方法,就可以得到標簽的內容了,對應到這里,我們通過get_text( )方法就可以得到xxxx了。
