python爬取酒店信息練習


  爬取酒店信息,首先知道要用到那些庫。本次使用request庫區獲取網頁,使用bs4來解析網頁,使用selenium來進行模擬瀏覽。

  本次要爬取的美團網的蚌埠酒店信息及其評價。爬取的網址為“http://hotel.meituan.com/bengbu/”。首先獲取導航頁的相關信息,具體代碼如下

url = 'http://hotel.meituan.com/bengbu/'

# 獲取酒店分頁信息,返回最大頁碼
html = requests.get(url).text
soup = BeautifulSoup(html,'html.parser')
page_info = soup.find_all('li',class_='page-link')  # 獲取酒店首頁的頁面導航條信息
get_page_num = page_info[-1].find('a').get_text()       # 獲取酒店頁面的總頁數
print(get_page_num)                         

  獲取了上面的信息,就可以選擇一個具體網頁,利用Google瀏覽器的F12查看具體的元素,利用xpath定位相關元素,把獲取的信息保存在文件夾下,具體方法代碼如下

# 獲取所有酒店詳細信息
def get_hotel_info(url):
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
    browser = webdriver.PhantomJS("D:/PhantomJS/phantomjs-2.1.1-windows/bin/phantomjs", desired_capabilities=dcap)         #指定phantomjs程序路徑
    browser.get(url)
    hotel_info = {}
    page_num = 1


    while(page_num < int(get_page_num)+1):
        # 獲取一個頁面的所有酒店信息
        for item in browser.find_elements_by_class_name('info-wrapper'):
            hotel_info['name'] = item.find_element_by_class_name('poi-title').text
            hotel_info['star'] = item.find_element_by_class_name('poi-grade').text
            hotel_info['consumers'] = item.find_element_by_class_name('poi-buy-num').text
            hotel_info['link'] = item.find_element_by_class_name('poi-title').get_attribute('href')
            print("酒店名稱:{}".format(hotel_info['name']))
            print("酒店評分:{}".format(hotel_info['star']))
            print("酒店銷量:{}".format(hotel_info['consumers']))
            print("酒店鏈接:{}".format(hotel_info['link']))
            f = open("酒店信息.txt", 'a', encoding="utf8")
            f.write(hotel_info['name']+"\n"+hotel_info['star']+"\n"+hotel_info['consumers']+"\n"+hotel_info['link']+"\n")
            u = hotel_info['link'][25:-1]
            # print(u)
            # 獲取酒店前10頁評論內容(動態加載的靜態爬取)
            for i in range(10):
                page = i + 1
                s = i * 10
                print("正在加載第" + str(page) + "頁評論")
                html = "http://ihotel.meituan.com/group/v1/poi/comment/" + u + "?sortType=default&noempty=1&withpic=0&filter=all&limit=10&offset=" + str(
                      s)+"&X-FOR-WITH="
                # print(html)
                # 第一次只使用一個header導致爬取信息不全,添加多個可以正常爬取
                my_headers = [
                    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
                    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
                ]
                randdom_header = random.choice(my_headers)
                headers = {
                    "User-Agent":randdom_header,
                    "Host":"ihotel.meituan.com"
                    }
                r = requests.get(html,headers=headers)
                print(r.text)
                data = json.loads(r.text,strict=False)
                # print(data)
                comments = data['data']['feedback']
                for n in comments:
                    replytime = n['feedbacktime']
                    content = n['comment']
                    # print("評論時間:", replytime)
                    # print("評論內容:", content)
                    f = open("jieguo-1.txt", 'a',encoding="utf8")
                    f.write(content+"\n")




        browser.find_element_by_class_name('paginator').find_element_by_class_name('next').find_element_by_tag_name('a').click()  # 一個頁面寫完后,通過點擊"下一頁"圖標至下一頁,繼續獲取
        time.sleep(1)
        page_num += 1

  實現了上述的方法,就可以把完整的酒店信息抓取下來,所有代碼如下:

 1 # encoding="utf8"
 2 # 愛學習的兔兔
 3 import requests
 4 from bs4 import BeautifulSoup
 5 from selenium import webdriver
 6 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 7 import time
 8 import json
 9 import random
10 
11 url = 'http://hotel.meituan.com/bengbu/'
12 
13 # 獲取酒店分頁信息,返回最大頁碼
14 html = requests.get(url).text
15 soup = BeautifulSoup(html,'html.parser')
16 page_info = soup.find_all('li',class_='page-link')  # 獲取酒店首頁的頁面導航條信息
17 get_page_num = page_info[-1].find('a').get_text()       # 獲取酒店頁面的總頁數
18 print(get_page_num)                                     # 返回酒店頁面的
19 
20 # 獲取所有酒店詳細信息
21 def get_hotel_info(url):
22     dcap = dict(DesiredCapabilities.PHANTOMJS)
23     dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
24     browser = webdriver.PhantomJS("D:/PhantomJS/phantomjs-2.1.1-windows/bin/phantomjs", desired_capabilities=dcap)         #指定phantomjs程序路徑
25     browser.get(url)
26     hotel_info = {}
27     page_num = 1
28 
29 
30     while(page_num < int(get_page_num)+1):
31         # 獲取一個頁面的所有酒店信息
32         for item in browser.find_elements_by_class_name('info-wrapper'):
33             hotel_info['name'] = item.find_element_by_class_name('poi-title').text
34             hotel_info['star'] = item.find_element_by_class_name('poi-grade').text
35             hotel_info['consumers'] = item.find_element_by_class_name('poi-buy-num').text
36             hotel_info['link'] = item.find_element_by_class_name('poi-title').get_attribute('href')
37             print("酒店名稱:{}".format(hotel_info['name']))
38             print("酒店評分:{}".format(hotel_info['star']))
39             print("酒店銷量:{}".format(hotel_info['consumers']))
40             print("酒店鏈接:{}".format(hotel_info['link']))
41             f = open("酒店信息.txt", 'a', encoding="utf8")
42             f.write(hotel_info['name']+"\n"+hotel_info['star']+"\n"+hotel_info['consumers']+"\n"+hotel_info['link']+"\n")
43             u = hotel_info['link'][25:-1]
44             # print(u)
45             # 獲取酒店前10頁評論內容(動態加載的靜態爬取)
46             for i in range(10):
47                 page = i + 1
48                 s = i * 10
49                 print("正在加載第" + str(page) + "頁評論")
50                 html = "http://ihotel.meituan.com/group/v1/poi/comment/" + u + "?sortType=default&noempty=1&withpic=0&filter=all&limit=10&offset=" + str(
51                       s)+"&X-FOR-WITH="
52                 # print(html)
53                 # 第一次只使用一個header導致爬取信息不全,添加多個可以正常爬取
54                 my_headers = [
55                     "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
56                     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
57                     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
58                     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
59                     "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
60                 ]
61                 randdom_header = random.choice(my_headers)
62                 headers = {
63                     "User-Agent":randdom_header,
64                     "Host":"ihotel.meituan.com"
65                     }
66                 r = requests.get(html,headers=headers)
67                 print(r.text)
68                 data = json.loads(r.text,strict=False)
69                 # print(data)
70                 comments = data['data']['feedback']
71                 for n in comments:
72                     replytime = n['feedbacktime']
73                     content = n['comment']
74                     # print("評論時間:", replytime)
75                     # print("評論內容:", content)
76                     f = open("jieguo-1.txt", 'a',encoding="utf8")
77                     f.write(content+"\n")
78 
79 
80 
81 
82         browser.find_element_by_class_name('paginator').find_element_by_class_name('next').find_element_by_tag_name('a').click()  # 一個頁面寫完后,通過點擊"下一頁"圖標至下一頁,繼續獲取
83         time.sleep(1)
84         page_num += 1
85 
86 def main():
87     get_hotel_info(url)
88 
89 if '__main__' == __name__:
90     main()

  這樣就順利的拿到了酒店信息和評價,為了簡單分析下拿到的數據,使用了SnowNLP分詞看數據,發現效果一般,又使用了jieba分詞和詞雲來分析,得出一張圖片如下圖:

這里沒有對符號進行過濾,只能給出一個大體的評價關系圖。具體代碼圖下:

# encoding="utf8"
# SnowNLP分詞
# 愛學習的兔兔
from snownlp import SnowNLP
f = open("jieguo-1.txt","r",encoding="utf8")
r = f.readlines()   #按行讀取
#for line in r:
s = SnowNLP(str(r))
for sentence in s.sentences:
    print(sentence)

# jieba分詞與詞雲
import jieba.posseg as posseg
from collections import Counter
from wordcloud import WordCloud
#for line in r:
words = [w for w,f in posseg.cut(str(r))]
print(words)
c = Counter(words)
print(c.most_common(20))
wc = WordCloud(font_path='c:\\Windows\\Fonts\\simkai.ttf', height=1080, width=1920).generate_from_frequencies(c)
image = wc.to_image()
image.show()
wc.to_file("ex2.png")

為了得到效果更好一點,自行百度了一下,得到新的圖片,如下:

 

具體的實現代碼如下:

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
text_from_file_with_apath = open('jieguo-1.txt',encoding="utf-8").read()

wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True)
wl_space_split = " ".join(wordlist_after_jieba)

my_wordcloud = WordCloud(font_path='c:\\Windows\\Fonts\\simkai.ttf', height=1080, width=1920).generate(wl_space_split)

plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()  

整體走下來,感覺寫個簡單的爬蟲能學到不少有用的信息。

 

 
       


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM