Python高級應用程序設計任務

本文轉載自查看原文 2019-11-12 11:06 316

一，主題式網絡爬蟲設計方案

1，主題式網絡爬蟲的名稱

1.1去哪網攻略的爬取

2，主題式網絡爬蟲的內容與數據特征分析

2.1爬蟲的內容

文章鏈接，標題，簡要描述信息，發布者，發布者的個人標簽，出發日期

天數，拍照數量，出行的類型，旅行的標簽，途徑，行程路線

人均消費，觀看數，點贊數，評論數

2.2 數據特征分析

2.2.1對trip,days和price做一個透視表並可視化

2.2.2對label,peope和price做一個透視表並可視化

3，主題式網絡爬蟲設計方案概述（包括實現思路和技術難點）

3.1實現思路

創建一個QuNaRSpider的類，定義start_requests()方法用來處理每一的請求，process_number()方法用來對整數數據的進一步加工，parse_detail()方法處理具體內容字段的提取，save()方法保存數據到csv文件中，run()用來啟動爬蟲,具體如下圖解。

3.2技術難點

爬取過程中並未遇到阻攔，既不需要設置header, 也沒遇到在爬取過程中被重定向到登錄頁面（整個爬取5-6分鍾）。

二，主題頁面的結構特征分析

1，主題頁面的特征結構

每頁10項數據，共計200頁，數據項2000，不存在應拖動滾動條而動態加載的數據項,即li,通過右鍵查看網頁源代碼分析需要提取的數據是否存在動態生成的數據，任意查看一個數據項中與原網頁中的數據對比后，發現所需要爬取的數據都是靜態的。

2， HTML頁面解析

框框中的數據都是需要爬取的字段。

3，節點（標簽）查找方法與遍歷發法（必要時畫出節點數結構）

查找節點的方法采用scrapy的Selector選擇器，用xpath來提取所需要的數據。從整體(ul)到部分(li)的查找方式，即先確定爬取的數據所在哪個html的節點中，找到這個節點的所有直接子節點，也就是每一個攻略項，再用for循環依次遍歷，然后再具體解析遍歷的每一項攻略的數據，圖解如下。

三，網絡爬蟲程序設計

1，爬蟲程序主題要包括以下部分，要附源代碼及較詳解注釋，並在每部分程序后面提供輸出結果的截圖。

import re
import os

import requests
from fake_useragent import UserAgent
from scrapy import Selector
from datetime import datetime
from urllib import parse
from w3lib.html import remove_tags
from pandas import DataFrame
from threading import Thread, Lock


class QuNaRSpider:
    '''
        爬取去哪兒網的攻略
    '''

    start_url = 'https://travel.qunar.com/travelbook/list.htm?order=hot_heat'
    domain = 'https://travel.qunar.com'
    # headers = {
    #     "User-Agent": UserAgent().random
    # }
    pages = 1

    lock = Lock()
    # 將數據保存在當前文件的目錄下
    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'origin_data.csv')

    def start_requests(self, url):
        '''
           處理每一次的請求
        '''
        res = requests.get(url)
        try:
            res.raise_for_status()
            # 不建議使用res.encoding = res.apparent_encoding
            # 因為body中內容的charset有可能不是utf-8,導致解析中文時會出現亂碼
            res.encoding = 'UTF-8'
            self.parse_detail(res.text)
        except Exception as e:
            raise e

    @staticmethod
    def process_number(value):
        '''
            處理觀看數，點贊數，評論數
        '''
        if "萬" in value:
            r = re.search(r'(\d+\.*\d*)', value)
            if r:
                r = float(r.group(1))
                return int(r * 10000)
        return int(value)

    @classmethod
    def save(cls, df):
        '''
            將數據寫入csv
        '''
        cls.lock.acquire()
        if os.path.exists(cls.file_path):
            # 字符編碼采用utf-8-sig,因為存在表情包
            df.to_csv(cls.file_path, header=False, index=False, mode="a+", encoding="utf_8_sig")
        else:
            df.to_csv(cls.file_path, index=False, mode="w+", encoding="utf_8_sig")
        cls.lock.release()

    def parse_detail(self, html):
        '''
           解析的攻略中，每個攻略的發布者發布的信息略有不同
           對於沒有的字段，統一采用字段 "" 來進行標識
           解析的字段如下：
           url: 文章鏈接
           title：標題
           describle: 簡要描述信息
           username: 發布者
           label: 發布者的個人標簽
           date: 出發日期
           days: 天數
           photo_nums: 拍照數量
           people: 出行的人員
           trip: 旅行的標簽
           via_places: 途徑
           distance: 行程路線
           price:  人均消費
           view_nums: 觀看數
           praise_nums: 點贊數
           comment_nums: 評論數
        '''
        sel = Selector(text=html)
        lis = sel.xpath('//ul[contains(@class, "b_strategy_list")]/li')
        for li in lis:
            url = li.xpath('.//h2/a/@href').get()
            url = parse.urljoin(self.domain, url)
            print("解析: " + url)
            title = li.xpath('.//h2/a/text()').get()

            describe = li.xpath('.//p[contains(@class, "icon_r")]/@class').re_first(r'icon_r\s*(\w+)') or ""
            username = li.xpath('.//span[@class="user_name"]/a[1]/text()').get()

            # 有些攻略不存在label
            label = li.xpath('.//span[@class="user_name"]/a[2]/span/@title').get(default="")

            # 轉換成日期的數據類型
            date = li.xpath('.//span[@class="date"]/text()').re_first(r'(\d+-\d+-\d+)')
            date = datetime.strptime(date, "%Y-%m-%d")

            days = int(li.xpath('.//span[@class="days"]/text()').re_first(r'\d+'))
            photo_nums = int(li.xpath('.//span[@class="photo_nums"]/text()').re_first(r'\d+'))

            people = li.xpath('.//span[@class="people"]/text()').get(default="")
            trip = li.xpath('.//span[@class="trip"]/text()').get(default="")

            # 途徑和行程的爬取時存在四種情況
            # 1,途徑 + 行程
            # 2,途徑
            # 3,行程
            # 4,兩者都沒
            try:
                places = li.xpath('.//p[@class="places"]')
            except:
                places = None
            via_places = distance = ""
            if places:
                if len(places) == 2:
                    via_places = remove_tags(places[0].get())
                    distance = remove_tags(places[1].get())
                    via_places = via_places.replace("&gt;", " > ").replace("途經：", "")
                    distance = distance.replace("&gt;", ' > ').replace("行程：", "")
                else:
                    via_places = remove_tags(places[0].get())
                    via_places = via_places.replace("&gt;", " > ").replace("途經：", "")

            # 有些攻略不存在price
            try:
                price = int(li.xpath('.//span[@class="fee"]/text()').re_first(r'\d+'))
            except:
                price = 0

            # 存在2.1萬, 3000這兩種類型的數據
            view_nums = li.xpath('.//span[@class="nums"]/span[@class="icon_view"]/span/text()').get()
            view_nums = QuNaRSpider.process_number(view_nums)
            praise_nums = li.xpath('.//span[@class="nums"]/span[@class="icon_love"]/span/text()').get()
            praise_nums = QuNaRSpider.process_number(praise_nums)
            comment_nums = li.xpath('.//span[@class="nums"]/span[@class="icon_comment"]/span/text()').get()
            comment_nums = QuNaRSpider.process_number(comment_nums)

            # 寫入csv文件
            df = DataFrame({
                "url": [url],
                "title": [title],
                "describe": [describe],
                "username": [username],
                "label": [label],
                "date": [date],
                "days": [days],
                "photo_nums": [photo_nums],
                "people": [people],
                "trip": [trip],
                "via_places": [via_places],
                "distance": [distance],
                "price": [price],
                "view_nums": [view_nums],
                "praise_nums": [praise_nums],
                "comment_nums": [comment_nums]
            })
            # 數據存儲：開啟異步
            t = Thread(target=self.save, args=(df, ))
            t.start()

        # 解析下一頁
        next_page = sel.xpath('//a[@class="next"]/@href')
        if next_page:
            self.pages += 1
            print('解析第{}頁'.format(self.pages))
            next_url = parse.urljoin('https:', next_page.get())
            self.start_requests(next_url)

    def run(self):
        '''
            QuNaRSpider的入口
        '''
        self.start_requests(self.start_url)


if __name__ == '__main__':
    qunar = QuNaRSpider()
    qunar.run()

具體截圖如下：

2，對數據進行清洗和處理

　2.1導包操作

# 導包操作
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas import DataFrame, Series
from pylab import mpl

# 指定默認字體：解決plot不能顯示中文問題
mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei'] 
# 解決保存圖像是負號'-'顯示為方塊的問題
mpl.rcParams['axes.unicode_minus'] = False  

%matplotlib inline

2.2 讀取爬蟲爬取到的數據

df = pd.read_csv(r"D:\python作業\homework\origin_data.csv")

2.3 查看數據大小並顯示前5條數據

df.shape
df.head()

2.4 產看是否存在重復值

df.duplicated()

2.5 去重復值,以title和username為去重的依據, 並查看其大小

df.drop_duplicates(['title', 'username']).shape

2.6 查看在爬取數據過程中無法通過直接方式提取到中文標識

df['describle'].value_counts()

2.7 數據的替換將拼音替換成漢字

df['describle'] = df['describle'].replace("meitu", "美圖")
df['describle'] = df['describle'].replace("duantupai", "短途派")
df['describle'] = df['describle'].replace("jinghua", "臻品")
df['describle'] = df['describle'].replace("wenyifan", "文藝范")
df['describle'] = df['describle'].replace("ganhuo", "干貨")
df['describle'].value_counts()

2.8 查看數據的整體分布

df.describe()

3.文本分析（可選）：jieba分詞、wordcloud可視化
4.數據分析與可視化
（例如：數據柱形圖、直方圖、散點圖、盒圖、分布圖、數據回歸分析等）

4.1 需求1：decrible與價格，people的透視表,查看好的體驗的平均消費

df_pivot = df.pivot_table(index="describle", columns="people", values="price")
df_pivot.shape
df_pivot

# 數據可視化
df_pivot.plot(kind='bar', title='describe and price')
plt.xlabel('describe')
plt.ylabel('price')

4.2 需求2：對出行的方式, 天數和價格做一個透視表

df_pivot2 = df.pivot_table(index="days", columns="people", values="price")
df_pivot.shape
df_pivot.head()

# 數據可視化
df_pivot2.plot(kind='hist')
plt.ylabel('days')
plt.xlabel('price')

4.3 需求3：想知道被去哪兒網確定為聰明旅行家所占的比例

df['label'].value_counts() / df.shape[0]

# 1:有聰明旅行家的標簽
# 0:無聰明旅行家的標簽
has_smart_traveler = []
no_smart_traveler = []
for i in df['label']:
    if '聰明旅行家' == i:
        has_smart_traveler.append(1)
    else:
        no_smart_traveler.append(0)

# bars的寬度
width = 0.34  

fig, ax = plt.subplots(figsize=(7,6))

rects1 = ax.bar(1 - width/2, len(has_smart_traveler), width, label='has_smart_traveler')
rects2 = ax.bar(1 + width/2, len(no_smart_traveler),  width, label='no_smart_traveler')

# 在每個bar上顯示所占的數量
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3), 
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
ax.set_ylabel('Count')
ax.set_title('Count by group and smart_travler')
ax.set_xticks([1])
ax.set_xticklabels(['聰明旅行家'])
ax.legend()

4.4 需求4：想知道攻略排名消費前10的信息

df.sort_values('price',ascending=False).head(10)

4.5 需求5：想知道攻略中最多人觀看的出行方式,作為出行參考的信息

df.sort_values('view_nums',ascending=False).head(10)

4.6 需求6：想知道人們具體出行的方式都有哪些

# 對trip進行一個重新組裝,找出所有的trip標簽,並進行標簽的去重操作
trip_list = []
for val in df['trip'].dropna():
    if '?' in val:
        val_list = val.strip().split('?')
        for sub_val in val_list:
            trip_list.append(sub_val)
    else:
        trip_list.append(val)
trip_series = Series(trip_list).drop_duplicates()
trip_series