爬取汽車之家車型配置信息


一、需求

獲取指定品牌的所有車型配置信息,並保存到excel中。

流程大致思路:

1.獲取品牌id:brand_id

2.通過品牌id獲取車型id:series_id

3.獲取車型配置頁面

4.解析配置頁面內容(這步最復雜,使用了之前一些大神的代碼)

二、代碼

測試完美運行

 

 

 

import requests
import json
import xlwt
from bs4 import BeautifulSoup
import re
from urllib import parse
from selenium import webdriver


class Car_home_config(object):
    def __init__(self):
        self.session = requests.Session()
        self.params = None
        self.brand_dict = {}
        self.series_dict = {}
        self.brand_name = None

    def get_header(self):
        self.headers = {
            "authority": "car.autohome.com.cn",
            "method": "GET",
            "path": "/AsLeftMenu/As_LeftListNew.ashx?%s" % parse.urlencode(self.params),
            "scheme": "https",
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87",
            "sec-ch-ua-mobile": "?0",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
        }

    # 獲取所有品牌id號
    def get_brand_id(self):
        self.params = {
            "typeId": "1",
            "brandId": "0",
            "fctId": "0",
            "seriesId": "0"
        }
        self.get_header()
        url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx"
        res = self.session.get(url=url, headers=self.headers, params=self.params)
        res.encoding = res.apparent_encoding
        html = res.text
        # print(html)
        soup = BeautifulSoup(html, 'lxml')
        ul_list = soup.find_all("ul")
        for ul in ul_list:
            li_list = ul.find_all("li")
            for li in li_list:
                a_href = li.find("a").attrs.get('href')
                a_text = li.find("a").text
                # print(a_href)
                # print(a_text)
                brand_id = re.findall("[0-9]\d*", a_href)[0]
                self.brand_dict[brand_id] = a_text
        return self.brand_dict

    def get_AsLeftMenu(self):
        url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx"
        res = self.session.get(url=url, headers=self.headers, params=self.params)
        res.encoding = res.apparent_encoding
        html = res.text
        soup = BeautifulSoup(html, 'lxml')
        dd_list = soup.find_all("dd")
        for dd in dd_list:
            a_list = dd.find_all("a")
            for a in a_list:
                a_href = a.attrs.get('href')
                a_text = a.text
                print(a_href)
                print(a_text)
                series_id = re.findall("[0-9]\d*", a_href)[0]
                self.series_dict[series_id] = a_text

    # 獲取某一品牌下車型的id號
    def get_series_id(self):
        self.get_brand_id()
        if self.brand_name:
            for k, v in self.brand_dict.items():
                if self.brand_name in v:
                    self.params = {
                        "typeId": "1",
                        "brandId": k,
                        "fctId": "0",
                        "seriesId": "0"
                    }
                    self.get_header()
                    self.get_AsLeftMenu()
                    return self.series_dict
        else:
            for k, v in self.brand_dict.items():
                self.params = {
                    "typeId": "1",
                    "brandId": k,
                    "fctId": "0",
                    "seriesId": "0"
                }
                self.get_header()
                self.get_AsLeftMenu()
            return self.series_dict

    # 獲取車型配置信息
    def get_config_content(self, series_id):
        res = self.session.get(r"https://car.autohome.com.cn/config/series/{}.html".format(series_id), verify=False,
                           headers={
                               "authority": "car.autohome.com.cn",
                               "method": "GET",
                               "path": "/config/series/{}.html".format(series_id),
                               "scheme": "https",
                               "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                               "accept-encoding": "gzip, deflate, br",
                               "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                               "cache-control": "no-cache",
                               "referer": "https://www.autohome.com.cn/",
                               "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87",
                               "sec-ch-ua-mobile": "?0",
                               "sec-fetch-dest": "document",
                               "sec-fetch-mode": "navigate",
                               "sec-fetch-site": "same-site",
                               "ec-fetch-user": "?1",
                               "upgrade-insecure-requests": "1",
                               "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"})
        html = res.content.decode("utf-8")
        return html

    def car_info(self, html):
        config = re.search("var config = (.*?)};", html)  # 車的參數
        option = re.search("var option = (.*?)};", html)  # 主被動安全裝備
        bag = re.search("var bag = (.*?)};", html)  # 選裝包
        # 處理汽車參數
        car_info = ""
        if config and option and bag:
            car_info = car_info + config.group(0) + option.group(0) + bag.group(0)
        return car_info

    def write_html(self, js_list, car_info):
        # 運行JS的DOM -- 這部破解是最麻煩的,非常耗時間~參考了互聯網上的大神代碼
        DOM = ("var rules = '2';"
               "var document = {};"
               "function getRules(){return rules}"
               "document.createElement = function() {"
               "      return {"
               "              sheet: {"
               "                      insertRule: function(rule, i) {"
               "                              if (rules.length == 0) {"
               "                                      rules = rule;"
               "                              } else {"
               "                                      rules = rules + '#' + rule;"
               "                              }"
               "                      }"
               "              }"
               "      }"
               "};"
               "document.querySelectorAll = function() {"
               "      return {};"           "};"
               "document.head = {};"
               "document.head.appendChild = function() {};"

               "var window = {};"
               "window.decodeURIComponent = decodeURIComponent;")

        # 把JS文件寫入到文件中去
        for item in js_list:
            DOM = DOM + item
        html_type = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body>    <script type='text/javascript'>"
        # 拼接成一個可以運行的網頁
        js = html_type + DOM + " document.write(rules)</script></body></html>"
        # 再次運行的時候,請把文件刪除,否則無法創建同名文件,或者自行加驗證即可
        with open("original.html", "w", encoding="utf-8") as f:
            f.write(js)
        try:
            driver = webdriver.PhantomJS(
                executable_path=r"phantomjs.exe")
            driver.get("original.html")
            # 讀取body部分
            text = driver.find_element_by_tag_name('body').text
            if not text:
                return
        except Exception as e:
            print(e)
        finally:
            driver.close()
        # 匹配車輛參數中所有的span標簽
        span_list = re.findall("<span(.*?)></span>", car_info)  # car_info 是我上面拼接的字符串
        # 按照span標簽與text中的關鍵字進行替換
        for span in span_list:
            # 這個地方匹配的是class的名稱  例如 <span class='hs_kw7_optionZl'></span> 匹配   hs_kw7_optionZl 出來
            info = re.search("'(.*?)'", span)
            if info:
                class_info = str(info.group(
                    1)) + "::before { content:(.*?)}"  # 拼接為  hs_kw7_optionZl::before { content:(.*?)}
                content = re.search(class_info, text).group(1)  # 匹配文字內容,返回結果為 "實測""油耗""質保"
                car_info = car_info.replace(str("<span class='" + info.group(1) + "'></span>"),
                                            re.search("\"(.*?)\"", content).group(1))
        return car_info

    def save(self, car_info, car_name, save_path):
        # 持久化
        car_item = {}
        config = re.search("var config = (.*?);", car_info).group(1)
        option = re.search("var option = (.*?);var", car_info).group(1)
        bag = re.search("var bag = (.*?);", car_info).group(1)
        config_re = json.loads(config)
        option_re = json.loads(option)
        bag_re = json.loads(bag)
        config_item =[]
        option_item = []
        for i in config_re['result']['paramtypeitems']:
            config_item+=i['paramitems']
        for i in option_re['result']['configtypeitems']:
            option_item+=i['configitems']
        # bag_item = bag_re['result']['bagtypeitems'][0]['bagitems']
        for car in config_item:
            car_item[car['name']] = []
            for value in car['valueitems']:
                car_item[car['name']].append(value['value'])
        for car in option_item:
            car_item[car['name']] = []
            for value in car['valueitems']:
                car_item[car['name']].append(value['value'])
        # for car in bag_item[0]['valueitems']:
        #     car_item[car['name']] = []
        #     car_item[car['name']].append(car['bagid'])
        #     car_item[car['name']].append(car['pricedesc'])
        #     car_item[car['name']].append(car['description'])
        # 生成表格
        workbook = xlwt.Workbook(encoding='ascii')  # 創建一個文件
        worksheet = workbook.add_sheet('汽車之家')  # 創建一個表
        cols = 0
        start_row = 0
        for co in car_item:
            worksheet.write(start_row, cols, co)  # 在第0(一)行寫入車的配置信息
            cols = cols + 1
        end_row_num = start_row + len(car_item['車型名稱'])  # 車輛款式記錄數
        for row in range(start_row, end_row_num):
            col_num = 0  # 列數
            row += 1
            for col in car_item:
                try:
                    con = str(car_item[col][row - 1])
                except:
                    con = ""
                worksheet.write(row, col_num, con)
                col_num = col_num + 1
        workbook.save('{}/{}.xls'.format(save_path, car_name))

    # 查找車型配置,brand_name不填就是查找所有
    def check(self, brand_name, save_path="./"):
        self.brand_name = brand_name
        self.get_series_id()
        for series_id, car_name in self.series_dict.items():
            print(series_id, car_name)
            html = self.get_config_content(series_id)
            car_info = self.car_info(html)
            js_list = re.findall('(\(function\([a-zA-Z]{2}.*?_\).*?\(document\);)', html)
            car_info = self.write_html(js_list, car_info)
            if car_info:
                self.save(car_info, car_name, save_path)


car = Car_home_config()
car.check("奧迪")

phantomjs.exe下載地址:https://phantomjs.org/download.html

感謝以下作者:
https://www.cnblogs.com/kangz/p/10011348.html
https://www.cnblogs.com/pontoon/p/10459471.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM