民政局中行政區域數據爬取


'''
    中華人民共和國民政局官網中的行政區域代碼爬取:
        技術點:
            1>進入二級頁面(數據展示頁)時,url發生跳轉(js作用的),需要在二級頁面源碼中找到真實url
            2>數據入庫實時更新:保存url,下次爬取時,先對比url,若相同,不更新,否則更新
'''

import requests
from lxml import etree
import re
import pymysql


class GovementSpider:
    def __init__(self):
        self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
        }
        self.db = pymysql.connect('localhost', 'root', '123456', 'govermentdb', charset='utf8')
        self.cursor = self.db.cursor()

    # 提取二級頁面鏈接(假鏈接),一定是最新的那個鏈接
    def get_false_link(self):
        html = requests.get(url=self.one_url, headers=self.headers).content.decode('utf-8', 'ignore')
        parse_html = etree.HTML(html)
        a_list = parse_html.xpath('//a [@class="artitlelist"]')
        for a in a_list:
            title = a.xpath('./@title')[0].strip()
            if re.findall(r'.*以上行政區划代碼', title, re.S):
                two_false_link = 'http://www.mca.gov.cn' + a.get('href')
                return two_false_link

    # 提取真實二級頁面鏈接(返回數據)
    def get_true_link(self):
        # 獲取響應內容
        false_link = self.get_false_link()
        html = requests.get(url=false_link, headers=self.headers).content.decode('utf-8', 'ignore')
        pattern = re.compile(r'window.location.href="(.*?)"', re.S)
        real_link = pattern.findall(html)[0]
        print(real_link)

        # 實現增量爬取
        # 即到version表中查詢是否有real_link,如果有,直接返回數據已是最新,否則,抓取最新數據
        sel = 'select * from version where link="{}"'.format(real_link)
        self.cursor.execute(sel)
        # 不為空元組(不需要抓取數據),即鏈接已存在
        if self.cursor.fetchall():
            print('數據已是最新')
        else:
            # 先抓數據
            self.get_data(real_link)
            # 把real_link插入到version表中
            ins = 'insert into version values(%s)'
            self.cursor.execute(ins, [real_link])
            self.db.commit()

    # 真正提取數據函數
    def get_data(self, real_link):
        html = requests.get(url=real_link, headers=self.headers).text
        parse_html = etree.HTML(html)
        tr_list = parse_html.xpath('//tr[@height="19"]')
        for tr in tr_list:
            code = tr.xpath('./td[2]/text()')[0]
            name = tr.xpath('./td[3]/text()')[0]
            print(name, code)

    # 主函數
    def main(self):
        pass


if __name__ == '__main__':
    spider = GovementSpider()
    spider.main()
    spider.get_true_link()

 

'''
    使用selenium+chrome進行爬取,可以避免js對二級頁面鏈接的渲染,爬取更簡單
'''

from selenium import webdriver
import time
import pymysql


class GovementSpider:
    def __init__(self):
        self.browser = webdriver.Chrome()
        self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
        self.db = pymysql.connect('localhost', 'root', '123456', db='govdb', charset='utf8')
        self.cursor = self.db.cursor()
        # 定義三個空列表,為了excutemany()
        self.province_list = []
        self.city_list = []
        self.county_list = []

    # 獲取首頁並提取二級頁面鏈接(虛假鏈接即可,真實鏈接可以不用)
    def get_false_url(self):
        self.browser.get(self.one_url)
        td_list = self.browser.find_elements_by_xpath('//td[@class="arlisttd"]/a[contains(@title,"代碼")]')
        if td_list:
            # 找節點對象,因為要click()
            two_url_element = td_list[0]
            # 增量爬取,取出鏈接,和數據庫中version表中作比對
            two_url = two_url_element.get_attribute('href')
            sel = 'select * from version where link=%s'
            self.cursor.execute(sel, [two_url])
            result = self.cursor.fetchall()
            if len(result) != 0:
                print('數據已最新,無需爬取')
            else:
                # 點擊
                two_url_element.click()
                time.sleep(3)
                # 切換browser
                all_handles = self.browser.window_handles
                self.browser.switch_to_window(all_handles[1])
                # 數據抓取
                self.get_data()
                # 結束后把two_url插入version表中
                ins = 'insert into version values(%s)'
                self.cursor.execute(ins, [two_url])
                self.db.commit()

    # 二級頁面中提取行政區划代碼
    def get_data(self):
        tr_list = self.browser.find_elements_by_xpath('//tr[@height="19"]')
        for tr in tr_list:
            code = tr.find_element_by_xpath('./td[2]').text.strip()
            name = tr.find_element_by_xpath('./td[3]').text.strip()
            print(name, code)
            # 判斷層級關系,添加到對應的數據庫表中(對應表中字段)
            if code[-4:] == '0000':
                self.province_list.append([name, code])
                if name in ['北京市', '天津市', '上海市', '重慶市']:
                    city = [name, code, code[:2] + '0000']
                    self.city_list.append(city)
            elif code[-2:] == '00':
                city = [name, code, code[:2] + '0000']
                self.city_list.append(city)
            else:
                county = [name, code, code[:4] + '00']
                self.county_list.append(county)
        # 所有數據爬取完成之后,統一excutemany()
        self.insert_mysql()

    def insert_mysql(self):
        # 更新時一定要刪除表記錄
        del_province = 'delete from province'
        del_city = 'delete from city'
        del_county = 'delete from county'
        self.cursor.execute(del_province)
        self.cursor.execute(del_city)
        self.cursor.execute(del_county)
        # 插入新的數據
        ins_province = 'insert into province values(%s,%s)'
        ins_city = 'insert into city values(%s,%s,%s)'
        ins_county = 'insert into county values(%s,%s,%s)'
        self.cursor.executemany(ins_province, self.province_list)
        self.cursor.executemany(ins_city, self.city_list)
        self.cursor.executemany(ins_county, self.county_list)
        self.db.commit()
        print('數據抓取完成,成功存入數據庫')

    def main(self):
        self.get_false_url()
        # 斷開連接
        self.cursor.close()
        self.db.close()
        self.browser.quit()


if __name__ == "__main__":
    spider = GovementSpider()
    spider.main()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM