中國國民經濟行業分類數據下載（含爬取代碼）

本文轉載自查看原文 2020-03-20 08:23 602 python爬蟲

爬取網站：https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei-liebiao

一.數據：（由於文檔中含有敏感詞，上傳失敗）

網盤：

鏈接：https://pan.baidu.com/s/1vCUuCOEEQb786afZVhS8SQ
提取碼：nsi4

數據截圖：

二.代碼分析：（爬取小類）

如下圖，可以發現代碼與連接的關系https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei/...

①先爬取<a>標簽中的代碼

②根據代碼創造鏈接url，爬取行業名稱

三.注意：python對txt文檔的讀寫規則以及文檔指針問題

在使用Python進行txt文件的讀寫時，當打開文件后，首先用read（）對文件的內容讀取，然后再用write（）寫入，這時發現雖然是用“r+”模式打開，按道理是應該覆蓋的，但是卻出現了追加的情況。
這是因為在使用read后，文檔的指針已經指向了文本最后，而write寫入的時候是以指針為起始，因此就產生了追加的效果。
如果想要覆蓋，需要先seek（0），然后使用truncate()清除后，即可實現重新覆蓋寫入

四.源代碼：（爬取小類）

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : 國民經濟行業分類及其代碼.py
# @Author: 田智凱
# @Date  : 2020/3/19
# @Desc  :爬取中國國民經濟行業分類

from multiprocessing.pool import Pool
import requests
from lxml import etree
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
txtpath=r'D:\hy.txt'

#爬取表格得到小類代碼
def get_codes(url):
    global codes
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    tb=selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/table/tbody/tr')
    #不清空原來內容
    with open(txtpath, "a",encoding='utf-8') as txt:
        for tr in tb:
            tds=tr.xpath('td')
            for td in tds:
                code=td.xpath('div/span/a/text()')[0]
                #print(code)
                hy=get_hangye(code)
                txt.write(hy)
                txt.write('\t')
                txt.write(code)
                txt.write('\n')
    txt.close()

#根據代碼得到名稱
def get_hangye(code):
    url='https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei/{}'.format(code)
    #print(url)
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    hy = selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/div/div[2]/span[2]/text()')[0]
    #print(hy)
    return hy

if __name__ == '__main__':
    urls = ['https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei-liebiao?page={}'.format(str(p)) for p in range(0,15)]
    start=time.time()
    print('程序運行中...')
    pool=Pool(processes=4)#開啟四個進程
    pool.map(get_codes,urls)
    end=time.time()
    print('程序用時',end-start)

附上：

import requests
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
txtpath=r'D:\hy2.txt'

def get_data(url):
    global codes
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    hy=selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/div/div[1]/span[2]/a/text()')[0]
    code=selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/div/div[2]/span[2]/text()')[0]
    #不清空原來內容
    with open(txtpath, "a",encoding='utf-8') as txt:
        txt.write(hy)
        txt.write('\t')
        txt.write(code)
        txt.write('\n')
    txt.close()

if __name__ == '__main__':
    for i in range(1,98):
        if i<10:
            url='https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format('0'+str(i))
            #print(url)
            get_data(url)
        else:
            url = 'https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format(str(i))
            get_data(url)

爬取大類

import requests
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
txtpath=r'D:\hy3.txt'

def get_data(url):
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    infos=selector.xpath('//*[@id="block-system-main"]/div/div/div[3]/div[1]/div/table/tbody/tr')
    for info in infos:
        code=info.xpath('td[1]/a/text()')[0].strip()
        hy=info.xpath('td[2]/text()')[0].strip()
        #不清空原來內容
        with open(txtpath, "a",encoding='utf-8') as txt:
            txt.write(hy)
            txt.write('\t')
            txt.write(code)
            txt.write('\n')
        txt.close()

if __name__ == '__main__':
    for i in range(1,98):
        if i<10:
            url='https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format('0'+str(i))
            #print(url)
            get_data(url)
        else:
            url = 'https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format(str(i))
            get_data(url)

爬取中類

import requests
from lxml import etree
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
txtpath=r'D:\hy.txt'

def get_data(url):
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    infos = selector.xpath('//*[@id="node-3"]/div/div/div/div/fieldset/div/div/table/tbody/tr')
    for info in infos:
        try:
            code = info.xpath('td[1]/a/text()')[0].strip()
            print(code)
            if (code):
                hy = info.xpath('td[3]/text()')[0].strip()
                content = info.xpath('td[4]/text()')[0].strip()
                list = re.findall(r'\d+', content)  # 挑選字符串中的數字(例：本門類包括01～05大類)
                min = int(list[0])
                max = int(list[1])
                # print(min,max)
                childs = ''
                for i in range(min, max + 1):
                    if (i < 10):
                        childs = childs + ';0' + str(i)
                    else:
                        childs = childs + ';' + str(i)
                print(childs.lstrip(';0'))
                # 不清空原來內容
                with open(txtpath, "a", encoding='utf-8') as txt:
                    txt.write(code)
                    txt.write('\t')
                    txt.write(hy)
                    txt.write('\t')
                    txt.write(childs.lstrip(';0'))
                    txt.write('\n')
                txt.close()
            else:
                pass
        except:
            pass



if __name__ == '__main__':
     get_data('https://www.shujukuji.cn/guominjingjihangyefenlei')

爬取門類

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 國民經濟行業分類與代碼(GB/T 4754-2017、GB/T 4754-2011、GB/T 4754-2002）數據下載國民經濟行業分類與代碼(GB/4754-2017) sql 國民經濟行業分類與代碼（GB/T 4754-2002、GB/T 4754-2011、GB/T 4754-2017）並存入MySQL數據庫【可獲取下載】 SourceTree安裝破姐添加SSH KEY以及拉取代碼教程(附資源下載) git命令提交/拉取代碼 IDEA 抽取代碼模板從特定的分支拉取代碼 git拉取代碼到本地 git拉取代碼到本地 git 拉取代碼指定分支