爬取網站:https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei-liebiao
一.數據:(由於文檔中含有敏感詞,上傳失敗)
網盤:
鏈接:https://pan.baidu.com/s/1vCUuCOEEQb786afZVhS8SQ
提取碼:nsi4
數據截圖:




二.代碼分析:(爬取小類)
如下圖,可以發現代碼與連接的關系https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei/...
①先爬取<a>標簽中的代碼
②根據代碼創造鏈接url,爬取行業名稱

三.注意:python對txt文檔的讀寫規則以及文檔指針問題

在使用Python進行txt文件的讀寫時,當打開文件后,首先用read()對文件的內容讀取,然后再用write()寫入,這時發現雖然是用“r+”模式打開,按道理是應該覆蓋的,但是卻出現了追加的情況。
這是因為在使用read后,文檔的指針已經指向了文本最后,而write寫入的時候是以指針為起始,因此就產生了追加的效果。
如果想要覆蓋,需要先seek(0),然后使用truncate()清除后,即可實現重新覆蓋寫入
四.源代碼:(爬取小類)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @File : 國民經濟行業分類及其代碼.py # @Author: 田智凱 # @Date : 2020/3/19 # @Desc :爬取中國國民經濟行業分類 from multiprocessing.pool import Pool import requests from lxml import etree import time headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } txtpath=r'D:\hy.txt' #爬取表格得到小類代碼 def get_codes(url): global codes res = requests.get(url, headers=headers) selector = etree.HTML(res.text) tb=selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/table/tbody/tr') #不清空原來內容 with open(txtpath, "a",encoding='utf-8') as txt: for tr in tb: tds=tr.xpath('td') for td in tds: code=td.xpath('div/span/a/text()')[0] #print(code) hy=get_hangye(code) txt.write(hy) txt.write('\t') txt.write(code) txt.write('\n') txt.close() #根據代碼得到名稱 def get_hangye(code): url='https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei/{}'.format(code) #print(url) res = requests.get(url, headers=headers) selector = etree.HTML(res.text) hy = selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/div/div[2]/span[2]/text()')[0] #print(hy) return hy if __name__ == '__main__': urls = ['https://www.shujukuji.cn/guominjingjihangyefenlei/xiaolei-liebiao?page={}'.format(str(p)) for p in range(0,15)] start=time.time() print('程序運行中...') pool=Pool(processes=4)#開啟四個進程 pool.map(get_codes,urls) end=time.time() print('程序用時',end-start)
附上:
import requests from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } txtpath=r'D:\hy2.txt' def get_data(url): global codes res = requests.get(url, headers=headers) selector = etree.HTML(res.text) hy=selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/div/div[1]/span[2]/a/text()')[0] code=selector.xpath('//*[@id="block-system-main"]/div/div/div[2]/div/div[2]/span[2]/text()')[0] #不清空原來內容 with open(txtpath, "a",encoding='utf-8') as txt: txt.write(hy) txt.write('\t') txt.write(code) txt.write('\n') txt.close() if __name__ == '__main__': for i in range(1,98): if i<10: url='https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format('0'+str(i)) #print(url) get_data(url) else: url = 'https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format(str(i)) get_data(url)
import requests from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } txtpath=r'D:\hy3.txt' def get_data(url): res = requests.get(url, headers=headers) selector = etree.HTML(res.text) infos=selector.xpath('//*[@id="block-system-main"]/div/div/div[3]/div[1]/div/table/tbody/tr') for info in infos: code=info.xpath('td[1]/a/text()')[0].strip() hy=info.xpath('td[2]/text()')[0].strip() #不清空原來內容 with open(txtpath, "a",encoding='utf-8') as txt: txt.write(hy) txt.write('\t') txt.write(code) txt.write('\n') txt.close() if __name__ == '__main__': for i in range(1,98): if i<10: url='https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format('0'+str(i)) #print(url) get_data(url) else: url = 'https://www.shujukuji.cn/guominjingjihangyefenlei/dalei/{}'.format(str(i)) get_data(url)
import requests from lxml import etree import re headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } txtpath=r'D:\hy.txt' def get_data(url): res = requests.get(url, headers=headers) selector = etree.HTML(res.text) infos = selector.xpath('//*[@id="node-3"]/div/div/div/div/fieldset/div/div/table/tbody/tr') for info in infos: try: code = info.xpath('td[1]/a/text()')[0].strip() print(code) if (code): hy = info.xpath('td[3]/text()')[0].strip() content = info.xpath('td[4]/text()')[0].strip() list = re.findall(r'\d+', content) # 挑選字符串中的數字(例:本門類包括01~05大類) min = int(list[0]) max = int(list[1]) # print(min,max) childs = '' for i in range(min, max + 1): if (i < 10): childs = childs + ';0' + str(i) else: childs = childs + ';' + str(i) print(childs.lstrip(';0')) # 不清空原來內容 with open(txtpath, "a", encoding='utf-8') as txt: txt.write(code) txt.write('\t') txt.write(hy) txt.write('\t') txt.write(childs.lstrip(';0')) txt.write('\n') txt.close() else: pass except: pass if __name__ == '__main__': get_data('https://www.shujukuji.cn/guominjingjihangyefenlei')
