丁香園中數據是由js加載出來的,因此需要用正則去截取數據,最后轉成json或其他數據形式並處理
爬取代碼:
import pymysql import requests import pprint import re import json def write_to_file(item): with open('yiqing.json','w',encoding='utf-8') as f: f.write(json.dumps(item,indent=4,ensure_ascii=False)) f.close() def mysql_(): conn = pymysql.connect(host='127.0.0.1', user='root', passwd='yuanpeng0', db='test', port=3306, charset='utf8', cursorclass=pymysql.cursors.DictCursor) cur = conn.cursor() return conn,cur reault=requests.get('https://ncov.dxy.cn/ncovh5/view/pneumonia?scene=2&clicktime=1579583352&enterid=1579583352&from=timeline&isappinstalled=0') url_text=reault.content.decode() url_result=re.search(r'window.getAreaStat = (.*?)}]}catch',url_text,re.S) texts=url_result.group() texts=texts.replace('window.getAreaStat = ','') texts=texts.replace('}catch','') c=json.loads(texts) # pprint.pprint(c) write_to_file(c) result=re.search(r' window.getStatisticsService(.*?)該字段已替換為說明1',url_text,re.S) result2=result.group() result3=result2.replace(' window.getStatisticsService = ','')+'"}' texts4=json.loads(result3) currentConfirmedCount=(texts4['currentConfirmedCount']) suspectedCount=(texts4['suspectedCount']) seriousCount=(texts4['seriousCount']) confirmedCount=(texts4['confirmedCount']) deadCount=(texts4['deadCount']) curedCount=(texts4['curedCount']) list_result=[] list_result.append((currentConfirmedCount,suspectedCount,seriousCount,confirmedCount,deadCount,curedCount)) print(list_result) conn, cur = mysql_() select_sql = "update yiqingcount set currentConfirmedCount=%s,suspectedCount=%s,seriousCount=%s,confirmedCount=%s,deadCount=%s,curedCount=%s where id = 1" cur.executemany(select_sql,list_result) conn.commit()
爬取數據形式如下: