python爬取疫情數據詳解


 

 

 

 

 

 

首先逐步分析每行代碼的意思:

這是要引入的東西:

from os import path
import requests
from bs4 import BeautifulSoup
import json
import pymysql
import numpy as np
import time

輸入請求地址:

#請求地址
url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'

為了防止被反爬蟲(偽裝成瀏覽器):

#為了避免反爬,偽裝成瀏覽器:
#創建頭部信息
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
response =  requests.get(url,headers = headers)  #發送網絡請求

頁面輸出顯示信息:

#print(response.content.decode('utf-8'))#以字節流形式打印網頁源碼
content = response.content.decode('utf-8')
#print(content)

進行解析:

soup = BeautifulSoup(content, 'html.parser')#指定Beautiful的解析器為“html.parser”

之后就是對於數組的處理:

'''*find()
返回的是第一個匹配的標簽結果
*find_all()
返回的是所有匹配結果的列表'''
listA = soup.find_all(name='script',attrs={"id":"getAreaStat"})
#世界確診
listB = soup.find_all(name='script',attrs={"id":"getListByCountryTypeService2"})
account = str(listA)#轉化成字符串
messages = account[52:-21]#截取從52到后邊倒數21個

轉換類型:

messages_json = json.loads(messages)#json.loads 用於解碼 JSON 數據。該函數返回 Python 字段的數據類型。

之后就是線管的數據傳入list然后對數據庫進行操作了。

具體的代碼如下:

from os import path
import requests
from bs4 import BeautifulSoup
import json
import pymysql
import numpy as np
import time
#請求地址
url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'
#為了避免反爬,偽裝成瀏覽器:
#創建頭部信息
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
response =  requests.get(url,headers = headers)  #發送網絡請求
#print(response.content.decode('utf-8'))#以字節流形式打印網頁源碼
content = response.content.decode('utf-8')
#print(content)
soup = BeautifulSoup(content, 'html.parser')#指定Beautiful的解析器為“html.parser”
listA = soup.find_all(name='script',attrs={"id":"getAreaStat"})
#世界確診
listB = soup.find_all(name='script',attrs={"id":"getListByCountryTypeService2"})
#listA = soup.find_all(name='div',attrs={"class":"c-touchable-feedback c-touchable-feedback-no-default"})
account = str(listA)
#world_messages = str(listB)[87:-21]
messages = account[52:-21]
messages_json = json.loads(messages)
#world_messages_json = json.loads(world_messages)
valuesList = []
cityList = []
'''
worldList = []
for k in range(len(world_messages_json)):
    worldvalue = (world_messages_json[k].get('id'),world_messages_json[k].get('createTime'),world_messages_json[k].get('modifyTime'),world_messages_json[k].get('tags'),
             world_messages_json[k].get('countryType'),world_messages_json[k].get('continents'),world_messages_json[k].get('provinceId'),world_messages_json[k].get('provinceName'),
             world_messages_json[k].get('provinceShortName'),world_messages_json[k].get('cityName'),world_messages_json[k].get('currentConfirmedCount'),world_messages_json[k].get('confirmedCount'),
             world_messages_json[k].get('suspectedCount'),world_messages_json[k].get('curedCount'),world_messages_json[k].get('deadCount'),world_messages_json[k].get('locationId'),
             world_messages_json[k].get('countryShortCode'),)
    worldList.append(worldvalue)
'''
con=len(messages_json)
k=0
for i in range(len(messages_json)):
    #value = messages_json[i]
    k=k+1
    value = (k,time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),messages_json[i].get('provinceShortName'),None,messages_json[i].get('confirmedCount'),messages_json[i].get('suspectedCount'),messages_json[i].get('curedCount'),messages_json[i].get('deadCount'),messages_json[i].get('locationId'))
    valuesList.append(value)
    cityValue = messages_json[i].get('cities')
    #print(cityValue)
    for j in range(len(cityValue)):
        con=con+1
        cityValueList = (con,time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),messages_json[i].get('provinceShortName'),cityValue[j].get('cityName'),cityValue[j].get('confirmedCount'),cityValue[j].get('suspectedCount'),cityValue[j].get('curedCount'),cityValue[j].get('deadCount'),cityValue[j].get('locationId'))
        #print(cityValueList)
        cityList.append(cityValueList)
    #cityList.append(cityValue)
db = pymysql.connect("localhost", "root", "密碼", "ceshi1", charset='utf8')
cursor = db.cursor()
array = np.asarray(valuesList[0])
#sql_clean_world = "TRUNCATE TABLE world_map"
#sql_clean_city = "TRUNCATE TABLE city_map"
#sql_clean_json = "TRUNCATE TABLE province_data_from_json"
sql_clean_province = "TRUNCATE TABLE info3"
#sql1 = "INSERT INTO city_map values (%s,%s,%s,%s,%s,%s,%s,%s)"
#sql_world = "INSERT INTO world_map values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
#sql = "INSERT INTO province_map values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
sql = "INSERT INTO info3 values (%s,%s,%s,%s,%s,%s,%s,%s,%s) "
#sql = "INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
#sql = """INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values ('湖北省', '湖北', 43334, 64786, 0, 18889, 2563, '', 420000, 'https://file1.dxycdn.com/2020/0223/618/3398299751673487511-135.json')"""
value_tuple = tuple(valuesList)
cityTuple = tuple(cityList)
#worldTuple = tuple(worldList)
#print(cityTuple)
#print(tuple(value_tuple))
try:
    #cursor.execute(sql_clean_city)
    cursor.execute(sql_clean_province)
    #cursor.executemany(sql, value_tuple)
    #cursor.executemany(sql1,cityTuple)
    db.commit()
except:
    print('執行失敗,進入回調1')
    db.rollback()

try:
    #cursor.execute(sql_clean_city)
    #cursor.execute(sql_clean_province)
    cursor.executemany(sql, value_tuple)
    #cursor.executemany(sql1,cityTuple)
    db.commit()
except:
    print('執行失敗,進入回調3')
    db.rollback()

try:
    #cursor.execute(sql_clean_city)
    #cursor.execute(sql_clean_province)
    #cursor.executemany(sql, value_tuple)
    cursor.executemany(sql,cityTuple)
    db.commit()
except:
    print('執行失敗,進入回調4')
    db.rollback()

#print(messages_json)
#print(account[52:-21])
# soupDiv = BeautifulSoup(listA,'html.parser')
# listB = soupDiv.find_all(name='div',attrs={"class":"c-gap-bottom-zero c-line-clamp2"})
#for i in listA:
    #print(i)
#listA[12]
#print(listA)


db.close()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM