距上次爬取過去1年多了,舊代碼不適用新網站 ;另外上次爬取的詳情頁沒有多大作用,這次只要取得“葯品經營企業名稱”就可以了
上次是通過ID的流水號,這次是通過頁碼的流水號來爬;
核心的目錄URL獲取:(自己找了2個小時,沒有找到,從網上其它的頁面中參考組合過來的)
http://app1.nmpa.gov.cn/datasearchcnda/face3/search.jsp?tableId=41&curstart=1
# -*- coding: UTF-8 -*- from selenium import webdriver import time import sys from selenium.webdriver.support import expected_conditions from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.wait import WebDriverWait import os #數據庫連接 import dbConnect from lrlog import logger import dbConnect import random from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from lrlog import logger file_object = open('nmpa20201.txt','a') # 輸入開始 和結束ID iBegin = int(input("please input begin id:")) iEnd = int(input("Please input the end id:")) if iBegin > iEnd: unicode('輸入錯誤!', encoding='utf-8') driver = webdriver.Firefox() # 數據庫連接 conn = dbConnect.ms i=1 while iBegin<=iEnd: url = 'http://app1.nmpa.gov.cn/datasearchcnda/face3/search.jsp?tableId=41&curstart=' + str(iBegin) try: driver.set_page_load_timeout(10) driver.get(url) #等待頁面加載完成 except TimeoutException: print(str(iBegin) + "driver.get(url) Time out !") s = random.randint(5,20) time.sleep(s) j = 1 rowcount = 15 while j <= rowcount: t= j*2 -1 try: qymc = driver.find_element_by_xpath("/html/body/table[2]/tbody/tr["+str(t)+"]/td").text vssql = "insert nmpa2020 (qymc) values('"+qymc+"')" conn.ExecNonQuery(vssql) print('第',iBegin,'行',qymc) except : break logger.warning('打開頁面出錯!'+str(iBegin)) iBegin = iBegin - 1 j = j + 1 i = i + 1 if (i % 10) ==0: time.sleep(20) iBegin = iBegin + 1 print('end!')
