距上次爬取过去1年多了,旧代码不适用新网站 ;另外上次爬取的详情页没有多大作用,这次只要取得“药品经营企业名称”就可以了
上次是通过ID的流水号,这次是通过页码的流水号来爬;
核心的目录URL获取:(自己找了2个小时,没有找到,从网上其它的页面中参考组合过来的)
http://app1.nmpa.gov.cn/datasearchcnda/face3/search.jsp?tableId=41&curstart=1
# -*- coding: UTF-8 -*- from selenium import webdriver import time import sys from selenium.webdriver.support import expected_conditions from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.wait import WebDriverWait import os #数据库连接 import dbConnect from lrlog import logger import dbConnect import random from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from lrlog import logger file_object = open('nmpa20201.txt','a') # 输入开始 和结束ID iBegin = int(input("please input begin id:")) iEnd = int(input("Please input the end id:")) if iBegin > iEnd: unicode('输入错误!', encoding='utf-8') driver = webdriver.Firefox() # 数据库连接 conn = dbConnect.ms i=1 while iBegin<=iEnd: url = 'http://app1.nmpa.gov.cn/datasearchcnda/face3/search.jsp?tableId=41&curstart=' + str(iBegin) try: driver.set_page_load_timeout(10) driver.get(url) #等待页面加载完成 except TimeoutException: print(str(iBegin) + "driver.get(url) Time out !") s = random.randint(5,20) time.sleep(s) j = 1 rowcount = 15 while j <= rowcount: t= j*2 -1 try: qymc = driver.find_element_by_xpath("/html/body/table[2]/tbody/tr["+str(t)+"]/td").text vssql = "insert nmpa2020 (qymc) values('"+qymc+"')" conn.ExecNonQuery(vssql) print('第',iBegin,'行',qymc) except : break logger.warning('打开页面出错!'+str(iBegin)) iBegin = iBegin - 1 j = j + 1 i = i + 1 if (i % 10) ==0: time.sleep(20) iBegin = iBegin + 1 print('end!')
