''' 中華人民共和國民政局官網中的行政區域代碼爬取: 技術點: 1>進入二級頁面(數據展示頁)時,url發生跳轉(js作用的),需要在二級頁面源碼中找到真實url 2>數據入庫實時更新:保存url,下次爬取時,先對比url,若相同,不更新,否則更新 ''' import requests from lxml import etree import re import pymysql class GovementSpider: def __init__(self): self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' } self.db = pymysql.connect('localhost', 'root', '123456', 'govermentdb', charset='utf8') self.cursor = self.db.cursor() # 提取二級頁面鏈接(假鏈接),一定是最新的那個鏈接 def get_false_link(self): html = requests.get(url=self.one_url, headers=self.headers).content.decode('utf-8', 'ignore') parse_html = etree.HTML(html) a_list = parse_html.xpath('//a [@class="artitlelist"]') for a in a_list: title = a.xpath('./@title')[0].strip() if re.findall(r'.*以上行政區划代碼', title, re.S): two_false_link = 'http://www.mca.gov.cn' + a.get('href') return two_false_link # 提取真實二級頁面鏈接(返回數據) def get_true_link(self): # 獲取響應內容 false_link = self.get_false_link() html = requests.get(url=false_link, headers=self.headers).content.decode('utf-8', 'ignore') pattern = re.compile(r'window.location.href="(.*?)"', re.S) real_link = pattern.findall(html)[0] print(real_link) # 實現增量爬取 # 即到version表中查詢是否有real_link,如果有,直接返回數據已是最新,否則,抓取最新數據 sel = 'select * from version where link="{}"'.format(real_link) self.cursor.execute(sel) # 不為空元組(不需要抓取數據),即鏈接已存在 if self.cursor.fetchall(): print('數據已是最新') else: # 先抓數據 self.get_data(real_link) # 把real_link插入到version表中 ins = 'insert into version values(%s)' self.cursor.execute(ins, [real_link]) self.db.commit() # 真正提取數據函數 def get_data(self, real_link): html = requests.get(url=real_link, headers=self.headers).text parse_html = etree.HTML(html) tr_list = parse_html.xpath('//tr[@height="19"]') for tr in tr_list: code = tr.xpath('./td[2]/text()')[0] name = tr.xpath('./td[3]/text()')[0] print(name, code) # 主函數 def main(self): pass if __name__ == '__main__': spider = GovementSpider() spider.main() spider.get_true_link()
''' 使用selenium+chrome進行爬取,可以避免js對二級頁面鏈接的渲染,爬取更簡單 ''' from selenium import webdriver import time import pymysql class GovementSpider: def __init__(self): self.browser = webdriver.Chrome() self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/' self.db = pymysql.connect('localhost', 'root', '123456', db='govdb', charset='utf8') self.cursor = self.db.cursor() # 定義三個空列表,為了excutemany() self.province_list = [] self.city_list = [] self.county_list = [] # 獲取首頁並提取二級頁面鏈接(虛假鏈接即可,真實鏈接可以不用) def get_false_url(self): self.browser.get(self.one_url) td_list = self.browser.find_elements_by_xpath('//td[@class="arlisttd"]/a[contains(@title,"代碼")]') if td_list: # 找節點對象,因為要click() two_url_element = td_list[0] # 增量爬取,取出鏈接,和數據庫中version表中作比對 two_url = two_url_element.get_attribute('href') sel = 'select * from version where link=%s' self.cursor.execute(sel, [two_url]) result = self.cursor.fetchall() if len(result) != 0: print('數據已最新,無需爬取') else: # 點擊 two_url_element.click() time.sleep(3) # 切換browser all_handles = self.browser.window_handles self.browser.switch_to_window(all_handles[1]) # 數據抓取 self.get_data() # 結束后把two_url插入version表中 ins = 'insert into version values(%s)' self.cursor.execute(ins, [two_url]) self.db.commit() # 二級頁面中提取行政區划代碼 def get_data(self): tr_list = self.browser.find_elements_by_xpath('//tr[@height="19"]') for tr in tr_list: code = tr.find_element_by_xpath('./td[2]').text.strip() name = tr.find_element_by_xpath('./td[3]').text.strip() print(name, code) # 判斷層級關系,添加到對應的數據庫表中(對應表中字段) if code[-4:] == '0000': self.province_list.append([name, code]) if name in ['北京市', '天津市', '上海市', '重慶市']: city = [name, code, code[:2] + '0000'] self.city_list.append(city) elif code[-2:] == '00': city = [name, code, code[:2] + '0000'] self.city_list.append(city) else: county = [name, code, code[:4] + '00'] self.county_list.append(county) # 所有數據爬取完成之后,統一excutemany() self.insert_mysql() def insert_mysql(self): # 更新時一定要刪除表記錄 del_province = 'delete from province' del_city = 'delete from city' del_county = 'delete from county' self.cursor.execute(del_province) self.cursor.execute(del_city) self.cursor.execute(del_county) # 插入新的數據 ins_province = 'insert into province values(%s,%s)' ins_city = 'insert into city values(%s,%s,%s)' ins_county = 'insert into county values(%s,%s,%s)' self.cursor.executemany(ins_province, self.province_list) self.cursor.executemany(ins_city, self.city_list) self.cursor.executemany(ins_county, self.county_list) self.db.commit() print('數據抓取完成,成功存入數據庫') def main(self): self.get_false_url() # 斷開連接 self.cursor.close() self.db.close() self.browser.quit() if __name__ == "__main__": spider = GovementSpider() spider.main()