因為工作需要cnnvd漏洞信息,以前用着集客搜、八爪魚之類的工具,但對其效果和速度都不滿意。最近開始接觸學習爬蟲,作為初學者,還需要慢慢完善。先記錄下第一個爬蟲。還想着在多進程和IP代理方向改善學習。
這個是運行情況,速度還是無法忍受,多進程在數據獲取應該能快很多,IP代理應該能忽視短時間多次訪問被限制的問題,從而可以提高速度。
輸出 excel 如圖:
以下是整個代碼:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # by 默不知然 import urllib.request from urllib import parse from bs4 import BeautifulSoup import http.cookiejar import xlwt import zlib import re import time import xlsxwriter import sys import datetime import pymysql ''' 運行方法: python vulnerabilities_crawler 2017-10-01 2017-10-31 178 第一個為開始時間,第二個為結束時間,第三個為總頁數。 ''' #獲得漏洞詳情鏈接列表 def vulnerabilities_url_list(url,start_time,end_time): header = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag' } data = { 'qstartdate':'2017-10-30', #---------------》開始日期 'qenddate':'2017-10-31' #---------------》結束日期 } data['qstartdate'] = start_time data['qenddate'] = end_time data = parse.urlencode(data).encode('utf-8') vulnerabilities_url_html = urllib.request.Request(url,headers=header,data=data) vulnerabilities_url_cookie = http.cookiejar.CookieJar() vulnerabilities_url_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(vulnerabilities_url_cookie)) vulnerabilities_url_html = vulnerabilities_url_opener.open(vulnerabilities_url_html) vulnerabilities_url_html = zlib.decompress(vulnerabilities_url_html.read(), 16+zlib.MAX_WBITS) vulnerabilities_url_html = vulnerabilities_url_html.decode() #提取漏洞詳情鏈接 response = r'href="(.+?)" target="_blank" class="a_title2"' vulnerabilities_link_list = re.compile(response).findall(vulnerabilities_url_html) #添加http前序 i = 0 for link in vulnerabilities_link_list: vulnerabilities_lists.append('http://cnnvd.org.cn'+vulnerabilities_link_list[i]) i+=1 print("已完成爬行第%d個漏洞鏈接"%i) time.sleep(0.2) #漏洞信息爬取函數 def vulnerabilities_data(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 'Accept-Encoding': 'gzip, deflate, sdch', } vulnerabilities_data_html = urllib.request.Request(url,headers=header) vulnerabilities_data_cookie = http.cookiejar.CookieJar() vulnerabilities_data_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(vulnerabilities_data_cookie)) vulnerabilities_data_html = vulnerabilities_data_opener.open(vulnerabilities_data_html) vulnerabilities_data_html = zlib.decompress(vulnerabilities_data_html.read(), 16+zlib.MAX_WBITS) vulnerabilities_data_html = vulnerabilities_data_html.decode() global vulnerabilities_result_list vulnerabilities_result_list=[] #抓取信息列表命名 #添加漏洞信息詳情 vulnerabilities_detainled_soup1 = BeautifulSoup(vulnerabilities_data_html,'html.parser') vulnerabilities_detainled_data = vulnerabilities_detainled_soup1.find('div',attrs={'class':'detail_xq w770'}) ##定義 漏洞信息詳情 塊的soup vulnerabilities_detainled_data = vulnerabilities_detainled_data.decode() vulnerabilities_detainled_soup = BeautifulSoup(vulnerabilities_detainled_data,'html.parser') #二次匹配 vulnerabilities_detainled_data_list = vulnerabilities_detainled_soup.find_all('li') #標簽a信息匯總 try: vulnerabilities_name = vulnerabilities_detainled_soup.h2.string #漏洞名稱 except: vulnerabilities_name = '' vulnerabilities_result_list.append(vulnerabilities_name) try: vulnerabilities_cnnvd_num = vulnerabilities_detainled_soup.span.string #cnnvd編號 vulnerabilities_cnnvd_num = re.findall(r"\:([\s\S]*)",vulnerabilities_cnnvd_num)[0] except: vulnerabilities_cnnvd_num = '' vulnerabilities_result_list.append(vulnerabilities_cnnvd_num) try: #漏洞等級 vulnerabilities_rank = vulnerabilities_detainled_soup.a.decode() vulnerabilities_rank = re.search(u'([\u4e00-\u9fa5]+)',vulnerabilities_rank).group(0) except: vulnerabilities_rank = '' vulnerabilities_result_list.append(vulnerabilities_rank) vulnerabilities_cve_html = vulnerabilities_detainled_data_list[2].decode() #漏洞cve編號 vulnerabilities_cve_soup = BeautifulSoup(vulnerabilities_cve_html,'html.parser') try: vulnerabilities_cve = vulnerabilities_cve_soup.a.string vulnerabilities_cve = vulnerabilities_cve.replace("\r","").replace("\t","").replace("\n","").replace(" ","") except: vulnerabilities_cve = '' vulnerabilities_result_list.append(vulnerabilities_cve) vulnerabilities_type_html = vulnerabilities_detainled_data_list[3].decode() #漏洞類型 vulnerabilities_type_soup = BeautifulSoup(vulnerabilities_type_html,'html.parser') try: vulnerabilities_type = vulnerabilities_type_soup.a.string vulnerabilities_type = vulnerabilities_type.replace("\r","").replace("\t","").replace("\n","").replace(" ","") except: vulnerabilities_type = '' vulnerabilities_result_list.append(vulnerabilities_type) vulnerabilities_time_html = vulnerabilities_detainled_data_list[4].decode() #發布時間 vulnerabilities_time_soup = BeautifulSoup(vulnerabilities_time_html,'html.parser') try: vulnerabilities_time = vulnerabilities_time_soup.a.string vulnerabilities_time = vulnerabilities_time.replace("\r","").replace("\t","").replace("\n","") except: vulnerabilities_time = '' vulnerabilities_result_list.append(vulnerabilities_time) vulnerabilities_attack_html = vulnerabilities_detainled_data_list[5].decode() #威脅類型 vulnerabilities_attack_soup = BeautifulSoup(vulnerabilities_attack_html,'html.parser') try: vulnerabilities_attack = vulnerabilities_attack_soup.a.string vulnerabilities_attack = vulnerabilities_attack.replace("\r","").replace("\t","").replace("\n","") except: vulnerabilities_attack = '' vulnerabilities_result_list.append(vulnerabilities_attack) vulnerabilities_update_html = vulnerabilities_detainled_data_list[6].decode() #更新時間 vulnerabilities_update_soup = BeautifulSoup(vulnerabilities_update_html,'html.parser') try: vulnerabilities_update = vulnerabilities_update_soup.a.string vulnerabilities_update = vulnerabilities_update.replace("\r","").replace("\t","").replace("\n","") except: vulnerabilities_update = '' vulnerabilities_result_list.append(vulnerabilities_update) vulnerabilities_firm_html = vulnerabilities_detainled_data_list[7].decode() #廠商 vulnerabilities_firm_soup = BeautifulSoup(vulnerabilities_firm_html,'html.parser') try: vulnerabilities_firm = vulnerabilities_firm_soup.a.string vulnerabilities_firm = vulnerabilities_firm.replace("\r","").replace("\t","").replace("\n","") except: vulnerabilities_firm = '' vulnerabilities_result_list.append(vulnerabilities_firm) vulnerabilities_source_html = vulnerabilities_detainled_data_list[8].decode() #漏洞來源 vulnerabilities_source_soup = BeautifulSoup(vulnerabilities_source_html,'html.parser') try: vulnerabilities_source = vulnerabilities_source_soup.a.string vulnerabilities_source = vulnerabilities_source.replace("\r","").replace("\t","").replace("\n","") except: vulnerabilities_source = '' vulnerabilities_result_list.append(vulnerabilities_source) #添加漏洞簡介詳情 vulnerabilities_title_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj'}) #定義 漏洞簡介 塊的soup vulnerabilities_title_html = vulnerabilities_title_html.decode() vulnerabilities_title_soup2 = BeautifulSoup(vulnerabilities_title_html,'html.parser') try: vulnerabilities_titles1 = vulnerabilities_title_soup2.find_all(name='p')[0].string vulnerabilities_titles2 = vulnerabilities_title_soup2.find_all(name='p')[1].string vulnerabilities_titles = vulnerabilities_titles1 + vulnerabilities_titles2 vulnerabilities_titles = vulnerabilities_titles.replace(' ','').replace('\t','').replace('\r','').replace('\n','') except: vulnerabilities_titles = '' vulnerabilities_result_list.append(vulnerabilities_titles) #漏洞公告 vulnerabilities_notice_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj m_t_20'}) #定義 漏洞公告 塊的soup vulnerabilities_notice_html = vulnerabilities_notice_html.decode() vulnerabilities_notice_soup2 = BeautifulSoup(vulnerabilities_notice_html,'html.parser') try: vulnerabilities_notice1 = vulnerabilities_notice_soup2.find_all(name='p')[0].string vulnerabilities_notice2 = vulnerabilities_notice_soup2.find_all(name='p')[1].string vulnerabilities_notice = vulnerabilities_notice1+vulnerabilities_notice2 vulnerabilities_notice = vulnerabilities_notice.replace('\n','').replace('\r','').replace('\t','') except: vulnerabilities_notice = '' vulnerabilities_result_list.append(vulnerabilities_notice) #參考網址 vulnerabilities_reference_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[1] #定義 參考網址 塊的soup vulnerabilities_reference_html = vulnerabilities_reference_html.decode() vulnerabilities_reference_soup2 = BeautifulSoup(vulnerabilities_reference_html,'html.parser') try: vulnerabilities_reference = vulnerabilities_reference_soup2.find_all(name='p')[1].string vulnerabilities_reference = vulnerabilities_reference.replace('\n','').replace('\r','').replace('\t','').replace('鏈接:','') except: vulnerabilities_reference = '' vulnerabilities_result_list.append(vulnerabilities_reference) #受影響實體 vulnerabilities_effect_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[2] #定義 受影響實體 塊的soup vulnerabilities_effect_html = vulnerabilities_effect_html.decode() vulnerabilities_effect_soup2 = BeautifulSoup(vulnerabilities_effect_html,'html.parser') try: vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='p')[0].string vulnerabilities_effect = vulnerabilities_effect.replace('\n','').replace('\r','').replace('\t','').replace(' ','') except: try: vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='a')[0].string vulnerabilities_effect = vulnerabilities_effect.replace('\n','').replace('\r','').replace('\t','').replace(' ','') except: vulnerabilities_effect = '' vulnerabilities_result_list.append(vulnerabilities_effect) #補丁 vulnerabilities_patch_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[3] #定義 補丁 塊的soup vulnerabilities_patch_html = vulnerabilities_patch_html.decode() vulnerabilities_patch_soup2 = BeautifulSoup(vulnerabilities_patch_html,'html.parser') try: vulnerabilities_patch = vulnerabilities_patch_soup2.find_all(name='p')[0].string vulnerabilities_patch = vulnerabilities_patch.replace('\n','').replace('\r','').replace('\t','').replace(' ','') except: vulnerabilities_patch = '' vulnerabilities_result_list.append(vulnerabilities_patch) #漏洞信息寫入excel def vulnerabilities_excel(excel): workbook = xlsxwriter.Workbook('vulnerabilities_data.xlsx') worksheet = workbook.add_worksheet() row = 0 col = 0 worksheet.write(row,0,'漏洞名稱') worksheet.write(row,1,'CNNVD編號') worksheet.write(row,2,'危害等級') worksheet.write(row,3,'CVE編號') worksheet.write(row,4,'漏洞類型') worksheet.write(row,5,'發布時間') worksheet.write(row,6,'攻擊途徑') worksheet.write(row,7,'更新時間') worksheet.write(row,8,'廠商') worksheet.write(row,9,'漏洞來源') worksheet.write(row,10,'漏洞描述') worksheet.write(row,11,'解決方案') worksheet.write(row,12,'參考鏈接') worksheet.write(row,13,'受影響實體') worksheet.write(row,14,'補丁') row = 1 for i in range(len(excel)): worksheet.write(row,col,excel[i][0]) worksheet.write(row,col+1,excel[i][1]) worksheet.write(row,col+2,excel[i][2]) worksheet.write(row,col+3,excel[i][3]) worksheet.write(row,col+4,excel[i][4]) worksheet.write(row,col+5,excel[i][5]) worksheet.write(row,col+6,excel[i][6]) worksheet.write(row,col+7,excel[i][7]) worksheet.write(row,col+8,excel[i][8]) worksheet.write(row,col+9,excel[i][9]) worksheet.write(row,col+10,excel[i][10]) worksheet.write(row,col+11,excel[i][11]) worksheet.write(row,col+12,excel[i][12]) worksheet.write(row,col+13,excel[i][13]) worksheet.write(row,col+14,excel[i][14]) row += 1 workbook.close() #漏洞信息寫入數據庫 def vulnerabilities_mysql(excel): db = pymysql.connect('127.0.0.1','root','xxxx','spider',charset='utf8') cursor = db.cursor() for i in range(len(excel)): sql="INSERT INTO cnnvd(vulnerabilities_name,cnnvd_num,vulnerabilities_rank,cve_num,vulnerabilities_type,release_time,attack_path,update_time,company,vulnerabilities_source,vulnerabilities_data,solution,reference,object,path) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);" try: cursor.execute(sql,(excel[i][0],excel[i][1],excel[i][2],excel[i][3],excel[i][4],excel[i][5],excel[i][6],excel[i][7],excel[i][8],excel[i][9],excel[i][10],excel[i][11],excel[i][12],excel[i][13],excel[i][14])) except: print('寫入數據庫失敗') print('寫入數據庫完畢!!!') db.commit() db.close() #爬取代理ip def main(): #調用漏洞列表函數並獲得漏洞鏈接列表 begin = datetime.datetime.now() global vulnerabilities_lists vulnerabilities_lists=[] j = 1 page_count = sys.argv[3] page_count = int(page_count) start_time = sys.argv[1] end_time = sys.argv[2] while j<=page_count: try: vulnerabilities_url = 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag?pageno=%d&repairLd='%j vulnerabilities_url_list(vulnerabilities_url,start_time,end_time) print("已完成爬行第%d頁"%j) print('\n') time.sleep(2) j+=1 except: print('爬取失敗,等待5秒后重新爬取。') time.sleep(5) #調用漏洞信息函數並爬取漏洞信息 vulnerabilities_result_lists = [] a=0 while a < len(vulnerabilities_lists): try: vulnerabilities_data(vulnerabilities_lists[a]) vulnerabilities_result_lists.append(vulnerabilities_result_list) a+=1 print("完成爬行第%d個漏洞信息"%a) time.sleep(1) except: print('爬取失敗,等待5秒后重新爬取。') time.sleep(5) #漏洞信息寫入excel vulnerabilities_excel(vulnerabilities_result_lists) #漏洞信息寫入MySQL #vulnerabilities_mysql(vulnerabilities_result_lists) #爬行結束 end = datetime.datetime.now() total_time = end - begin print ('漏洞信息爬取結束') print ('應該爬行漏洞數量: ',len(vulnerabilities_lists)) print ('爬行時間: ',total_time) if __name__ == '__main__': main()