一、創建新項目,新建Flight_Info.py頁面
1.寫一個主程序方法:
1 #主程序 2 if __name__ == '__main__': 3 try: 4 py_info() #循環爬取方法 5 6 #爬取出錯 7 except Exception as e: 8 print('爬取錯誤:'+e) 9 #pass
2.查看爬取頁面HTML,定位要爬取信息位置

3.根據URL參數爬取航班信息:
1 ok_ip=[] #可用IP 2 all_ip=[] #IP列表 3 ok=[] #返回信息 4 # 根據航班參數返回航班信息 5 def get_content(fnum,dep,arr,date,type): 6 global ok_ip 7 global all_ip 8 global ok 9 # 首次使用本機IP直接獲取 10 content = requests.get('http://happiness.variflight.com/info/detail?fnum='+fnum+'&dep='+dep+'&arr='+arr+'&date='+date+'&type='+type+'').text 11 soup = BeautifulSoup(content, 'html.parser') 12 13 #是否上限需代理IP 14 if(content.find("查無航班判斷") < 0): 15 ipinfo = open('代理IP(2017-12-25).txt') 16 all_ip = ipinfo.read().splitlines() 17 18 if len(ok_ip)>0: #有可用IP 19 iptext=ok_ip[0] 20 # 查詢上限,換IP 21 proxies = {'http': '//' + iptext, 'https': '//' + iptext} 22 try: 23 # proxies代理IP timeout超時設置 24 content = requests.get( 25 'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '', 26 proxies=proxies,timeout=30).text 27 soup = BeautifulSoup(content, 'html.parser') 28 # 可用IP是否上限 29 if (content.find("查無航班判斷") < 0): 30 if(ok_ip[0]!=''): 31 ok_ip.remove(iptext) # 移除不可用IP 32 except: 33 pass 34 35 else: #無可用IP找IP列表 36 # 獲取IP列表 37 for qwe in all_ip: 38 iptext = qwe 39 40 # 查詢上限,換IP 41 proxies = {'http': '//' + iptext, 'https': '//' + iptext} 42 try: 43 content = requests.get( 44 'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '', 45 proxies=proxies,timeout=30).text 46 soup = BeautifulSoup(content, 'html.parser') 47 # 可用IP是否上限 48 if (content.find("查無航班判斷") < 0): 49 50 if(ok_ip[0]!=''): 51 ok_ip.remove(iptext) # 移除不可用IP 52 continue 53 # 是可用IP即結束循環 54 else: 55 ok_ip.append(iptext) # 加入可用IP 56 print('目前可用IP:' + iptext) 57 break 58 except : 59 continue 60 61 #暫無航班信息 62 if (content.find("沒有找到您輸入的航班信息") > 0): 63 ok=[] 64 #查詢成功 65 else: 66 try: 67 ok=get_info(fnum,soup,dep,arr) 68 except: 69 print('爬取'+fnum+'航班失敗') 70 return ok 71 #返回航班信息 72 return ok
4.自動循環爬取
1 #循環爬取 2 def py_info(): 3 # 批量爬取航班信息 4 newhb='' 5 szm_cf='' 6 szm_md='' 7 hbb='' 8 # 根據航班txt循環爬取 9 hb_txt = open('航班列表.txt') 10 try: 11 all_text = hb_txt.read().splitlines() 12 #獲取最新航班索引 13 newhb=ReadPGSQL() 14 if(newhb!=''): #獲取數據庫最新航班 15 hisindex = all_text.index(newhb) 16 # 查找位置 17 for hb in all_text: 18 # 找到當前位置開始爬取 19 if (all_text.index(hb) < hisindex): 20 continue 21 szm_list = hb.split("\t", 1)[0] 22 szm_cf = szm_list[0:3] # 出發地三字碼 23 szm_md = szm_list[3:6] # 目的地三字碼 24 hbb = hb.split("\t", 1)[1] # 航班號 25 hblx = '1' # 航班類型 26 hbrq = time.strftime("%Y-%m-%d") # 日期 27 save(hbb, szm_cf, szm_md, hbrq, hblx) # 保存航班信息 28 print(hbb + '航班爬取完成!') 29 print('爬取完成!') 30 else: 31 for hb in all_text: 32 szm_list = hb.split("\t", 1)[0] 33 szm_cf = szm_list[0:3] # 出發地三字碼 34 szm_md = szm_list[3:6] # 目的地三字碼 35 hbb = hb.split("\t", 1)[1] # 航班號 36 hblx = '1' # 航班類型 37 hbrq = time.strftime("%Y-%m-%d") # 日期 38 save(hbb, szm_cf, szm_md, hbrq, hblx) # 保存航班信息 39 print(hbb + '航班爬取完成!') 40 print('爬取完成!') 41 42 # 爬取出錯中止寫入列名 43 except: 44 print('保存航班出錯') 45 Error(szm_cf,szm_md,hbb) #記錄出錯航班 46 #pass
5.處理HTML
1 #處理HTML航班信息 2 def get_info(fnum,soup,dep,arr): 3 try: 4 hbh = fnum 5 6 phdate=time.strftime("%Y-%m-%d") #抓取票號日期 7 8 szm_str=dep 9 10 szm_end=arr 11 12 str_time='' 13 # 查找div中class=“fl three-lef”的HTML 14 for li in soup.select('div[class="fl three-lef"]'): #起飛時間 15 str_time=li.get_text() #獲取文本內容 16 17 end_time='' 18 for li in soup.select('div[class="fr three-rig"]'): #到達時間 19 end_time=li.get_text() 20 21 jt = '無經停' 22 for li in soup.select('div[class="fl three-mid"]'): # 經停 23 jt = li.get_text() 24 if(jt!='無經停'): 25 jt=jt[4:] 26 27 km='' 28 for li in soup.select('p[class="one"]'): #里程(km) 29 km=li.get_text() 30 km=km[4:] 31 32 km_time='' 33 for li in soup.select('p[class="two"]'): #耗時(分鍾) 34 km_time=li.get_text() 35 km_time=km_time[4:] 36 37 jx=' ' 38 for li in soup.select('span[style="max-width:75px!important"]'): #機型 39 jx=li.get_text() 40 41 jxdx='' 42 if(soup.select('span[title="大型客機"]')): 43 jxdx='大型客機' 44 elif(soup.select('span[title="中型客機"]')): 45 jxdx = '中型客機' 46 elif(soup.select('span[title="小型客機"]')): 47 jxdx = '中型客機' 48 49 can='' 50 if (soup.select('span[class="totalCont"]')): 51 can='提供' 52 53 pf='' 54 for li in soup.select('span[class="score cur"]'): #舒適度評分 55 pf=li.get_text() 56 57 updatetime=time.strftime("%Y-%m-%d") #更新時間 58 59 try: 60 FLPGSQL(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf, 61 updatetime) # 入庫 62 except: 63 print('入庫出錯') 64 Error(szm_str,szm_end,hbh) #記錄出錯航班 65 #pass 66 67 finally: 68 return(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf, updatetime)
全部代碼:(單個爬取航班)
1 import urllib.request 2 import urllib.parse 3 import re 4 from bs4 import BeautifulSoup 5 import requests 6 from lxml import etree 7 import datetime 8 import time 9 import html 10 import csv 11 import exception 12 import int 13 import psycopg2 14 import socket 15 import sys 16 import os 17 18 19 #處理航班信息HTML 20 def get_info(fnum,soup,dep,arr): 21 try: 22 hbh = fnum 23 24 phdate=time.strftime("%Y-%m-%d") #抓取票號日期 25 26 szm_str=dep 27 28 szm_end=arr 29 30 str_time=' ' 31 for li in soup.select('div[class="fl three-lef"]'): #起飛時間 32 str_time=li.get_text() 33 34 end_time=' ' 35 for li in soup.select('div[class="fr three-rig"]'): #到達時間 36 end_time=li.get_text() 37 38 jt = ' ' 39 for li in soup.select('div[class="fl three-mid"]'): # 經停 40 jt = li.get_text() 41 if(jt!=' '): 42 jt=jt[4:] 43 44 km='' 45 for li in soup.select('p[class="one"]'): #里程(km) 46 km=li.get_text() 47 km=km[4:] 48 49 km_time=' ' 50 for li in soup.select('p[class="two"]'): #耗時(分鍾) 51 km_time=li.get_text() 52 km_time=km_time[4:] 53 54 jx=' ' 55 for li in soup.select('span[style="max-width:75px!important"]'): #機型 56 jx=li.get_text() 57 58 jxdx=' ' 59 if(soup.select('span[title="大型客機"]')): 60 jxdx='大型客機' 61 elif(soup.select('span[title="中型客機"]')): 62 jxdx = '中型客機' 63 elif(soup.select('span[title="小型客機"]')): 64 jxdx = '中型客機' 65 66 can=' ' 67 if (soup.select('span[class="totalCont"]')): 68 can='提供' 69 70 pf=' ' 71 for li in soup.select('span[class="score cur"]'): #舒適度評分 72 pf=li.get_text() 73 74 updatetime=time.strftime("%Y-%m-%d") #更新時間 75 76 finally: 77 return(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf, updatetime) 78 79 80 ok_ip=[] #可用IP 81 all_ip=[] #IP列表 82 ok=[] #返回信息 83 # 根據航班參數請求頁面 84 def get_content(fnum,dep,arr,date,type): 85 # 首次使用本機IP 86 content = requests.get('http://happiness.variflight.com/info/detail?fnum='+fnum+'&dep='+dep+'&arr='+arr+'&date='+date+'&type='+type+'').text 87 soup = BeautifulSoup(content, 'html.parser') 88 89 #是否上限需代理IP 90 if(content.find("Notifica: timeout del gateway")>0 or content.find("The requested URL could not be retrieved")>0 or content.find("main notFound")>0 or content.find("此類查詢已達當日上限")>0): 91 ipinfo = open('代理IP(2017-12-25).txt') 92 all_ip = ipinfo.read().splitlines() 93 94 if len(ok_ip)>0: #有可用IP 95 iptext=ok_ip[0] 96 # 查詢上限,換IP 97 proxies = {'http': '//' + iptext, 'https': '//' + iptext} 98 try: 99 content = requests.get( 100 'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '', 101 proxies=proxies).text 102 #, timeout=120 103 #socket.setdefaulttimeout(150) # 超時后能自動往下繼續跑 104 soup = BeautifulSoup(content, 'html.parser') 105 # 可用IP是否上限 106 if (content.find("Notifica: timeout del gateway") > 0 or content.find( 107 "The requested URL could not be retrieved") > 0 or content.find( 108 "main notFound") > 0 or content.find("此類查詢已達當日上限") > 0): 109 ok_ip.remove(iptext) # 移除不可用IP 110 except: 111 pass 112 113 else: #無可用IP找IP列表 114 # 獲取IP列表 115 for qwe in all_ip: 116 iptext = qwe 117 118 # 查詢上限,換IP 119 proxies = {'http': '//' + iptext, 'https': '//' + iptext} 120 try: 121 content = requests.get( 122 'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '', 123 proxies=proxies).text 124 #,timeout=120 125 #socket.setdefaulttimeout(150) ##超時后能自動往下繼續跑 126 soup = BeautifulSoup(content, 'html.parser') 127 128 # 可用IP是否上限 129 if (content.find("502 Bad Gateway")>0 or content.find("Notifica: timeout del gateway") > 0 or content.find( 130 "The requested URL could not be retrieved") > 0 or content.find( 131 "main notFound") > 0 or content.find("此類查詢已達當日上限") > 0): 132 ok_ip.remove(iptext) # 移除不可用IP 133 continue 134 # 是可用IP即結束循環 135 else: 136 ok_ip.append(iptext) # 加入可用IP 137 print('目前可用IP:' + iptext) 138 break 139 except : 140 continue 141 142 #暫無航班信息 143 if (content.find("沒有找到您輸入的航班信息") > 0): 144 ok=[] 145 #查詢成功 146 else: 147 try: 148 ok=get_info(fnum,soup,dep,arr) 149 except: 150 return ok 151 #返回航班信息 152 return ok 153 154 155 #寫入CSV文件 156 def save(fnum,dep,arr,date,type): 157 #返回航班信息 158 try: 159 content=get_content(fnum,dep,arr,date,type) 160 # 寫方式打開一個文本,把獲取的航班信息存放進去 161 with open('Flight_Info.csv', 'a', ) as f: 162 writer = csv.writer(f) 163 writer.writerows([content]) 164 f.close() 165 except: 166 pass 167 168 169 hbb='' 170 szm_cf='' 171 szm_md='' 172 #循環爬取 173 def py_info(): 174 global hbb 175 global szm_cf 176 global szm_md 177 try: 178 print('請輸入航班號:') 179 hbb = input() # 航班號 180 print('請輸入出發地三字碼:') 181 szm_cf = input() # 出發地三字碼 182 print('請輸入目的地三字碼:') 183 szm_md = input() # 目的地三字碼 184 hblx = '1' # 航班類型默認為1 185 hbrq = time.strftime("%Y-%m-%d") # 日期默認當天 186 save(hbb, szm_cf, szm_md, hbrq, hblx) # 保存寫入CSV文件 187 print(hbb + '航班爬取完成!') 188 189 # 爬取出錯跳過繼續 190 except: 191 print(hbh+'航班爬取出錯'+szm_cf+szm_md) #輸出出錯航班信息 192 pass 193 194 195 #主程序 196 if __name__ == '__main__': 197 py_info()
