python爬蟲實例（爬取航班信息）

本文轉載自查看原文 2017-12-29 14:58 4295

一、創建新項目，新建Flight_Info.py頁面

1.寫一個主程序方法：

1 #主程序
2 if __name__ == '__main__':
3     try:
4         py_info() #循環爬取方法
5 
6 #爬取出錯
7     except Exception as e:
8         print('爬取錯誤：'+e)
9         #pass

2.查看爬取頁面HTML，定位要爬取信息位置

3.根據URL參數爬取航班信息：

 1 ok_ip=[] #可用IP
 2 all_ip=[] #IP列表
 3 ok=[] #返回信息
 4 # 根據航班參數返回航班信息
 5 def get_content(fnum,dep,arr,date,type):
 6     global ok_ip
 7     global all_ip
 8     global ok
 9     # 首次使用本機IP直接獲取
10     content = requests.get('http://happiness.variflight.com/info/detail?fnum='+fnum+'&dep='+dep+'&arr='+arr+'&date='+date+'&type='+type+'').text
11     soup = BeautifulSoup(content, 'html.parser')
12 
13     #是否上限需代理IP
14     if(content.find("查無航班判斷") < 0):
15       ipinfo = open('代理IP(2017-12-25).txt')
16       all_ip = ipinfo.read().splitlines()
17 
18       if len(ok_ip)>0: #有可用IP
19            iptext=ok_ip[0]
20            # 查詢上限，換IP
21            proxies = {'http': '//' + iptext, 'https': '//' + iptext}
22            try:
23                # proxies代理IP timeout超時設置
24                content = requests.get(
25                    'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
26                    proxies=proxies,timeout=30).text
27                soup = BeautifulSoup(content, 'html.parser')
28                # 可用IP是否上限
29                if (content.find("查無航班判斷") < 0):
30                    if(ok_ip[0]!=''):
31                        ok_ip.remove(iptext)  # 移除不可用IP
32            except:
33                pass
34 
35       else: #無可用IP找IP列表
36           # 獲取IP列表
37           for qwe in all_ip:
38               iptext = qwe
39 
40               # 查詢上限，換IP
41               proxies = {'http': '//' + iptext, 'https': '//' + iptext}
42               try:
43                   content = requests.get(
44                       'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
45                       proxies=proxies,timeout=30).text
46                   soup = BeautifulSoup(content, 'html.parser')
47                   # 可用IP是否上限
48                   if (content.find("查無航班判斷") < 0):
49 
50                       if(ok_ip[0]!=''):
51                           ok_ip.remove(iptext)  # 移除不可用IP
52                       continue
53                   # 是可用IP即結束循環
54                   else:
55                       ok_ip.append(iptext)  # 加入可用IP
56                       print('目前可用IP：' + iptext)
57                       break
58               except :
59                   continue
60 
61     #暫無航班信息
62     if (content.find("沒有找到您輸入的航班信息") > 0):
63         ok=[]
64     #查詢成功
65     else:
66         try:
67           ok=get_info(fnum,soup,dep,arr)
68         except:
69             print('爬取'+fnum+'航班失敗')
70             return ok
71     #返回航班信息
72     return ok

View Code

4.自動循環爬取

 1 #循環爬取
 2 def py_info():
 3     # 批量爬取航班信息
 4     newhb=''
 5     szm_cf=''
 6     szm_md=''
 7     hbb=''
 8     # 根據航班txt循環爬取
 9     hb_txt = open('航班列表.txt')
10     try:
11         all_text = hb_txt.read().splitlines()
12         #獲取最新航班索引
13         newhb=ReadPGSQL()
14         if(newhb!=''): #獲取數據庫最新航班
15             hisindex = all_text.index(newhb)
16             # 查找位置
17             for hb in all_text:
18                 # 找到當前位置開始爬取
19                 if (all_text.index(hb) < hisindex):
20                     continue
21                 szm_list = hb.split("\t", 1)[0]
22                 szm_cf = szm_list[0:3]  # 出發地三字碼
23                 szm_md = szm_list[3:6]  # 目的地三字碼
24                 hbb = hb.split("\t", 1)[1]  # 航班號
25                 hblx = '1'  # 航班類型
26                 hbrq = time.strftime("%Y-%m-%d")  # 日期
27                 save(hbb, szm_cf, szm_md, hbrq, hblx)  # 保存航班信息
28                 print(hbb + '航班爬取完成！')
29             print('爬取完成！')
30         else:
31             for hb in all_text:
32                 szm_list = hb.split("\t", 1)[0]
33                 szm_cf = szm_list[0:3]  # 出發地三字碼
34                 szm_md = szm_list[3:6]  # 目的地三字碼
35                 hbb = hb.split("\t", 1)[1]  # 航班號
36                 hblx = '1'  # 航班類型
37                 hbrq = time.strftime("%Y-%m-%d")  # 日期
38                 save(hbb, szm_cf, szm_md, hbrq, hblx)  # 保存航班信息
39                 print(hbb + '航班爬取完成！')
40             print('爬取完成！')
41 
42     # 爬取出錯中止寫入列名
43     except:
44         print('保存航班出錯')
45         Error(szm_cf,szm_md,hbb) #記錄出錯航班
46         #pass

View Code

5.處理HTML

 1 #處理HTML航班信息
 2 def get_info(fnum,soup,dep,arr):
 3     try:
 4         hbh = fnum
 5 
 6         phdate=time.strftime("%Y-%m-%d") #抓取票號日期
 7 
 8         szm_str=dep
 9 
10         szm_end=arr
11 
12         str_time=''
13         # 查找div中class=“fl three-lef”的HTML
14         for li in soup.select('div[class="fl three-lef"]'): #起飛時間
15          str_time=li.get_text() #獲取文本內容
16 
17         end_time=''
18         for li in  soup.select('div[class="fr three-rig"]'): #到達時間
19          end_time=li.get_text()
20 
21          jt = '無經停'
22          for li in soup.select('div[class="fl three-mid"]'):  # 經停
23              jt = li.get_text()
24              if(jt!='無經停'):
25                  jt=jt[4:]
26 
27          km=''
28          for li in soup.select('p[class="one"]'): #里程（km）
29           km=li.get_text()
30           km=km[4:]
31 
32          km_time=''
33          for li in soup.select('p[class="two"]'): #耗時（分鍾）
34           km_time=li.get_text()
35           km_time=km_time[4:]
36 
37         jx=' '
38         for li in soup.select('span[style="max-width:75px!important"]'): #機型
39          jx=li.get_text()
40 
41         jxdx=''
42         if(soup.select('span[title="大型客機"]')):
43            jxdx='大型客機'
44         elif(soup.select('span[title="中型客機"]')):
45            jxdx = '中型客機'
46         elif(soup.select('span[title="小型客機"]')):
47            jxdx = '中型客機'
48 
49         can=''
50         if (soup.select('span[class="totalCont"]')):
51          can='提供'
52 
53         pf=''
54         for li in soup.select('span[class="score cur"]'): #舒適度評分
55          pf=li.get_text()
56 
57         updatetime=time.strftime("%Y-%m-%d") #更新時間
58 
59         try:
60             FLPGSQL(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf,
61                     updatetime)  # 入庫
62         except:
63             print('入庫出錯')
64             Error(szm_str,szm_end,hbh) #記錄出錯航班
65             #pass
66 
67     finally:
68         return(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf, updatetime)

View Code

全部代碼：（單個爬取航班）

  1 import urllib.request
  2 import urllib.parse
  3 import re
  4 from bs4 import BeautifulSoup
  5 import  requests
  6 from lxml import etree
  7 import datetime
  8 import time
  9 import html
 10 import csv
 11 import exception
 12 import int
 13 import psycopg2
 14 import socket
 15 import sys
 16 import os
 17 
 18 
 19 #處理航班信息HTML
 20 def get_info(fnum,soup,dep,arr):
 21     try:
 22         hbh = fnum
 23 
 24         phdate=time.strftime("%Y-%m-%d") #抓取票號日期
 25 
 26         szm_str=dep
 27 
 28         szm_end=arr
 29 
 30         str_time=' '
 31         for li in soup.select('div[class="fl three-lef"]'): #起飛時間
 32          str_time=li.get_text()
 33 
 34         end_time=' '
 35         for li in  soup.select('div[class="fr three-rig"]'): #到達時間
 36          end_time=li.get_text()
 37 
 38          jt = ' '
 39          for li in soup.select('div[class="fl three-mid"]'):  # 經停
 40              jt = li.get_text()
 41              if(jt!=' '):
 42                  jt=jt[4:]
 43 
 44          km=''
 45          for li in soup.select('p[class="one"]'): #里程（km）
 46           km=li.get_text()
 47           km=km[4:]
 48 
 49          km_time=' '
 50          for li in soup.select('p[class="two"]'): #耗時（分鍾）
 51           km_time=li.get_text()
 52           km_time=km_time[4:]
 53 
 54         jx=' '
 55         for li in soup.select('span[style="max-width:75px!important"]'): #機型
 56          jx=li.get_text()
 57 
 58         jxdx=' '
 59         if(soup.select('span[title="大型客機"]')):
 60            jxdx='大型客機'
 61         elif(soup.select('span[title="中型客機"]')):
 62            jxdx = '中型客機'
 63         elif(soup.select('span[title="小型客機"]')):
 64            jxdx = '中型客機'
 65 
 66         can=' '
 67         if (soup.select('span[class="totalCont"]')):
 68          can='提供'
 69 
 70         pf=' '
 71         for li in soup.select('span[class="score cur"]'): #舒適度評分
 72          pf=li.get_text()
 73 
 74         updatetime=time.strftime("%Y-%m-%d") #更新時間
 75 
 76     finally:
 77         return(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf, updatetime)
 78 
 79 
 80 ok_ip=[] #可用IP
 81 all_ip=[] #IP列表
 82 ok=[] #返回信息
 83 # 根據航班參數請求頁面
 84 def get_content(fnum,dep,arr,date,type):
 85     # 首次使用本機IP
 86     content = requests.get('http://happiness.variflight.com/info/detail?fnum='+fnum+'&dep='+dep+'&arr='+arr+'&date='+date+'&type='+type+'').text
 87     soup = BeautifulSoup(content, 'html.parser')
 88 
 89     #是否上限需代理IP
 90     if(content.find("Notifica: timeout del gateway")>0 or content.find("The requested URL could not be retrieved")>0 or content.find("main notFound")>0 or content.find("此類查詢已達當日上限")>0):
 91       ipinfo = open('代理IP(2017-12-25).txt')
 92       all_ip = ipinfo.read().splitlines()
 93 
 94       if len(ok_ip)>0: #有可用IP
 95            iptext=ok_ip[0]
 96            # 查詢上限，換IP
 97            proxies = {'http': '//' + iptext, 'https': '//' + iptext}
 98            try:
 99                content = requests.get(
100                    'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
101                    proxies=proxies).text
102                #, timeout=120
103                #socket.setdefaulttimeout(150)  # 超時后能自動往下繼續跑
104                soup = BeautifulSoup(content, 'html.parser')
105                # 可用IP是否上限
106                if (content.find("Notifica: timeout del gateway") > 0 or content.find(
107                        "The requested URL could not be retrieved") > 0 or content.find(
108                    "main notFound") > 0 or content.find("此類查詢已達當日上限") > 0):
109                    ok_ip.remove(iptext)  # 移除不可用IP
110            except:
111                pass
112 
113       else: #無可用IP找IP列表
114           # 獲取IP列表
115           for qwe in all_ip:
116               iptext = qwe
117 
118               # 查詢上限，換IP
119               proxies = {'http': '//' + iptext, 'https': '//' + iptext}
120               try:
121                   content = requests.get(
122                       'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
123                       proxies=proxies).text
124                   #,timeout=120
125                   #socket.setdefaulttimeout(150) ##超時后能自動往下繼續跑
126                   soup = BeautifulSoup(content, 'html.parser')
127 
128                   # 可用IP是否上限
129                   if (content.find("502 Bad Gateway")>0 or content.find("Notifica: timeout del gateway") > 0 or content.find(
130                           "The requested URL could not be retrieved") > 0 or content.find(
131                           "main notFound") > 0 or content.find("此類查詢已達當日上限") > 0):
132                       ok_ip.remove(iptext)  # 移除不可用IP
133                       continue
134                   # 是可用IP即結束循環
135                   else:
136                       ok_ip.append(iptext)  # 加入可用IP
137                       print('目前可用IP：' + iptext)
138                       break
139               except :
140                   continue
141 
142     #暫無航班信息
143     if (content.find("沒有找到您輸入的航班信息") > 0):
144         ok=[]
145     #查詢成功
146     else:
147         try:
148           ok=get_info(fnum,soup,dep,arr)
149         except:
150             return ok
151     #返回航班信息
152     return ok
153 
154 
155 #寫入CSV文件
156 def save(fnum,dep,arr,date,type):
157     #返回航班信息
158     try:
159                content=get_content(fnum,dep,arr,date,type)
160                # 寫方式打開一個文本，把獲取的航班信息存放進去
161                with open('Flight_Info.csv', 'a', ) as f:
162                    writer = csv.writer(f)
163                    writer.writerows([content])
164                    f.close()
165     except:
166         pass
167 
168 
169 hbb=''
170 szm_cf=''
171 szm_md=''
172 #循環爬取
173 def py_info():
174     global hbb
175     global szm_cf
176     global szm_md
177     try:
178         print('請輸入航班號：')
179         hbb = input()  # 航班號
180         print('請輸入出發地三字碼：')
181         szm_cf = input()  # 出發地三字碼
182         print('請輸入目的地三字碼：')
183         szm_md = input()  # 目的地三字碼
184         hblx = '1'  # 航班類型默認為1
185         hbrq = time.strftime("%Y-%m-%d")  # 日期默認當天
186         save(hbb, szm_cf, szm_md, hbrq, hblx)  # 保存寫入CSV文件
187         print(hbb + '航班爬取完成！')
188 
189     # 爬取出錯跳過繼續
190     except:
191         print(hbh+'航班爬取出錯'+szm_cf+szm_md) #輸出出錯航班信息
192         pass
193 
194 
195 #主程序
196 if __name__ == '__main__':
197     py_info()

View Code

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python爬蟲BUG（爬取航班信息） Python爬蟲實例：爬取B站《工作細胞》短評——異步加載信息的爬取 Python 爬蟲實例（8）—— 爬取動態頁面 Python 爬蟲實例（9）—— 搜索爬取淘寶 python爬蟲之深度爬取實例 python爬蟲的圖片信息爬取 python 爬蟲 booking爬取酒店信息 Python爬蟲將爬取的信息變為字典 python爬蟲爬取全球機場信息 Python爬蟲學習(三) ——————爬取外賣信息