整體思路是從8684公交網上抓取城市的公交名列表,然后在高德中利用api交互來抓包獲取站點的數據,有{ UID,線路,站點名,站點x坐標,站點y坐標 }。
這里爬8684用了requests和BeautifulSoup,比較簡單就不闡述了。最后的存儲同時存了csv和xlsx兩種格式。
import requests from bs4 import BeautifulSoup import pandas as pd import xlrd #城市總列表 citys = ["hefei","wuhu","bengbu","huainan","maanshan","huaibei","tongling","anqing","huangshan","chuzhou","fuyang","suzhou2","liuan","bozhou","chizhou","xuancheng"] #獲取首字母 def getInitial(cityName): url = 'https://{}.8684.cn/list1'.format(cityName) headers = {'User-Agent':"自己的UA"} data = requests.get(url,headers=headers) soup = BeautifulSoup(data.text, 'lxml') initial = soup.find_all('div',{'class':'tooltip-inner'})[3] initial = initial.find_all('a') ListInitial = [] for i in initial: ListInitial.append(i.get_text()) return ListInitial #ListInitial = ['1', '2', '3', '4', '5', '6', '7', '8', '9', #'A', 'B', 'C', 'D', 'F', 'G', 'H', 'K', 'L', 'M', 'N', 'P', #'Q', 'S', 'T', 'W', 'X', 'Y', 'Z'] #根據ListInitial的各項爬取各項的首字母公交 def getLine(cityName,n): url = 'https://{}.8684.cn/list{}'.format(cityName,n) headers = {'User-Agent':"自己的UA"} data = requests.get(url,headers=headers) soup = BeautifulSoup(data.text, 'lxml') busline = soup.find('div',{'class':'list clearfix'}) busline = busline.find_all('a') for i in busline: lines.append(i.get_text()) #存儲,傳參lines,csv轉xlsx def storageCsv(listBus,cityName): result = pd.DataFrame(listBus) result.to_csv("{}.csv".format(cityName),index=False,na_rep="NULL",) def csv_to_xlsx_pd(cityName): csv = pd.read_csv('{}.csv'.format(cityName), encoding='utf-8') csv.to_excel('{}.xlsx'.format(cityName),header=False,index=False) #對citys列表中的每一個城市i for i in citys: #創建公交線路空列表 lines = [] #爬取首字母列表,返回首字母列表ListInitial ListInitial = getInitial(i) #根據首字母列表for循環爬取1-Z首字母線路,append入全局變量lines for n in ListInitial: getLine(i,n) #存儲lines進入csv和xlsx storageCsv(lines,i) csv_to_xlsx_pd(i) print(i,"中了")
得到的csv如圖: