流程:尋找需要爬的網頁(小編的爬取網頁:http://www.gaosan.com/gaokao/239012.html)→打開Spyder→輸入爬蟲代碼→查看爬取結果→寫入到CSV文件中
輸出CSV格式核心代碼:
def writercsv(save_road,num,title): if os.path.isfile(save_road): with open(save_road,'a',newline='')as f: csv_write=csv.writer(f,dialect='excel') for i in range(num): u=allUniv[i] csv_write.writerow(u) else: with open(save_road,'w',newline='')as f: csv_write=csv.writer(f,dialect='excel') csv_write.writerow(title) for i in range(num): u=allUniv[i] csv_write.writerow(u) title=["排名","學校名稱","綜合得分","省份"] def main(): url = 'http://www.gaosan.com/gaokao/239012.html' html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") fillUnivList(soup) printUnivList(250) #定義輸出路徑和行數,以及標題 writercsv('E:\\python\爬蟲數據.csv',250,title)
完整爬蟲代碼:
import requests import codecs import csv from bs4 import BeautifulSoup allUniv=[] def getHtmlText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return "" def fillUnivList(soup): data=soup.find_all('tr') for tr in data: ltd=tr.find_all('td') if len(ltd)==0: continue singleUniv=[] for td in ltd: singleUniv.append(td.string) allUniv.append(singleUniv) def printUnivList(num): print("{:^10}{:^10}{:^5}{:^8}".format("排名","學校名稱","綜合得分","省份")) for i in range(num): u=allUniv[i] print("{:^10}{:^10}{:^5}{:^8}".format(u[0],u[1],u[2],u[3])) def writercsv(save_road,num,title): if os.path.isfile(save_road): with open(save_road,'a',newline='')as f: csv_write=csv.writer(f,dialect='excel') for i in range(num): u=allUniv[i] csv_write.writerow(u) else: with open(save_road,'w',newline='')as f: csv_write=csv.writer(f,dialect='excel') csv_write.writerow(title) for i in range(num): u=allUniv[i] csv_write.writerow(u) title=["排名","學校名稱","綜合得分","省份"] def main(): url = 'http://www.gaosan.com/gaokao/239012.html' html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") fillUnivList(soup) printUnivList(250) #定義輸出路徑和行數,以及標題 writercsv('E:\\python\爬蟲數據.csv',250,title) main()
爬取結果,如下圖:
在相對應的路徑下生產csv文件
文件打開效果圖: