scrapy爬蟲還是很簡單的,主要是三部分:spider,item,pipeline
其中后面兩個也是通用套路,需要詳細解析的也就是spider。
具體如下:
在網上找了幾個汽車網站,后來敲定,以易車網作為爬取站點
原因在於,其數據源實在是太方便了。
看這個頁面,左邊按照品牌排序,搜索子品牌,再挨個查看信息即可
按照通常的思路,是需要手動解析左邊這列表
找出每個品牌的鏈接頁面
結果分析源碼發現,網站直接通過js生成的導航欄,直接通過這個鏈接生成的json即可獲得所有的信息
http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=baojia&pagetype=masterbrand&objid=2
直接解析其中需要的數據即可
如下圖
可以用json解析,我沒嘗試,我采用最簡單的正則匹配提取
代碼很簡單
json_str = """ 上面全部的數據 JsonpCallBack({char:{A:1,B:1,C:1,D:1,E:0,F:1,G:1,H:1,I:0,J:1,K:1,L:1,M:1,
N:1,O:1,P:1,Q:1,R:1,S:1,T:1,U:0,V:0,W:1,X:1,Y:1,Z:1}
,brand:{A:[{type:"mb",id:9,name:"奧迪",url:"/mb9/",cur:0,num:95546}
~~~~~~~~~~太長 剩余的代碼中我省略了 """ import re result = re.findall(r'\/mb\d+\/', json_str) print result
#mb_pages = ['/mb9/', '/mb97/', ] #192條
所以,品牌頁的代碼:
def parse(self, response): page_root = 'price.bitauto.com' #response.url.split('/')[2] #'price.bitauto.com' mb_pages = ['/mb2/', '/mb3/','/mb9/', ] #/mb9/audi, /mb2/benz, /mb3/bmw for info in mb_pages: page_href = info page_url = 'http://'+ page_root + page_href yield scrapy.Request(url=page_url, meta={'treeurl': info}, callback=self.parse_brand_page)
品牌頁面下面的子品牌
元素定位,爬取,代碼如下
def parse_brand_page(self,response): #命令行測試 scrapy shell http://price.bitauto.com/mb196/ page_xpath = '//div[@id="c_result"]/div[@class="carpic_list"]/ul/li' page_root = 'price.bitauto.com' #response.url.split('/')[2] #'price.bitauto.com' treeurl = response.meta['treeurl'] brand = response.xpath('//div[@class="tree_navigate"]/div/strong/text()').extract()[0] for info in response.xpath(page_xpath): page_href = info.xpath('a/attribute::href').extract()[0] page_url = 'http://'+ page_root + page_href #print page_url yield scrapy.Request(url=page_url, meta={'treeurl': treeurl, 'brand': brand}, callback=self.parse_car_page)
到款型詳情頁,然而,需要的是參數頁,,繼續request跳轉
def parse_car_page(self,response): peizhi_xpath = '//a[@id="linkOutCs"]/@href' page_url = response.xpath(peizhi_xpath).extract()[0] treeurl = response.meta['treeurl'] brand = response.meta['brand'] yield scrapy.Request(url=page_url, meta={'treeurl': treeurl, 'brand': brand}, callback=self.parse_detail_page)
最后到參數頁,這才是我們需要的數據啊
分析源代碼發現,所有的數據依舊是在js代碼以字符串形式存在的,這種代碼,正是正則的用武之地啊
數據字符串是三個[[[]]]嵌套的字符串,處理方式為
ff = re.search('\[\[\[.*\]\]\]',response.body).group() #str infos = eval(ff)
ff獲得的是整個[[[]]]的內容,然后用eval轉化成python的值,然后再用循環取對應位置的數據即可。代碼如下
def parse_detail_page(self,response): #命令行測試 scrapy shell http://car.bitauto.com/changchengh5/peizhi/ ff = re.search('\[\[\[.*\]\]\]',response.body).group() #str infos = eval(ff) for s_second in infos: item = BitautoCarItem() item['carid'] =s_second[0][0] #"117388" item['url'] = response.url item['brand'] = response.meta['brand'] ### item['treeurl'] = response.meta['treeurl'] ### item['brandurl'] = s_second[0][6] ##changchengh5,benchieji item['brandmodel4'] = s_second[0][4] #"哈弗H5" "奔馳E級" item['brandmodel5'] = s_second[0][5] ### item['version'] = s_second[0][1] #"經典版 2.0T 手動 兩驅 精英型", item['image'] = s_second[0][2] item['cyear'] = s_second[0][7] item['ctype'] = s_second[0][12] #"SUV" item['color'] = s_second[0][13] item['price1'] = s_second[1][0] # 廠家指導價 item['price2'] = s_second[1][1] # 商家報價 item['displacement'] = s_second[1][5] #"2.0", 排量(L) item['shiftgears'] = s_second[1][6] # "6" item['shifttype'] = s_second[1][7] # "手動" item['clength'] = s_second[2][0] # 長寬高,為了清楚表示,加了前綴c item['cwidth'] = s_second[2][1] # 長寬高,為了清楚表示,加了前綴c item['cheight'] = s_second[2][2] # 長寬高,為了清楚表示,加了前綴c item['wheelbase'] = s_second[2][3] #軸距 item['mingrounddistance'] = s_second[2][8] #最小離地間隙 item['motor'] = s_second[3][1] # 發動機型號 item['intaketype'] = s_second[3][5] # 進氣形式 item['maxhorsepower'] = s_second[3][13] # 最大馬力(Ps) item['maxpower'] = s_second[3][14] # 最大功率(kW) item['maxrpm'] = s_second[3][15] # 最大功率轉速(rpm) item['oiltype'] = s_second[3][19] # 燃料類型 item['oilsupply'] = s_second[3][21] # 供油方式 item['tankvolume'] = s_second[3][22] # 燃油箱容積(L) item['drivetype'] = s_second[5][6] # 驅動方式 item['braketype'] = s_second[5][5] # 駐車制動類型 item['frontwheel'] = s_second[7][0] # 前輪 item['backwheel'] = s_second[7][1] # 后輪 yield item
以上,整個爬取代碼,為:
#!/usr/bin/env python # coding=utf-8 import scrapy import re from Car_spider.items import BitautoCarItem class BitautoSpider(scrapy.Spider): name = 'bitauto' allowed_domains = ['bitauto.com'] start_urls = ['http://price.bitauto.com/mb2/',] def parse(self, response): page_root = 'price.bitauto.com' #response.url.split('/')[2] #'price.bitauto.com' mb_pages = ['/mb2/', '/mb3/','/mb9/', ] #/mb9/audi, /mb2/benz, /mb3/bmw for info in mb_pages: page_href = info page_url = 'http://'+ page_root + page_href yield scrapy.Request(url=page_url, meta={'treeurl': info}, callback=self.parse_brand_page) def parse_brand_page(self,response): #命令行測試 scrapy shell http://price.bitauto.com/mb196/ page_xpath = '//div[@id="c_result"]/div[@class="carpic_list"]/ul/li' page_root = 'price.bitauto.com' #response.url.split('/')[2] #'price.bitauto.com' treeurl = response.meta['treeurl'] brand = response.xpath('//div[@class="tree_navigate"]/div/strong/text()').extract()[0] for info in response.xpath(page_xpath): page_href = info.xpath('a/attribute::href').extract()[0] page_url = 'http://'+ page_root + page_href #print page_url yield scrapy.Request(url=page_url, meta={'treeurl': treeurl, 'brand': brand}, callback=self.parse_car_page) def parse_car_page(self,response): peizhi_xpath = '//a[@id="linkOutCs"]/@href' page_url = response.xpath(peizhi_xpath).extract()[0] treeurl = response.meta['treeurl'] brand = response.meta['brand'] yield scrapy.Request(url=page_url, meta={'treeurl': treeurl, 'brand': brand}, callback=self.parse_detail_page) def parse_detail_page(self,response): #命令行測試 scrapy shell http://car.bitauto.com/changchengh5/peizhi/ ff = re.search('\[\[\[.*\]\]\]',response.body).group() #str infos = eval(ff) for s_second in infos: item = BitautoCarItem() item['carid'] =s_second[0][0] #"117388" item['url'] = response.url item['brand'] = response.meta['brand'] ### item['treeurl'] = response.meta['treeurl'] ### item['brandurl'] = s_second[0][6] ##changchengh5,benchieji item['brandmodel4'] = s_second[0][4] #"哈弗H5" "奔馳E級" item['brandmodel5'] = s_second[0][5] ### item['version'] = s_second[0][1] #"經典版 2.0T 手動 兩驅 精英型", item['image'] = s_second[0][2] item['cyear'] = s_second[0][7] item['ctype'] = s_second[0][12] #"SUV" item['color'] = s_second[0][13] item['price1'] = s_second[1][0] # 廠家指導價 item['price2'] = s_second[1][1] # 商家報價 item['displacement'] = s_second[1][5] #"2.0", 排量(L) item['shiftgears'] = s_second[1][6] # "6" item['shifttype'] = s_second[1][7] # "手動" item['clength'] = s_second[2][0] # 長寬高,為了清楚表示,加了前綴c item['cwidth'] = s_second[2][1] # 長寬高,為了清楚表示,加了前綴c item['cheight'] = s_second[2][2] # 長寬高,為了清楚表示,加了前綴c item['wheelbase'] = s_second[2][3] #軸距 item['mingrounddistance'] = s_second[2][8] #最小離地間隙 item['motor'] = s_second[3][1] # 發動機型號 item['intaketype'] = s_second[3][5] # 進氣形式 item['maxhorsepower'] = s_second[3][13] # 最大馬力(Ps) item['maxpower'] = s_second[3][14] # 最大功率(kW) item['maxrpm'] = s_second[3][15] # 最大功率轉速(rpm) item['oiltype'] = s_second[3][19] # 燃料類型 item['oilsupply'] = s_second[3][21] # 供油方式 item['tankvolume'] = s_second[3][22] # 燃油箱容積(L) item['drivetype'] = s_second[5][6] # 驅動方式 item['braketype'] = s_second[5][5] # 駐車制動類型 item['frontwheel'] = s_second[7][0] # 前輪 item['backwheel'] = s_second[7][1] # 后輪 yield item
前面只是從頁面層次去parse,沒敘述item,因為這個也簡單,沒啥需要敘述的。其定義代碼為
class BitautoCarItem(scrapy.Item): carid = scrapy.Field() url = scrapy.Field() treeurl = scrapy.Field() brand = scrapy.Field() ### brandurl = scrapy.Field() ### brandmodel4 = scrapy.Field() #"哈弗H5" brandmodel5 = scrapy.Field() #"哈弗H5" version = scrapy.Field() #"經典版 2.0T 手動 兩驅 精英型", image = scrapy.Field() cyear = scrapy.Field() ctype = scrapy.Field() #"SUV" color = scrapy.Field() price1 = scrapy.Field() # 廠家指導價 price2 = scrapy.Field() # 商家報價 displacement = scrapy.Field() # "2.0", 排量(L) shiftgears = scrapy.Field() # "6" shifttype = scrapy.Field() # "手動" clength = scrapy.Field() # 長寬高,為了清楚表示,加了前綴c cwidth = scrapy.Field() # 長寬高,為了清楚表示,加了前綴c cheight = scrapy.Field() # 長寬高,為了清楚表示,加了前綴c wheelbase = scrapy.Field() #軸距 mingrounddistance = scrapy.Field() #最小離地間隙 motor = scrapy.Field() # 發動機型號 intaketype = scrapy.Field() # 進氣形式 maxhorsepower = scrapy.Field() # 最大馬力(Ps) maxpower = scrapy.Field() # 最大功率(kW) maxrpm = scrapy.Field() # 最大功率轉速(rpm) oiltype = scrapy.Field() # 燃料類型 oilsupply = scrapy.Field() # 供油方式 tankvolume = scrapy.Field() # 燃油箱容積(L) drivetype = scrapy.Field() # 驅動方式 braketype = scrapy.Field() # 駐車制動類型 frontwheel = scrapy.Field() # 前輪胎規格 backwheel = scrapy.Field() # 后輪胎規格
至於Pipeline,隨意寫即可,也是套路而已,mongodb的pipeline如下
class MongoDBPipeline(object): def __init__(self): connection = MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) db=connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] def process_item(self, item, spider): self.collection.insert(dict(item))
也可以是關系型數據庫,如postgresql
class CarsPgPipeline(object): def __init__(self): #reload(sys) #sys.setdefaultencoding('utf-8') self.connection = psycopg2.connect( database= settings['POSTGRES_DB'], user= settings['POSTGRES_USER'], password= settings['POSTGRES_PW'], host= settings['POSTGRES_SERVER'], port= settings['POSTGRES_PORT'], ) self.cursor = self.connection.cursor() def process_item(self,item,spider): if instance(item, BitautoCarItem): _sql = """INSERT INTO BitautoCar(carid,url,treeurl,brand,brandurl,brandmodel4,brandmodel5,version,image,cyear,ctype,color,price1,price2,displacement,shiftgears,shifttype,clength,cwidth,cheight,wheelbase,mingrounddistance,motor,intaketype,maxhorsepower,maxpower,maxrpm,oiltype,oilsupply,tankvolume,drivetype,braketype,frontwheel,backwheel) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s');"""%( item['carid'],item['url'],item['treeurl'],item['brand'],item['brandurl'],item['brandmodel4'],item['brandmodel5'],item['version'],item['image'],item['cyear'],item['ctype'],item['color'],item['price1'],item['price2'],item['displacement'],item['shiftgears'],item['shifttype'],item['clength'],item['cwidth'],item['cheight'],item['wheelbase'],item['mingrounddistance'],item['motor'],item['intaketype'],item['maxhorsepower'],item['maxpower'],item['maxrpm'],item['oiltype'],item['oilsupply'],item['tankvolume'],item['drivetype'],item['braketype'],item['frontwheel'],item['backwheel']) try: self.cursor.execute(self.cursor.mogrify(_sql) ) self.connection.commit() except Exception, e: self.connection.rollback() print "Error: %s" % e return item
項目setting部分
BOT_NAME = 'Car_spider' SPIDER_MODULES = ['Car_spider.spiders'] NEWSPIDER_MODULE = 'Car_spider.spiders' ITEM_PIPELINES = { 'Car_spider.pipelines.CarsPgPipeline' : 1000, } MONGODB_SERVER = 'localhost' MONGODB_PORT = 27017 MONGODB_DB = 'car' MONGODB_COLLECTION = 'kache360' #'bitantotest' POSTGRES_SERVER = 'localhost' POSTGRES_PORT = 5432 POSTGRES_DB = 'yourdb' POSTGRES_USER = 'yourname' POSTGRES_PW = '123456' ROBOTSTXT_OBEY = True DOWNLOAD_DELAY = 3 RANDOMIZE_DOWNLOAD_DELAY = True