1、新建文件config.json,內容如下,文件放在pyspider文件夾下,路徑為C:\Windows\System32\cmd.exe
{
"taskdb": "mongodb+taskdb://127.0.0.1:27017/pyspider_taskdb",
"projectdb": "mongodb+projectdb://127.0.0.1:27017/pyspider_projectdb",
"resultdb": "mongodb+resultdb://127.0.0.1:27017/pyspider_resultdb",
"message_queue": "redis://127.0.0.1:6379/0",
"webui": {
"port": 5000
}
}
2、安裝redis,在redis文件夾下啟動終端,運行命令啟動redis服務端
E:\redis>redis-server.exe redis.windows.conf
redis默認15個數據庫,db0,db1...上述文件選擇index為0的db數據庫
若想啟動客戶端,運行命令如下,set,get為測試
E:\redis>redis-cli.exe -h 127.0.0.1 -p 6379
127.0.0.1:6379> set myKey abc
OK
127.0.0.1:6379> get myKey
"abc"
127.0.0.1:6379>
3、安裝mongoDB,建文件夾db,並配置到mongoDB里去(文件夾不建也行)
在bin文件夾下運行命令
E:\MongoDB\Server\4.0\bin>mongod.exe --dbpath \data\db
在客戶端運行一些查詢命令
show dbs
查看有哪些數據庫
db
查看當前數據庫
use dbname
使用dbname數據庫作為當前數據庫
show tables / show collections
查看當前數據庫下的表或集合,都指一個意思
db.website.find()
查看當前數據庫下website集合的數據內容
db.website.find().count()
查看website表里數據總數
4、啟動redis,啟動mongoDB后,啟動pyspider,並把新加的配置文件配置進去
D:\Python\Python36\Lib\site-packages\pyspider>pyspider --config config.json
5、發現需要安裝第三方模塊
pip install redis
pip install pymongo
6、在項目里重載函數on_result
import pymongo
def on_result(self,result): if not result: #提取每個鏈接都會調用這個函數,只有detail_page返回的result才有值,所以沒值時不繼續進行 return client = pymongo.MongoClient(host='127.0.0.1',port=27017) db = client['pyspider_projectdb'] #建數據庫,也可以是配置文件里設置的數據庫 coll = db['website'] #建集合,即表 data = { 'originalLink':result['originalLink'], 'productName':result['productName'], 'price':result['price'], 'productDescription':result['productDescription'], 'category1':result['category1'], 'category2':result['category2'], 'category3':result['category3'], 'images':result['images'] } data_id = coll.insert(data) #將數據插入集合里 print(data_id)
7、完整代碼
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-11-08 09:56:40 # Project: product from pyspider.libs.base_handler import * import re import base64 import os import urllib import urllib.request import requests import json import pymongo import uuid class Handler(BaseHandler): def default(self, obj): if isinstance(obj, bytes): return str(obj, encoding='utf-8') return json.JSONEncoder.default(self, obj) crawl_config = { "headers": { "User-Agent": "BaiDuSpider", } } @every(minutes=24 * 60) def on_start(self): self.crawl('https://www.zhe800.com/', callback=self.index_page, validate_cert=False) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): if re.match('https://shop.zhe800.com/products/.+',each.attr.href): self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, connect_timeout = 50, timeout = 200) elif re.match('https://brand.zhe800.com/.+',each.attr.href): self.crawl(each.attr.href, callback=self.index_page, validate_cert=False, connect_timeout = 50, timeout = 200) @config(priority=2) def detail_page(self, response): if not response.doc('h1').text(): return x=1 imageresult=[]#放圖片對象 results=[]#最終結果,全部json放里 description='' result=dict()#放json headers = {"Content-Type": "application/json"} path='D:\\pythonlianxi\\testimg' if not os.path.isdir(path): os.makedirs(path) paths = path+'\\' for img in response.doc('div[class="deteilpic l"]>UL>LI>A>IMG').items(): if re.match('.+?\.jpg',img.attr.src): urllib.request.urlretrieve(img.attr.src,'{0}{1}.jpg'.format(paths,x)) with open(paths+str(x)+".jpg","rb") as f: base64_data = base64.b64encode(f.read()).decode() imgurl=dict()#放base64 imgurl['id']=x imgurl['base64']=base64_data imageresult.append(imgurl) x = x + 1 for each in response.doc('aside[class="pos area"]').items(): catagoary=each.text() try: catagoary1=catagoary.split(' > ')[1] except: catagoary1="category1" try: catagoary2=catagoary.split(' > ')[2] except: catagoary2="category2" try: catagoary3=catagoary.split(' > ')[3] except: catagoary3="category3" pricebefore = response.doc('strong[class="red js_price_st"]>I').text() try: price=float(pricebefore) except: pricearray = re.findall('[0-9]*\.?[0-9]+', pricebefore) if not len(pricearray): pricearray=[0] price=pricearray[0] for des in response.doc('ul[class="list12 clear"]>LI').items(): if des.attr.title: description=description+des.attr.title result['id']=''.join(str(uuid.uuid4()).split('-')) result['originalLink']=response.url result['productName']=response.doc('h1').text() result['price']=price result['productDescription']=description result['category1']=catagoary1 result['category2']=catagoary2 result['category3']=catagoary3 result['images']=imageresult filename="D:\\pythonlianxi\\zhe800.txt" with open(filename,'+a') as f: f.write(str(result)+'\n') results.append(result) payload=json.dumps(results) #r = requests.post('http://192.168.1.160:8764/index/products', data=payload, headers=headers) return { 'id':result['id'], 'price':price, "originalLink": response.url, "productName": response.doc('h1').text(), 'productDescription':description, 'category1':catagoary1, 'category2':catagoary2, 'category3':catagoary3, 'images':imageresult } def on_result(self,result): if not result: return print(result) client = pymongo.MongoClient(host='127.0.0.1',port=27017) db = client['pyspider_projectdb'] coll = db['productzhe'] data = { 'id':result['id'], 'originalLink':result['originalLink'], 'productName':result['productName'], 'price':result['price'], 'productDescription':result['productDescription'], 'category1':result['category1'], 'category2':result['category2'], 'category3':result['category3'], 'images':result['images'] } data_id = coll.insert(data)