西刺代理爬蟲
1. 新建項目和爬蟲
scrapy startproject daili_ips
......
cd daili_ips/
#爬蟲名稱和domains
scrapy genspider xici xicidaili.com
2. 測試
In [1]: import requests
In [2]: r = requests.get('http://www.xicidaili.com/nn/1')
In [3]: r.status_code
Out[3]: 500
In [4]:
返回500, 猜測是沒有加User-Agent
導致
In [4]: headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
In [5]:
In [5]: r = requests.get('http://www.xicidaili.com/nn/1', headers=headers)
In [6]: r.status_code
Out[6]: 200
In [7]:
返回正常
3. 在項目的settings中去掉USER_AGENT
的注釋
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
4. 編寫items.py
item定義存儲哪些字段
import scrapy
class DailiIpsItem(scrapy.Item):
ip = scrapy.Field()
port = scrapy.Field()
position = scrapy.Field()
type = scrapy.Field()
speed = scrapy.Field()
last_check_time = scrapy.Field()
5. 編寫spider
# -*- coding: utf-8 -*-
import scrapy
from daili_ips.items import DailiIpsItem
class XiciSpider(scrapy.Spider):
name = "xici"
allowed_domains = ["xicidaili.com"]
start_urls = (
'http://www.xicidaili.com/',
)
def start_requests(self):
res = []
for i in range(1, 2):
url = 'http://www.xicidaili.com/nn/%d'%i
req = scrapy.Request(url)
# 存儲所有對應地址的請求
res.append(req)
return res
def parse(self, response):
table = response.xpath('//table[@id="ip_list"]')[0]
trs = table.xpath('//tr')[1:] #去掉標題行
items = []
for tr in trs:
pre_item = DailiIpsItem()
pre_item['ip'] = tr.xpath('td[2]/text()').extract()[0]
pre_item['port'] = tr.xpath('td[3]/text()').extract()[0]
pre_item['position'] = tr.xpath('string(td[4])').extract()[0].strip()
pre_item['type'] = tr.xpath('td[6]/text()').extract()[0]
pre_item['speed'] = tr.xpath('td[7]/div/@title').re('\d+\.\d*')[0]
pre_item['last_check_time'] = tr.xpath('td[10]/text()').extract()[0]
items.append(pre_item)
return items
編寫spider的時候可以通過命令行工具scrapy shell url
來測試要提取數據的xpath語法, 這樣更高效
6. 編寫Pipelines
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/top ics/item-pipeline.html
import MySQLdb
class DailiIpsPipeline(object):
# 該函數必須返回一個具有數據的dict或者item對象
def process_item(self, item, spider):
DBS = spider.settings.get('DBS')
con = MySQLdb.connect(**DBS)
# 下面這行代碼表示設置MySQL使用的字符集為utf8
con.set_character_set('utf8')
cur = con.cursor()
insert_sql = (
"insert into proxy (ip, port, position, type, speed, last_check_time) "
"values (%s,%s,%s,%s,%s,%s);"
)
values = (item['ip'], item['port'], item['position'], item['type'], item['speed'], item['last_check_time'])
# 插入數據庫
try:
cur.execute(insert_sql, values)
except Exception, e:
print "插入失敗: ", e
con.rollback()
else:
con.commit()
cur.close()
con.close()
return item
return item
注意:
這里我剛開始做的時候沒有加con.set_character_set('utf8')
這一行, 結果報錯如下
UnicodeEncodeError: 'latin-1' codec can't encode character
但是我在創建數據表的時候已經設置字符集為utf8, 查資料后是MySQLdb正常情況下會嘗試將所有的內容轉為latin1字符集處理
所以處理方法就是,設置連接和游標的charset為你所希望的編碼
con = MySQLdb.connect(...)
# 設置鏈接編碼
con.set_character_set('utf8')
cur = con.cursor()
# 設置游標編碼
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET CHARACTER_SET_CONNECTION=utf8;')
我在測試后發現僅僅設置連接(con)
的編碼也不會報錯, 所以上述程序並沒有設置游標編碼
7. 創建MySQL數據表
mysql> create table porxy(
-> id int primary key auto_increment,
-> ip varchar(20),
-> port varchar(20),
-> position varchar(20),
-> type varchar(20),
-> speed varchar(20),
-> last_check_time varchar(20)
-> )charset=utf8;
Query OK, 0 rows affected (0.01 sec)
mysql>
8. 啟用Pipelines
更改settings.py
文件, 取消注釋
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'daili_ips.pipelines.SomePipeline': 300,
#}
改為
ITEM_PIPELINES = {
'daili_ips.pipelines.DailiIpsPipeline': 300,
}
后面的數字一般在0-1000以內, 當有多個Pipelines的時候表示執行順粗, 數字小的先執行
啟動爬蟲
scrapy crawl xici