python爬蟲爬取大眾點評並導入redis


直接上代碼,導入redis的中文編碼沒有解決,日后解決了會第一時間上代碼!新手上路,多多包涵!

# -*- coding: utf-8 -*-
import re
import requests
from time import sleep, ctime
from urllib.request import urlopen
from urllib.request import Request
from lxml import etree
import redis
import MySQLdb


r = redis.Redis(host='192.168.60.112', port=6379,db=0)#host自己的ip地址

# 添加模擬瀏覽器協議頭
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
for page in range(1,3):#爬取第1頁到第3頁。
  #大眾點評鏈接,用了字符串拼串 url
= "http://www.dianping.com/search/category/2/10/g112p%i"%(page)+"?aid=90308842%2C21171398%2C22974252%2C77259356%2C79709316%2C69011566%2C93070619%2C75101541%2C5724122%2C21559834&cpt=90308842%2C21171398%2C22974252%2C77259356%2C79709316%2C69011566%2C93070619%2C75101541%2C5724122%2C21559834&tc=1"#字符串拼接 # print(url) req_timeout = 5#延時 req = Request(url=url, headers=headers) f = urlopen(req, None, req_timeout) s = f.read() s = s.decode('utf-8') ss = str(s) # lxml提取 selector = etree.HTML(ss)
#爬的內容 links
= selector.xpath( '//div[@class="txt"]/div[@class="tit"]/a/@href|//div[@class="txt"]/div[@class="tit"]/a/h4/text()') for link in links: print(link)
#寫入redis,用的list類型(棧結構) r.lpush(
'mylist',link)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM