Python大數據:外部數據獲取(網頁抓取)


import urllib2 as url
import cookielib,StringIO,gzip,json
import pandas as pd
import numpy as np

#定義一個通用函數,用於抓取指定商品的指定頁評論
def GetPage(link, page):
    # 偽造請求頭
    req=url.Request(link)
    req.add_header("Cookie","ykjjdc=jjcc=e94cc85e72c94e55a098c78e19d979e4&jjcs=1&jjst=0; UM_distinctid=1609c238cf0111-0e3a4ab84d1fdf-6b1b1279-13c680-1609c238cf164f; CNZZDATA4396285=cnzz_eid%3D1644510205-1514443813-%26ntime%3D1514443813; Hm_lvt_f38eafa6ecbff460f93b98423ef80584=1514448064; Hm_lpvt_f38eafa6ecbff460f93b98423ef80584=1514448087; Hm_lvt_06b2a1ee40cb8f7fbd2546dfc4bfaa8c=1514448064; Hm_lpvt_06b2a1ee40cb8f7fbd2546dfc4bfaa8c=1514448087")
    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")
    req.add_header("Upgrade-Insecure-Requests","1")
    req.add_header("Accept","*/*")
    req.add_header("Accept-Encoding","gzip, deflate, sdch")
    req.add_header("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4")
    req.add_header("Cache-Control","no-cache")
    req.add_header("Connection","keep-alive")
    req.add_header("Pragma","no-cache")
    req.add_header("Upgrade-Insecure-Requests","1")

    # 發送請求
    f=url.urlopen(req)

    # 讀取返回的數據流
    s=f.read()

    #數據流解壓縮
    compressedstream = StringIO.StringIO(s)
    gzipper = gzip.GzipFile(fileobj=compressedstream) 

    # 數據流編碼格式轉換
    content = gzipper.read()
    #只保留列表部分
    startPos = content.index("<ul class=\"Sec_lul01\">")
    endPos = content.index("<div class=\"Sec_lright01\">")
    content = content[startPos:endPos]
    content = content.replace("\r\n","").replace("  "," ")
    
    return content

print GetPage("http://www.jjw.com/ershoufang",1)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM