nodejs抓取數據一(列表抓取)


純屬初學...有很多需要改進的地方,請多多指點...

目標是抓取58同城 這個大分類下的列表數據: http://cd.58.com/caishui/?PGTID=14397169455980.9244072034489363&ClickID=1

簡單分析:

  1. 按照以下二級分類來獲取每個列表的數據,

      

  2. 主要分頁: 可以看出,其分頁是pn5 這里設置的,那么這個5就是頁碼了.

      http://cd.58.com/dailijizh/pn5/?PGTID=117742907188706554997826849&ClickID=1

  3. 電話號碼: 實在隱藏的div里面,點擊 聯系商家即可看到.但是對於程序來說.只可以直接取得的.

     

代碼如下:

//抓取58數據
var http = require("http"),
    cheerio = require("cheerio"),
    mongoose = require('mongoose');
db = mongoose.createConnection('mongodb://127.0.0.1:27017/crawl58');

db.on('error', function (error) {
    console.log('mongodb連接錯誤: ' + error);
});

//存儲數據
var mongooseSchema = new mongoose.Schema({
    url: {type: String},//抓取地址
    type: {type: String},//類型
    content: {type: String},//抓取地址
    updateTime: {type: Date, default: Date.now},//數據抓取時間
    flag: {type: String, default: 0} //用於判斷是否抓取過 0表示詳情沒有抓取過.
});
// model
var mongooseModel = db.model('pageList', mongooseSchema);//代理記賬

//抓取數據
var proxy = [ //代理
    {ip: '120.203.159.14', port: '8118'},
    {ip: '111.161.246.233', port: '8118'},
    {ip: '58.30.233.196', port: '8118'},
    {ip: '113.215.0.130', port: '80'},
    {ip: '183.218.63.179', port: '8181'},
    {ip: '120.198.245.36', port: '8080'},
    {ip: '120.203.158.149', port: '8118'},
    {ip: '124.240.187.89', port: '80'},
    {ip: '218.204.140.105', port: '8118'},
    {ip: '175.1.79.63', port: '80'}
];
var proxyIndex = 5;

var flag = false;//判斷是否為最后一頁
var pageNo = 1;
function crawl() {
    console.log('正在抓取 頁碼: ' + pageNo);
    //url需要手動設置一下,每個分類抓完畢后,切換到下一個分類
    //var url = 'http://cd.58.com/dailijizh/pn' + pageNo + '/?PGTID=1007041601886955933022299' + pageNo + '&ClickID=1';
    var url = 'http://cd.58.com/nashuishenbao/pn' + pageNo + '/?PGTID=1007041601886955933022299' + pageNo + '&ClickID=1';
    var type='納稅申報';//這里需要手動設置一下分類,對應url分類

    if (flag) {
        console.log('抓取完畢.總頁數為:' + pageNo);
        return false;
    }
    var option = {
        host: proxy[proxyIndex].ip,
        port: proxy[proxyIndex].port,
        method: 'GET',//這里是發送的方法
        path: url,
        header: {
            'Host': 'cd.58.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
            'Referer': url,
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cookie': 'userid360_xml=6B337B22E8098342C5F725D4F58495C6; time_create=1442050990222; id58=05dzXVSaeAcJgzn9Crp9Ag==; bdshare_firstime=1419409592050; bangbangguoqi=true; ppqp=1; tj_ershoubiz=true; tj_ershounobiz=true; CNZZDATA30017898=cnzz_eid%3D443859762-1419406677-%26ntime%3D1431055823; ag_fid=WeYSRnDPQwUjsUJF; myfeet_tooltip=end; quanmyy=forfirst; __ag_cm_=1439442804516; bangbigtip2=1; nearby=NOTSHOW; ipcity=cd%7C%u6210%u90FD; sessionid=4019a46c-3b78-45f9-8af1-d5d576171b60; 58home=cd; bangbangid=1080863912864997567; cookieuid1=05dvUVXOs3ZTEwlzHrnMAg==; __autma=253535702.1952421463.1439442813.1439598477.1439610035.5; __autmc=253535702; __autmz=253535702.1439610035.5.2.autmcsr=cd.58.com|autmccn=(referral)|autmcmd=referral|autmcct=/caishui/; final_history=19947936375429%2C20303113064713%2C16884696076038%2C18742095746434%2C22669284355361; ag_fid=WeYSRnDPQwUjsUJF; __utmt_pageTracker=1; city=cd; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1439452109,1439458833,1439516942,1439598477; Hm_lpvt_3bb04d7a4ca3846dcc66a99c3e861511=1439627751; __utma=253535702.1249887847.1419409519.1439618478.1439625451.38; __utmb=253535702.20.10.1439625451; __utmc=253535702; __utmz=253535702.1439625451.38.15.utmcsr=cd.58.com|utmccn=(referral)|utmcmd=referral|utmcct=/dailijizh/pn2/; new_session=0; init_refer=http%253A%252F%252Fcd.58.com%252Fdailijizh%252Fpn2%252F%253FPGTID%253D198304873188692623092226919; new_uv=41'
        }
    };
    //http.request(option, function (res) {//這里為使用代理IP,還有bug,暫時沒有解決掉.
    http.get(url, function (res) {
        var data = "";
        res.on('data', function (chunk) {
            data += chunk;
        });
        res.on("end", function () {
            //解析數據並存入數據庫
            var $ = cheerio.load(data);
            if ($('a.next', 'div.pager').length < 1) {
                flag = true;//設置 抓取完畢的標志
            }
            var item = {
                url: url,
                type: type,
                content: data
            }
            //保存列表數據
            mongooseModel.create(item, function (error) {
                if (error) {
                    console.log(error);
                } else {
                    console.log('保存成功  頁碼: ' + pageNo + '   ' + url);

                    if (proxyIndex = 10) {
                        proxyIndex = 0;
                    } else {
                        proxyIndex = proxyIndex + 1;
                    }
                    pageNo = pageNo + 1;
                    setTimeout(crawl, 5020);//設置為5秒以上,就不會被跳轉驗證頁面,  O(∩_∩)O哈哈~被我發現了. 其實最好是使用代理.
                }
            });
        });
    }).on("error", function (error) {
        console.log('抓取錯誤: ' + error.message);
    });
};

//開始抓取數據
crawl();

 

 

 

 

  

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM