爬取頁面數據與爬取接口數據,我還是覺得爬取接口數據更加簡單一點,主要爬取一些分頁的數據。
爬取步驟:
1.明確目標接口地址,舉個例子 : https://www.vcg.com/api/common/searchImage?phrase=%E6%98%A5%E5%A4%A9&graphicalStyle%5B0%5D=1&page=1 網上隨便找到 視覺中國的一個網址
這個網址上的圖片非常好看
2.接口返回的數據都是json數據。很統一,處理起來也很便捷。撇開奇葩接口不說。
3.只需要偽造好請求頭就可訪問,這是重點。
接口參數是
原始數據
下面是我爬取代碼

//引入自己封裝的鏈接數據模塊 let mysql = require('./connectdatabase.js'); //SuperAgent是一個輕量級、靈活的、易讀的、低學習曲線的客戶端請求代理模塊,使用在NodeJS環境中。 let superagent = require('superagent'); var fs = require('fs'); // .set( "Accept", "application/json") // .set( "Cookie", "acw_tc=276aedea15548801849954091e3094245e35937d42d912140fa37630751eeb; clientIp=122.235.188.119; source=baidusem; sajssdk_2015_cross_new_user=1; _ga=GA1.2.109353021.1554880168; _gid=GA1.2.1994909913.1554880168; Hm_lvt_0af14db9b5993b4879812c54f6cf147d=1554880168; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22%24device_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidusem%22%2C%22%24latest_utm_medium%22%3A%22cpc%22%2C%22%24latest_utm_campaign%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E6%A0%B8%E5%BF%83%22%2C%22%24latest_utm_content%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E5%9B%BE%E7%89%87%22%2C%22%24latest_utm_term%22%3A%22%E5%9B%BE%E7%89%87%22%7D%7D; Hm_lpvt_0af14db9b5993b4879812c54f6cf147d=1554880467") // .set( "X-Forwarded-For","122.235.188.119") // .set( "Host", "www.vcg.com") // .set( "Accept-Encoding","gzip, deflate, br") // .set( "Referer", url) function get(url) { superagent.get(url) .set( { "Accept": "application/json", "Accept-Encoding": "gzip, deflate, br", "Host": "www.vcg.com", "Referer": 'https://www.vcg.com/api/common/search?phrase=%E6%98%A5%E5%A4%A9&graphicalStyle%5B0%5D=1&page=1', "X-Forwarded-For": "122.235.188.119", "Cookie": "acw_tc=276aedea15548801849954091e3094245e35937d42d912140fa37630751eeb; clientIp=122.235.188.119; source=baidusem; sajssdk_2015_cross_new_user=1; _ga=GA1.2.109353021.1554880168; _gid=GA1.2.1994909913.1554880168; Hm_lvt_0af14db9b5993b4879812c54f6cf147d=1554880168; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22%24device_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidusem%22%2C%22%24latest_utm_medium%22%3A%22cpc%22%2C%22%24latest_utm_campaign%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E6%A0%B8%E5%BF%83%22%2C%22%24latest_utm_content%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E5%9B%BE%E7%89%87%22%2C%22%24latest_utm_term%22%3A%22%E5%9B%BE%E7%89%87%22%7D%7D; Hm_lpvt_0af14db9b5993b4879812c54f6cf147d=1554880467", } ) .end(function (err, res) { // 拋錯攔截 if (err) { throw Error(err); return } // console.log(res); fs.appendFile("./接口數據.txt", JSON.stringify(res.text), 'utf-8', function (err) { if (err) { console.log(err); } }); }); } get('https://www.vcg.com/api/common/searchImage?phrase=%E6%98%A5%E5%A4%A9&graphicalStyle%5B0%5D=1&page=1');
這樣就爬取到了第一頁的數據 並且在本地創建了一個接口數據的txt文件 。
這樣一個簡單的接口數據就已經爬下來了,對於數據的處理就不多廢話,各取所需。
本人qq : 981900309 加備注博客園