由於一些巨大的困難,一些后端爬蟲改成了前端爬蟲。
前端爬蟲是只有js語言,后端爬蟲有python java nodejs php這些語言。
前端爬蟲有window.document對象,在瀏覽器端的爬蟲即使是二次發送ajax,也不需要學后端爬蟲來構造一堆請求頭,特別是困難的cookie token。最為重要的還是天然的突破了ip限制。
提取個人信息,直接使用頁面渲染后的結構:
/** * Created by xy49476 on 2018/1/9. */ /* * http://iservice.10010.com/e4/index_server.html * 提取個人信息 * */ function extractPersonalInfomation() { var loginName = document.querySelector('#personalInfo td').innerText; var userLevel = document.querySelector('#font').innerText; var userName = document.querySelector('div.data_basic_c.ly_gr_zl > dl:nth-child(1) > dd').innerText; var sex = document.querySelector('div.data_basic_c.ly_gr_zl > dl:nth-child(3) > dd').innerText; var idNo = document.querySelector('div.data_basic_c.ly_gr_zl > dl:nth-child(5) > dd').innerText; var contactPhone = document.querySelector('div.data_basic_c.ly_gr_zl > dl:nth-child(9) > dd').innerText; var contactAddress = document.querySelector('div.data_basic_c.ly_gr_zl > dl:nth-child(11) > dd').innerText; var package = document.querySelector('#packageInfocontext > div.add_main > div > dl:nth-child(1) > dd').innerText; var chargeType = document.querySelector('#packageInfocontext > div.add_main > div > dl:nth-child(3) > dd').innerText; var currentStatus = document.querySelector('#numberContext > div.data_basic_c2 > div.data_basic_c2_l > dl:nth-child(1) > dd').innerText; var netInDate = document.querySelector('#numberContext > div.data_basic_c2 > div.data_basic_c2_r > dl:nth-child(1) > dd').innerText; var billingAccount = document.querySelector('#numberContext > div.data_basic_c2 > div.data_basic_c2_l > dl:nth-child(3) > dd').innerText; var brand = document.querySelector('#numberContext > div.data_basic_c2 > div.data_basic_c2_r > dl:nth-child(2) > dd').innerText; var communicatelevel = document.querySelector('#numberContext > div.data_basic_c2 > div.data_basic_c2_l > dl:nth-child(4) > dd').innerText; var pukCode = document.querySelector('#numberContext > div.data_basic_c2 > div.data_basic_c2_r > dl:nth-child(3) > dd').innerText; var contractName = document.querySelector('#contractPeriod > div.ly_gr_l2 > dl:nth-child(1) > dd').innerText; var effectiveTime = document.querySelector('#contractPeriod > div.ly_gr_l2 > dl:nth-child(3) > dd').innerText; var uneffectiveTime = document.querySelector('#contractPeriod > div.ly_gr_l2 > dl:nth-child(5) > dd').innerText; personalObj = {loginName:loginName, userLevel:userLevel, userName:userName, sex:sex, idNo:idNo, contactPhone:contactPhone, contactAddress:contactAddress, package:package, chargeType:chargeType, currentStatus:currentStatus, netInDate:netInDate, billingAccount:billingAccount, brand:brand, communicatelevel:communicatelevel, pukCode:pukCode, contractName:contractName, effectiveTime:effectiveTime, uneffectiveTime:uneffectiveTime }; return '{"personalInfo":' + JSON.stringify(personalObj) + '}'; } //extractPersonalInfomation();
提取通話記錄,這里和上面不同,發送了ajax來獲取json,按月份按分頁獲取通話記錄:
/**
* Created by xy49476 on 2018/1/10.
*/
/*http://iservice.10010.com/e4/query/bill/call_dan-iframe.html?menuCode=000100030001
提取通話記錄
*
* */
function extractCallRecordsOuter() {
function getNear6Montgh(){
var near6MonthArray = [];
for (var i=0;i<6;i++){
var d = new Date();
d.setMonth(d.getMonth() - i);
var yy1 = d.getFullYear();
var mm1 = d.getMonth() + 1;//因為getMonth()返回值是 0(一月) 到 11(十二月) 之間的一個整數。所以要給其加1
var dd1 = d.getDate();
// if (mm1 < 10) {
// mm1 = '0' + mm1;
// }
if (dd1 < 10) {
dd1 = '0' + dd1;
}
console.info(yy1 + '-' + mm1 + '-' + dd1);
near6MonthArray.push([yy1,mm1]);
}
return near6MonthArray;
}
//按照年月,獲取一個月有多少天 mGetDate(2004,2)
function mGetDate(year, month){
var d = new Date(year, month, 0);
return d.getDate();
}
var callRecordsArray = [];
function extractCallRecords() {
//歷史賬單
var near6MonthArray = getNear6Montgh();
for (i of near6MonthArray) {
var dates = mGetDate(i[0], i[1]);
var mm1 = i[1];
if (mm1 < 10) {
mm1 = '0' + mm1;
}
var yearMonth = i[0] + '' + mm1;
var callRecordsObj = {};
callRecordsObj.pagelist = [];
callRecordsObj.yearMonth = yearMonth;
function extractCallRecordByPage(page) {
var page = page || 1;
var url = 'http://iservice.10010.com/e3/static/query/callDetail?_=1515561593328&accessURL=http://iservice.10010.com/e4/query/bill/call_dan-iframe.html?menuCode=000100030001&menuid=000100030001';
var data = {
pageNo: page,
pageSize: 200,
beginDate: yearMonth + '01',
endDate: yearMonth + dates
};
console.debug(data);
var htmlObj = $.ajax({
type: 'POST',
url: url,
data: data,
async: false //一定要設置這個,否則異步的還沒得到返回結果就執行到下面去了,代碼 一定會報錯
});
var htmlStr = htmlObj.responseText;
console.debug('htmlStr:',htmlStr);
var callRecordsRawObj = JSON.parse(htmlStr);
callRecordsObj.pagelist = callRecordsObj.pagelist.concat(callRecordsRawObj.pagelist);
var totalpage = callRecordsRawObj.totalpage;
if (page < totalpage) {
extractCallRecordByPage(page + 1); //回調自己
}
}
extractCallRecordByPage();
callRecordsArray.push(callRecordsObj);
}
callRecordsInfoObj = {
callRecordsInfo:callRecordsArray
};
return JSON.stringify(callRecordsInfoObj);
}
return extractCallRecords();
}
//extractCallRecordsOuter();
為了一個腳本一個函數,在最外層套了一個函數。
有沒有很像js閉包,但這不是閉包。
最后三行是
return extractCallRecords(); } extractCallRecordsOuter();
如果改成下面,
return extractCallRecords; } extractCallRecordsOuter()();
這樣就是js之閉包了。
前端爬蟲時候,使用jqury選擇器或者document的queryselector方法就能使用css3選擇器的語法了。比getElementByxx能更精確的提取某些元素。
使用jquery時候,如果被爬頁面沒有引入jquery庫,需要先在document注入jquery地址或者完整的執行一遍jquery代碼。
