前提
与浏览器DOM具有一致接口的JsDOM https://github.com/jsdom/jsdom
与jQuery有类似接口的CheerIO https://github.com/cheeriojs/cheerio
都可以做DOM操作
JSOM
文档:https://airbnb.io/enzyme/docs/guides/jsdom.html
一段抓取网页数据的代码:
const fs = require('fs');
const { JSDOM } = require('jsdom');
const jquery = require('jquery');
const { log, table, } = console;
function get(url, callback) {
try {
JSDOM.fromURL(url).then(jsenv => {
const $ = jquery(jsenv.window);
const title = $('a#thread_subject')[0];
let result = title.innerHTML + '\n\n\n';
const ct = $('div#postlist > div > table .t_fsz')[0];
result += ct.querySelector('td.t_f').innerHTML;
result = result.replace(/(?:<br\s*>| )/g, '') .replace(/<img src="([^"]*)" .*>/g, '[img]$1[/img]') .replace(/<font.*>.*<\/font>\n*/g, '\n') .replace(/<a.*>\n*/g, '') .replace(/(?:<\/a>\n|\n\n)/g, '\n');
try {
const link = ct.querySelector('p.attnm > a').href;
result += '\n\n下载地址:' + link;
} catch(e) {}
callback(result);
});
} catch (e) {
console.log(e);
}
}