Puppeter是什么的?
Puppeter在github上對自己的介紹是:
Haedless Chrome Node API
puppeteer是一個nodejs的庫,支持調用Chrome的API來操縱Web,相比較Selenium或是PhantomJs,它最大的特點就是它的操作Dom可以完全在內存中進行模擬既在V8引擎中處理而不打開瀏覽器(headless無界面)。但要注意的是,它雖然很好用,但一般卻不建議用來做測試使用,因為是專門針對Chrome處理的,當然你也可以根據業務需要來選擇。
Puppeter能做什么?
Puppeter官網給了幾個例子,分別是:
(1)網頁截圖。
(2)生成頁面的PDF。
(3)分析當前頁的腳本。
(4) 寫爬蟲
(5) ....
安裝
Puppeteer 至少需要 Node v6.4.0,如要使用 async / await,只有 Node v7.6.0 或更高版本才支持。
如果項目路徑下沒有package.json就先執行“npm init”,然后按照提示填寫完畢后,生成一個package.json文件,然后執行:
npm i puppeteer
我在安裝過程中遇到了錯誤:
是在執行install.js 下載Chromium時出錯,你也可以通過設置環境變量set PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1阻止下載 Chromium,稍后再手動下載,但手動下載后還要配置路徑,太麻煩啦,所以解決方案是打開翻牆軟件再重新執行下“npm i puppeteer”。
使用
(1)網頁截圖
//screenshot.js
const puppeteer = require('puppeteer');
const config = require('./config/config');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.baidu.com');
await page.screenshot({
path:`${config.screenshot}/${Date.now()}.png`,
});
await browser.close();
})();
//config.js
const path = require('path')
module.exports ={
screenshot:path.resolve(__dirname,'../../screenshot')
}
(2) 將網頁生成pdf
const puppeteer = require('puppeteer');
const config = require('./config/config');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.baidu.com',{waitUntil:'networkidle2'});
await page.pdf({path: `${config.pdfroot}/${Date.now()}.pdf`, format: 'A4'});
await browser.close();
})();
(3)分析網頁
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.baidu.com');
// Get the "viewport" of the page, as reported by the page.
const dimensions = await page.evaluate(() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio
};
});
console.log('Dimensions:', dimensions);
await browser.close();
})();
(4) 寫爬蟲
//screenshot.js
const puppeteer = require('puppeteer');
const config = require('./config/config');
const srcToImg = require('./helper/srcToImg');
const chalk = require('chalk');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://image.baidu.com/');
console.log('go to https://image.baidu.com/')
await page.setViewport({
width: 1920,
height: 1080
})
console.log("reset viewpoint");
await page.focus('#kw');
await page.keyboard.sendCharacter('單身狗');
await page.click('.s_search');
console.log(chalk.red(("reset viewpoint")));
console.log('go to searchlist');
page.on('load', async () => {
console.log('page loading done,start fetch.........')
const srcs = await page.evaluate(() => {
const images = document.querySelectorAll('img.main_img');
return Array.prototype.map.call(images, img => img.src);
})
srcs.forEach(src => {
srcToImg(src,config.imgUrl)
});
await browser.close();
})
})();
//srcToImg.js
const http = require('http');
const https = require('https');
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const writeFile = promisify(fs.writeFile)
module.exports = async (src,dir) =>{
if(/\.(jpg|png|gif)$/.test(src)){
await urlToImg(src,dir);
}else{
await base64ToImg(src,dir);
}
}
//url => img
const urlToImg = async (url,dir) =>{
const mod = /^https:/.test(url)?https:http;
const ext = path.extname(url);
const file = path.join(dir,`${Date.now()}${ext}`)
mod.get(url, res => {
res.pipe(fs.createWriteStream(file))
.on('finish',() =>{
console.log(file);
})
})
}
//base64 => img
const base64ToImg = async function(base64Str,dir){
const matches = base64Str.match(/^data:(.+?);base64,(.+)$/);
try{
const ext = matches[1].split('/')[1]
.replace('jpeg','jpg');
const file = path.join(dir,`${Date.now()}.${ext}`)
await writeFile(file,matches[2],'base64');
console.log(file);
}catch(err){
console.log("非法的base64 字符串")
}
};
//config.js
const path = require('path')
module.exports ={
imgUrl:path.resolve(__dirname,'../../images'),
}