[Nodejs] 用node寫個爬蟲

本文轉載自查看原文 2019-03-22 18:03 8010 node/ 爬蟲/ cheerio/ Nodejs

尋找爬取的目標

首先我們需要一個堅定的目標,於是找個一個比較好看一些網站,將一些信息統計一下,比如 url/tag/title/number...等信息

init(1, 2); //設置頁數,現在是1-2頁

async function init(startPage, endPage) {
  for (let i = startPage; i <= endPage; i++) {
    await getAndSaveImg(i);
  }
    .....
}

一般網站都會進行一些反爬蟲處理,這時候就需要一個 ip 代理池進行 ip 偽裝了.

網絡請求

使用一個 nodejs 的模塊 request,這個模塊可以讓 node 的 http 請求變的更加簡單,同時支持 http/https 請求還可以將任何請求輸出到文件流.

request.post({url:'http://service.com/upload', formData: formData}, function optionalCallback(err, httpResponse, body) {
  if (err) {
    return console.error('upload failed:', err);
  }
  console.log('Upload successful!  Server responded with:', body);
});

使用 request 封裝個方法進行請求

新建 utils/ajax.js

let request = require("request");

module.exports = {
  handleRequestByPromise
};

function handleRequestByPromise(options) {
  let op = Object.assign(
    {},
    {
      url: "",
      method: "GET",
      encoding: null,
      header: {
        "User-Agent":
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
        Referer: "https://www.meituri.com"
      }
    },
    options
  );

  if (op.url === "") {
    throw new Error("請求的url地址不正確");
  }

  const promise = new Promise(function(resolve, reject) {
    request(op, (err, response, body) => {
      if (err) reject(err);

      if (response && response.statusCode === 200) {
        resolve(body);
      } else {
        reject(`請求✿✿✿${url}✿✿✿失敗`);
      }
    });
  });

  return promise;
}

cheerio

官網

爬蟲需要抓取頁面上特定的信息.需要依據一些標識符去拿到想要的信息,不如 id.比如 class.cheerio 就是這么一個工具,將網站信息轉化成可以直接用 jquery 的 dom 進行提取的一個模塊.cheerio 的出現就是用於服務端需要對 dom 進行操作的地方.

基本使用

let cheerio = require('cheerio');
let $ = cheerio.load("<div id='helloworld'>hello world</div>", {ignoreWhitespace: true...})

options 用來進行一些特別的定制更多

選擇器

基本和 jquery 一樣

$( selector, [context], [root] )

$(".helloworld").text();

屬性操作

.attr(name, value)
.removeAtrr(name)
.hasClass(className)
.addClass(className)
.remoteClass([className])

遍歷

.find(selector)
.parent()
.next()
.prev()
.siblings()
.children( selector )
.each( function(index, element) )
.map( function(index, element) )
.filter( selector )
.filter( function(index) )
.first()
.last()
.eq(i)

操作 DOM

.append( content, [content, ...] )
.prepend( content, [content, ...] )
.after( content, [content, ...] )
.before( content, [content, ...] )
.remove( [selector] )
.replaceWith( content )
.empty()
.html( [htmlString] )
.text( [textString] )

其他

$.html()
$('ul').text()
.toArray()
.clone()
$.root()
$.contains( container, contained )

在項目中使用

  let homeBody = await handleRequestByPromise({ url: pageImgSetUrl });
  let $ = cheerio.load(homeBody);
  let lis = $(".hezi li");

上面就是將獲取的 html 數據通過 cheerio 轉化后,可以直接使用$符號進行類似 dom 的使用方法.特別適合前端使用

iconv-lite

有些時候,獲取到的數據是一些亂碼,尤其是中文的情況.所以我們需要解決亂碼的問題,iconv-lite 模塊就可以解決這一問題.

homeBody = iconv.decode(homeBody,"GBK"); //進行gbk解碼

如果亂碼就在 cheerio.load()之前進行解碼.(這次用的網站並沒有亂碼).原因是

<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> //這里是utf-8

如果是 gbk 或者 gbk2312 等就需要解碼了

爬取流程

找尋目標
控制台查看 dom 的信息存放或標識符(id,class,element)
爬取 title,url,tag,num 等信息進行存放
進行下載(如果只需要鏈接其實可以不下載,不過許多網站對圖片外部引入有限制)
入庫(mysql)
出個 html 進行圖片查看(簡易寫真集網站)

初始化

還是創建一個本地服務器,異步沒有使用 async 模塊,而是直接使用 es6 的 async/await 語法.

let http = require("http");
let url = require("url");
let Extend = require("./Extend");
let xz = new Extend(1, 2);

http
  .createServer((request, response) => {
    let pathname = url.parse(request.url).pathname;
    if (pathname !== "/favicon.ico") {
      router(pathname)(request, response);
    }
  })
  .listen(9527);
console.log("server running at http://127.0.0.1:9527/");

function router(p) {
  let router = {
    "/": (request, response) => {
      response.writeHead(200, { "Content-type": "text/html;charset=utf-8" });
      response.end();
    },
    "/xz": async (request, response) => {
      response.writeHead(200, { "Content-type": "text/html;charset=utf-8" });
      await xz.init(response);
      response.end();
    },
    "/404": (request, response) => {
      response.writeHead(404, { "Content-Type": "text/plain;charset=utf-8" });
      response.end("404找不到相關文件");
    }
  };
  !Object.keys(router).includes(p) && (p = "/404");
  return router[p];
}

分析頁面

直接右鍵在控制台中查看就好了,看看 class,id 什么,cheerio 實現的 jquery 的 dom 相關的 api 十分強大,直接$("")就行

進行網站的分析和抓取

開始進行網站數據的分析和爬取,如果亂碼就在 cheerio 操作之前進行解碼就行了,這樣通過一個變量將爬取的數據全部保存起來.也可以創建相應的文件夾和 txt 文件進行保存(writeFile),還可以直接在這里就將數據保存到數據庫.(看心情)

async getAndSaveImg(page) {
    let pageImgSetUrl = ``;

    if (page === 1) {
      pageImgSetUrl = `${this.siteUrl}`;
    } else {
      pageImgSetUrl = `${this.siteUrl}${page}.html`;
    }

    let homeBody = await handleRequestByPromise({ url: pageImgSetUrl });
    let $ = cheerio.load(homeBody);
    let lis = $(".hezi li");

    for (let i = 0; i < lis.length; i++) {
      let config = {
        href: lis
          .eq(i)
          .find("a")
          .eq(0)
          .attr("href"),
        num: lis
          .eq(i)
          .find(".shuliang")
          .text(),
        title: lis
          .eq(i)
          .find(".biaoti a")
          .text()
          .replace(/\//, "")
      };

      config.childs = [];

      let num = Number(config.num.substr(0, 2));
      for (let j = 1; j <= num; j++) {
        let link = config.href.replace(
          this.collectUrl,
          "https://ii.hywly.com/a/1/"
        );
        let a_link = `${link}${j}.jpg`;
        config.childs.push(a_link);
      }
      this.all.push(config);
    }
  }

進行圖片的下載

開始進行圖片的下載,並且創建相應的文件夾進行保存

async downloadAllImg() {
    let length = this.all.length;

    for (let index = 0; index < length; index++) {
      let childs = this.all[index].childs;
      let title = this.all[index].title;

      if (childs) {
        let c_length = childs.length;
        for (let c = 0; c < c_length; c++) {
          if (!fs.existsSync(`mrw`)) {
            fs.mkdirSync(`mrw`);
          }

          if (!fs.existsSync(`mrw/${title}`)) {
            fs.mkdirSync(`mrw/${title}`);
          }

          await super.downloadImg(
            childs[c],
            `mrw/${title}/${title}_image${c}.jpg`
          );

          console.log(
            "DownloadThumbsImg:",
            title,
            "SavePath:",
            `mrw/${title}/${title} image${c}.jpg`
          );
        }
      }
    }
  }

下載完之后存入數據庫

下載 mysql 模塊進行 mysql 數據庫操作

const fs = require("fs");
const mysql = require("mysql");
const path_dir = "D:\\data\\wwwroot\\xiezhenji.web\\static\\mrw\\";
const connection = mysql.createConnection({
  host: "xxxx",
  port: "xxxx",
  user: "xiezhenji",
  password: "iJAuzTbdrDJDswjPN6!*M*6%Ne",
  database: "xiezhenji"
});

module.exports = {
  insertImg
};

function insertImg() {
  connection.connect();

  let files = fs.readdirSync(path_dir, {
    encoding: "utf-8"
  });

  files.forEach((file, index) => {
    let cover_img_path = `/mrw/mrw_${index + 1}/image_1`;

    insert([
      "美女",
      file,
      Number(files.length),
      file,
      cover_img_path,
      `mrw/mrw_${index + 1}`,
      `mrw_${index + 1}`
    ]);
  });
}

function insert(arr) {
  let sql = `INSERT INTO photo_album_collect(tags,name,num,intro,cover_img,dir,new_name) VALUES(?,?,?,?,?,?,?)`;
  let sql_params = arr;

  connection.query(sql, sql_params, function(err, result) {
    if (err) {
      console.log("[SELECT ERROR] - ", err.message);
      return;
    }
    console.log("--------------------------SELECT----------------------------");
    console.log(result);
    console.log(
      "------------------------------------------------------------\n\n"
    );
  });
}

Docs

superagent
cheerio
async
request
iconv-lite
download
Node.js Request+Cheerio 實現一個小爬蟲-番外篇：代理設置
nodejs 爬取網頁出現亂碼的解決方案
Nodejs 爬取 10G 妹子套圖 cheerio

聲明:僅供學習,不可用於商業用途

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 [Nodejs] node寫個helloworld NodeJS寫個爬蟲，把文章放到kindle中閱讀 nodejs爬蟲 [NodeJS]Node模塊原理 NodeJs編寫小爬蟲基於nodejs 的多頁面爬蟲 NodeJS網絡爬蟲 nodejs實現新聞爬蟲 nodejs爬蟲selenium 用 Java 拿下 HTML 分分鍾寫個小爬蟲