使用Nodejs實現的小說爬蟲

本文轉載自查看原文 2017-04-23 23:26 1260 爬蟲/ node.js/ javascript

 1 //引入模塊
 2 const http = require('http')
 3 const fs = require('fs')
 4 const cheerio = require('cheerio')
 5 const iconv = require('iconv-lite')
 6 //第一章url
 7 const url = 'http://www.81zw.com/book/8634/745331.html'
 8 //開始章節數
 9 let i = 1
10 //最大獲取章節數
11 let num = 100
12 
13 function main(url) {
14     startRequest(url)
15 }
16 
17 function startRequest(url) {
18     http.get(url, res => {
19         //定義空數組存放html
20         const html = []
21         res.on('data', (chunk) => {
22             //把數據塊添加進數組
23             html.push(chunk)
24         })
25         res.on('end', () => {
26             //獲取數據完畢后，使用iconv-lite轉碼，decedo中為Buffer對象，Buffer.concat為數組
27             const html1 = iconv.decode(Buffer.concat(html), 'gbk')
28             //使用cheerio解析html，cheerio模塊的語法跟jQuery基本一樣
29             const $ = cheerio.load(html1, {decodeEntities: false})
30             //處理數據
31             const title = $('.bookname h1').text()
32             const arr = []
33             const content = $("#content").html()
34             //分析結構后分割html
35             const contentArr = content.split('<br><br>')
36             contentArr.forEach(elem => {
37                 //去除內容的兩端空格和&nbsp;
38                 const data = trim(elem.toString())
39                 arr.push(data)
40             })
41             const bookName = $(".con_top a").eq(2).text()
42             //定義存入數據庫的對象
43             const obj = {
44                 id: i,
45                 err: 0,
46                 bookName: bookName,
47                 title: title,
48                 content: arr
49             }
50 
51             let url2 = url.split('/')[url.split('/').length - 2]
52             const link = $(".bottem2 a").eq(2).attr('href')
53             //獲取當前章節的下一章地址，遞歸調用fetchPage
54             const nextLink = `http://www.81zw.com/book/${url2}/${link}`
55             saveContent(obj, nextLink)
56             console.log(`第${i + 1}章：${nextLink}`)
57             i++
58             if (i <= num) {
59                 setTimeout(() => {
60                     main(nextLink)
61                 }, 100)
62             }
63         })
64     })
65 }
66 
67 function saveContent(obj, nextLink) {
68     console.log(`${i}--${obj.title}`)
69     //判斷書名文件夾是否存在，不存在則創建
70     if (!fs.existsSync(`data/${obj.bookName}`)) {
71         fs.mkdirSync(`data/${obj.bookName}`)
72     }
73     //寫入json文件
74     fs.writeFile(`./data/${obj.bookName}/chapter${i}.json`, JSON.stringify(obj), 'utf-8', err => {
75         if (err) throw err
76     })
77 }
78 
79 function trim(str) {
80     return str.replace(/(^\s*)|(\s*$)/g, '').replace(/&nbsp;/g, '')
81 }
82 
83 main(url)

生成文件

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 【nodejs爬蟲】使用async控制並發寫一個小說爬蟲 Node.js 實現簡單小說爬蟲 nodejs實現簡單爬蟲 Nodejs實現爬蟲抓取數據使用Python3爬蟲抓取網頁來下載小說 python爬蟲之小說爬取爬蟲入門（四）——Scrapy框架入門：使用Scrapy框架爬取全書網小說數據初次嘗試python爬蟲，爬取小說網站的小說。批量下載小說網站上的小說（python爬蟲）一個逐頁抓取網站小說的爬蟲