自從學習了爬蟲,就想在B站爬取點什么數據,最近看到一些個up主漲粉很快,於是對up主的粉絲數量產生了好奇,所以就有了標題~
首先,我天真的以為通過up主個人空間的地址就能爬到
https://space.bilibili.com/137952
但事與願違,給這個地址發送請求返回來的並不是我們想要的頁面數據,而是一個類似於要求用戶更換瀏覽器的錯誤頁面
我們可以使用postman來模擬發送這個請求
響應的頁面就是這個
<!DOCTYPE html> <html> <head> <meta name=spm_prefix content=333.999> <meta charset=UTF-8> <meta http-equiv=X-UA-Compatible content="IE=edge,chrome=1"> <meta name=renderer content=webkit|ie-comp|ie-stand> <link rel=stylesheet type=text/css href=//at.alicdn.com/t/font_438759_ivzgkauwm755qaor.css> <script type=text/javascript>var ua = window.navigator.userAgent var agents = ["Android","iPhone","SymbianOS","Windows Phone","iPod"] var pathname = /\d+/.exec(window.location.pathname) var getCookie = function(sKey) { return decodeURIComponent( document.cookie.replace( new RegExp('(?:(?:^|.*;)\\s*' + encodeURIComponent(sKey).replace(/[\-\.\+\*]/g, '\\$&') + '\\s*\\=\\s*([^;]*).*$)|^.*$'), '$1' ) ) || null } var DedeUserID = getCookie('DedeUserID') var mid = pathname ? +pathname[0] : DedeUserID === null ? 0 : +DedeUserID if (mid < 1) { window.location.href = 'https://passport.bilibili.com/login?gourl=https://space.bilibili.com' } else { window._bili_space_mid = mid window._bili_space_mymid = DedeUserID === null ? 0 : +DedeUserID var prefix = /^\/v/.test(pathname) ? '/v' : '' window.history.replaceState({}, '', prefix + '/' + mid + '/' + (pathname ? window.location.hash : '#/')) for (var i = 0; i < agents.length; i++) { if (ua.indexOf(agents[i]) > -1) { window.location.href = 'https://m.bilibili.com/space/' + mid break } } } </script> <link href=//s1.hdslb.com/bfs/static/jinkela/space/css/space.25.bbaa2f1b5482f89caf23662936077cf2ae130dd9.css rel=stylesheet> <link href=//s1.hdslb.com/bfs/static/jinkela/space/css/space.26.bbaa2f1b5482f89caf23662936077cf2ae130dd9.css rel=stylesheet> </head> <body> <div class="z-top-container has-top-search"></div> <!--[if lt IE 9]> <div id="browser-version-tip"> <div class="wrapper"> 抱歉,您正在使用不支持的瀏覽器訪問個人空間。推薦您 <a href="//www.google.cn/chrome/browser/desktop/index.html">安裝 Chrome 瀏覽器</a>以獲得更好的體驗 ヾ(o◕∀◕)ノ </div> </div> <![endif]--> <div id=space-app></div> <script type=text/javascript>//日志上報 window.spaceReport = {} window.reportConfig = { sample: 1, scrollTracker: true, msgObjects: 'spaceReport' } var reportScript = document.createElement('script') reportScript.src = '//s1.hdslb.com/bfs/seed/log/report/log-reporter.js' document.getElementsByTagName('body')[0].appendChild(reportScript) reportScript.onerror = function() { console.warn('log-reporter.js加載失敗,放棄上報') var noop = function() {} window.reportObserver = { sendPV: noop, forceCommit: noop } }</script> <script src=//static.hdslb.com/js/jquery.min.js></script> <script src=//s1.hdslb.com/bfs/seed/jinkela/header/header.js></script> <script type=text/javascript src=//s1.hdslb.com/bfs/static/jinkela/space/manifest.bbaa2f1b5482f89caf23662936077cf2ae130dd9.js></script> <script type=text/javascript src=//s1.hdslb.com/bfs/static/jinkela/space/vendor.bbaa2f1b5482f89caf23662936077cf2ae130dd9.js></script> <script type=text/javascript src=//s1.hdslb.com/bfs/static/jinkela/space/space.bbaa2f1b5482f89caf23662936077cf2ae130dd9.js></script> </body> </html>
這是怎么回事呢?
讓我們來換個思路吧,隨便打開一個up主的專欄,按下F12,可以看到發送了這么些請求
其中的這個請求便是我們所需要的
https://api.bilibili.com/x/web-interface/card?mid=1393013&jsonp=jsonp&article=true
讓我們再用postman來測試一下
由圖可見,返回的json串就是我們想要的數據
接下來,我們就用Java的爬蟲框架WebMagic來編寫爬蟲程序,爬取1~1000的用戶信息(粉絲數 >= 10000)
package com.tangzhe.spider.webmagic; import com.alibaba.fastjson.JSONObject; import com.mongodb.*; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import com.tangzhe.spider.webmagic.entity.UpMaster; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Json; import java.util.ArrayList; import java.util.List; /** * Created by 唐哲 * 2018-06-21 17:36 * 爬取b站粉絲數 */ public class BilibiliPageProcess implements PageProcessor { private static MongoCollection<DBObject> collection = null; static { MongoClientOptions options = MongoClientOptions.builder().connectTimeout(60000).build(); MongoClient client = new MongoClient(new ServerAddress("localhost", 27017), options); MongoDatabase db = client.getDatabase("bilibili"); collection = db.getCollection("up_master", DBObject.class); } private Site site = Site.me() .setRetrySleepTime(3); //.addHeader("Cookie", "sid=73z7tai9; fts=1499404833; pgv_pvi=4286189568; LIVE_BUVID=038be0e7dbefae118807a05ce6758c31; LIVE_BUVID__ckMd5=35b7fa0ba25cd9c6; rpdid=iwpsxqxxqxdopllxpxipw; buvid3=6ECE81DC-0F17-4805-A8D8-93AAA590623537243infoc; biliMzIsnew=1; biliMzTs=0; UM_distinctid=160c4c3cad640-0317802b275f19-5a442916-144000-160c4c3cad724e; im_notify_type_1393013=0; _cnt_dyn=undefined; _cnt_pm=0; _cnt_notify=0; uTZ=-480; CURRENT_QUALITY=64; im_local_unread_1393013=0; im_seqno_1393013=37; finger=edc6ecda; DedeUserID=1393013; DedeUserID__ckMd5=afb19007fffe33b0; SESSDATA=0ea7d6d3%2C1531986297%2C72337142; bili_jct=1814a4be6409416a0cc9f606e5494a09; _dfcaptcha=bebbef520c7c46f8cb0a877c33c677e1; bp_t_offset_1393013=132005944396809066") //.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"); @Override public Site getSite() { return this.site; } @Override public void process(Page page) { Json result = page.getJson(); JSONObject jsonObject = JSONObject.parseObject(result.toString()); JSONObject data = jsonObject.getJSONObject("data"); JSONObject card = data.getJSONObject("card"); String mid = card.getString("mid"); // mid String name = card.getString("name"); // name String face = card.getString("face"); // 頭像 String fans = card.getString("fans"); // 粉絲數 if (Long.parseLong(fans) <= 10000) { return; } String attention = card.getString("attention"); // 關注數 String sign = card.getString("sign"); // 簽名 JSONObject levelInfo = card.getJSONObject("level_info"); String level = levelInfo.getString("current_level"); // 會員等級 BasicDBObject document = new BasicDBObject(); document.append("mid", Long.parseLong(mid)) .append("name", name) .append("face", face) .append("fans", Long.parseLong(fans)) .append("attention", Long.parseLong(attention)) .append("sign", sign) .append("level", Integer.parseInt(level)); collection.insertOne(document); } public static void main(String[] args) { List<String> urls = new ArrayList<>(); for (int i = 1; i <= 1000; i++) { urls.add("https://api.bilibili.com/x/web-interface/card?mid=" + i); } Spider.create(new BilibiliPageProcess()).addRequest().addUrl(urls.toArray(new String[urls.size()])).thread(10).run(); } }
爬取的數據存儲到mongodb中,打開mongodb查看存下來的數據:
uid在1~1000以內粉絲數在10000以上包括10000的up主就全部存儲到數據庫中了
我們可以用spingboot寫一個web應用,通過頁面和接口更好地查看這些數據
pom文件:
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.tangzhe</groupId> <artifactId>spider-demo</artifactId> <version>1.0</version> <packaging>jar</packaging> <name>spider-demo</name> <description>this is my spider demo</description> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>1.5.10.RELEASE</version> </parent> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!-- webmagic --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.6.1</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.6.1</version> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.16.12</version> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.23</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.7</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.6</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>2.3.0</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-thymeleaf</artifactId> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.47</version> </dependency> <!-- mongodb --> <dependency> <groupId>org.mongodb</groupId> <artifactId>mongo-java-driver</artifactId> <version>3.3.0</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-mongodb</artifactId> </dependency>
<dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-freemarker</artifactId> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
配置文件:
server: port: 8888 spring: application: name: spider-demo profiles: active: dev http: encoding: force: true charset: UTF-8 enabled: true thymeleaf: encoding: UTF-8 cache: false mode: HTML5 mongo: host: localhost port: 27017 timeout: 60000 db: test spring: freemarker: allow-request-override: false cache: true check-template-location: true charset: UTF-8 content-type: text/html expose-request-attributes: false expose-session-attributes: false expose-spring-macro-helpers: false
mongodb配置類:
package com.tangzhe.spider.webmagic.config; import com.mongodb.MongoClient; import com.mongodb.MongoClientOptions; import com.mongodb.ServerAddress; import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.data.mongodb.core.MongoTemplate; import org.springframework.data.mongodb.core.SimpleMongoDbFactory; import org.springframework.data.mongodb.core.convert.DefaultMongoTypeMapper; import org.springframework.data.mongodb.core.convert.MappingMongoConverter; import org.springframework.data.mongodb.core.mapping.MongoMappingContext; import org.springframework.data.mongodb.gridfs.GridFsTemplate; /** * MongoDB配置 */ @Configuration @ConfigurationProperties(prefix = "mongo") @Data public class MongoDBConfiguration { //mongodb服務地址 private String host; //mongodb服務端口號 private Integer port; //連接超時 private Integer timeout; //mongodb數據庫名 private String db; /** * 配置MongoDB模板 */ @Bean public MongoTemplate mongoTemplate(SimpleMongoDbFactory mongoDbFactory, MappingMongoConverter mappingMongoConverter) { return new MongoTemplate(mongoDbFactory, mappingMongoConverter); } /** * 配置自增ID監聽器 */ // @Bean // public SaveMongoEventListener saveMongoEventListener() { // return new SaveMongoEventListener(); // } /** * 配置GridFs模板,實現文件上傳下載 */ @Bean public GridFsTemplate gridFsTemplate(SimpleMongoDbFactory mongoDbFactory, MappingMongoConverter mappingMongoConverter) { return new GridFsTemplate(mongoDbFactory, mappingMongoConverter); } /** * 配置mongoDbFactory */ @Bean public SimpleMongoDbFactory mongoDbFactory() { MongoClientOptions options = MongoClientOptions.builder().connectTimeout(timeout).build(); MongoClient client = new MongoClient(new ServerAddress(host, port), options); return new SimpleMongoDbFactory(client, db); } /** * 配置mongoMappingContext */ @Bean public MongoMappingContext mongoMappingContext() { return new MongoMappingContext(); } /** * 配置defaultMongoTypeMapper */ @Bean public DefaultMongoTypeMapper defaultMongoTypeMapper() { //去掉_class字段 return new DefaultMongoTypeMapper(null); } /** * 配置mappingMongoConverter */ @Bean public MappingMongoConverter mappingMongoConverter(SimpleMongoDbFactory mongoDbFactory, MongoMappingContext mongoMappingContext, DefaultMongoTypeMapper defaultMongoTypeMapper) { MappingMongoConverter mappingMongoConverter = new MappingMongoConverter(mongoDbFactory, mongoMappingContext); mappingMongoConverter.setTypeMapper(defaultMongoTypeMapper); return mappingMongoConverter; } }
首先編寫一個跟mongodb交互的entity類:
package com.tangzhe.spider.webmagic.entity; import lombok.Data; import org.springframework.data.mongodb.core.mapping.Document; @Document(collection = "up_master") @Data public class UpMaster { private String mid; private Long uid; private String name; // 頭像 private String face; // 粉絲 private Long fans; // 關注 private Long attention; // 簽名 private String sign; // 會員等級 private Integer level; }
持久層:
這里有三個方法
第一個是通過uid查詢用戶
第二個是查詢所有用戶並通過粉絲數排序
第三個是查詢粉絲數排行前十名
package com.tangzhe.spider.webmagic.repository; import com.tangzhe.spider.webmagic.entity.UpMaster; import org.springframework.data.repository.CrudRepository; import java.util.List; public interface UpMasterRepository extends CrudRepository<UpMaster, Long> { List<UpMaster> findAllByOrderByMid(); List<UpMaster> findAllByOrderByFansDesc(); List<UpMaster> findTop10ByOrderByFansDesc(); }
業務層:
查詢前10up主
package com.tangzhe.spider.webmagic.service; import com.tangzhe.spider.webmagic.entity.UpMaster; import com.tangzhe.spider.webmagic.repository.UpMasterRepository; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Sort; import org.springframework.stereotype.Service; import java.util.List; @Service public class UpMasterServiceImpl implements UpMasterService { @Autowired private UpMasterRepository upMasterRepository; @Override public List<UpMaster> findAllOrderByMid() { Sort.Order order = new Sort.Order(Sort.Direction.DESC, "mid"); Sort sort = new Sort(order); return upMasterRepository.findTop10ByOrderByFansDesc(); } @Override public void add(UpMaster upMaster) { upMasterRepository.save(upMaster); } }
controller層:
這個接口可以查詢當前mongodb中排名前10的up主信息
package com.tangzhe.spider.webmagic.controller; import com.tangzhe.spider.webmagic.entity.UpMaster; import com.tangzhe.spider.webmagic.service.UpMasterService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; import java.util.List; @RestController @RequestMapping("/up") public class UpMasterController { @Autowired private UpMasterService upMasterService; @GetMapping("/list") public List<UpMaster> list() { List<UpMaster> list = upMasterService.findAllOrderByMid(); return list; } }
最后寫一個前端頁面展示up主粉絲排行:
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8" /> <title>B站up主粉絲數排行榜</title> <style type="text/css"> .face { width: 128px; height: 128px; } .content { position: absolute; } .info { float: left; padding-right: 250px; } </style> <script type="text/javascript" src="/jquery-1.8.3.js"></script> <script type="text/javascript"> $.ajax({ url: "/up/list", type: "GET", success: function(data){ $(data).each(function(i, up){ var no = i+1; var name = "<div>"+"NO."+no+"\t"+up.name+"</div>"; var uid = "<div>UID: "+up.uid+"</div>"; var face = "<img src="+up.face+" />"; var fans = "<div>粉絲:"+up.fans+"</div><br/>" var temp = "info"; var info = "<div class="+temp+">" + name + uid + face + fans + "</div>"; $(".content").append(info); $(".content img").addClass("face"); }); } }); </script> </head> <body> <div class="content"></div> </body> </html>
現在就可以執行springboot主類運行項目
訪問 http://localhost:8888/
圖中展示出來的就是B站用戶Uid從1~5000的粉絲數在1W以上的up主的前十名排行榜了
看起來很不錯,一個爬蟲程序加上web端頁面展示就完成了~
但是,當爬取上萬甚至上百萬B站用戶信息的時候,爬蟲效率並不高,可以說是很慢,這個問題讓我很糾結。。。
於是我便想用go語言來寫一段同樣的爬蟲程序,試試看會不會快一點,說寫就寫~
這里為了方便起見,就不分層了,直接將爬蟲程序放到一個文件中
package main import ( "fmt" "strconv" "net/http" "encoding/json" "log" "gopkg.in/mgo.v2" "time" ) // B站接口返回數據類型 type Result struct { Code int `json:"code"` Message string `json:"message"` Ttl int `json:"ttl"` Data Data `json:"data"` } type Data struct { Card UpMaster `json:"card"` } // up主結構體 type UpMaster struct { Mid string `json:"mid"` Uid int64 `json:"uid"` Name string `json:"name"` Face string `json:"face"` Fans int64 `json:"fans"` Attention int64 `json:"attention"` Sign string `json:"sign"` } const urlPrefix = "https://api.bilibili.com/x/web-interface/card?mid=" // url前綴 var conn *mgo.Collection func init() { // 初始化mongodb session, err := mgo.Dial("localhost:27017") if err != nil { panic(err) } session.SetMode(mgo.Monotonic, true) conn = session.DB("test").C("up_master") } // 爬蟲執行函數,供外部調用 func Work(f func(i int, page chan<- int) ()) { var start, end int fmt.Printf("請輸入起始值:") fmt.Scan(&start) fmt.Printf("請輸入結束值:") fmt.Scan(&end) fmt.Printf("現在開始爬取UID從%d~%d的B站用戶粉絲數\n", start, end) page := make(chan int) for i:=start; i<=end; i++ { go f(i, page) } for i:=start; i<=end; i++ { fmt.Printf("%d爬取完成\n", <-page) } } // 通過用戶id爬取粉絲數量 func SpideBilibiliFansByUid(i int, page chan<- int) { // 明確爬取的url url := urlPrefix + strconv.Itoa(i) //fmt.Println(url) resp, err := http.Get(url) if err != nil { fmt.Println("http.Get err =", err) return } defer resp.Body.Close() var result string buf := make([]byte, 4*1024) for { n, _ := resp.Body.Read(buf) if n == 0 { break } result += string(buf[:n]) } //fmt.Println(result) // 將接口返回的json串封裝成upMaster對象 var res Result err = json.Unmarshal([]byte(result), &res) if err != nil { log.Fatal(err) } //fmt.Println(res.Data.Card) up := res.Data.Card // up主對象 atoi, _ := strconv.Atoi(up.Mid) up.Uid = int64(atoi) // 將up主對象存入mongodb // 粉絲數 >= 10000 if up.Fans >= 10000 { conn.Insert(up) } time.Sleep(10 * time.Millisecond) page <- i } func main() { Work(SpideBilibiliFansByUid) }
go語言在語言層面天生支持多線程,只要在前面加上go關鍵字,就能使用協程了 go func(){}
運行程序(運行之前需要先開mongodb):
這里輸入的兩個數字就是B站用戶的uid,圖中是1~10000
經測試,速度比Java的WebMagic快了好幾個層級,所以爬蟲程序就選用go語言的了,web項目還是采用springboot的。
最后奉上Uid從1~10000的up主粉絲大於1W的用戶數據:
web頁面展示前十名:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
DuangDuangDuangDuang,由於篇幅有限,未能展示所有UP主,畢竟有好幾十億的用戶啊(這得爬到什么時候呢。。。)
不過后續還會推出更多的排名,慢慢地接近爬取所有UP主粉絲數
請大家拭目以待哦~