前情提要:最近使用PHP實現了簡單的網盤搜索程序,並且關聯了微信公眾平台,名字是網盤小說。用戶可以通過公眾號輸入關鍵字,公眾號會返回相應的網盤下載地址。就是這么一個簡單的功能,類似很多的網盤搜索類網站,我這個采集和搜索程序都是PHP實現的,全文和分詞搜索部分使用到了開源軟件xunsearch。
真實上線案例:搜盤子-網盤電影資源站
上一篇([PHP] 網盤搜索引擎-采集爬取百度網盤分享文件實現網盤搜索)中我重點介紹了怎樣去獲取一大批的百度網盤用戶,這一篇介紹怎樣獲得指定網盤用戶的分享列表。同樣的原理,也是找到百度獲取分享列表的接口,然后去循環就可以了。
查找分享接口
隨便找一個網盤用戶的分享頁面,點擊最下面的分頁鏈接,可以看到發起的請求接口,這個就是獲取分享列表的接口。
整個的請求url是這個 https://pan.baidu.com/pcloud/feed/getsharelist?t=1493892795526&category=0&auth_type=1&request_location=share_home&start=60&limit=60&query_uk=4162539356&channel=chunlei&clienttype=0&web=1&logid=MTQ5Mzg5Mjc5NTUyNzAuOTEwNDc2NTU1NTgyMTM1OQ==&bdstoken=bc329b0677cad94231e973953a09b46f
調用接口獲取數據
使用PHP的CURL去請求這個接口,看看是否能夠獲取到數據。測試后發現,返回的是{"errno":2,"request_id":1775381927},並沒有獲取到數據。這是因為百度對header頭信息里面的Referer進行了限制,我把Referer改成http://www.baidu.com,就可以獲取到數據了。接口的參數也可以進行簡化成 https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=60&limit=60&query_uk=4162539356
測試代碼如下:
<?php /* * 獲取分享列表 */ class TextsSpider{ /** * 發送請求 */ public function sendRequest($url,$data = null,$header=null){ $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); if (!empty($data)){ curl_setopt($curl, CURLOPT_POST, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, $data); } if (!empty($header)){ curl_setopt($curl, CURLOPT_HTTPHEADER, $header); } curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); $output = curl_exec($curl); curl_close($curl); return $output; } } $textsSpider=new TextsSpider(); $header=array( 'Referer:http://www.baidu.com' ); $str=$textsSpider->sendRequest("https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=60&limit=60&query_uk=4162539356",null,$header); echo $str;
分享列表的json結果如下:
{ "errno": 0, "request_id": 1985680203, "total_count": 1025, "records": [ { "feed_type": "share", "category": 6, "public": "1", "shareid": "98963537", "data_id": "1799945104803474515", "title": "《通靈少女》2017.同步台視(完結待刪)", "third": 0, "clienttype": 0, "filecount": 1, "uk": 4162539356, "username": "a20****3762", "feed_time": 1493626027308, "desc": "", "avatar_url": "https://ss0.bdstatic.com/7Ls0a8Sm1A5BphGlnYG/sys/portrait/item/01f8831f.jpg", "dir_cnt": 1, "filelist": [ { "server_filename": "《通靈少女》2017.同步台視(完結待刪)", "category": 6, "isdir": 1, "size": 1024, "fs_id": 98994643773159, "path": "%2F%E3%80%8A%E9%80%9A%E7%81%B5%E5%B0%91%E5%A5%B3%E3%80%8B2017.%E5%90%8C%E6%AD%A5%E5%8F%B0%E8%A7%86%EF%BC%88%E5%AE%8C%E7%BB%93%E5%BE%85%E5%88%A0%EF%BC%89", "md5": "0", "sign": "86de8a14f72e6e3798d525c689c0e4575b1a7728", "time_stamp": 1493895381 } ], "source_uid": "528742401", "source_id": "98963537", "shorturl": "1pKPCF0J", "vCnt": 356, "dCnt": 29, "tCnt": 184 }, { "source_uid": "528742401", "source_id": "152434783", "shorturl": "1qYdhFkC", "vCnt": 1022, "dCnt": 29, "tCnt": 345 } ] }
還是和上次一樣,綜合性的搜索站,可以把有用的數據都留下存住,我只是做個最簡單的,就只要了標題title和shareid
每個分享文件的下載頁面url是這樣的:http://pan.baidu.com/share/link?shareid={$shareId}&uk={$uk} ,只需要用戶編號和分享id就可以拼出下載url。
生成分頁接口URL
假設用戶最多分享了30000個,每頁60個,可以分500頁,這樣url可以這樣生成
<?php /* * 獲取分享列表 */ class TextsSpider{ private $pages=500;//分頁數 private $start=60;//每頁個數 /** * 生成分頁接口的url */ public function makeUrl($rootUk){ $urls=array(); for($i=0;$i<=$this->pages;$i++){ $start=$this->start*$i; $url="https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start={$start}&limit={$this->start}&query_uk={$rootUk}"; $urls[]=$url; } return $urls; } } $textsSpider=new TextsSpider(); $urls=$textsSpider->makeUrl(4162539356); print_r($urls);
分頁的url結果是這樣的
Array ( [0] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=0&limit=60&query_uk=4162539356 [1] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=60&limit=60&query_uk=4162539356 [2] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=120&limit=60&query_uk=4162539356 [3] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=180&limit=60&query_uk=4162539356 [4] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=240&limit=60&query_uk=4162539356 [5] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=300&limit=60&query_uk=4162539356 [6] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=360&limit=60&query_uk=4162539356 [7] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=420&limit=60&query_uk=4162539356 [8] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=480&limit=60&query_uk=4162539356 [9] => https://pan.baidu.com/pcloud/feed/getsharelist?&auth_type=1&request_location=share_home&start=540&limit=60&query_uk=4162539356
數據表存儲結構
CREATE TABLE `texts` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `title` varchar(255) NOT NULL DEFAULT '', `url` varchar(255) NOT NULL DEFAULT '', `time` int(10) unsigned NOT NULL DEFAULT '0', PRIMARY KEY (`id`), KEY `title` (`title`(250)) ) ENGINE=MyISAM
循環讀取的時候,應該注意,每次間隔一定的時間,防止被封。
下一篇主要介紹xunsearch分詞和全文搜索和這次的完整代碼
演示地址,關注微信公眾號:網盤小說,或者掃描下面的二維碼
上一篇循環獲取uk並存入數據庫的完整代碼如下:
<?php /* * 獲取訂閱者 */ class UkSpider{ private $pages;//分頁數 private $start=24;//每頁個數 private $db=null;//數據庫 public function __construct($pages=100){ $this->pages=$pages; $this->db = new PDO("mysql:host=localhost;dbname=pan","root","root"); $this->db->query('set names utf8'); } /** * 生成分頁接口的url */ public function makeUrl($rootUk){ $urls=array(); for($i=0;$i<=$this->pages;$i++){ $start=$this->start*$i; $url="https://pan.baidu.com/pcloud/friend/getfollowlist?query_uk={$rootUk}&limit={$this->start}&start={$start}"; $urls[]=$url; } return $urls; } /** * 根據URL獲取訂閱用戶id */ public function getFollowsByUrl($url){ $result=$this->sendRequest($url); $arr=json_decode($result,true); if(empty($arr)||!isset($arr['follow_list'])){ return; } $ret=array(); foreach($arr['follow_list'] as $fan){ $ret[]=$fan['follow_uk']; } return $ret; } /** * 發送請求 */ public function sendRequest($url,$data = null,$header=null){ $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); if (!empty($data)){ curl_setopt($curl, CURLOPT_POST, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, $data); } if (!empty($header)){ curl_setopt($curl, CURLOPT_HTTPHEADER, $header); } curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); $output = curl_exec($curl); curl_close($curl); return $output; } /* 獲取到的uks存入數據 */ public function addUks($uks){ foreach($uks as $uk){ $sql="insert into uks (uk)values({$uk})"; $this->db->prepare($sql)->execute(); } } /* 獲取某個用戶的所有訂閱並入庫 */ public function sleepGetByUk($uk){ $urls=$this->makeUrl($uk); //$this->updateUkFollow($uk); //循環分頁url foreach($urls as $url){ echo "loading:".$url."\r\n"; //隨機睡眠7到11秒 $second=rand(7,11); echo "sleep...{$second}s\r\n"; sleep($second); //發起請求 $followList=$this->getFollowsByUrl($url); //如果已經沒有數據了,要停掉請求 if(empty($followList)){ break; } $this->addUks($followList); } } /*從數據庫取get_follow=0的uk*/ public function getUksFromDb(){ $sth = $this->db->prepare("select * from uks where get_follow=0"); $sth->execute(); $uks = $sth->fetchAll(PDO::FETCH_ASSOC); $result=array(); foreach ($uks as $key => $uk) { $result[]=$uk['uk']; } return $result; } /*已經取過follow的置為1*/ public function updateUkFollow($uk){ $sql="UPDATE uks SET get_follow=1 where uk={$uk}"; $this->db->prepare($sql)->execute(); } } $ukSpider=new UkSpider(); $uks=$ukSpider->getUksFromDb(); foreach($uks as $uk){ $ukSpider->sleepGetByUk($uk); }