項目下載地址:http://code.google.com/p/phpquery/
獲取內容的方法:
第一種:newDocumentFile
phpQuery::newDocumentFile($url);
第二種:
$content = file_get_contents($url); $htmlObj = phpQuery::newDocumentHTML($content);
獲取網頁內容:
第一種:獲取html節點
pq('title')->html()
第二種:獲取script內容(會分數組)
pq("script")->getString();
突破防爬蟲
1 function _get_fake_apider($url) { 2 $ch = curl_init(); 3 $ip = '115.239.211.112'; //百度蜘蛛 4 $timeout = 15; 5 curl_setopt($ch,CURLOPT_URL,$url); 6 curl_setopt($ch,CURLOPT_TIMEOUT, $timeout); 7 //偽造百度蜘蛛IP 8 curl_setopt($ch,CURLOPT_HTTPHEADER,array('X-FORWARDED-FOR:'.$ip.'','CLIENT-IP:'.$ip.'')); 9 //偽造百度蜘蛛頭部 10 curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); 11 curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); 12 curl_setopt($ch,CURLOPT_HEADER,0); 13 curl_setopt ($ch, CURLOPT_REFERER, "http://www.baidu.com/ "); //構造來路 14 curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout); 15 $content = curl_exec($ch); 16 return $content; 17 }