用php寫爬蟲去爬數據

本文轉載自查看原文 2019-11-29 18:09 861 php

參考文檔1
參考文檔2
這里是我自己寫的一個小需求

<?php
/**
采集http://www.959.cn/school，即時更新的最新的文章內容；每分鍾采集一次。采集結束后實時入庫並展示。
*/
header("Content-Type: text/html;charset=utf-8");
date_default_timezone_set('PRC');
$con = mysqli_connect('127.0.0.1', 'xxxxx', 'xxxxx', 'xxxxx');
mysqli_set_charset($con,'UTF8');

// 日志位置
//把當前的最新一條文章的id.shtml的id 存入一個max.log的文件中,實時讀取 實時更改
$log_path = '/home/logs';
if(!file_exists($log_path)){
        mkdir($log_path,0755,true);
}
$max_log = $log_path.'/kecheng_max.log';
$path = './img/';
$max = file_get_contents($max_log) ? file_get_contents($max_log) : 4613925; //當前的最新一條文章的id.shtml的id

//記錄日志
$message_log = $log_path.'/kecheng_message.log';

$now_max = 0; //當前網站實際的最新一條的文章的id.shtml的id
$url = "http://www.959.cn/school/kecheng/";//要爬取的網址
$res = file_get_contents("compress.zlib://".$url);//curl封裝方法
//var_dump($con);die;
preg_match_all('/<h3>.*<\/h3>/',$res,$arr_all);//文章列表內容
//var_dump($arr_all[0]);
foreach($arr_all[0] as $k => $v){
	// 先去匹配a標簽 如果不是a標簽直接繼續下一個
	preg_match_all('/<a href="(.*)">.*<\/a>/',$v,$v_all);
	if (empty($v_all[1])){// 不是a標簽
		continue;
	}
	
	$title =  strip_tags($v_all[0][0]); //文章標題
	$href = $v_all[1][0];//url
	
	// 截取當前url的shtml前面的id
	$href_one = substr($href, strrpos($href, '/')+1);
	$href_id = substr($href_one, 0, strpos($href_one, '.'));
	
	if ($k === 0) {
        $now_max = $href_id;
    }

	if ($href_id <=  $max){ // 只爬大於保存的最大id的數據
		$max = file_put_contents($max_log, $now_max, LOCK_EX); // 爬完之后讓maxid更新為最新的
		break;
	}
	
	// 獲取標題縮略圖
	preg_match_all('/<img src="(.*)" alt="'.$title.'" \/>/',$res,$title_img);
	$title_img_url = $title_img[1][0];
	if(!file_exists($path)){
		mkdir($path,0755,true);
	}
	$title_img = file_get_contents ("http:{$title_img_url}");
	$cover_img_name = microtime_float().'.jpg';
	file_put_contents($path.$cover_img_name, $title_img);
	$cover_img = '/public/uploads/img/'.date('Y/md',time()).'/'.$cover_img_name; // 標題縮略圖位置
	
	// 獲取文章內容
	$article_res = file_get_contents("compress.zlib://".$href);
	//var_dump($article_res);die;
	$article_res = trimall($article_res);
	//var_dump($article_res);die;
	preg_match_all('/<div class="detail">(.*?)<\/div>/',$article_res,$article_all);//文章詳情內容
	
	$text = $article_all[1][0]; // 文章內容
	//var_dump($text);die;
	
	//篩選圖片 並下載到本地
	preg_match_all('/<img .*? src="(.*?)" .*? \/>/',$article_all[1][0],$img_all); //文章詳情圖片
	//preg_match_all('/<imgalt="創業項目"src="(.*?)"title="創業項目" \/>/',$article_all[0][0],$img_all); //文章詳情圖片
	//var_dump($img_all[1]);
	foreach($img_all[1] as $key => $value){
		$message_img = file_get_contents ("http:{$value}");
		$message_img_name = microtime_float().'.jpg';
		file_put_contents($path.$message_img_name, $message_img);
		
		$text = str_replace($value,'/public/uploads/img/'.date('Y/md',time()).'/'.$message_img_name,$text);
	}
	// 去掉【如果還有什么問題點擊留言咨詢】 和  >>>>想了解更多創業資訊，點擊咨詢詳情 和 其他的a標簽
	$text = preg_replace("/<a[^>]*>(.*?)<\/a>/is", "", $text);
	//var_dump($text);die;
	
	// 入庫數據
	$cid = 2;// 1:行業資訊，2：創業故事
	$source = '88加盟網';
	$create_time = time();
	$update_time = time();
	
	// 入庫前轉碼
	$title = mb_convert_encoding($title,'UTF-8');
	$text = mb_convert_encoding($text,'UTF-8');
	//var_dump($title, $text);
	
	// 插入數據庫
	$sql = "insert into zs_message (cid, title, create_time, update_time, source, cover_img, text) values ($cid, '$title', $create_time, $update_time, '$source', '$cover_img', '$text')";
    $result = mysqli_query($con,$sql);
	
	if ($result) {
		$msg = "當前抓取的頁面url為:{$href},入庫sql為:{$sql},結果:入庫成功";
	} else {
		$msg = "當前抓取的頁面url為:{$href},入庫sql為:{$sql},結果:入庫失敗";
	}
	file_put_contents($message_log, $msg, FILE_APPEND | LOCK_EX);
	file_put_contents($message_log, "\n\r", FILE_APPEND | LOCK_EX);
	
}

function microtime_float()
{
    list($usec, $sec) = explode(" ", microtime());
    return $sec.substr($usec,2,6);
}


function trimall($str){
    $qian=array("\n","\r");
    return str_replace($qian, '', $str);  
}

function curl_get_contents($url,$cookie='',$referer='',$timeout=300,$ishead=0) {
  $curl = curl_init();
  curl_setopt($curl, CURLOPT_RETURNTRANSFER,1);
  curl_setopt($curl, CURLOPT_FOLLOWLOCATION,1);
  curl_setopt($curl, CURLOPT_URL,$url);
  curl_setopt($curl, CURLOPT_TIMEOUT,$timeout);
  curl_setopt($curl, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36');
  if($cookie)
  {
    curl_setopt( $curl, CURLOPT_COOKIE,$cookie);
  }
  if($referer)
  {
    curl_setopt ($curl,CURLOPT_REFERER,$referer);
  }
  $ssl = substr($url, 0, 8) == "https://" ? TRUE : FALSE;
  if ($ssl)
  {
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
  }
  $res = curl_exec($curl);
  return $res;
  curl_close($curl);
}

爬蟲主要的思路是:用正則去篩選我們爬取我們的頁面,然后從列表頁獲取到的url,再一個個去爬取詳情頁的內容
注意事項:

如果用file_get_contens去獲取內容的話,gzip壓縮,會出現亂碼的情況

file_get_contents("compress.zlib://".$url);

如是用的curl的話

function curl_get($url, $gzip=false){
        $curl = curl_init($url);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10);
        if($gzip) curl_setopt($curl, CURLOPT_ENCODING, "gzip"); // 關鍵在這里
        $content = curl_exec($curl);
        curl_close($curl);
        return $content;
}

無論頁面是否經過gzip壓縮，上述代碼都可以正常工作！
參考出處
2.在獲取到頁面后,在匹配之前,一定要先把字符串中的\r\n空格換行都去掉,在進行匹配,否則會出現匹配為空的情況

function trimall($str){
    $qian=array("\n","\r");
    return str_replace($qian, '', $str);  
}
 // 處理頁面源碼，多行變單行
   $htmlOneLine = preg_replace("/\r|\n|\t/","",$html);

3.去掉文章中的超鏈接或者將錨文本，只保留文字，去掉鏈接，去掉加粗格式

$text = preg_replace("/<a[^>]*>(.*?)<\/a>/is", "", $text);//去掉文章中的超鏈接
$str = preg_replace("/<a[^>]*>(.*?)<\/a>/is", "$1", $str);//只保留文字

4.生成圖片的路徑和文件名(參考)

$cover_img_name = microtime_float().'.jpg';
$cover_img = '/public/uploads/img/'.date('Y/md',time()).'/'.$cover_img_name;
function microtime_float()
{
    list($usec, $sec) = explode(" ", microtime());
    return $sec.substr($usec,2,6);
}

5.入庫的時候,有可能會出現,存入的數據讀出來全是???,或者亂碼要把頁面設置為utf8,數據庫設置為utf8

header("Content-Type: text/html;charset=utf-8");
date_default_timezone_set('PRC');
$con = mysqli_connect('127.0.0.1', 'xx', 'xx', 'xx');
mysqli_set_charset($con,'UTF8');

// 入庫前轉碼
$title = mb_convert_encoding($title,'UTF-8');
$text = mb_convert_encoding($text,'UTF-8');

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python寫爬蟲爬妹子 php寫爬蟲之寫法總結（轉）基於 PHP 的數據爬取（QueryList） python爬蟲Scrapy(一)-我爬了boss數據爬蟲-移動端數據爬取爬蟲爬取股票數據數據爬蟲：爬取一張圖片爬蟲之移動端數據爬取 python爬蟲——數據爬取和具體解析 Python爬蟲爬取疫情數據