火車頭采集偽原創插件PHP版實現

本文轉載自查看原文 2020-02-20 13:22 4530 PHP

很多碼農都有自己的個人博客，平時謝謝筆記什么的，今天我不是談如何做博客，而是談如何做搜索引擎優化，結合自己多年的PHP經驗來實現網站自動優化。

做一個原始網站或偽原創不僅僅是整合文本。SEO優化的原始文章與添加或刪除內容無關。我喜歡用火車頭來實現內容爬取，然后自己寫一個偽原創插件，有興趣大家可以自己實踐下。

准備撰寫文章時，請務必在撰寫之前考慮本文的標題和布局。本文的標題可能基於哪個標題，並且由優化程序撰寫的文章也考慮添加““錨文本”或“軟鏈接”，因為在寫文章之前，這些是要注意的事情。

偽原創或原始文章必須引起用戶的注意，而不是假冒作品來欺騙搜索蜘蛛。因此，高質量的文章應具有清晰的結構，清晰的主題選擇，清晰的布局並根據主題撰寫文章。還可以為優化人員提供時間，並為網站帶來良好的優化結果。

下面是火車頭偽原創插件PHP代碼：

<?php
set_time_limit(270);
error_reporting(E_ERROR | E_WARNING | E_PARSE);

define('TITLE_SEPAR', 'xxx**xxx');
// 這里的key要自己去小發貓那里注冊一下。
$url = 'http://api.xiaofamao.com/api.php?json=0&v=1&key=xxxxxxx';

switch($LabelArray['PageType'])
{
    case 'List'://處理列表頁，只能處理html
        break;
    case 'Pages'://處理多頁，只能處理html
        break;
    case 'Content'://處理默認頁，只能處理html
        break;
    case 'Save'://只有保存時是可以處理標簽值的
        // 保存原文

    try {
    /**********************************************************************/
    // 這一步用來獲取偽原創文章
    /**********************************************************************/
    $article_src = $LabelArray['標題'] . TITLE_SEPAR . $LabelArray['內容'];
    $article_src_b = $article_src;
    $article_src = br2newline($article_src);

    $article_new = curl_request($url, array('wenzhang'=>$article_src));
    $article_new_x = $article_new;
    $article_new = fix_newline($article_new);

    $temp = explode(TITLE_SEPAR, $article_new);

    $new_title = $temp[0];
    $new_title = fix_title($new_title);

    /*
    $temp[1] = ltrim($temp[1], "\r\n");//
    $temp[1] = ltrim($temp[1], "\n");
    $temp[1] = ltrim($temp[1], "\r\n");//implode(PHP_EOL, $temp);
    $temp[1] = ltrim($temp[1], "\n");*/
    $new_article = $temp[1];
    $new_article = delete_newline($new_article);
//    $new_article = newline2br($new_article);
    //$new_article = remove_alt($new_article);

    //$article_new = xfm_strong_str_replace_once('<p>', '<p>'.$new_title, $new_article);
    //$LabelArray['內容'] = $article_new;//$new_article;//$new_article;
    $LabelArray['內容'] = $new_article;//$article_src_b;//$article_new_x;//$article_new;
//    $LabelArray['內容'] = $temp[1];
    //$LabelArray['內容'] = $article_src;
    $new_title = str_replace(array('[',']','％'), array('【','】','%'), $new_title);
    #$LabelArray['標題'] = $new_title;
    //$LabelArray['摘要'] = curl_request($url, array('wenzhang'=>$LabelArray['標題'].','.$LabelArray['摘要']));
    }
    catch (Exception $e) {
        $LabelArray['標題'] .= $e->getMessage();
        $LabelArray['內容'] .= $e->getMessage();
    }
        break;
    default:
        //$LabelArray['內容']=curl_request($url, array('wenzhang'=>$LabelArray['內容'] ));
}

echo serialize($LabelArray);


function remove_alt($contents) {
    $contents = preg_replace('/alt=\"(.*)\"/', '', $contents);
    return $contents;
}


function fix_title($contents) {
    $punctuation_symbol = array('。', '？', '，', '：', '；', '、', '！',
                                '.',  '?',  ',',  ':',  ';', '!');

     $contents = str_replace($punctuation_symbol, '', $contents);
    return $contents;
}

function br2newline($contents) {
    $contents = str_replace('<br>', PHP_EOL, $contents);
    $contents = str_replace('<br/>', PHP_EOL, $contents);
    $contents = str_replace('<br />', PHP_EOL, $contents);
    $contents = str_replace('<BR/>', PHP_EOL, $contents);
    $contents = str_replace('<BR>', PHP_EOL, $contents);
    $contents = str_replace('<BR />', PHP_EOL, $contents);

    return $contents;
}

function newline2br($contnets) {
    $contnets = str_replace(PHP_EOL, "<br>", $contnets);
//    $contnets = str_replace('><br><', '><', $contnets);
    $contnets = str_replace('<p><br>', '<p>', $contnets);
    return $contnets;
}


function delete_newline($contents) {
    $contents = fix_newline($contents);
//    $contents = str_replace(PHP_EOL.PHP_EOL, PHP_EOL, $contents);
//    $contents = str_replace('>'.PHP_EOL, '>', $contents);
    return $contents;
}

function reset_newline_win($contents) {
    // 優化換行符
    $contents = str_replace("\r\n", "\n", $contents);
    $contents = str_replace("\r", "\n", $contents);
    $contents = str_replace("\n", PHP_EOL, $contents);

    return $contents;
}

function fix_newline($data) {
    $data = str_replace("\r", "\n", $data);
    while(strpos($data, "\n\n") !== false) {
        $data = str_replace("\n\n", "\n", $data);
    }
    $data = str_replace("\n", PHP_EOL, $data);

    return $data;
}

function clean_contents($contents) {
//    $str = preg_replace('#<([^>\s/]+)[^>]*>#','<$1>', $contents);
//    return $str;
    $sa = new cleanHtml;  
    $sa->allow = array( 'src' );    
    $sa->exceptions = array(  
    'img' => array( 'src', 'alt' ),  
    //'a' => array( 'href', 'title' ),  
    'iframe'=>array('src','frameborder'),  
    ); 
    $str = $sa->strip( $contents );   

    return $str;
}


function xfm_strong_str_replace_once($search, $replace, $subject) {
    $firstChar = strpos($subject, $search);
    if($firstChar !== false) {
        $beforeStr = substr($subject,0,$firstChar);
        $afterStr = substr($subject, $firstChar + strlen($search));
        return $beforeStr.$replace.$afterStr;
    } else {
        return $subject;
    }
}

//參數1：訪問的URL，參數2：post數據(不填則為GET)，參數3：提交的$cookies,參數4：是否返回$cookies
function curl_request($url,$post='',$cookie='', $returnCookie=0){
    if (! extension_loaded('curl')) {
        file_exists('./ext/php_curl.dll') && dl('php_curl.dll'); // 加載擴展
    }
    
        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');
    if (ini_get('open_basedir') == '' && strtolower(ini_get('safe_mode')) != 'on'){ 
        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
    }
        curl_setopt($curl, CURLOPT_AUTOREFERER, 1);
        curl_setopt($curl, CURLOPT_REFERER, "http://XXX");
        if($post) {
            curl_setopt($curl, CURLOPT_POST, 1);
            curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
        }
        if($cookie) {
            curl_setopt($curl, CURLOPT_COOKIE, $cookie);
        }
        curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
        curl_setopt($curl, CURLOPT_TIMEOUT, 150);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        $data = curl_exec($curl);
        if (curl_errno($curl)) {
            return curl_error($curl);
        }
        curl_close($curl);
        if($returnCookie){
            list($header, $body) = explode("\r\n\r\n", $data, 2);
            preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches);
            $info['cookie']  = substr($matches[1][0], 1);
            $info['content'] = $body;
            return $info;
        }else{
            return $data;
        }
}

//echo $tag;
// 計算中文字符串長度
function utf8_strlen($string = null) {
// 將字符串分解為單元
preg_match_all("/./us", $string, $match);
// 返回單元個數
return count($match[0]);
}


function reg_escape( $str )  
{  
    $conversions = array( "^" => "\^", "[" => "\[", "." => "\.", "$" => "\$", "{" => "\{", "*" => "\*", "(" => "\(", "\\" => "\\\\", "/" => "\/", "+" => "\+", ")" => "\)", "|" => "\|", "?" => "\?", "<" => "\<", ">" => "\>" );  
    return strtr( $str, $conversions );  
}  
  
/** 
* Strip attribute Class 
* Remove attributes from XML elements 
* @author David (semlabs.co.uk) 
* @version 0.2.1 
*/  
  
class cleanHtml{  
      
    public $str         = '';  
    public $allow       = array();  
    public $exceptions  = array();  
    public $ignore      = array();  
      
    public function strip( $str )  
    {  
        $this->str = $str;  
          
        if( is_string( $str ) && strlen( $str ) > 0 )  
        {  
            $res = $this->findElements();  
            if( is_string( $res ) )  
                return $res;  
            $nodes = $this->findAttributes( $res );  
            $this->removeAttributes( $nodes );  
        }  
          
        return $this->str;  
    }  
      
    private function findElements()  
    {  
        # Create an array of elements with attributes  
        $nodes = array();  
        preg_match_all( "/<([^ !\/\>\n]+)([^>]*)>/i", $this->str, $elements );  
        foreach( $elements[1] as $el_key => $element )  
        {  
            if( $elements[2][$el_key] )  
            {  
                $literal = $elements[0][$el_key];  
                $element_name = $elements[1][$el_key];  
                $attributes = $elements[2][$el_key];  
                if( is_array( $this->ignore ) && !in_array( $element_name, $this->ignore ) )  
                    $nodes[] = array( 'literal' => $literal, 'name' => $element_name, 'attributes' => $attributes );  
            }  
        }  
          
        # Return the XML if there were no attributes to remove  
        if( !$nodes[0] )  
            return $this->str;  
        else  
            return $nodes;  
    }  
      
    private function findAttributes( $nodes )  
    {  
          
        # Extract attributes  
        foreach( $nodes as &$node )  
        {  
            preg_match_all( "/([^ =]+)\s*=\s*[\"|']{0,1}([^\"']*)[\"|']{0,1}/i", $node['attributes'], $attributes );  
            if( $attributes[1] )  
            {  
                foreach( $attributes[1] as $att_key => $att )  
                {  
                    $literal = $attributes[0][$att_key];  
                    $attribute_name = $attributes[1][$att_key];  
                    $value = $attributes[2][$att_key];  
                    $atts[] = array( 'literal' => $literal, 'name' => $attribute_name, 'value' => $value );  
                }  
            }  
            else  
                $node['attributes'] = null;  
              
            $node['attributes'] = $atts;  
            unset( $atts );  
        }  
          
        return $nodes;  
    }  
      
    private function removeAttributes( $nodes )  
    {  
          
        # Remove unwanted attributes  
        foreach( $nodes as $node )  
        {  
              
            # Check if node has any attributes to be kept  
            $node_name = $node['name'];  
            $new_attributes = '';  
            if( is_array( $node['attributes'] ) )  
            {  
                foreach( $node['attributes'] as $attribute )  
                {  
                    if( ( is_array( $this->allow ) && in_array( $attribute['name'], $this->allow ) ) || $this->isException( $node_name, $attribute['name'], $this->exceptions ) )  
                        $new_attributes = $this->createAttributes( $new_attributes, $attribute['name'], $attribute['value'] );  
                }  
            }  
            $replacement = ( $new_attributes ) ? "<$node_name $new_attributes>" : "<$node_name>";  
            $this->str = preg_replace( '/'. reg_escape( $node['literal'] ) .'/', $replacement, $this->str );  
        }  
          
    }  
      
    private function isException( $element_name, $attribute_name, $exceptions )  
    {  
        if( array_key_exists($element_name, $this->exceptions) )  
        {  
            if( in_array( $attribute_name, $this->exceptions[$element_name] ) )  
                return true;  
        }  
          
        return false;  
    }  
      
    private function createAttributes( $new_attributes, $name, $value )  
    {  
        if( $new_attributes )  
            $new_attributes .= " ";  
        $new_attributes .= "$name=\"$value\"";  
          
        return $new_attributes;  
    }  
  
}  

?>

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 火車頭采集器偽原創（附PHP實現代碼）火車頭采集 web發布配置要點（bdyxel原創）火車頭如何采集圖片_圖文教程火車頭采集器使用教程 (仿)火車頭采集器源碼開源淘寶數據采集-火車頭采集之多頁采集火車頭采集器采集文章使用教程實例超快讀快寫 & 火車頭 WordPress4.9火車頭免登陸發布接口+模塊（增強版）如何在洛谷上使用火車頭（詳細揭秘）