因為最近需要一批數據來做機器學習,所以用火車頭采集器來抓數據,數據偽原創用的小發貓的API。以下是PHP實現代碼:
<?php set_time_limit(270); error_reporting(E_ERROR | E_WARNING | E_PARSE); define('TITLE_SEPAR', 'xxx**xxx'); define('TITLE_SEPAR2', '262661'); $url = 'http://api-6.xiaofamao.com/api.php?json=0&v=1&key='; $content_tag_name = '內容'; $headdd = '<figure class="wp-block-gallery columns-3 is-cropped"><ul class="blocks-gallery-grid">'; $taill = '</figure>'; switch($LabelArray['PageType']) { case 'List'://處理列表頁,只能處理html break; case 'Pages'://處理多頁,只能處理html break; case 'Content'://處理默認頁,只能處理html break; case 'Save'://只有保存時是可以處理標簽值的 // 保存原文 try { /**********************************************************************/ // 這一步用來獲取偽原創文章 /**********************************************************************/ $title = $LabelArray['標題']; $content = $LabelArray[$content_tag_name]; $article_src = compose_article($title, $content); $article_src_b = $article_src; //$article_src = br2newline($article_src); $article_new = get_wyc_article($article_src); $title_wyc = trim($article_new[0]); $content_wyc = trim($article_new[1]); //$article_new_x = $article_new; //$article_new = fix_newline($article_new); //$temp = explode(TITLE_SEPAR, $article_new); //$new_title = $temp[0]; //$new_title = fix_title($new_title); /* $temp[1] = ltrim($temp[1], "\r\n");// $temp[1] = ltrim($temp[1], "\n"); $temp[1] = ltrim($temp[1], "\r\n");//implode(PHP_EOL, $temp); $temp[1] = ltrim($temp[1], "\n");*/ //$new_article = get_wyc_article($LabelArray[$content_tag_name]); $content_wyc = fix_newline($content_wyc); // $new_article = newline2br($new_article); //$new_article = remove_alt($new_article); //$article_new = xfm_strong_str_replace_once('<p>', '<p>'.$new_title, $new_article); //$LabelArray[$content_tag_name] = $article_new;//$new_article;//$new_article; //$nlp = get_keywords($new_title, $new_article); //$nlp_arr = explode(TITLE_SEPAR, $nlp); //$LabelArray['關鍵詞'] = $nlp_arr[0]; //$LabelArray['內容簡介'] = $nlp_arr[1]; //$LabelArray['內容簡介'] = curl_request($url, array('wenzhang'=>$LabelArray['內容簡介'])); $content_wyc = ltrim($content_wyc, '</p>'); //$LabelArray[$content_tag_name] = $headdd. $content_wyc. $taill; //serialize($article_new); // $LabelArray[$content_tag_name] = $temp[1]; //$LabelArray[$content_tag_name] = $article_src; $new_title = str_replace(array('[',']','%'), array('【','】','%'), $new_title); $LabelArray['標題'] = strip_tags($title_wyc); $LabelArray['標題'] = ltrim($LabelArray['標題']); $LabelArray['標題'] = trim($LabelArray['標題']); //$LabelArray['摘要'] = curl_request($url, array('wenzhang'=>$LabelArray['標題'].','.$LabelArray['摘要'])); } catch (Exception $e) { $LabelArray['標題'] .= $e->getMessage(); $LabelArray[$content_tag_name] .= $e->getMessage(); } break; default: //$LabelArray[$content_tag_name]=curl_request($url, array('wenzhang'=>$LabelArray[$content_tag_name] )); } echo serialize($LabelArray); function compose_article($title, $content) { $separator = compose_separator(); return $title.$separator.$content; } function compose_separator() { return PHP_EOL.'('.TITLE_SEPAR2.')'.PHP_EOL; } function fix_separator($article) { return $article; } function get_wyc_article($str) { global $url; $separator = compose_separator(); $separator = str_replace(PHP_EOL, '', $separator); $wyc = curl_request($url, array('wenzhang'=>$str)); $wyc = fix_separator($wyc); $wyc = explode($separator, $wyc); if (isset($wyc[0])) $wyc[0] = trim($wyc[0]); if (isset($wyc[1])) $wyc[1] = trim($wyc[1]); return $wyc; } function get_wyc_title($str) { $title = get_wyc_article($str.PHP_EOL.PHP_EOL.PHP_EOL.$str.PHP_EOL.PHP_EOL.PHP_EOL.$str); $title = fix_newline($title); $title = explode(PHP_EOL, $title); return $title[0]; } function get_keywords($title, $contents) { $url_kw = 'http://api-2.78tp.com/nlp/kws.php?appid='; $kws = curl_request($url_kw, array( 'title'=>$title, 'len'=>100, 'text'=>$contents)); return $kws; } function remove_alt($contents) { $contents = preg_replace('/alt=\"(.*)\"/', '', $contents); return $contents; } function fix_title($contents) { $punctuation_symbol = array('。', '?', ',', ':', ';', '、', '!', '.', '?', ',', ':', ';', '!'); $contents = str_replace($punctuation_symbol, '', $contents); return $contents; } function br2newline($contents) { $contents = str_replace('<br>', PHP_EOL, $contents); $contents = str_replace('<br/>', PHP_EOL, $contents); $contents = str_replace('<br />', PHP_EOL, $contents); $contents = str_replace('<BR/>', PHP_EOL, $contents); $contents = str_replace('<BR>', PHP_EOL, $contents); $contents = str_replace('<BR />', PHP_EOL, $contents); return $contents; } function newline2br($contnets) { $contnets = str_replace(PHP_EOL, "<br>", $contnets); // $contnets = str_replace('><br><', '><', $contnets); $contnets = str_replace('<p><br>', '<p>', $contnets); return $contnets; } function delete_newline($contents) { $contents = fix_newline($contents); // $contents = str_replace(PHP_EOL.PHP_EOL, PHP_EOL, $contents); // $contents = str_replace('>'.PHP_EOL, '>', $contents); return $contents; } function reset_newline_win($contents) { // 優化換行符 $contents = str_replace("\r\n", "\n", $contents); $contents = str_replace("\r", "\n", $contents); $contents = str_replace("\n", PHP_EOL, $contents); return $contents; } function fix_newline($data) { $data = str_replace("\r", "\n", $data); while(strpos($data, "\n\n") !== false) { $data = str_replace("\n\n", "\n", $data); } $data = str_replace("\n", PHP_EOL, $data); return $data; } function clean_contents($contents) { // $str = preg_replace('#<([^>\s/]+)[^>]*>#','<$1>', $contents); // return $str; $sa = new cleanHtml; $sa->allow = array( 'src' ); $sa->exceptions = array( 'img' => array( 'src', 'alt' ), //'a' => array( 'href', 'title' ), 'iframe'=>array('src','frameborder'), ); $str = $sa->strip( $contents ); return $str; } function xfm_strong_str_replace_once($search, $replace, $subject) { $firstChar = strpos($subject, $search); if($firstChar !== false) { $beforeStr = substr($subject,0,$firstChar); $afterStr = substr($subject, $firstChar + strlen($search)); return $beforeStr.$replace.$afterStr; } else { return $subject; } } //參數1:訪問的URL,參數2:post數據(不填則為GET),參數3:提交的$cookies,參數4:是否返回$cookies function curl_request($url,$post='',$cookie='', $returnCookie=0){ if (! extension_loaded('curl')) { file_exists('./ext/php_curl.dll') && dl('php_curl.dll'); // 加載擴展 } $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'); if (ini_get('open_basedir') == '' && strtolower(ini_get('safe_mode')) != 'on'){ curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); } curl_setopt($curl, CURLOPT_AUTOREFERER, 1); curl_setopt($curl, CURLOPT_REFERER, "http://XXX"); if($post) { curl_setopt($curl, CURLOPT_POST, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post)); } if($cookie) { curl_setopt($curl, CURLOPT_COOKIE, $cookie); } curl_setopt($curl, CURLOPT_HEADER, $returnCookie); curl_setopt($curl, CURLOPT_TIMEOUT, 150); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); $data = curl_exec($curl); if (curl_errno($curl)) { return curl_error($curl); } curl_close($curl); if($returnCookie){ list($header, $body) = explode("\r\n\r\n", $data, 2); preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches); $info['cookie'] = substr($matches[1][0], 1); $info['content'] = $body; return $info; }else{ return $data; } } //echo $tag; // 計算中文字符串長度 function utf8_strlen($string = null) { // 將字符串分解為單元 preg_match_all("/./us", $string, $match); // 返回單元個數 return count($match[0]); } function reg_escape( $str ) { $conversions = array( "^" => "\^", "[" => "\[", "." => "\.", "$" => "\$", "{" => "\{", "*" => "\*", "(" => "\(", "\\" => "\\\\", "/" => "\/", "+" => "\+", ")" => "\)", "|" => "\|", "?" => "\?", "<" => "\<", ">" => "\>" ); return strtr( $str, $conversions ); } /** * Strip attribute Class * Remove attributes from XML elements * @author David (semlabs.co.uk) * @version 0.2.1 */ class cleanHtml{ public $str = ''; public $allow = array(); public $exceptions = array(); public $ignore = array(); public function strip( $str ) { $this->str = $str; if( is_string( $str ) && strlen( $str ) > 0 ) { $res = $this->findElements(); if( is_string( $res ) ) return $res; $nodes = $this->findAttributes( $res ); $this->removeAttributes( $nodes ); } return $this->str; } private function findElements() { # Create an array of elements with attributes $nodes = array(); preg_match_all( "/<([^ !\/\>\n]+)([^>]*)>/i", $this->str, $elements ); foreach( $elements[1] as $el_key => $element ) { if( $elements[2][$el_key] ) { $literal = $elements[0][$el_key]; $element_name = $elements[1][$el_key]; $attributes = $elements[2][$el_key]; if( is_array( $this->ignore ) && !in_array( $element_name, $this->ignore ) ) $nodes[] = array( 'literal' => $literal, 'name' => $element_name, 'attributes' => $attributes ); } } # Return the XML if there were no attributes to remove if( !$nodes[0] ) return $this->str; else return $nodes; } private function findAttributes( $nodes ) { # Extract attributes foreach( $nodes as &$node ) { preg_match_all( "/([^ =]+)\s*=\s*[\"|']{0,1}([^\"']*)[\"|']{0,1}/i", $node['attributes'], $attributes ); if( $attributes[1] ) { foreach( $attributes[1] as $att_key => $att ) { $literal = $attributes[0][$att_key]; $attribute_name = $attributes[1][$att_key]; $value = $attributes[2][$att_key]; $atts[] = array( 'literal' => $literal, 'name' => $attribute_name, 'value' => $value ); } } else $node['attributes'] = null; $node['attributes'] = $atts; unset( $atts ); } return $nodes; } private function removeAttributes( $nodes ) { # Remove unwanted attributes foreach( $nodes as $node ) { # Check if node has any attributes to be kept $node_name = $node['name']; $new_attributes = ''; if( is_array( $node['attributes'] ) ) { foreach( $node['attributes'] as $attribute ) { if( ( is_array( $this->allow ) && in_array( $attribute['name'], $this->allow ) ) || $this->isException( $node_name, $attribute['name'], $this->exceptions ) ) $new_attributes = $this->createAttributes( $new_attributes, $attribute['name'], $attribute['value'] ); } } $replacement = ( $new_attributes ) ? "<$node_name $new_attributes>" : "<$node_name>"; $this->str = preg_replace( '/'. reg_escape( $node['literal'] ) .'/', $replacement, $this->str ); } } private function isException( $element_name, $attribute_name, $exceptions ) { if( array_key_exists($element_name, $this->exceptions) ) { if( in_array( $attribute_name, $this->exceptions[$element_name] ) ) return true; } return false; } private function createAttributes( $new_attributes, $name, $value ) { if( $new_attributes ) $new_attributes .= " "; $new_attributes .= "$name=\"$value\""; return $new_attributes; } } ?>
我們選擇方法1:“保存到軟件數據庫”,同時,選擇模式3“網上發布到網站”的“使用自定義發布方式”,選擇3“自定義分類標識”,將任務命名為“房地產”,將收藏任務命名為“保存並更新”。由於我們的教程剛剛開始,我們不會做深入的研究。
返回機車主界面,在“房地產”任務上點擊鼠標右鍵,選擇“開始”完成采集。收集的數據將自動發布到模式3中指向的網站的指定列(標識=3),並保存到:機車安裝目錄/數據/序列號-任務名稱/蜘蛛結果. mdb在的數據庫中。
哦,昨天網絡給了我一個關於我的錯誤的提示,我必須寫文案,錄像,並收集信息到我的網站3個小時。我暈倒過幾次。太倉的作品很粗糙。這完全是憑感覺寫的。這讓霧中的每個人都很困惑。對不起,請原諒我!現在更正以下內容:
這里,方法1和方法3是並行關系,可以同時選擇,也可以選擇其中一個,如果不發布模塊,可以直接收集本地軟件數據庫。“本地軟件數據庫”來自微軟Access。我們可以打開數據庫來瀏覽和檢查數據。
至於模式3,“火車頭采集器偽原創”,我將在下面的教程中解釋。我希望每個人都能耐心等待。
好了,本教程到此結束!下一課,再見!