因为最近需要一批数据来做机器学习,所以用火车头采集器来抓数据,数据伪原创用的小发猫的API。以下是PHP实现代码:

<?php 
set_time_limit(270); 
error_reporting(E_ERROR | E_WARNING | E_PARSE); 
 
define('TITLE_SEPAR', 'xxx**xxx'); 
define('TITLE_SEPAR2', '262661'); 
 
 
$url = 'http://api-6.xiaofamao.com/api.php?json=0&v=1&key='; 
$content_tag_name = '内容'; 
 
 
$headdd = '<figure class="wp-block-gallery columns-3 is-cropped"><ul class="blocks-gallery-grid">'; 
$taill = '</figure>'; 
 
 
 
switch($LabelArray['PageType']) 
{ 
    case 'List'://处理列表页,只能处理html 
        break; 
    case 'Pages'://处理多页,只能处理html 
        break; 
    case 'Content'://处理默认页,只能处理html 
        break; 
    case 'Save'://只有保存时是可以处理标签值的 
        // 保存原文 
 
    try { 
    /**********************************************************************/ 
    // 这一步用来获取伪原创文章 
    /**********************************************************************/ 
    $title = $LabelArray['标题']; 
    $content = $LabelArray[$content_tag_name]; 
 
    $article_src = compose_article($title, $content); 
    $article_src_b = $article_src; 
 
    //$article_src = br2newline($article_src); 
 
    $article_new = get_wyc_article($article_src); 
 
    $title_wyc = trim($article_new[0]); 
    $content_wyc = trim($article_new[1]); 
    //$article_new_x = $article_new; 
    //$article_new = fix_newline($article_new); 
 
    //$temp = explode(TITLE_SEPAR, $article_new); 
 
    //$new_title = $temp[0]; 
    //$new_title = fix_title($new_title); 
 
    /* 
    $temp[1] = ltrim($temp[1], "\r\n");// 
    $temp[1] = ltrim($temp[1], "\n"); 
    $temp[1] = ltrim($temp[1], "\r\n");//implode(PHP_EOL, $temp); 
    $temp[1] = ltrim($temp[1], "\n");*/ 
    //$new_article = get_wyc_article($LabelArray[$content_tag_name]); 
    $content_wyc = fix_newline($content_wyc); 
//    $new_article = newline2br($new_article); 
    //$new_article = remove_alt($new_article); 
 
    //$article_new = xfm_strong_str_replace_once('<p>', '<p>'.$new_title, $new_article); 
    //$LabelArray[$content_tag_name] = $article_new;//$new_article;//$new_article; 
    //$nlp = get_keywords($new_title, $new_article); 
    //$nlp_arr = explode(TITLE_SEPAR, $nlp); 
    //$LabelArray['关键词'] = $nlp_arr[0]; 
    //$LabelArray['内容简介'] = $nlp_arr[1]; 
    //$LabelArray['内容简介'] = curl_request($url, array('wenzhang'=>$LabelArray['内容简介'])); 
    $content_wyc = ltrim($content_wyc, '</p>'); 
    //$LabelArray[$content_tag_name] = $headdd. $content_wyc. $taill; //serialize($article_new); 
//    $LabelArray[$content_tag_name] = $temp[1]; 
    //$LabelArray[$content_tag_name] = $article_src; 
    $new_title = str_replace(array('[',']','%'), array('【','】','%'), $new_title); 
    $LabelArray['标题'] = strip_tags($title_wyc); 
    $LabelArray['标题'] = ltrim($LabelArray['标题']); 
    $LabelArray['标题'] = trim($LabelArray['标题']); 
 
    //$LabelArray['摘要'] = curl_request($url, array('wenzhang'=>$LabelArray['标题'].','.$LabelArray['摘要'])); 
    } 
    catch (Exception $e) { 
        $LabelArray['标题'] .= $e->getMessage(); 
        $LabelArray[$content_tag_name] .= $e->getMessage(); 
    } 
        break; 
    default: 
        //$LabelArray[$content_tag_name]=curl_request($url, array('wenzhang'=>$LabelArray[$content_tag_name] )); 
} 
 
echo serialize($LabelArray); 
 
 
 
function compose_article($title, $content) { 
    $separator = compose_separator(); 
    return $title.$separator.$content; 
} 
 
function compose_separator() { 
    return PHP_EOL.'('.TITLE_SEPAR2.')'.PHP_EOL; 
} 
 
 
function fix_separator($article) { 
    return $article; 
} 
 
 
function get_wyc_article($str) { 
    global $url; 
    $separator = compose_separator(); 
    $separator = str_replace(PHP_EOL, '', $separator); 
    $wyc = curl_request($url, array('wenzhang'=>$str)); 
    $wyc = fix_separator($wyc); 
    $wyc = explode($separator, $wyc); 
    if (isset($wyc[0])) $wyc[0] = trim($wyc[0]); 
    if (isset($wyc[1])) $wyc[1] = trim($wyc[1]); 
    return $wyc; 
} 
 
 
function get_wyc_title($str) { 
    $title = get_wyc_article($str.PHP_EOL.PHP_EOL.PHP_EOL.$str.PHP_EOL.PHP_EOL.PHP_EOL.$str); 
    $title = fix_newline($title); 
    $title = explode(PHP_EOL, $title); 
    return $title[0]; 
} 
 
function get_keywords($title, $contents) { 
    $url_kw = 'http://api-2.78tp.com/nlp/kws.php?appid='; 
    $kws = curl_request($url_kw, array( 
    'title'=>$title, 
    'len'=>100, 
    'text'=>$contents)); 
 
    return $kws; 
} 
 
 
function remove_alt($contents) { 
    $contents = preg_replace('/alt=\"(.*)\"/', '', $contents); 
    return $contents; 
} 
 
 
function fix_title($contents) { 
    $punctuation_symbol = array('。', '?', ',', ':', ';', '、', '!', 
                                '.',  '?',  ',',  ':',  ';', '!'); 
 
     $contents = str_replace($punctuation_symbol, '', $contents); 
    return $contents; 
} 
 
function br2newline($contents) { 
    $contents = str_replace('<br>', PHP_EOL, $contents); 
    $contents = str_replace('<br/>', PHP_EOL, $contents); 
    $contents = str_replace('<br />', PHP_EOL, $contents); 
    $contents = str_replace('<BR/>', PHP_EOL, $contents); 
    $contents = str_replace('<BR>', PHP_EOL, $contents); 
    $contents = str_replace('<BR />', PHP_EOL, $contents); 
 
    return $contents; 
} 
 
function newline2br($contnets) { 
    $contnets = str_replace(PHP_EOL, "<br>", $contnets); 
//    $contnets = str_replace('><br><', '><', $contnets); 
    $contnets = str_replace('<p><br>', '<p>', $contnets); 
    return $contnets; 
} 
 
 
function delete_newline($contents) { 
    $contents = fix_newline($contents); 
//    $contents = str_replace(PHP_EOL.PHP_EOL, PHP_EOL, $contents); 
//    $contents = str_replace('>'.PHP_EOL, '>', $contents); 
    return $contents; 
} 
 
function reset_newline_win($contents) { 
    // 优化换行符 
    $contents = str_replace("\r\n", "\n", $contents); 
    $contents = str_replace("\r", "\n", $contents); 
    $contents = str_replace("\n", PHP_EOL, $contents); 
 
    return $contents; 
} 
 
function fix_newline($data) { 
    $data = str_replace("\r", "\n", $data); 
    while(strpos($data, "\n\n") !== false) { 
        $data = str_replace("\n\n", "\n", $data); 
    } 
    $data = str_replace("\n", PHP_EOL, $data); 
 
    return $data; 
} 
 
function clean_contents($contents) { 
//    $str = preg_replace('#<([^>\s/]+)[^>]*>#','<$1>', $contents); 
//    return $str; 
    $sa = new cleanHtml;   
    $sa->allow = array( 'src' );     
    $sa->exceptions = array(   
    'img' => array( 'src', 'alt' ),   
    //'a' => array( 'href', 'title' ),   
    'iframe'=>array('src','frameborder'),   
    );  
    $str = $sa->strip( $contents );    
 
    return $str; 
} 
 
 
function xfm_strong_str_replace_once($search, $replace, $subject) { 
    $firstChar = strpos($subject, $search); 
    if($firstChar !== false) { 
        $beforeStr = substr($subject,0,$firstChar); 
        $afterStr = substr($subject, $firstChar + strlen($search)); 
        return $beforeStr.$replace.$afterStr; 
    } else { 
        return $subject; 
    } 
} 
 
//参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies 
function curl_request($url,$post='',$cookie='', $returnCookie=0){ 
    if (! extension_loaded('curl')) { 
        file_exists('./ext/php_curl.dll') && dl('php_curl.dll'); // 加载扩展 
    } 
     
        $curl = curl_init(); 
        curl_setopt($curl, CURLOPT_URL, $url); 
        curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'); 
    if (ini_get('open_basedir') == '' && strtolower(ini_get('safe_mode')) != 'on'){  
        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); 
    } 
        curl_setopt($curl, CURLOPT_AUTOREFERER, 1); 
        curl_setopt($curl, CURLOPT_REFERER, "http://XXX"); 
        if($post) { 
            curl_setopt($curl, CURLOPT_POST, 1); 
            curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post)); 
        } 
        if($cookie) { 
            curl_setopt($curl, CURLOPT_COOKIE, $cookie); 
        } 
        curl_setopt($curl, CURLOPT_HEADER, $returnCookie); 
        curl_setopt($curl, CURLOPT_TIMEOUT, 150); 
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); 
        $data = curl_exec($curl); 
        if (curl_errno($curl)) { 
            return curl_error($curl); 
        } 
        curl_close($curl); 
        if($returnCookie){ 
            list($header, $body) = explode("\r\n\r\n", $data, 2); 
            preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches); 
            $info['cookie']  = substr($matches[1][0], 1); 
            $info['content'] = $body; 
            return $info; 
        }else{ 
            return $data; 
        } 
} 
 
//echo $tag; 
// 计算中文字符串长度 
function utf8_strlen($string = null) { 
// 将字符串分解为单元 
preg_match_all("/./us", $string, $match); 
// 返回单元个数 
return count($match[0]); 
} 
 
 
function reg_escape( $str )   
{   
    $conversions = array( "^" => "\^", "[" => "\[", "." => "\.", "$" => "\$", "{" => "\{", "*" => "\*", "(" => "\(", "\\" => "\\\\", "/" => "\/", "+" => "\+", ")" => "\)", "|" => "\|", "?" => "\?", "<" => "\<", ">" => "\>" );   
    return strtr( $str, $conversions );   
}   
   
/**  
* Strip attribute Class  
* Remove attributes from XML elements  
* @author David (semlabs.co.uk)  
* @version 0.2.1  
*/   
   
class cleanHtml{   
       
    public $str         = '';   
    public $allow       = array();   
    public $exceptions  = array();   
    public $ignore      = array();   
       
    public function strip( $str )   
    {   
        $this->str = $str;   
           
        if( is_string( $str ) && strlen( $str ) > 0 )   
        {   
            $res = $this->findElements();   
            if( is_string( $res ) )   
                return $res;   
            $nodes = $this->findAttributes( $res );   
            $this->removeAttributes( $nodes );   
        }   
           
        return $this->str;   
    }   
       
    private function findElements()   
    {   
        # Create an array of elements with attributes   
        $nodes = array();   
        preg_match_all( "/<([^ !\/\>\n]+)([^>]*)>/i", $this->str, $elements );   
        foreach( $elements[1] as $el_key => $element )   
        {   
            if( $elements[2][$el_key] )   
            {   
                $literal = $elements[0][$el_key];   
                $element_name = $elements[1][$el_key];   
                $attributes = $elements[2][$el_key];   
                if( is_array( $this->ignore ) && !in_array( $element_name, $this->ignore ) )   
                    $nodes[] = array( 'literal' => $literal, 'name' => $element_name, 'attributes' => $attributes );   
            }   
        }   
           
        # Return the XML if there were no attributes to remove   
        if( !$nodes[0] )   
            return $this->str;   
        else   
            return $nodes;   
    }   
       
    private function findAttributes( $nodes )   
    {   
           
        # Extract attributes   
        foreach( $nodes as &$node )   
        {   
            preg_match_all( "/([^ =]+)\s*=\s*[\"|']{0,1}([^\"']*)[\"|']{0,1}/i", $node['attributes'], $attributes );   
            if( $attributes[1] )   
            {   
                foreach( $attributes[1] as $att_key => $att )   
                {   
                    $literal = $attributes[0][$att_key];   
                    $attribute_name = $attributes[1][$att_key];   
                    $value = $attributes[2][$att_key];   
                    $atts[] = array( 'literal' => $literal, 'name' => $attribute_name, 'value' => $value );   
                }   
            }   
            else   
                $node['attributes'] = null;   
               
            $node['attributes'] = $atts;   
            unset( $atts );   
        }   
           
        return $nodes;   
    }   
       
    private function removeAttributes( $nodes )   
    {   
           
        # Remove unwanted attributes   
        foreach( $nodes as $node )   
        {   
               
            # Check if node has any attributes to be kept   
            $node_name = $node['name'];   
            $new_attributes = '';   
            if( is_array( $node['attributes'] ) )   
            {   
                foreach( $node['attributes'] as $attribute )   
                {   
                    if( ( is_array( $this->allow ) && in_array( $attribute['name'], $this->allow ) ) || $this->isException( $node_name, $attribute['name'], $this->exceptions ) )   
                        $new_attributes = $this->createAttributes( $new_attributes, $attribute['name'], $attribute['value'] );   
                }   
            }   
            $replacement = ( $new_attributes ) ? "<$node_name $new_attributes>" : "<$node_name>";   
            $this->str = preg_replace( '/'. reg_escape( $node['literal'] ) .'/', $replacement, $this->str );   
        }   
           
    }   
       
    private function isException( $element_name, $attribute_name, $exceptions )   
    {   
        if( array_key_exists($element_name, $this->exceptions) )   
        {   
            if( in_array( $attribute_name, $this->exceptions[$element_name] ) )   
                return true;   
        }   
           
        return false;   
    }   
       
    private function createAttributes( $new_attributes, $name, $value )   
    {   
        if( $new_attributes )   
            $new_attributes .= " ";   
        $new_attributes .= "$name=\"$value\"";   
           
        return $new_attributes;   
    }   
   
}   
 
?> 

我们选择方法1:“保存到软件数据库”,同时,选择模式3“网上发布到网站”的“使用自定义发布方式”,选择3“自定义分类标识”,将任务命名为“房地产”,将收藏任务命名为“保存并更新”。由于我们的教程刚刚开始,我们不会做深入的研究。
返回机车主界面,在“房地产”任务上点击鼠标右键,选择“开始”完成采集。收集的数据将自动发布到模式3中指向的网站的指定列(标识=3),并保存到:机车安装目录/数据/序列号-任务名称/蜘蛛结果. mdb在的数据库中。
哦,昨天网络给了我一个关于我的错误的提示,我必须写文案,录像,并收集信息到我的网站3个小时。我晕倒过几次。太仓的作品很粗糙。这完全是凭感觉写的。这让雾中的每个人都很困惑。对不起,请原谅我!现在更正以下内容:
这里,方法1和方法3是并行关系,可以同时选择,也可以选择其中一个,如果不发布模块,可以直接收集本地软件数据库。“本地软件数据库”来自微软Access。我们可以打开数据库来浏览和检查数据。
至于模式3,“火车头采集器伪原创”,我将在下面的教程中解释。我希望每个人都能耐心等待。
好了,本教程到此结束!下一课,再见!


评论关闭
IT干货网

微信公众号号:IT虾米 (左侧二维码扫一扫)欢迎添加!