zl程序教程

您现在的位置是:首页 >  后端

当前栏目

php获取网页标题和内容函数(不包含html标签)

PHP网页HTML 函数 获取 内容 标签 包含
2023-06-13 09:15:17 时间
复制代码代码如下:

functiongetPageContent($url){  

       //$url="http://www.ttphp.com;  

       $pageinfo=array();  
       $pageinfo[content_type]="";  
       $pageinfo[charset]="";  
       $pageinfo[title]="";  
       $pageinfo[description]="";  
       $pageinfo[keywords]="";  
       $pageinfo[body]="";  
       $pageinfo["httpcode"]=200;  
       $pageinfo["all"]="";   

       $ch=curl_init();  
       curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)");  
       curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);  
       curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);  
       curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);  
       curl_setopt($ch,CURLOPT_TIMEOUT,8);  
       curl_setopt($ch,CURLOPT_FILETIME,1);  
       curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);  
       //curl_setopt($ch,CURLOPT_HEADER,1);        
       curl_setopt($ch,CURLOPT_URL,$url);  

       $curl_start=microtime(true);  
       $store=curl_exec($ch);  

       $curl_time=microtime(true)-$curl_start;  
       if(curl_error($ch)){  
           $pageinfo["httpcode"]=505; //gatewayerror  
           echo"Curlerror:".curl_error($ch)."/n";  
           return$pageinfo;  
       }  

       //print_r(curl_getinfo($ch));  
       $pageinfo["httpcode"]=curl_getinfo($ch,CURLINFO_HTTP_CODE);  
       //echocurl_getinfo($ch,CURLINFO_CONTENT_TYPE)."/n";  
       $pageinfo[content_type]=curl_getinfo($ch,CURLINFO_CONTENT_TYPE);  
       if(intval($pageinfo["httpcode"])<>200or!preg_match("@text/html@",curl_getinfo($ch,CURLINFO_CONTENT_TYPE))  ){  
               //print_r(curl_getinfo($ch));  
               //exit;  
               return$pageinfo;  
       }  
       preg_match("/charset=([^/s/n/r]+)/i",curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches);//从header里取charset  
       if(trim($matches[1])){  
           $pageinfo[charset]=trim($matches[1]);  
       }  
       //echo$pageinfo[charset];  
       //exit;  
       curl_close($ch);  
       //echo$store;  

 
       //removejavascript  
       $store=preg_replace("/<mce:script.*><!--
(.*)<//script>/smUi","",$store);  
       //removelink   
       $store=preg_replace("/<link/s+[^>]+>/smUi","",$store);  
       //remove<!-- -->  
       $store=preg_replace("/<!--.*-->/smUi","",$store);  
       //remove<style </<style>  
       $store=preg_replace("/<style.*>(.*)<//style>/smUi","",$store);  
       //remove中文空格  
       $store=preg_replace("/ /","",$store);  
       //remove标点符号  
       //$store=preg_replace("/[/~`!@#$%^&*()_/-+={}|/[/]//;":"/</>/?/,/.//]/","",$store);  

          
       //preg_match("/<head.*>(.*)<//head>/smUi",$store,$matches);  
       //$head=$matches[1];  
       //echo$head."/n";  

       //charset  
       if($pageinfo[charset]==""){ 
           preg_match("@<meta.+charset=([/w/-]+)[^>]*>@i",$store,$matches); 
           $pageinfo[charset]=trim($matches[1]); 
       } 
       //desctiption 
       preg_match("@<meta/s+name=/"*description/"*/s+content/s*=/s*([^/>]+)/*>@i",$store,$matches); 
       //print_r($matches); 
       $desc=trim($matches[1]); 
       $pageinfo[description]=str_replace("/"","",$desc); 

 
       preg_match("@<meta/s+name=/"*keywords/"*/s+content/s*=/s*([^/>]+)/*>@i",$store,$matches); 
       //print_r($matches); 
       $keywords=trim($matches[1]); 
       $pageinfo[keywords]=str_replace("/"","",$keywords); 

         
       preg_match("/<title>(.*)<//title>/smUi",$store,$matches); 
       $pageinfo[title]=trim($matches[1]); 

       preg_match("/<body.*>(.*)<//body>/smUi",$store,$matches); 
       $pageinfo[body]=addslashes(replaceHtmlAndJs($matches[1])); 
       $pageinfo["all"]=addslashes(replaceHtmlAndJs($store)); 

       //echo"charset=".$pageinfo[charset]."/n"; 

       //print_r($pageinfo); 
       //exit; 

         
       return$pageinfo; 



/** 
 *去掉所有的HTML标记和JavaScript标记 
 */ 
functionreplaceHtmlAndJs($document)  
{  
        $document=trim($document);  
        if(strlen($document)<=0)  
        {  
         return$document;  
        }  
        $search=array(        
                                           ""<script[^>]*?>.*?
//--></mce:script>"si", //去掉javascript  
                         ""<[///!]*?[^<>]*?>"si",         //去掉HTML标记  
                         ""[/r/n/s+]"",               //去掉空白字符  
                         ""&(/w+);"i"             //替换HTML实体  
                        );                   //作为PHP代码运行  

        $replace=array("","","","" );  

        return@preg_replace($search,$replace,$document);  




使用例子

复制代码代码如下:


$a=getPageContent(www.ttphp.com);  
print_r($a);