zl程序教程

您现在的位置是:首页 >  后端

当前栏目

php实现的一个很好用HTML解析器类可用于采集数据

PHPHTML数据 实现 一个 用于 采集 解析器
2023-06-13 09:15:05 时间
复制代码代码如下:

<?php  
 $oldSetting=libxml_use_internal_errors(true);   
libxml_clear_errors();  
/**
 * 
 *-+-----------------------------------
 *|PHP5Framework-2011
 *|WebSite:www.iblue.cc
 *|E-mail:mejinke@gmail.com
 *|Date:2012-10-12
 *-+-----------------------------------
 * 
 *@descHTML解析器
 *@authorjingke
 */   
classXF_HtmlDom  
{  
   private$_xpath=null;  
   private$_nodePath="";  

   publicfunction__construct($xpath=null,$nodePath="")  
   {  
       $this->_xpath=$xpath;  
       $this->_nodePath=$nodePath;  
   }  

   publicfunctionloadHtml($url)  
   {  
       ini_set("user_agent","Mozilla/5.0(Linux;U;Android2.1;en-us;NexusOneBuild/ERD62)AppleWebKit/530.17(KHTML,likeGecko)Version/4.0MobileSafari/530.17?Nexus");  
       $content="";  
       if(strpos(strtolower($url),"http")===false)  
       {  
           $content=file_get_contents($url);  
       }  
       else 
       {  
           $ch=curl_init();   
           $user_agent="Baiduspider+(+http://www.baidu.com/search/spider.htm)";  
           $user_agent1="Mozilla/5.0(WindowsNT5.1;rv:6.0)Gecko/20100101Firefox/6.0";  
           curl_setopt($ch,CURLOPT_URL,$url);   
           curl_setopt($ch,CURLOPT_HEADER,false);   
           curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);   
           curl_setopt($ch,CURLOPT_REFERER,$url);  
           curl_setopt($ch,CURLOPT_USERAGENT,$user_agent1);  
           curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);  
           $content=curl_exec($ch);   
           curl_close($ch);  
       }  

       $html=newDOMDocument();   
       $html->loadHtml($content);   
       $this->_xpath=newDOMXPath($html);   
       //return$this; 

   }  

   publicfunctionfind($query,$index=null)  
   {  
       if($this->_nodePath=="")  
           $this->_nodePath="//"; 
       else 
           $this->_nodePath.="/";  

       $nodes=$this->_xpath->query($this->_nodePath.$query);  
       //echo$nodes->item(0)->getNodePath();exit; 

          
       if($index==null&&!is_numeric($index))   
       {   
           $tmp=array();  
           foreach($nodesas$node)   
           {  
               $tmp[]=newXF_HtmlDom($this->_xpath,$node->getNodePath());  
           }  
           return$tmp;  
       }  
       returnnewXF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());  
   }  

   /**
    *获取内容
    */ 
   publicfunctiontext()  
   {  
       if($this->_nodePath!=""&&$this->_xpath!=null)   
           return$this->_xpath->query($this->_nodePath)->item(0)->textContent;  
       else 
           returnfalse;  
   }  

   /**
    *获取属性值
    */ 
   publicfunctiongetAttribute($name)  
   {  
       if($this->_nodePath!=""&&$this->_xpath!=null)   
           return$this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);  
       else 
           returnfalse;  
   }  

   publicfunction__get($name)  
   {  
       if($name=="innertext")  
           return$this->text();  
       else 
           return$this->getAttribute($name);  
   }    
}    
$xp=newxf_HtmlDom();  
$xp->loadHtml("http://www.aizhan.com/siteall/www.opendir.cn/"); 
$rows=$xp->find("td[@id="baidu"]/a",0)->innertext;  
print_r($rows);