php获取网页标题和内容函数(不包含html标签)
2023-06-13 09:15:17 时间
functiongetPageContent($url){
//$url="http://www.ttphp.com;
$pageinfo=array();
$pageinfo[content_type]="";
$pageinfo[charset]="";
$pageinfo[title]="";
$pageinfo[description]="";
$pageinfo[keywords]="";
$pageinfo[body]="";
$pageinfo["httpcode"]=200;
$pageinfo["all"]="";
$ch=curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)");
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);
curl_setopt($ch,CURLOPT_TIMEOUT,8);
curl_setopt($ch,CURLOPT_FILETIME,1);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);
//curl_setopt($ch,CURLOPT_HEADER,1);
curl_setopt($ch,CURLOPT_URL,$url);
$curl_start=microtime(true);
$store=curl_exec($ch);
$curl_time=microtime(true)-$curl_start;
if(curl_error($ch)){
$pageinfo["httpcode"]=505; //gatewayerror
echo"Curlerror:".curl_error($ch)."/n";
return$pageinfo;
}
//print_r(curl_getinfo($ch));
$pageinfo["httpcode"]=curl_getinfo($ch,CURLINFO_HTTP_CODE);
//echocurl_getinfo($ch,CURLINFO_CONTENT_TYPE)."/n";
$pageinfo[content_type]=curl_getinfo($ch,CURLINFO_CONTENT_TYPE);
if(intval($pageinfo["httpcode"])<>200or!preg_match("@text/html@",curl_getinfo($ch,CURLINFO_CONTENT_TYPE)) ){
//print_r(curl_getinfo($ch));
//exit;
return$pageinfo;
}
preg_match("/charset=([^/s/n/r]+)/i",curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches);//从header里取charset
if(trim($matches[1])){
$pageinfo[charset]=trim($matches[1]);
}
//echo$pageinfo[charset];
//exit;
curl_close($ch);
//echo$store;
//removejavascript
$store=preg_replace("/<mce:script.*><!--
(.*)<//script>/smUi","",$store);
//removelink
$store=preg_replace("/<link/s+[^>]+>/smUi","",$store);
//remove<!-- -->
$store=preg_replace("/<!--.*-->/smUi","",$store);
//remove<style </<style>
$store=preg_replace("/<style.*>(.*)<//style>/smUi","",$store);
//remove中文空格
$store=preg_replace("/ /","",$store);
//remove标点符号
//$store=preg_replace("/[/~`!@#$%^&*()_/-+={}|/[/]//;":"/</>/?/,/.//]/","",$store);
//preg_match("/<head.*>(.*)<//head>/smUi",$store,$matches);
//$head=$matches[1];
//echo$head."/n";
//charset
if($pageinfo[charset]==""){
preg_match("@<meta.+charset=([/w/-]+)[^>]*>@i",$store,$matches);
$pageinfo[charset]=trim($matches[1]);
}
//desctiption
preg_match("@<meta/s+name=/"*description/"*/s+content/s*=/s*([^/>]+)/*>@i",$store,$matches);
//print_r($matches);
$desc=trim($matches[1]);
$pageinfo[description]=str_replace("/"","",$desc);
preg_match("@<meta/s+name=/"*keywords/"*/s+content/s*=/s*([^/>]+)/*>@i",$store,$matches);
//print_r($matches);
$keywords=trim($matches[1]);
$pageinfo[keywords]=str_replace("/"","",$keywords);
preg_match("/<title>(.*)<//title>/smUi",$store,$matches);
$pageinfo[title]=trim($matches[1]);
preg_match("/<body.*>(.*)<//body>/smUi",$store,$matches);
$pageinfo[body]=addslashes(replaceHtmlAndJs($matches[1]));
$pageinfo["all"]=addslashes(replaceHtmlAndJs($store));
//echo"charset=".$pageinfo[charset]."/n";
//print_r($pageinfo);
//exit;
return$pageinfo;
}
/**
*去掉所有的HTML标记和JavaScript标记
*/
functionreplaceHtmlAndJs($document)
{
$document=trim($document);
if(strlen($document)<=0)
{
return$document;
}
$search=array(
""<script[^>]*?>.*?
//--></mce:script>"si", //去掉javascript
""<[///!]*?[^<>]*?>"si", //去掉HTML标记
""[/r/n/s+]"", //去掉空白字符
""&(/w+);"i" //替换HTML实体
); //作为PHP代码运行
$replace=array("","","","" );
return@preg_replace($search,$replace,$document);
}
使用例子
$a=getPageContent(www.ttphp.com);
print_r($a);
相关文章
- php 动静分离原理,nginx动静分离的好处
- php curl header设置参数[通俗易懂]
- HTML添加背景图片_html背景图片铺满网页
- qq登录钓鱼php网页,PHP+JS模仿登录钓鱼「建议收藏」
- php 伪静态-服务器伪静态和WordPress固定连接设置
- MySQL求和算法在PHP中的应用(mysql求和php)
- 测试Linux下PHP应用的实现(linux测试php)
- PHP Fatal error: Uncaught Error: Call to undefined function pcntl_fork().. 开启php pcntl扩展实现多进程详解编程语言
- php webman和tp并发能力对比详解编程语言
- PHP中实现Redis队列的挑战与实践(php队列redis)
- 技巧Linux新手必知的PHP换行技巧(linux换行php)
- Linux上的PHP扩展开发(php扩展linux)
- PHP文件包含漏洞总结
- PHP快速获取MySQL数据库信息(php获取mysql信息)
- PHP操作MySQL:删除字段(php删除mysql字段)
- php绘制图片验证码
- PHP调用MSSQL数据库:开启新的数据库探索(用php调用mssql)
- PHP读取MSSQL数据库:实现简单快捷的网页查询(php读取mssql网页)
- PHP环境搭建:从编译MSSQL说起(php 编译mssql)
- 用PHP连接MSSQL数据库:轻松搞定!(mssql连接类php)
- PHP生成便于打印的网页
- PHP正则表达式的效率回溯与固化分组
- php删除页面记录同时刷新页面删除条件用GET方式获得
- 配置php.ini实现PHP文件上传功能
- PHP提示Warning:phpinfo()hasbeendisabled函数禁用的解决方法
- php中file_get_content和curl以及fopen效率分析