基于Snoopy的PHP近似完美获取网站编码的代码_php教程-查字典教程网
基于Snoopy的PHP近似完美获取网站编码的代码
基于Snoopy的PHP近似完美获取网站编码的代码
发布时间:2016-12-29 来源:查字典编辑
摘要:先要到网上下载Snoopy.class.php调用方法:复制代码代码如下:复制代码代码如下:

先要到网上下载Snoopy.class.php

调用方法:

复制代码 代码如下:

<?php

require 'lib/Snoopy.class.php';

require 'lib/WebCrawl.class.php';//包含下面代码

$go=new WebCrawl('http://www.baidu.com');

echo $go->getCharset();

?>

复制代码 代码如下:

<?php

class WebCrawl

{

private $url;

private $request;

public $charset_arr=array(

'gb2312',

'utf-8',

'big5',

'gbk',

'ascii',

'cp936',

'ibm037',

'ibm437',

'ibm500',

'asmo-708',

'dos-720',

'ibm737',

'ibm775',

'ibm850',

'ibm852',

'ibm855',

'ibm857',

'ibm00858',

'ibm861',

'ibm860',

'dos-862',

'ibm863',

'ibm864',

'ibm865',

'cp866',

'ibm869',

'ibm870',

'windows-874',

'cp875',

'shift_jis',

'ks_c_5601-1987',

'ibm1026',

'ibm01047',

'ibm01047',

'ibm01040',

'ibm01041',

'ibm01042',

'ibm01043',

'ibm01044',

'ibm01045',

'ibm01046',

'ibm01047',

'ibm01048',

'ibm01049',

'utf-16',

'unicodefffe',

'windows-1250',

'windows-1251',

'windows-1252',

'windows-1253',

'windows-1254',

'windows-1255',

'windows-1256',

'windows-1257',

'windows-1258',

'johab',

'macintosh',

'x-mac-japanese',

'x-mac-chinesetrad',

'x-mac-korean',

'x-mac-arabic',

'x-mac-hebrew',

'x-mac-greek',

'x-mac-cyrillic',

'x-mac-chinesesimp',

'x-mac-romanian',

'x-mac-ukrainian',

'x-mac-thai',

'x-mac-ce',

'x-mac-icelandic',

'x-mac-turkish',

'x-mac-croatian',

'x-chinese-cns',

'x-cp20001',

'x-chinese-eten',

'x-cp20003',

'x-cp20004',

'x-cp20005',

'x-ia5',

'x-ia5-german',

'x-ia5-swedish',

'x-ia5-norwegian',

'us-ascii',

'x-cp20261',

'x-cp20269',

'ibm273',

'ibm277',

'ibm278',

'ibm280',

'ibm284',

'ibm285',

'ibm290',

'ibm420',

'ibm423',

'ibm424',

'x-ebcdic-koreanextended',

'ibm-thai',

'koi8-r',

'ibm871',

'ibm880',

'ibm905',

'ibm00924',

'x-cp20936',

'x-cp20949',

'cp1025',

'koi8-u',

'iso-8859-1',

'iso-8859-2',

'iso-8859-3',

'iso-8859-4',

'iso-8859-5',

'iso-8859-6',

'iso-8859-7',

'iso-8859-8',

'iso-8859-9',

'iso-8859-13',

'iso-8859-15',

'x-europa',

'iso-8859-8-i',

'iso-2022-jp',

'csiso2022jp',

'iso-2022-jp',

'iso-2022-kr',

'x-cp50227',

'euc-jp',

'euc-cn',

'euc-kr',

'hz-gb-2312',

'gb18030',

'x-iscii-de',

'x-iscii-be',

'x-iscii-ta',

'x-iscii-te',

'x-iscii-as',

'x-iscii-or',

'x-iscii-ka',

'x-iscii-ma',

'x-iscii-gu',

'x-iscii-pa',

'utf-7',

'utf-32',

'utf-32be'

);

public function __construct($url)

{

$this->url=$url;

}

//打开网站

private function open($url)

{

if($this->request!==null)

{

if($this->request->status==200)

{

return true;

}

else

{

return false;

}

}

else

{

$this->request=new Snoopy();

$this->request->fetch($url);

if($this->request->status==200)

{

$this->request->results=strtolower($this->request->results);

$charset=$this->getCharset();

if($charset!="utf-8")

{

if($charset=="windows-1252")

{

$this->request->results=$this->uni_decode($this->request->results);

}

else

{

$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset);

}

}

return true;

}

else

{

return false;

}

}

}

//获取网站title,keywords,description

public function getWebinfo()

{

$info=array(

'title'=>'',

'keywords'=>'',

'desc'=>'',

'ip'=>''

);

if(!$this->open($this->url)){return $info;exit;}

// print_r($this->request->results);exit;

preg_match('/<title>([^>]*)</title>/si', $this->request->results, $titlematch );

if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0)

{

$info['title'] = strip_tags($titlematch[1]);

}

preg_match_all('/<[s]*meta[s]*name="?' . '([^>"]*)"?[s]*' . 'content="?([^>"]*)"?[s]*[/]?[s]*>/si', $this->request->results, $match);

$ft=0;

foreach($match[1] as $mt)

{

if($mt=="keywords" || $mt=="description")

{

$ft=1;

}

}

if($ft==0)

{

preg_match_all('/<[s]*meta[s]*content="?([^>"]*)"?[s]*name="?' . '([^>"]*)"?[s]*[/]?[s]*>/si', $this->request->results, $match);

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[2];

$values = $match[1];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

}

}

}

else

{

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[1];

$values = $match[2];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

}

}

}

$result = array (

'metaTags' => $metaTags

);

if(isset($result['metaTags']['keywords']['value']))

{

$info['keywords']=$result['metaTags']['keywords']['value'];

}

else

{

$info['keywords']="";

}

if(isset($result['metaTags']['description']['value']))

{

$info['desc']=$result['metaTags']['description']['value'];

}

else

{

$info['desc']="";

}

$domain=preg_replace('/http:///si', '', $this->url);

$ip=@gethostbyname($domain);

$ip_arr=explode(".", $ip);

if(count($ip_arr)==4)

{

$info['ip']=$ip;

}

return $info;

}

public function t($string,$o)

{

for($i=0;$i<strlen($string);$i++)

{

if(ord($string{$i})<128)

continue;

if((ord($string{$i})&224)==224)

{

//第一个字节判断通过

$char = $string{++$i};

if((ord($char)&128)==128)

{

//第二个字节判断通过

$char = $string{++$i};

if((ord($char)&128)==128)

{

$encoding = "UTF-8";

break;

}

}

}

if((ord($string{$i})&192)==192)

{

//第一个字节判断通过

$char = $string{++$i};

if((ord($char)&128)==128)

{

//第二个字节判断通过

$encoding = "GB2312";

break;

}

}

}

return strtolower($encoding);

}

function uni_decode ($str, $code = 'utf-8'){

$str = json_decode(preg_replace_callback('/&#(d{5});/', create_function('$dec', 'return 'u'.dechex($dec[1]);'), '"'.$str.'"'));

if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); }

return $str;

}

//获取网站编码

public function getCharset()

{

if(!$this->open($this->url)){return false;exit;}

//首先从html获取编码

preg_match("/<meta.+?charset=[^w]?([-w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):"";

if($temp[1]!="")

{

if(in_array($temp[1], $this->charset_arr))

{

if($temp[1]=="gb2312")

{

$tmp_charset=$this->t($this->request->results,$temp[1]);

if($tmp_charset==$temp[1])

{

return $temp[1];

}

}

else

{

return $temp[1];

}

}

}

if(!empty($this->request->headers))

{

//从header中获取编码

$hstr=strtolower(implode("|||",$this->request->headers));

preg_match("/charset=[^w]?([-w]+)/is",$hstr,$lang) ? strtolower($lang[1]):"";

if($lang[1]!="")

{

return $lang[1];

}

}

$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win");

$encoded=mb_detect_encoding($this->request->results,$encode_arr);

if($encoded)

{

return strtolower($encoded);

}

else

{

return false;

}

}

}

?>

相关阅读
推荐文章
猜你喜欢
附近的人在看
推荐阅读
拓展阅读
  • 大家都在看
  • 小编推荐
  • 猜你喜欢
  • 最新php教程学习
    热门php教程学习
    编程开发子分类