java asp分析各种搜索引擎的关键字,自动识别url 中关键字的编码
java asp分析各种搜索引擎的关键字,自动识别url 中关键字的编码
发布时间:2016-12-29 来源:查字典编辑
摘要:所以必须要通过编码后的关键字,例如“解析关键字编码”在google里面输入搜索,得到编码后的“%E8%A7%A3%E6%9E%90%E5%8...

所以必须要通过编码后的关键字,例如“解析关键字编码”在google里面输入搜索,得到编码后的“%E8%A7%A3%E6%9E%90%E5%85%B3%E9%94%AE%E5%AD%97%E7%BC%96%E7%A0%81”

1.从以上地址中解析出关键字部分。

2.通过编码后的关键字获取编码时的编码名称(如:gbk,utf-8等等)

3.用URLdecode(keywords,encodeCode)来解码得到对应的关键字。

以下是java代码的实现:

复制代码 代码如下:

package test;

import java.io.UnsupportedEncodingException;

import java.net.URLDecoder;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class ParseURLKeyword {

public static void main(String[] args) {

String url = "http://www.google.co.kr/search?hl=en&q=%ED%95%9C%EA%B5%AD%EC%96%B4+&btnG=Google+Search&aq=f&oq=";

System.out.println(ParseURLKeyword.getKeyword(url));

System.out.println("");

url = "http://www.google.cn/search?q=%E6%8F%90%E5%8F%96+%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E+%E5%85%B3%E9%94%AE%E5%AD%97&hl=zh-CN&newwindow=1&sa=2";

System.out.println(ParseURLKeyword.getKeyword(url));

System.out.println("");

url = "http://www.google.com.tw/search?hl=zh-CN&q=%E6%B9%98%E9%8B%BC%E4%B8%AD%E5%9C%8B%E9%A6%99%E7%85%99&btnG=Google+%E6%90%9C%E7%B4%A2&aq=f&oq=";

System.out.println(ParseURLKeyword.getKeyword(url));

System.out.println("");

url = "http://www.baidu.com/s?wd=%D6%D0%87%F8%D3%D0%BE%80%D8%9F%C8%CE%B9%AB%CB%BE";

System.out.println(ParseURLKeyword.getKeyword(url));

System.out.println("");

url = "http://www.baidu.com/s?wd=%C6%F3%D2%B5%CD%C6%B9%E3";

System.out.println(ParseURLKeyword.getKeyword(url));

System.out.println("");

}

public static String getKeyword(String url){

String keywordReg = "(?:yahoo.+?[?|&]p=|openfind.+?query=|google.+?q=|lycos.+?query=|onseek.+?keyword=|search.tom.+?word=|search.qq.com.+?word=|zhongsou.com.+?word=|search.msn.com.+?q=|yisou.com.+?p=|sina.+?word=|sina.+?query=|sina.+?_searchkey=|sohu.+?word=|sohu.+?key_word=|sohu.+?query=|163.+?q=|baidu.+?wd=|soso.+?w=|3721.com.+?p=|Alltheweb.+?q=)([^&]*)";

String encodeReg = "^(?:[x00-x7f]|[xfc-xff][x80-xbf]{5}|[xf8-xfb][x80-xbf]{4}|[xf0-xf7][x80-xbf]{3}|[xe0-xef][x80-xbf]{2}|[xc0-xdf][x80-xbf])+$";

Pattern keywordPatt = Pattern.compile(keywordReg);

StringBuffer keyword = new StringBuffer(20);

Matcher keywordMat = keywordPatt.matcher(url);

while (keywordMat.find()) {

keywordMat.appendReplacement(keyword, "$1");

}

if (!keyword.toString().equals("")){

String keywordsTmp = keyword.toString().replace("http://www.", "");

Pattern encodePatt = Pattern.compile(encodeReg);

String unescapeString = ParseURLKeyword.unescape(keywordsTmp);

Matcher encodeMat = encodePatt.matcher(unescapeString);

String encodeString = "gbk";

if (encodeMat.matches()) encodeString = "utf-8";

try {

return URLDecoder.decode(keywordsTmp, encodeString);

} catch (UnsupportedEncodingException e) {

return "";

}

}

return "";

}

public static String unescape(String src) {

StringBuffer tmp = new StringBuffer();

tmp.ensureCapacity(src.length());

int lastPos = 0, pos = 0;

char ch;

while (lastPos < src.length()) {

pos = src.indexOf("%", lastPos);

if (pos == lastPos) {

if (src.charAt(pos + 1) == 'u') {

ch = (char) Integer.parseInt(src.substring(pos + 2, pos + 6), 16);

tmp.append(ch);

lastPos = pos + 6;

} else {

ch = (char) Integer.parseInt(src.substring(pos + 1, pos + 3), 16);

tmp.append(ch);

lastPos = pos + 3;

}

} else {

if (pos == -1) {

tmp.append(src.substring(lastPos));

lastPos = src.length();

} else {

tmp.append(src.substring(lastPos, pos));

lastPos = pos;

}

}

}

return tmp.toString();

}

}

以下是Asp的实现代码:

复制代码 代码如下:

Function DecodeURI(s)

s = UnEscape(s)

Dim reg, cs

cs = "GBK"

Set reg = New RegExp

reg.Pattern = "^(?:[x00-x7f]|[xfc-xff][x80-xbf]{5}|[xf8-xfb][x80-xbf]{4}|[xf0-xf7][x80-xbf]{3}|[xe0-xef][x80-xbf]{2}|[xc0-xdf][x80-xbf])+$"

If reg.Test(s) Then cs = "UTF-8"

Set reg = Nothing

Dim sm

Set sm = CreateObject("ADODB.Stream")

With sm

.Type = 2

.Mode = 3

.Open

.CharSet = "iso-8859-1"

.WriteText s

.Position = 0

.CharSet = cs

DecodeURI = .ReadText(-1)

.Close

End With

Set sm = Nothing

End Function

Response.Write DecodeURI("%B8%A7%CB%B3%C7%E0%CB%C9%D2%A9%D2%B5")

Response.Write DecodeURI("%E6%8A%9A%E9%A1%BA%E9%9D%92%E6%9D%BE%E8%8D%AF%E4%B8%9A")

推荐文章
猜你喜欢
附近的人在看
推荐阅读
拓展阅读
相关阅读
网友关注
最新其他综合学习
热门其他综合学习
编程开发子分类