分享下页面关键字抓取www.icbase.com站点代码(带asp.net参数的)
分享下页面关键字抓取www.icbase.com站点代码(带asp.net参数的)
发布时间:2016-12-29 来源:查字典编辑
摘要:复制代码代码如下:

复制代码 代码如下:

<?php

/**

* HOST: www.icbase.com

*/

//set_time_limit(0);

// base function

function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')

{

$ch = curl_init();

if (!empty($data)) {

$data = is_array($data)?http_build_query($data): $data;

$url .= (strpos($url,'?')? '&': "?") . $data;

}

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

curl_setopt($ch, CURLOPT_POST, 0);

curl_setopt($ch, CURLOPT_PORT, $port);

curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面

$reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);

if($proxy) {

curl_setopt($ch, CURLOPT_PROXY, $proxy);

curl_setopt($ch, CURLOPT_PROXYPORT, 1723);

curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");

}

$result = array();

$result['result'] = curl_exec($ch);

if (0 != curl_errno($ch)) {

$result['error'] = "Error:n" . curl_error($ch);

}

curl_close($ch);

return $result;

}

复制代码 代码如下:

function curl_post($url, $data = array(), $header = array(), $timeout = 5, $port = 80)

{

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

//curl_setopt($ch, CURLOPT_PORT, $port);

!empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

curl_setopt($ch, CURLOPT_POST, 1);

curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

$result = array();

$result['result'] = curl_exec($ch);

if (0 != curl_errno($ch)) {

$result['error'] = "Error:n" . curl_error($ch);

}

curl_close($ch);

return $result;

}

/**

* 获取列表页的html源码

* @param string $keywords 搜索关键字

* @param int $page 页数

* @return boolean|array

*/

function getListHtml($keywords, $page=1)

{

if ($page < 0)

{

return false;

}

$page = $page == 0 ? 1 : intval($page);

if ($page == 1)

{

$result = curl_get('http://www.icbase.com/ProResult.aspx', array('ProKey' => $keywords));

if ( isset($result['error']) )

{

return false;

//exit($result['error']);

}

$result = $result['result'];

// asp.net post提交数据

if(! defined('__VIEWSTATE') && preg_match('/<inputs+type="hidden"s+name="__VIEWSTATE"s+id="__VIEWSTATE"s+value="(.[^"]+)"/isU', $result, $matches))

{

define('__VIEWSTATE', $matches[1]);

} else {

return false;

}

if(! defined('__PREVIOUSPAGE') && preg_match('/<inputs+type="hidden"s+name="__PREVIOUSPAGE"s+id="__PREVIOUSPAGE"s+value="(.[^"]+)"/isU', $result, $matches))

{

define('__PREVIOUSPAGE', $matches[1]);

} else {

return false;

}

if(! defined('__EVENTVALIDATION') && preg_match('/<inputs+type="hidden"s+name="__EVENTVALIDATION"s+id="__EVENTVALIDATION"s+value="(.[^"]+)"/isU', $result, $matches))

{

define('__EVENTVALIDATION', $matches[1]);

} else {

return false;

}

return $result;

}

$data = array(

'__EVENTTARGET' => 'pager',

'__EVENTARGUMENT' => $page,

'__VIEWSTATE' => __VIEWSTATE,

'__PREVIOUSPAGE' => __PREVIOUSPAGE,

'__EVENTVALIDATION' => __EVENTVALIDATION,

);

$result = curl_post('http://www.icbase.com/ProResult.aspx?ProKey=' . $keywords, $data);

if ( isset($result['error']) )

{

return false;

//exit($result['error']);

}

$result = $result['result'];

return $result;

}

/**

* 获取列表页 a链接的url

* @param string $html html源码

* @return array

*/

function getListHref($html)

{

$pattern = '/<as+href='(.[^']+)'s+target="_blank"s*>[sn]*<img.+[^>]/>/isU';

if (preg_match_all($pattern, $html, $matches))

{

return $matches[1];

} else {

// 没有匹配项

return array();

}

}

/**

* 获取下一页数字

* @param string $html html源码

* @return number

*/

function getListNextPage($html)

{

$pattern = '/<divs+id="Pager".+[^>]>.+<as+href="javascript:__doPostBack('Pager','(d+)')">></a>/isU';

if (preg_match($pattern, $html, $matches))

{

return intval($matches[1]);

} else {

return -1;

}

}

/**

* 获取列表也所有的href

* @param string $keywords 搜索关键字

* @return boolean|array

*/

function getListHrefAll($keywords)

{

if (empty($keywords))

{

return false;

}

$html = getListHtml($keywords);

$hrefList = getListHref($html);

if (empty($hrefList))

{

// 没有结果

return array();

}

$nextPage = getListNextPage($html);

while ($nextPage > 0)

{

$html = getListHtml($keywords, $nextPage);

$tmpHrefList = getListHref($html);

$hrefList = array_merge($hrefList, $tmpHrefList);

$nextPage = getListNextPage($html);

}

return $hrefList;

}

/**

* 获取详情页信息

* @param string $url url地址或者是抓取到的html源代码 根据@see $is_url 区分

* @param int $is_url 1使用的是url地址 0直接处理html源代码

* @return boolean|multitype:|multitype:string

*/

function getDetail($url, $is_url = 1)

{

if ( empty($url) )

{

return false;

}

$host = 'www.icbase.com';

$html = $url;

if ($is_url) {

$url = '/' . ltrim($url, '/');

$result = curl_get($host . $url);

if ( isset($result['error']) )

{

exit($result['error']);

}

$html = $result['result'];

}

$result = array(

'sup_part' => '', // 供应商型号

'sup_id' => '', // 供应商ID

'mfg_part' => '', // 制造商型号

'mfg_name' => '', // 制造商名称

'cat_name' => '', // 分类名称

'para' => '', // 属性

'desc' => '', // 描述

'pdf_url' => '', // PDF地址

'sup_stock' => '', // 库存

'min_purch' => '', // 最小订购量

'price' => '', // 价格

'img_url' => '', // 图片地址

'createtime' => '', // 创建时间

'datacode' => '', // 批号

'package' => '', // 封装

'page_url' => '', // 页面地址

);

// mfg_part

$pattern = '/<td>产品型号</td><td>(.[^<]+)</isU';

if (preg_match($pattern, $html, $matches))

{

$result['mfg_part'] = trim($matches[1]);

} else {

// 此项木有,说明也没处处了

return array();

}

// mfg_name

$pattern = '/<td>厂商</td>[sn]*<td>(.+)</td>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['mfg_name'] = trim($matches[1]);

}

// para

$pattern = '/<trs+style="background-color:#E9E9E9;color:black; font-weight:bold;">(.+)</tr></table>/isU';

if (preg_match($pattern, $html, $matches))

{

if (preg_match_all('/<td>(.+)</td>/isU', $matches[1], $matches))

{

$count = count($matches[1]);

$count = intval($count / 2 );

foreach ($matches[1] as $k=>$v)

{

if ($k >= $count)

{

break;

}

if (trim($v) == '描述')

{

// desc

$result['desc'] = trim($matches[1][$count + $k]);

continue;

}

$v = trim($v);

$result['para'][$v] = trim($matches[1][$count + $k]);

}

}

}

// pdf_url

$pattern = '/<td>详细资料</td><td><as+href="(.[^"]+)"/isU';

if (preg_match($pattern, $html, $matches))

{

$result['pdf_url'] = trim($matches[1]);

}

// sup_stock

$pattern = '/<td>库存数量</td>[sn]*<td>(d+)</td>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['sup_stock'] = trim($matches[1]);

}

// price

$pattern = '/<tr><td.[^>]+>(d+)+</td><td.[^>]+>.[^d]*([d.]+)</td></tr>/isU';

if (preg_match_all($pattern, $html, $matches))

{

foreach ($matches[1] as $k=>$v)

{

$result['price'][$v] = '¥' . $matches[2][$k];

}

}

//img_url

$pattern = '/<td>图片</td><td><imgs+src="(.[^"]+)"/isU';

if (preg_match($pattern, $html, $matches))

{

$result['img_url'] = trim($matches[1]);

}

// page_url

if ($is_url)

{

$result['page_url'] = $host . $url;

}

return $result;

}

/**

* 最终调用函数

* @param string $keywords 搜索关键字

* @return array

*/

function getData($keywords)

{

$hrefList = getListHrefAll($keywords);

$result = array();

foreach ($hrefList as $k=>$v)

{

$result[] = getDetail($v);

}

return $result;

}

// Test Script

$keywords = trim($_GET['keywords']);

$result = getData($keywords);

print_r($result);

推荐文章
猜你喜欢
附近的人在看
推荐阅读
拓展阅读
相关阅读
网友关注
最新php教程学习
热门php教程学习
编程开发子分类