分享下页面关键字抓取components.arrow.com站点代码
分享下页面关键字抓取components.arrow.com站点代码
发布时间:2016-12-29 来源:查字典编辑
摘要:复制代码代码如下:

复制代码 代码如下:

<?php

/**

* HOST: components.arrow.com

*/

//set_time_limit(0);

// base function

function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')

{

$ch = curl_init();

if (!empty($data)) {

$data = is_array($data)?http_build_query($data): $data;

$url .= (strpos($url,'?')? '&': "?") . $data;

}

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

curl_setopt($ch, CURLOPT_POST, 0);

curl_setopt($ch, CURLOPT_PORT, $port);

curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面

$reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);

if($proxy) {

curl_setopt($ch, CURLOPT_PROXY, $proxy);

curl_setopt($ch, CURLOPT_PROXYPORT, 1723);

curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");

}

$result = array();

$result['result'] = curl_exec($ch);

if (0 != curl_errno($ch)) {

$result['error'] = "Error:n" . curl_error($ch);

}

curl_close($ch);

return $result;

}

复制代码 代码如下:

function curl_post($url, $data = array(), $header = array(), $timeout = 15, $port = 80)

{

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

curl_setopt($ch, CURLOPT_PORT, $port);

!empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

curl_setopt($ch, CURLOPT_POST, 1);

curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

$result = array();

$result['result'] = curl_exec($ch);

if (0 != curl_errno($ch)) {

$result['error'] = "Error:n" . curl_error($ch);

}

curl_close($ch);

return $result;

}

/**

* 获取列表页的html源码

* @param string $keywords 搜索关键字

* @param int $start 开始记录数

* @return boolean|array

*/

function getListHtml($keywords, $start = 0)

{

if ($start < 0)

{

return false;

}

$postData = array(

'search_token' => $keywords,

'start' => $start,

'limit' => 100,

);

$result = curl_post('http://components.arrow.com/part/search/' . $keywords, http_build_query($postData));

if ( isset($result['error']) )

{

return false;

//exit($result['error']);

}

$result = $result['result'];

return $result;

}

/**

* 获取列表页 连接href

* @param string $html html源码

* @return array

*/

function getListHref($html)

{

$pattern = '/<tds+class="col_mfr_part_num"><as+href="(.[^>]+)">/isU';

if (preg_match_all($pattern, $html, $matches))

{

return $matches[1];

} else {

// 没有匹配项

return array();

}

}

/**

* 获取下一页数字start

* @param string $html html源码

* @return number

*/

function getListNextPage($html)

{

$pattern = '/<scripts+language="javascript">buildPagination('d+','d+','(d+)',d+);</script>/isU';

if (preg_match($pattern, $html, $matches))

{

return intval($matches[1]);

} else {

return -1;

}

}

/**

* 获取列表也所有的详细列表

* @param string $keywords 搜索关键字

* @return boolean|array

*/

function getListHrefAll($keywords)

{

if (empty($keywords))

{

return false;

}

$html = getListHtml($keywords);

$hrefList = getListHref($html);

if (empty($hrefList))

{

// 没有结果

return array();

}

$nextPage = getListNextPage($html);

$loop =0;

while ($nextPage > 0)

{

$html = getListHtml($keywords, $nextPage);

$tmpHrefList = getListHref($html);

$hrefList = array_merge($hrefList, $tmpHrefList);

$nextPage = getListNextPage($html);

$loop ++;

}

return $hrefList;

}

/**

* 获取详情页信息

* @param string $url url地址

* @return array()

*/

function getDetail($url)

{

if ( empty($url) )

{

return false;

}

$host = 'http://components.arrow.com';

$url = $host . $url;

$result = curl_get($url);

if ( isset($result['error']) )

{

return array();

//exit($result['error']);

}

$html = $result['result'];

$result = array(

'sup_part' => '', // 供应商型

'sup_id' => '', // 供应商ID

'mfg_part' => '', // 制造商型号

'mfg_name' => '', // 制造商名称

'cat_name' => '', // 分类名称

'para' => '', // 属性

'desc' => '', // 描述

'pdf_url' => '', // PDF地址

'sup_stock' => '', // 库存

'min_purch' => '', // 最小订购量

'price' => '', // 价格

'img_url' => '', // 图片地址

'createtime' => '', // 创建时间

'datacode' => '', // 批号

'package' => '', // 封装

'page_url' => '', // 页面地址

);

// mfg_part

$pattern = '/<li>[sn]*<strong>Part No:s*</strong>(.+)</li>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['mfg_part'] = trim($matches[1]);

} else {file_put_contents('page.txt', $html);die('xxx');

return array();

}

// mfg_name

$pattern = '/<li>[sn]*<strong>Manufacturer: </strong>(.+)</li>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['mfg_name'] = trim($matches[1]);

}

// cat_name

$pattern = '/displayCategory('(.[^']+)');/isU';

if (preg_match($pattern, $html, $matches))

{

$result['cat_name'] = trim($matches[1]);

$result['cat_name'] = str_replace('|', '>', $result['cat_name']);

}

// para

$tablepattern = '/<tables+id="part_specs".[^>]*>(.+)</table>/isU';

if (preg_match($tablepattern, $html, $matches))

{

$pattern = '/<tr>[sn]*<td><strong>(.+)</strong></td><td>(.+)</td>[sn]*</tr>/isU';

if (preg_match_all($pattern, $matches[1], $matches))

{

foreach($matches[1] as $k=>$v)

{

$v = trim($v);

if ('Package Type' == $v)

{

$result['package'] = trim($matches[2][$k]);

continue;

}

$result['para'][$v] = trim($matches[2][$k]);

}

}

}

// desc

$pattern = '/<divs+id="part_title">.+<h4>(.+)</h4>[sn]*</div>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['desc'] = trim($matches[1]);

}

// pdf_url

$pattern = '/<lis+class="datasheet">[sn]*<strong>Datasheet:</strong><as+href="(.[^"]+)"/isU';

if (preg_match($pattern, $html, $matches))

{

$result['pdf_url'] = $host . trim($matches[1]);

}

// sup_stock

$pattern = '/<tds+id="inv_1"s+class="li_inv">([d,]+)</td>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['sup_stock'] = trim($matches[1]);

$result['sup_stock'] = str_replace(',', '', $result['sup_stock']);

}

// min_purch

$pattern = '/<spans+id="multiples">[sn]*<strong>Multiple:s*</strong>(.+)</span>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['min_purch'] = trim($matches[1]);

}

// price

$pattern = '/<divs+id="price_1"s+class="li_price">(.[^<]+)</div>/isU';

if (preg_match($pattern, $html, $matches))

{

$result['price'][1] = trim($matches[1]);

}

$pattern = '/<divs+id="price_1"s+class="li_price">[sn]*<span.[^>]+title="(.[^"]+)">/isU';

if (preg_match($pattern, $html, $matches))

{

$priceurl = str_replace('&', '&', $matches[1]);

$json = curl_get($priceurl);

$json = $json['result'];

if (! empty($json))

{

$jsonresult = json_decode($json, true);

foreach ($jsonresult['parts'][0]['webprice']['resale'] as $k=>$v)

{

$result['price'][$v['minqty']] = $v['price'];

}

}

}

// img_url

$pattern = '/<divs+id="part_image">[sn]*<imgs+src="(.[^"]+)"/isU';

if (preg_match($pattern, $html, $matches))

{

$result['img_url'] = trim($matches[1]);

}

// page_url

$result['page_url'] = $url;

return $result;

}

/**

* 最终调用函数

* @param string $keywords 搜索关键字

* @return array

*/

function getData($keywords)

{

$hrefList = getListHrefAll($keywords);

$result = array();

foreach ($hrefList as $k=>$v)

{

$result[] = getDetail($v);

}

return $result;

}

// Test Script

$keywords = trim($_GET['keywords']);

$result = getData($keywords);

print_r($result);

推荐文章
猜你喜欢
附近的人在看
推荐阅读
拓展阅读
相关阅读
网友关注
最新php教程学习
热门php教程学习
编程开发子分类