基于swoole多进程与协程,撸了一下网络爬虫库。
demo
快手后台
<?php
/**
* 跑取快手数据示例
* https://k.kuaishou.com/#/index
*/
namespace demo;
// 引入composer加载
require_once dirname(__FILE__)."/../vendor/autoload.php";
use \phpspiderman\phpspiderman;
$config = require_once "config.php";
$mysqldb = $config['db'];
$spider = new phpspiderman([
"type" => 2,
//数据库配置
"mysqlconfig" => $mysqldb,
"worker_num" => 4,
"totalSumField" => "total",
"totalPageField" => '',
"PageField" => 'currentPage',
"table" => "spider_kuaishou",
"PageSize" => 20,
"ProxyUrl" => '',
"url" => 'k.kuaishou.com',
"urlport" => 443,
"list" => "/rest/web/star/list",
"cookie" => "",
"body" => [
'liveAvgPeopleMinNum' => '0',
'currentPage' => 1,
'starOrderTag' => 3,
'liveAvgPeopleMaxNum' => null,
'starMaxPrice' => null,
'ugcTag' => '',
'mmuStarTagIds' => [
12,3 //搞笑和游戏
],
'fansMinNum' => 0, //最低粉丝数
'taskType' => 1, //1 4 1视频 4直播
'cityCode' => 0,
'fansMaxNum' => null,
'userName' => '',
'provinceCode' => 0,
'starOrderType' => 0,
'isAppWithLink' => null,
'starMinPrice' => 0,
'gender' => ''
],
]);
$spider->handleList = function($spider,$header,$json_array)
{
$list = $spider->Http->getContent2($spider->urlconfig['list'],$header,$json_array);
$list = \phpspiderman\content\json::decode($list);
return $list['starList'];
};
//获取到内容的处理
$spider->handleContent = function($val = null)
{
$data['userId'] = $val['userId'];
$data['starId'] = $val['starId'];
$data['name'] = $val['name'];
$data['kwaiId'] = $val['kwaiId'];
$data['gender'] = $val['gender']=='男'?'0':'1';
$data['fansNumber'] = $val['fansNumber'];
$data['areaTag'] = $val['areaTag'];
$data['headUrl'] = $val['headUrl'];
$data['liveQuotedPrice'] = $val['liveQuotedPrice'];
$data['oneDaysOrderBid'] = $val['oneDaysOrderBid'];
$data['threeDaysOrderBid'] = $val['threeDaysOrderBid'];
$data['sevenDaysOrderBid'] = $val['sevenDaysOrderBid'];
return $data;
};
$spider->crawl();
安装
composer install showx/phpspiderman
文档说明
后续再更新!