php爬虫

2020/06/01 posted in  脚本
Tags:  #php

基于swoole多进程与协程,撸了一下网络爬虫库。

demo

快手后台

<?php
/**
 * 跑取快手数据示例
 * https://k.kuaishou.com/#/index
 */
namespace demo;
// 引入composer加载
require_once dirname(__FILE__)."/../vendor/autoload.php";
use \phpspiderman\phpspiderman;
$config = require_once "config.php";
$mysqldb = $config['db'];

$spider = new phpspiderman([
    "type" => 2,
    //数据库配置
    "mysqlconfig" => $mysqldb,
    "worker_num" => 4,
    "totalSumField" => "total",
    "totalPageField" => '',
    "PageField" => 'currentPage',
    "table" => "spider_kuaishou",
    "PageSize" => 20,
    "ProxyUrl" => '',
    "url" => 'k.kuaishou.com',
    "urlport" => 443,
    "list" => "/rest/web/star/list",
    "cookie" => "",
    "body" => [
        'liveAvgPeopleMinNum' => '0',
        'currentPage' => 1,
        'starOrderTag' => 3,
        'liveAvgPeopleMaxNum' => null,
        'starMaxPrice' => null,
        'ugcTag' => '',
        'mmuStarTagIds' => [
            12,3  //搞笑和游戏
        ],
        'fansMinNum' => 0,  //最低粉丝数
        'taskType' => 1,  //1 4 1视频 4直播
        'cityCode' => 0,
        'fansMaxNum' => null,
        'userName' => '',
        'provinceCode' => 0,
        'starOrderType' => 0,
        'isAppWithLink' => null,
        'starMinPrice' => 0,
        'gender' => ''
        ],
]);

$spider->handleList = function($spider,$header,$json_array)
{
    $list = $spider->Http->getContent2($spider->urlconfig['list'],$header,$json_array);
    $list = \phpspiderman\content\json::decode($list);
    return $list['starList'];
};

//获取到内容的处理
$spider->handleContent = function($val = null)
{
    $data['userId'] = $val['userId'];
    $data['starId'] = $val['starId'];
    $data['name'] = $val['name'];
    $data['kwaiId'] = $val['kwaiId'];
    $data['gender'] = $val['gender']=='男'?'0':'1';
    $data['fansNumber'] = $val['fansNumber'];
    $data['areaTag'] = $val['areaTag'];
    $data['headUrl'] = $val['headUrl'];

    $data['liveQuotedPrice'] = $val['liveQuotedPrice'];
    $data['oneDaysOrderBid'] = $val['oneDaysOrderBid'];
    $data['threeDaysOrderBid'] = $val['threeDaysOrderBid'];
    $data['sevenDaysOrderBid'] = $val['sevenDaysOrderBid'];
    return $data;

};
$spider->crawl();

安装

composer install showx/phpspiderman

文档说明

后续再更新!