1、相关文档和网站
pupp使用示例demo:http://www.querylist.cc/docs/guide/v4/Puppeteer
pupp官方原生语法大全:https://zhaoqize.github.io/puppeteer-api-zh_CN/#?product=Puppeteer&version=v5.5.0&show=api-pagewaitforselectorselector-options
华为云pupp指定谷歌浏览器镜像:https://mirrors.huaweicloud.com/
下载:
1、composer require jaeger/querylist-puppeteer:~v4
2、composer require jaeger/querylist-puppeteer:*
3、npm install @nesk/puphpeteer
后,必须上华为云【https://mirrors.huaweicloud.com】搜索【chromium】,下载对应的puppteteer的Chrome谷歌浏览器
【原因:puppteteer1.7.5版本后,谷歌团队将puppteteer和浏览器分开了,所以我们用npm 下的puppteteer是-core内核,浏览器需要单独下载 -- 此坑花了我3天时间】
2、下载相关npm插件
3、华为云镜像源,下载puppteteer的浏览器和-core
然后就可以愉快的玩耍了。
4、puppeteer插件在win下没问题,在linux会报错:Failed to lanch the browser process!
解决办法:puppeteer的github->issue 里面有这个问题,
进入项目/node_modules/,找到chrome运行文件路径(我的是:www/wwwroot/xxx/project/node_modules/puppeteer/.local-chromium/linux-818858/chrome-linux/),执行ldd chrome | grep not,查看缺失的包,然后安装就行。
5、下面上一段php爬虫的laravel5.7代码:抓取天猫www.tmall.com模拟登陆账号并获取某个店铺列表页 -- js动态渲染数据后HTML
(备注:2020/12/25,分析天猫登陆逻辑,Jhtml->login.do->add?xx->访问目标页面并带上add页面response->headers->cookies)
<?php namespace App\Http\Controllers; use Illuminate\Http\Request; use GuzzleHttp\Cookie\CookieJar; use GuzzleHttp\Psr7\Response; use QL\QueryList; use QL\Ext\Chrome; use GuzzleHttp\Client; class HomeController extends Controller { //jaeger--模拟天猫登录,存cookie public function jargerTmallLogin2() { $jhtml_cookie = $this->getJhtmlCookie(); $form_params = array_merge([ 'loginId' => config('cache.Tmall.username'), 'password2' => 'b9c3b97c2a3ab755deddb370bf2659200236b4ab9ebfcb0a44f4c8431920f7b4799bdfc1cc368769aa66c353b000d4a6b3961332dc5ea9b112e539384baa4ec6af9d5df3c84e5ef97ca07233e61dfc84ce16fa1baabf2e9257ce586fdfbd860c630e343852db4f8103ce89539a55b7ec7761ce97fd44621a89db85b0f68dc93d', // 'password2' => config('cache.Tmall.password'), 'keepLogin' => 'false', 'ua' => '137#g6E9hE9o9kYFAWPMA9Oy4vcGIn9b7zb0r37oFA7TeVJPcfsFFthJcm420Q1uI3U3JfRy0DyD4EKTXVl3Uvo/HQr2q564KWWvt2PCEZcmLm1pQiJNWwz8Xx38LQMmOIIy5lXGepSQoiuSLCaJm5VaL+z+bgl81143CHYkMr1+qEV5a7ZOj1tlFXm9iRbMZuRiEwBKg1242mSCCSygmf02VgvJvpf/maKYEs+hhsfAZ7ynOSIdPJ4K3D2F89sB2Lwn+YEOspR9rjd5DVpq1/T3wdmzE8Pivm6BH9/neUPY72dzAA+CEqWbn9AfBzFwOJMJtIqvSCQhgI5pXMIiGkvKCCOqd07xJc2MinUk0oLjZyzt8Ef5e8C11G/0TKBL0ZPeFUVr9A+AM0e70T0ZlI5qLd8wSbWvceu4/VRaE9xjMlJWsDy0g8ovyCjRUt/+4mAFyUF9qS8zDBe5DBHoJLaPjiqmQofJ+GxVBp1m1ioI+/IkYrSP3lEIvEDhQefU+GDpFoyql4ei+tiVYSUS1lQyqdicQDJO/d6MpRJs1Iey+tiVYTUS8lrDJdicBAW70sceDRZjEyIRacn+bLnq1qQipXpmQAfJ+GXppRSf1Iei+piVYTUx1lgippimQOIoEV+H99qXEnF/EPLUBSzfs/9wdbVc8ByysTdY1MM2zHHu+JZXoPpTgItyRHHJ/WSHIuscdksoU0MND1yioT+MrJE0s/vfyPAb+tBO94uaky6uUe+Nt2Fc0BBypPwGSnZVVOeLQMWTuPMmjIjCvbt9Uon/ETx86o5mwlz9hqK5JBtqLm1sdWr0DDg0DaXupn4Dt6Bir6yBzetn34hfTteaUokQO2EAtBvcmiUThn3gykXcLoVzhnwW2Hmlbh0lC123Tym6lEqpKgrfNCbZM8wS9eUDufau3qZVQi80WOWdmLmN/JCUZ8GwE2SZ9RK7dEugd9R8dz2awhZF0VqvRFNF1sMlp/6rNeMOK3tHbQlGogeDfFseAwwbXdOeNEM5BnEP+6aj+ob4nVXRn6D+Ibzm28Dk1ArFoZSraC/41e0fs0/k7CJPrNyz197hJkwlv6P29Y1YaYSLdEjlq7nYa0bHpVM/ESSApV+QMRCWcHavL0Gx6eV5+Ok6rd5cYoHDvsyNjNrbFAYJSP1Eg3KRmqoIf7Hsy0C6Qb4u36HE5qzb7txcpOgqIgalG3Ot0GcwuXZ3QkyXOpzSAn9k6U+g7FiEcTdMe7Neis7pNpqvy/6RWdS/yTv9xaG4AnKWdoUuTVTKMKgaDZAtCRYnCaD6roOJAxrMnxnD9g0ghweHx31wwzBa256fafx7ljorDiKE3ENmcmtwqjnKHADOsGiQhc==', 'umidGetStatusVal' => '255', 'navlanguage' => 'zh-CN', 'navUserAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', 'navPlatform' => 'Win32' ], $jhtml_cookie); $jar = new \GuzzleHttp\Cookie\CookieJar; $client = new Client(['cookies' => $jar]); $res = $client->request('POST', 'https://login.taobao.com/newlogin/login.do?appName=taobao&fromSite=0', [ 'form_params'=>$form_params, 'timeout' => 60, 'verify' => false, //不验证ssl证书 ] ); $loginCookie = []; foreach ($jar->toArray() as $k=>$v){ $loginCookie[$k]['name'] = $v['Name']; $loginCookie[$k]['value'] = $v['Value']; // $loginCookie[$k]['path'] = $v['Path']; // $loginCookie[$k]['Max-Age'] = $v['Max-Age']; // $loginCookie[$k]['expires'] = floatval($v['Expires']); // $loginCookie[$k]['secure'] = $v['Secure']; // $loginCookie[$k]['discard'] = $v['Discard']; // $loginCookie[$k]['httpOnly'] = $v['HttpOnly']; // $loginCookie[$k]['url'] = strstr($v['Domain'], 'http') ? $v['Domain'] : 'https://www' . $v['Domain']; // $loginCookie[$k]['url'] = strstr($v['Domain'], 'http') ? $v['Domain'] : 'https://www.tmall.com'; // foreach ($v as $key=>$val){ // $loginCookie[$k][lcfirst($key)] = $val; // } } //print_r($loginCookie); file_put_contents(storage_path('logs/tmall2.cookie.txt'), json_encode($loginCookie, JSON_UNESCAPED_UNICODE)); $body = (string)$res->getBody(); $loginReturn = json_decode($body, true); print_r($loginReturn['content']['data']['redirectUrl']);//https://pass.tmall.com/add?_l_g=xx $headers = $res->getHeaders(); // dd($loginCookie); $jumpCookieArr = $this->redirectUrlJump($loginReturn['content']['data']['redirectUrl'], $loginCookie); $jumpReCookie = []; foreach ($jumpCookieArr as $k=>$v){ $jumpReCookie[$k]['name'] = $v['Name']; $jumpReCookie[$k]['value'] = $v['Value']; $jumpReCookie[$k]['path'] = $v['Path']; $jumpReCookie[$k]['Max-Age'] = $v['Max-Age']; $jumpReCookie[$k]['expires'] = floatval($v['Expires']); $jumpReCookie[$k]['secure'] = $v['Secure']; $jumpReCookie[$k]['discard'] = $v['Discard']; $jumpReCookie[$k]['httpOnly'] = $v['HttpOnly']; $jumpReCookie[$k]['url'] = strstr($v['Domain'], 'www') ? $v['Domain'] : 'www' . $v['Domain']; $jumpReCookie[$k]['url'] = strstr($jumpReCookie[$k]['url'], 'http://') ? $v['Domain'] : 'https://' . $v['Domain']; } // $jar2 = new CookieJar(false, $loginCookie); // $jar2 = unserialize(file_get_contents(storage_path('logs/tmall2.cookie.txt'))); // dd($jar2); // $client = new Client(['cookie'=>$jar2]); // $client = new Client(); /* $response = $client->request('GET', 'https://nanjirenoc.tmall.com/category.htm' , [ 'verify' => false, //不验证ssl证书 'cookie' => $jar2, 'allow_redirects' => false, 'headers' => ['Accept-Encoding' => 'gzip, deflate, br'], 'decode_content' => false, 'timeout' => 60 ] ); */ //print_r((string)$response->getBody()); // 执行南极人店铺列表页js异步渲染html获取 $ql = QueryList::getInstance(); // 注册插件,默认注册的方法名为: chrome $ql->use(Chrome::class); $text2 = $ql->chrome(function ($page,$browser) use($jumpReCookie) { $page->setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.39'); // 设置cookie //print_r($jumpReCookie); // 动态传参 call_user_func_array([$page, 'setCookie'], $jumpReCookie); //$page->setCookie(); $page->goto('https://nanjirenoc.tmall.com/category.htm'); // $page->goto('https://www.iviewui.com/components/button'); // 等待h1元素出现 $page->waitFor('#J_ShopAsynSearchURL'); // $page->waitFor('h1'); sleep(100); // 获取页面HTML内容 $html = $page->content(); // 关闭浏览器 $browser->close(); // 返回值一定要是页面的HTML内容 return $html; },[ 'headless' => false, // 启动可视化Chrome浏览器,方便调试 'devtools' => true, // 打开浏览器的开发者工具 'timeout' => 100000, 'ignoreHTTPSErrors' => true, ]) ->removeHead() ->getHtml(); // ->find('h1')->text(); dd($text2); $response = $client->request('GET', 'https://nanjirenoc.tmall.com/category.htm' , [ 'verify' => false, //不验证ssl证书 'cookie' => $jar2, 'allow_redirects' => false, /* 'headers' => [ 'Content-Type'=>'application/html;charset=UTF-8', 'Accept-Encoding' => 'gzip, deflate, br', ], */ 'timeout' => 60 ] ); $string = (string)$response->getBody(); dd(iconv("gb2312","utf-8//IGNORE",$string)); } public function redirectUrlJump( $jumpUrl, $loginCookie=[] ) { // 将二维数组拼接成原生cookie格式 $cookieStr = ''; foreach ($loginCookie as $k=>$v){ $cookieStr .= ';'.$v['name'].'='.$v['value']; } $cookieStr = substr($cookieStr, 1); print_r($cookieStr); // 自定义http header $retrunCookie = new \GuzzleHttp\Cookie\CookieJar; $client = new Client(['cookies' => $retrunCookie]); $res = $client->request('GET', $jumpUrl, [ 'timeout' => 60, 'verify' => false, //不验证ssl证书 'headers' => [ 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', // 携带cookie 'Cookie' => $cookieStr,//'abc=111;xxx=222' ] ]); // 获取响应头部信息 $headers = $res->getHeaders(); file_put_contents(storage_path('logs/jumpUrlReturnCookie.log'), json_encode($retrunCookie->toArray())); return $retrunCookie->toArray(); dd($headers); } //查看是否登录成功URL:https://i.taobao.com/my_taobao.htm public function getJhtmlCookie() { $jar = $jar_response = new CookieJar(); $url = 'https://login.taobao.com/member/login.jhtml?tpl_redirect_url=https://www.tmall.com&style=miniall&enup=true&newMini2=true&full_redirect=true&sub=true&from=tmall&allp=assets_css=3.0.10/login_pc.css'; $ql = QueryList::get($url, null, [ 'cookies' => $jar ]) //->encoding('UTF-8','GB2312') ->removeHead() ->getHtml(); $match = []; preg_match_all('/\"loginFormData\"\:\{(.*?)\}/', $ql, $match); //print_r($match); $a1 = $match[1][0]; return $loginFormData = json_decode('{' . $a1 . '}', true); } }