您现在的位置是:网站首页> NodeJS
Puppeteer专题
- NodeJS
- 2023-12-02
- 489人已阅读
Puppeteer和Headless chrome的简介和应用
node puppeteer拦截谷歌请求、设置浏览器响应-爬取电子书链接
Puppeteer和Headless chrome的简介和应用
Puppeteer和Headless chrome的简介和应用
性能优化
const browser = await puppeteer.launch(
{
headless:true,
args: [
‘–disable-gpu’,
‘–disable-dev-shm-usage’,
‘–disable-setuid-sandbox’,
‘–no-first-run’,
‘–no-sandbox’,
‘–no-zygote’,
‘–single-process’
]
});
Puppeteer:在page.evaluate()中传参
const links = await page.evaluate((evalVar) => {
console.log(evalVar); // should be defined now …
}, evalVar);
滚动获得内容通过源码
/**
* Created by Administrator on 2021-07-10.
*/
const puppeteer = require('puppeteer');
const devices = require('puppeteer/DeviceDescriptors');
const iPhone = devices['iPhone 6'];
function returnCrawler(result){
console.log("####"+JSON.stringify(result)+"####");
}
function Trim(str)
{
return str.replace(/(^\s*)|(\s*$)/g, "");
}
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 120);
});
});
}
(async () => {
const browser = await puppeteer.launch({
executablePath: '../chrome-win/chrome.exe',
//executablePath: 'C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chrome.exe',
//executablePath: '../Chrome-bin/chrome.exe',
args: [
'--disable-images', // 允许跨域
'--disable-web-security', // 允许跨域
'--disable-infobars',
'--start-maximized',
//'-proxy-server=127.0.0.1:8888',
],
headless: false,
slowMo: 250,//延迟500毫秒
ignoreDefaultArgs: ["--enable-automation"]
});
console.log(process.argv);
const page = await browser.newPage();
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
});
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setRequestInterception(true);
let imageBytes = [-119, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82, 0, 0, 0, 64, 0,
0, 0, 64, 8, 3, 0, 0, 0, -99, -73, -127, -20, 0, 0, 0, 4, 103, 65, 77, 65, 0, 0, -79, -113, 11, -4, 97,
5, 0, 0, 0, 1, 115, 82, 71, 66, 0, -82, -50, 28, -23, 0, 0, 0, -64, 80, 76, 84, 69, 8, -111, -23, -124,
-46, -1, -127, -50, -3, -125, -48, -1, 125, -52, -3, -123, -44, -1, 127, -50, -3, -1, -1, -1, -126, -50,
-4, -121, -41, -1, -124, -49, -3, 13, -109, -22, 9, -111, -23, -116, -46, -4, 23, -103, -20, -121, -47,
-3, 0, -116, -24, 107, -60, -6, 29, -100, -19, 114, -58, -6, 18, -106, -21, -125, -48, -2, 95, -67, -9,
-105, -42, -4, -3, -2, -1, -48, -20, -3, 120, -54, -5, 41, -95, -18, -12, -5, -1, 81, -74, -11, -7, -3,
-1, -95, -38, -3, -90, -37, -5, 35, -97, -18, 67, -81, -13, -126, -49, -3, 46, -92, -17, 90, -71, -11,
-18, -8, -1, 5, -112, -23, 78, -77, -13, -71, -29, -3, -111, -44, -3, 123, -53, -4, 58, -86, -15, 51,
-90, -17, 101, -66, -10, -36, -15, -2, -64, -26, -3, -81, -32, -3, 72, -79, -13, 55, -87, -15, -24,
-11, -2, 58, -81, -5, 62, -83, -14, 65, -83, -15, -41, -17, -2, -58, -24, -3, -78, -33, -5, 118, -62,
-12, -29, -13, -3, -115, -50, -9, -122, -55, -12, 42, -89, -7, 28, 71, -109, -109, 0, 0, 3, 94, 73, 68,
65, 84, 88, -61, -19, 87, 91, 123, -94, 48, 16, 37, -102, -124, 112, -111, 59, 40, -32, 5, 11, 10, -118,
-73, -74, -82, 90, -37, -18, -2, -1, 127, -75, -127, -86, 107, 77, 92, -30, -21, 126, 123, 30, -115, -25, 56, 57,
51, -103, 25, 37, -23, 63, -2, -29, 31, -121, 110, -60, -117, -119, 11, -95, 27, 44, 98, 67, 127, -108, -83, -59,
-123, 93, 2, 5, -7, -124, -8, 72, 1, -91, 61, -56, -76, 7, -24, -34, -50, -10, 107, -14, 25, 84, 4, -71, 47, -122,
-24, -81, -17, 92, 5, -75, -56, 45, -128, 82, 62, 9, 69, 49, 94, 41, -128, 112, 1, 20, 123, -36, -52, 95, 98, -123,
-36, 5, -62, -81, 77, -4, 39, 0, -56, 95, 0, -64, -66, -127, 127, 109, 29, 15, 62, -38, -3, -115, -65, 64, 41, 105,
-128, -17, 103, -9, -7, 113, -22, -109, 70, -128, -24, 110, 58, 61, 23, 17, 1, 40, -63, -67, -70, 28, 40, 34, 124,
-46, 66, 119, 82, 49, 38, -75, 1, 16, -30, 59, 76, 124, 58, 2, 46, -65, -96, 38, 85, 0, 88, 77, -114, 35, 62, 31,
-50, -97, -97, -25, -75, -126, -78, -32, -15, 29, 88, 57, 104, 30, 100, 89, 14, -71, -4, -47, 80, -106, -5, 9, -84,
-22, -55, -26, -123, -80, -81, 2, -128, 51, -7, 88, 76, -62, 25, 100, 5, 58, 31, 63, -117, -9, -2, -44, -84, 92, 0,
49, -25, -7, -81, -22, 20, 76, 55, -110, 21, 127, 76, 85, 54, -128, -39, -54, -78, -34, -121, 114, 29, -126, 50, -32,
-28, -80, -92, 55, -64, 115, 121, 98, -27, -53, -11, -90, -53, 24, -87, 38, -102, 100, -3, 60, 76, 55, -107, 0, -80,
-39, 76, -114, 43, 7, 90, -35, 97, -106, -21, -71, 22, -113, 24, -127, -7, -69, -91, -21, -70, 118, 60, 84, 2, 126,
-55, 22, -45, -94, -50, -63, 104, -22, -28, -70, -98, 123, 93, -26, 10, -35, -126, 10, 72, 86, 119, 93, -37, -61, 49,
-31, -91, 22, 48, -41, 75, 43, -105, 114, -19, 76, 51, 77, -13, 100, 71, 103, 111, 73, 82, 110, 37, -57, -78, 78, -28,
43, -73, 12, 97, 103, -76, -103, -19, 61, -55, 43, -66, -82, 109, 70, -63, 96, -30, 126, 73, -88, -10, 88, -41, -30,
-49, 48, -103, 99, 122, 61, -27, -119, 91, 70, -101, 62, 45, -126, 94, -104, 116, 72, 68, 84, -107, -72, 123, 47, -73,
44, -19, -43, 77, 85, 85, -123, 100, -98, -124, -43, -7, 48, -20, 96, -94, -68, 48, 2, 1, -62, 111, -14, 9, -61, -11,
103, -80, 90, 77, -76, -36, 48, 60, -61, -77, -116, 96, 101, 7, -97, -21, -31, -7, -8, -61, 36, 74, -63, -42, 33, 30,
-11, -28, 63, -40, -66, -123, -45, -107, -10, -125, -30, -105, -105, 108, 103, -57, -16, -6, -16, 13, -6, -82, -57,
-44, 33, 12, -27, 91, 36, 3, 42, -80, 98, 63, -17, 119, 8, -54, 24, 11, -52, 41, -13, 69, -71, -9, 113, -40, -54, 28,
-116, 48, -29, -30, 4, -51, -121, -78, 48, -98, 33, -29, 98, -127, -70, -113, 9, -36, -66, -24, -41, -121, 34, 24, 97,
127, -52, 100, -127, -25, -63, 29, 12, -25, 126, 116, -101, 5, -35, 110, -121, -62, 2, 107, -84, 76, 56, 35, -27, 91,
29, 52, 88, -128, -106, -100, -114, -122, 15, -126, -4, -87, -55, -42, 81, 85, -53, 64, 52, 15, 51, -56, -21, 72, -12,
61, -61, 55, 33, 126, -65, -125, 57, -81, 89, -54, 34, 31, -117, -123, 16, 66, -30, -69, -52, -94, -80, 79, -85, -79,
-66, 21, -77, -112, 54, 118, 120, -29, -30, 75, 61, -42, -95, 72, 38, 123, 117, -61, 5, -19, 111, -91, -8, -92, 124,
77, -75, -115, -104, 5, -11, -104, 7, 87, 49, 100, 32, 61, -11, 126, 33, -127, -45, -104, -57, 23, 31, -116, 8, -112,
-57, 5, -82, 6, -36, 101, -84, -105, 15, 9, 92, 58, -21, 56, 61, -17, 53, -16, -7, 33, -127, -45, 120, -47, -125, -53,
94, -126, 19, -95, 66, -2, -77, -85, 20, -41, 1, -76, -38, 116, -126, 11, 8, 108, -23, 119, -37, -41, 33, -100, 28,
-16, 65, -108, -30, 110, 79, -28, 45, 19, 18, -99, -74, -71, -54, 5, 47, -86, 55, 51, -32, -17, 98, -97, 116, -6, -51,
2, -76, -89, -105, 78, -127, -38, -25, 68, 44, 107, 7, -108, 40, -109, -116, 50, 85, -73, 34, -107, 92, -19, 72, 59,
92, -15, -46, -44, -111, 92, 64, 82, -124, 2, -121, 110, -23, 46, -128, 7, -111, 74, 70, 65, 101, -99, -83, -48, 123,
-48, -34, 58, -64, 8, -37, 75, -3, 107, -70, 9, -44, -14, -10, 108, 62, -3, 87, 64, -105, 111, 58, -26, -99, -52, -47,
47, 29, -95, 57, 13, 116, 67, 57, 119, 3, -51, -55, -66, -17, 25, 25, 106, 118, 113, -40, -59, 105, -53, -71, 34, -3,
6, 71, 84, 119, -92, 72, -109, -15, -121, 0, 0, 0, 0, 73, 69, 78, 68, -82, 66, 96, -126];
page.on('request', interceptedRequest => {
//判断如果是 图片请求 就直接拦截
if (interceptedRequest.url().endsWith('.png') || interceptedRequest.url().endsWith('.jpg')|| interceptedRequest.url().endsWith('.PNG') || interceptedRequest.url().endsWith('.JPG'))
//interceptedRequest.abort(); //终止请求
interceptedRequest.respond({
status: 200,
body: Buffer.from(imageBytes)
});
else
interceptedRequest.continue();//弹出
});
// 设置浏览器视窗
/*
page.setViewport({
width: 1920,
height: 1080,
});
*/
await page.emulate(iPhone);
//await page.goto('https://m.toutiaoimg.com/item/6706038150935888391/?app=news_article_lite×tamp=1563929593&req_id=20190724085313010152028146551F0B7&group_id=6706038150935888391');
//await page.goto("http://m.gifshow.com/s/U6kK7y0Q");
await page.goto(process.argv[2]);
await page
.mainFrame()
.addScriptTag({
url: 'https://cdn.bootcss.com/jquery/3.2.0/jquery.min.js'
});
await page.waitFor(2000);
await page.mouse.move( 126,126 );
await page.mouse.down();
await page.mouse.move( 126, 19 );
await page.mouse.up();
//const input_text= await page.$("#kw");
var y=0;
while(true) {
var result = await page.evaluate((y) => {
console.log("加载完毕");
if (y == 0) {
var top = 0
//每200毫秒滚动100px
var timer = setInterval(() => {
console.log(window.scrollY);
window.scrollTo(0, top += 100)
}, 200);
y = 1;
}
/*
var ii;
for (ii = y; ii <= y + 1000 * 5; ii += 100) {
window.scrollTo(0, ii);
}
y = ii;
*/
try {
var m_ReturnOBJ = {};
m_ReturnOBJ.sMsg="";
var m_MVList = [];
m_ReturnOBJ.y = y;
// var allcnt = $('div[class="feed-header"]').text();
// allcnt.replace("作品", "");
// allcnt = Trim(allcnt);
// var nn = parseInt(allcnt);
/*
var m_lis=$('li[class="photo "]').find('a');
var name=$('div[class="name"]').text();
for(var i=0;i<m_lis.length;i++)
{
var m_one={};
m_one.mvurl= m_lis.attr('href');
m_one.mvpic= m_lis.attr('data-lazy');
m_one.nickname=name;
m_one.type="kuaishou";
m_ReturnOBJ.push(m_one);
}
*/
var m_lis = $('img[class="image-main"]');
var name = $('div[class="user-info-name"]').text();
var mytext = $('div[class="footer"]').text();
var bOver = false;
if (mytext.indexOf("已经到底") != -1) {
bOver = true;
}
//bOver = true;
if (bOver) {
for (var i = 0; i < m_lis.length; i++) {
var m_one = {};
m_one.mvurl = "";//m_lis.attr('href');
m_one.mvpic = $(m_lis[i]).attr('src');
m_one.nickname = name;
m_one.type = "kuaishou";
m_one.title = "";
m_MVList.push(m_one);
}
m_ReturnOBJ.bOK = true;
m_ReturnOBJ.sMgs=mytext;
}
else {
m_ReturnOBJ.bOK = false;
}
m_ReturnOBJ.m_MVList = m_MVList;
}
catch(err)
{
m_ReturnOBJ.bOK=true;
m_ReturnOBJ.sMsg=err.message;
}
return m_ReturnOBJ;
}, y);
//console.log(result);
if (result.bOK) {
returnCrawler(result.m_MVList);//m_MVList);
break;
}
else {
y = result.y;
}
}
await page.screenshot({path: 'kuaishou.png'});
browser.close();
})();
Puppeteer性能优化与执行速度提升
我们需要找到下面几种配置来提升速度:
如果将Dom解析和渲染放到同一进程,肯定能提升时间(进程上下文切换的时间)。对应的配置是 single-process
部分功能disable掉,比如GPU、Sandbox、插件等,减少内存的使用和相关计算。
如果启动Chromium时能绑定到某个CPU核上也能提升速度(单核上进行进程切换耗费的时间更少)。可惜没有找到对应的配置,官方文档写的是Chromium启动时会自动绑定CPU大核(ARM架构的CPU通常有大小核之分),依此推测Chromium启动时是会绑核的。(此处我并未验证)
最后配置如下:
const browser = await puppeteer.launch(
{
headless:true,
args: [
‘–disable-gpu’,
‘–disable-dev-shm-usage’,
‘–disable-setuid-sandbox’,
‘–no-first-run’,
‘–no-sandbox’,
‘–no-zygote’,
‘–single-process’
]
});
Chromium 启动参数列表 文档中的配置项都可以尝试看看,我没有对所有选项做测试,但可以肯定存在某些选项能提升Chromium速度。
优化Chromium执行流程
接下来我们再单独优化Chromium对应的页面。我之前的文章中提过,如果每次请求都启动Chromium,再打开tab页,请求结束后再关闭tab页与浏览器。流程大致如下:
请求到达->启动Chromium->打开tab页->运行代码->关闭tab页->关闭Chromium->返回数据
真正运行代码的只是tab页面,理论上启动一个Chromium程序能运行成千上万的tab页,可不可以复用Chromium每次只打开一个tab页然后关闭呢?当然是可以的,Puppeteer提供了puppeteer.connect() 方法,可以连接到当前打开的浏览器。流程如下:
请求到达->连接Chromium->打开tab页->运行代码->关闭tab页->返回数据
代码如下:
const MAX_WSE = 4; //启动几个浏览器
let WSE_LIST = []; //存储browserWSEndpoint列表
init();
app.get('/', function (req, res) {
let tmp = Math.floor(Math.random()* MAX_WSE);
(async () => {
let browserWSEndpoint = WSE_LIST[tmp];
const browser = await puppeteer.connect({browserWSEndpoint});
const page = await browser.newPage();
await page.goto('file://code/screen/index.html');
await page.setViewport({
width: 600,
height: 400
});
await page.screenshot({path: 'example.png'});
await page.close();
res.send('Hello World!');
})();
});
function init(){
(async () => {
for(var i=0;i<MAX_WSE;i++){
const browser = await puppeteer.launch({headless:true,
args: [
'--disable-gpu',
'--disable-dev-shm-usage',
'--disable-setuid-sandbox',
'--no-first-run',
'--no-sandbox',
'--no-zygote',
'--single-process'
]});
browserWSEndpoint = await browser.wsEndpoint();
WSE_LIST[i] = browserWSEndpoint;
}
console.log(WSE_LIST);
})();
}
利用cluster优化Puppeteer
通常情况下我们会使用 .map() 搭配 Promise.all() 的方式并行处理异步,但是在使用Puppeteer批量截图时发现Promise.all会打开多个浏览器,导致机器性能急剧下降。
Promise.all() 并行处理
image
利用 Reduce 是多个Promise顺序执行
await tasks.reduce((sequence, url, idx) => {
return sequence.then(() => {
// doAnalyze 是个异步函数
return doAnalyze(url, idx);
});
}, Promise.resolve())
场景:有40个URL,需要获取每个博客的首页截图
如果是Promise.all(),程序启动会同时打开20+的chromium浏览器,导致机器卡死。
使用reduce缓解了压力,但没充分利用多核性能
参入Cluster
// cluster_index.js 入口文件
const cluster = require('cluster');
(async () => {
let run;
if (cluster.isMaster) {
run = require('./cluster_master');
} else {
run = require('./cluster_worker');
}
try {
await run();
} catch (e) {
// 追踪函数的调用轨迹
console.trace(e);
}
})();
// cluster_master.js master进程分配任务
const cluster = require('cluster');
const numCPUs = require('os').cpus().length;
// 处理的任务列表
let arr = [
'https://github.com/guoguoya',
'http://www.52cik.com',
'http://zhalice.com',
'https://www.yzqroom.cn',
'http://zxh.name',
'https://fogdong.github.io/',
'http://github.com/elsieyin',
'https://summer.tlb058.com',
'https://skymon4.cn',
'http://www.jiweiqing.cn',
'http://effect.im',
'http://dingkewz.com',
'http://xcdh.me',
'http://d2g.io',
'http://codingdemon.com',
'http://blog.leanote.com/dujuncheng',
'http://niexiaotao.com',
'http://zhengchengwen.com',
'http://blog.tophefei.com',
'https://zh-rocco.github.io',
'http://wangyn.net',
'http://dscdtc.ml',
'http://jweboy.github.io',
'http://www.wenghaoping.com',
'http://zhoujingchao.github.io',
'http://kyriejoshua.github.io/jo.github.io/',
'http://www.withyoufriends.com',
'http://if2er.com',
'https://github.com/zhou-yg',
'http://github/suoutsky',
'http://richardsleet.github.io',
'http://www.89io.com',
'https://guoshencheng.com',
'http://www.landluck.com.cn',
'http://www.89io.com',
'http://myoungxue.top',
'https://github.com/Wangszzju',
'http://www.hacke2.cn',
'https://github.com/enochjs',
'https://i.jakeyu.top',
'http://muyunyun.cn',
];
module.exports = async () => {
// 每个 CPU 分配 N 个任务
const n = Math.floor(arr.length / numCPUs);
// 未分配的余数
const remainder = arr.length % numCPUs;
for (let i = 1; i <= numCPUs; i += 1) {
const tasks = arr.splice(0, n + (i > remainder ? 0 : 1));
// 将任务编号传递到 Cluster 内启动
cluster.fork({ tasks: JSON.stringify(tasks) });
}
cluster.on('exit', (worker) => {
console.log(`worker #${worker.id} PID:${worker.process.pid} died`);
});
cluster.on('error', (err) => {
console.log(`worker #${worker.id} PID ERROR: `, err);
});
};
// cluster_worker.js worker进程 完成任务
const cluster = require('cluster');
const puppeteer = require('puppeteer');
// 禁止直接启动
if (cluster.isMaster) {
console.log('----', cluster.worker.id)
process.exit(0);
}
module.exports = async () => {
const env = process.env.tasks;
let tasks = [];
if (/^\[.*\]$/.test(env)) {
tasks = JSON.parse(env);
}
if (tasks.length === 0) {
console.log('????', tasks)
// 非法启动, 释放进程资源
process.exit(0);
}
console.log(`worker #${cluster.worker.id} PID:${process.pid} Start`);
await tasks.reduce((sequence, url, idx) => {
return sequence.then(() => {
return doAnalyze(url, idx);
});
}, Promise.resolve())
console.log(cluster.worker.id + ' 顺利完成');
process.exit(0);
};
async function doAnalyze(url, i) {
try {
const browser = await (puppeteer.launch({
// 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/
// executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',
//设置超时时间
timeout: 30000,
//如果是访问https页面 此属性会忽略https错误
ignoreHTTPSErrors: true,
// 打开开发者工具, 当此值为true时, headless总为false
devtools: false,
// 关闭headless模式, 会打开浏览器
headless: false
}));
const page = await browser.newPage();
await page.setViewport({width: 1920, height: 1080});
await page.goto(url);
await page.waitFor(4000);
console.log(cluster.worker.id, url, i, '截图中...');
await page.screenshot({
path: `./img_cluster/${cluster.worker.id}-${i}.png`,
// path: '3.png',
type: 'png',
// quality: 100, 只对jpg有效
// fullPage: true,
// 指定区域截图,clip和fullPage两者只能设置一个
// clip: {
// x: 0,
// y: 0,
// width: 1920,
// height: 600
// }
});
browser.close();
} catch (error) {
console.log(cluster.worker.id, url, i)
console.log(error)
}
};
多个page轮询与多个browser轮询
为了性能,现有解决方案是初始化若干个browser,请求打过来时,直接在browserList中取一个browser实例使用。
作为对比,可以参考初始化一个browser,预先打开若干个page,请求打过来时,直接在pageList中取一个page实例使用。
node puppeteer拦截谷歌请求、设置浏览器响应-爬取电子书链接
一、拦截谷歌请求:
(1)拦截谷歌请求,使用后会激活会激活 request.abort, request.continue 和 request.respond 方法。
await page.setRequestInterception(true);
(2)监听request事件,对请求做出操作
一旦启用请求拦截,每个请求都将停止,除非它继续,响应或中止
page.on('request',回调函数(request形参,包含上述方法)
如拦截谷歌广告:
//截取谷歌请求
await page.setRequestInterception(true);
//监听每一次请求,形参为请求对象
page.on('request',(interceptedRequest)=>{
//ite.url()获取请求url地址
let urlObj=Url.parse(interceptedRequest.url());
//如果是谷歌的广告
if(urlObj.hostname=='googleads.g.doubleclick.net'或者urlObj.hostname.indexOf('google')!=-1){
//拦截请求
interceptedRequest.abort();
}else{
interceptedRequest.continue();
}
})
二、如果爬取时间超时导致失败:
(1)通过延迟函数,将延迟时间增长
(2)使用page.setDefaultNavigationTimeout(0); 将浏览器响应时间改为无限长,默认为30秒
(3)在配置信息中设置,timeout=0; 效果和(2)一样
三、获取资源后关闭页面,减小性能消耗
page.close();
四、通过page.$/$$()返回的ElementHandle 获取元素属性
let xx=await page.$('选择器')
let xxx=await xx.getProperty('属性');
let xxxx=xxx._remoteObject.value;
代码示例:
let puppeteer=require('puppeteer');
let axios =require('axios');
let Url=require('url');
let fs=require('fs');
let http='https://sobooks.cc/';
async function run(){
function wait(time){
return new Promise(function(resolve,reject){
setTimeout(function(){
resolve('ok延迟')
})
},time)
}
let options={
headless:false,
slowMo:250,
defaultViewport:{
width:1000,
height:800
}
}
let browser=await puppeteer.launch(options);
//获取所有页数
async function getAllNum(){
let page=await browser.newPage();
//截取谷歌广告请求
await page.setRequestInterception(true);
//监听每一次请求,形参为请求对象
page.on('request',(interceptedRequest)=>{
//ite.url()获取请求url地址
let urlObj=Url.parse(interceptedRequest.url());
if(urlObj.hostname=='googleads.g.doubleclick.net'){
//拦截请求
interceptedRequest.abort();
}else{
interceptedRequest.continue();
}
})
await page.goto(http);
let pageNum=await page.$eval('.pagination li:last-child span',(ele)=>{
//获得页数
let num=ele.innerText.substring(2,length-2);
return num.trim();
})
//返回内容后关掉当前页面,节省空间
page.close();
return pageNum;
}
let pageNum=await getAllNum();
//获取指定页的所有书籍链接
async function pageList(num){
let listUrl='https://sobooks.cc/page/'+num;
let page=await browser.newPage();
//设置浏览器响应时间为无限制等待,默认为30秒
page.setDefaultNavigationTimeout(0);
//截取谷歌请求
await page.setRequestInterception(true);
//监听每一次请求,形参为请求对象
page.on('request',(interceptedRequest)=>{
//ite.url()获取请求url地址
let urlObj=Url.parse(interceptedRequest.url());
if(urlObj.hostname=='googleads.g.doubleclick.net'){
//拦截请求
interceptedRequest.abort();
}else{
interceptedRequest.continue();
}
})
await page.goto(listUrl);
let arr= await page.$$eval('.card .card-item .thumb-img>a',(ele)=>{
let arr=[];
ele.forEach(function(item,index){
let obj={
href:item.getAttribute('href'),
title:item.getAttribute('title')
};
arr.push(obj);
})
// console.log(arr);
return arr;
})
//获取结束后关闭页面
page.close();
//遍历内容,去到书籍的详情页面,获取网盘链接
arr.forEach(async (item,index)=>{
//延迟函数,延迟打开
await wait(300*index);
getPageInfo(item);
})
}
//根据书籍地址,去获取网盘链接
async function getPageInfo(pageObj){
let page=await browser.newPage();
//截取谷歌请求
await page.setRequestInterception(true);
//监听每一次请求,形参为请求对象
page.on('request',(interceptedRequest)=>{
//ite.url()获取请求url地址
let urlObj=Url.parse(interceptedRequest.url());
if(urlObj.hostname=='googleads.g.doubleclick.net'){
//拦截请求
interceptedRequest.abort();
}else{
interceptedRequest.continue();
}
})
await page.goto(pageObj.href);
let eleA=await page.$('.dltable tr:nth-child(3) a:last-child')
//page.$()方式,获取元素属性的方法
let url=await eleA.getProperty('href');
url=url._remoteObject.value;
//解析地址,获取网盘链接
url=url.split('?url=')[1]+'\n';
let content={
title:pageObj.title,
url:url
}
//存放地址
console.log(pageObj.title);
fs.writeFile("C:/Users/10853/Desktop/爬虫电子书.txt",content.title+','+content.url,{flag:'a'},function(err){
console.log('ok');
})
}
pageList(1);
}
run();