您现在的位置是:网站首页> NodeJS
NodeJS框架里爬虫模块任务队列实现
- NodeJS
- 2023-08-23
- 458人已阅读
/**
* Created by Administrator on 2019-07-26.
*/
var baseCall=require("../../modes/baseCall");
var processCnt=0;
var processMax=10;
//任务类
class TaskQueue {
constructor(concurrency) {
this.concurrency = concurrency;
this.running = 0;
this.queue = [];
}
async runTask(task) {
return new Promise((resolve, reject) => {
this.queue.push(async () => {
try {
await task();
resolve();
} catch (error) {
reject(error);
} finally {
this.running--;
this.next();
}
});
if (this.running < this.concurrency) {
this.next();
}
});
}
next() {
while (this.running < this.concurrency && this.queue.length) {
const task = this.queue.shift();
task().catch(error => console.error(error));
this.running++;
}
}
}
/*
// 创建一个任务队列,设置并发数为2
const queue = new TaskQueue(2);
// 示例任务函数
function simulateAsyncTask(id, duration) {
return new Promise(resolve => {
console.log(`开始执行任务 ${id}`);
setTimeout(() => {
console.log(`任务 ${id} 完成`);
resolve();
}, duration);
});
}
// 向任务队列中添加任务
queue.runTask(() => simulateAsyncTask(1, 3000));
queue.runTask(() => simulateAsyncTask(2, 2000));
queue.runTask(() => simulateAsyncTask(3, 1000));
*/
// 创建一个任务队列,设置并发数为1
const queue = new TaskQueue(1);
function myTask(ctx,params,m_ReturnJson){
return new Promise(resolve => {
console.log(`开始执行任务`);
runPuppeteer(ctx, function (ctx, m_ReturnJson) {
console.log("HookWeb返回:", JSON.stringify(m_ReturnJson));
processCnt--;
baseCall.ReturnJson(ctx.res, m_ReturnJson);
resolve();
}, params.jsfile, params.url);
});
}
function hookWeb(ctx,params) {
console.log("params",params);
console.log("获得程序参数:",baseCall.getAppArgv());
/*
runPuppeteer(ctx,function(ctx,m_ReturnJson){
baseCall.ReturnJson(ctx.res,m_ReturnJson);
},"loop","http://m.gifshow.com/s/U6kK7y0Q");
*/
var m_ReturnJson={};
if(params.jsfile==undefined || params.jsfile=="")
{
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="缺少参数jsfile";
return m_ReturnJson;
}
if(params.url==undefined || params.url=="") {
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="缺少参数url";
return m_ReturnJson;
}
if(processCnt+1>processMax)
{
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="启动蜘蛛超限,请稍候再试";
return m_ReturnJson;
}
processCnt++;
//myTask(ctx,params,m_ReturnJson);
queue.runTask(() => myTask(ctx,params,m_ReturnJson));
/*
for(var i=0;i<1;i++) {
runPuppeteer(ctx, function (ctx, m_ReturnJson) {
console.log("HookWeb返回:", JSON.stringify(m_ReturnJson));
processCnt--;
baseCall.ReturnJson(ctx.res, m_ReturnJson);
}, params.jsfile, params.url);
}
*/
}
function hookWebOLD(ctx,params) {
console.log("params",params);
console.log("获得程序参数:",baseCall.getAppArgv());
/*
runPuppeteer(ctx,function(ctx,m_ReturnJson){
baseCall.ReturnJson(ctx.res,m_ReturnJson);
},"loop","http://m.gifshow.com/s/U6kK7y0Q");
*/
var m_ReturnJson={};
if(params.jsfile==undefined || params.jsfile=="")
{
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="缺少参数jsfile";
return m_ReturnJson;
}
if(params.url==undefined || params.url=="") {
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="缺少参数url";
return m_ReturnJson;
}
if(processCnt+1>processMax)
{
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="启动蜘蛛超限,请稍候再试";
return m_ReturnJson;
}
processCnt++;
for(var i=0;i<1;i++) {
runPuppeteer(ctx, function (ctx, m_ReturnJson) {
console.log("HookWeb返回:", JSON.stringify(m_ReturnJson));
processCnt--;
baseCall.ReturnJson(ctx.res, m_ReturnJson);
}, params.jsfile, params.url);
}
}
function helloEXE(ctx,callBack,jsname){
const execFile = require('child_process').execFile;
var jsArray=new Array();
jsArray[0]='crawlerjs/'+jsname+".js";
for(var i=3; i<arguments.length; i++){
jsArray[i-2]=arguments[i];
}
//const child = execFile('node', ['crawlerjs/kuaishou.js'], (error, stdout, stderr) => {
let maxSize=5000*1024;
const child = execFile('notepad', jsArray,{maxBuffer:maxSize},(error, stdout, stderr) => {
console.log(error);
console.log(stdout);
console.log(stderr);
});
child.on("exit", () => console.log("exit"));
}
function runPuppeteer(ctx,callBack,jsname){
//helloEXE(ctx,callBack,jsname);
//return;
const execFile = require('child_process').execFile;
var jsArray=new Array();
jsArray[0]='crawlerjs/'+jsname+".js";
for(var i=3; i<arguments.length; i++){
jsArray[i-2]=arguments[i];
}
//const child = execFile('node', ['crawlerjs/kuaishou.js'], (error, stdout, stderr) => {
let maxSize=5000*1024;
const child = execFile('node', jsArray,{maxBuffer:maxSize},(error, stdout, stderr) => {
//const child = execFile('node', jsArray,(error, stdout, stderr) => {
if (error) {
console.log(error);
//throw error;
if(callBack!=null){
var m_ReturnJson={};
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg=error.toString();
callBack(ctx,m_ReturnJson);
}
}
console.log(stdout);
var nPos1=stdout.indexOf("####");
if(nPos1!=-1){
var nPos2=stdout.indexOf("####",nPos1+4);
if(nPos2!=-1){
var m_ReturnJson={};
m_ReturnJson.bOK=true;
try {
var jsonString=stdout.substring(nPos1+4,nPos2);
console.log("jsonString:"+jsonString);
m_ReturnJson.m_ReturnOBJ=JSON.parse(jsonString);
}
catch(err){
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="解析数据错误:"+err;
}
callBack(ctx,m_ReturnJson);
}
}
});
child.on("exit", () => console.log("exit"));
}
exports.InitMode=function() {
baseCall.AddRouteForGet("netcrawler","hookWeb",{call:hookWeb,CanNoLogin:true});
baseCall.AddRouteForPost("netcrawler","hookWeb",{call:hookWeb,CanNoLogin:true});
}
//请求例子:http://1.1.1.1:3001/netcrawler/hookWeb.ajax?jsfile=tb&url=http://qq.com
jsfile.js文件内容:注意请求超时的处理代码
const puppeteer = require('puppeteer');
const devices = require('puppeteer/DeviceDescriptors');
const iPhone = devices['iPhone 6'];
function returnCrawler(result){
console.log("####"+JSON.stringify(result)+"####");
}
(async () => {
const browser = await puppeteer.launch({
executablePath: '../chrome-win/chrome.exe',
/*
args: [
'--disable-web-security', // 允许跨域
'--proxy-server=127.0.0.1:1080', // 代理
]
*/
args: [
'--disable-images', // 允许跨域
],
headless: false
});
console.log(process.argv);
const page = await browser.newPage();
// 设置浏览器视窗
/*
page.setViewport({
width: 1920,
height: 1080,
});
*/
await page.emulate(iPhone);
//await page.goto('https://m.toutiaoimg.com/item/6706038150935888391/?app=news_article_lite×tamp=1563929593&req_id=20190724085313010152028146551F0B7&group_id=6706038150935888391');
//await page.goto("http://m.gifshow.com/s/U6kK7y0Q");
await page.goto(process.argv[2], {
waitUntil: 'networkidle2' // 等待网络状态为空闲的时候才继续执行
});
try {
await page.addScriptTag({
url: 'https://cdn.bootcss.com/jquery/3.2.0/jquery.min.js'
});
}
catch(er)
{
console.log("错误:",er);
}
//对于处理超时的处理
setTimeout(function(){
var m_outOBJ={};
m_outOBJ.bOK=false;
m_outOBJ.pic="";
returnCrawler(m_outOBJ);
//await page.screenshot({path: 'jd.png'});
browser.close();
return;
},10000);
const result = await page.evaluate(() => {
console.log("加载完毕");
var m_ReturnOBJ={};
try {
/*
var m_picArray = $('.shop-info').find('.item-img').find('img');
if (m_picArray.length > 0) {
m_ReturnOBJ.pic = m_picArray[0].src;
}
*/
var imgs=document.querySelectorAll('img.item-img');
if(imgs.length>0) {
m_ReturnOBJ.pic = imgs[0].src;
}else {
var imgs=document.querySelectorAll('img[aria-label=商品主图]');
m_ReturnOBJ.pic = imgs[0].src;
}
// m_ReturnOBJ.html=document.body.innerHTML;
}catch(err)
{
m_ReturnOBJ.pic=err;
console.log(err);
}
//m_ReturnOBJ.pic="123.jpg";
return m_ReturnOBJ;
});
returnCrawler(result);
//await page.screenshot({path: 'jd.png'});
browser.close();
})();
上一篇:实用代码下载
下一篇:Puppeteer专题