您现在的位置是:网站首页> C#技术
框架使用爬虫
- C#技术
- 2021-04-30
- 769人已阅读
摘要
爬虫设置位置
获得爬虫地址
string neturl = GetOneNetcrawler(ctx);
XNWindowHttpClient m_XNWindowHttpClient = new XNWindowHttpClient();
m_XNWindowHttpClient.GetUrl(neturl+"/netcrawler/hookweb.ajax?jsfile=tb&url=" + sLink, "", out jsonString);
参数jsfile要运行的js文件,url=抓取的地址,jsonString输出的json文本
/**
* Created by Administrator on 2019-07-26.
*/
var baseCall=require("../../modes/baseCall");
function hookWeb(ctx,params) {
console.log("params",params);
console.log("获得程序参数:",baseCall.getAppArgv());
/*
runPuppeteer(ctx,function(ctx,m_ReturnJson){
baseCall.ReturnJson(ctx.res,m_ReturnJson);
},"loop","http://m.gifshow.com/s/U6kK7y0Q");
*/
var m_ReturnJson={};
if(params.jsfile==undefined || params.jsfile=="")
{
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="缺少参数jsfile";
return m_ReturnJson;
}
if(params.url==undefined || params.url=="") {
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="缺少参数url";
return m_ReturnJson;
}
for(var i=0;i<1;i++) {
runPuppeteer(ctx, function (ctx, m_ReturnJson) {
console.log("HookWeb返回:", JSON.stringify(m_ReturnJson));
baseCall.ReturnJson(ctx.res, m_ReturnJson);
}, params.jsfile, params.url);
}
}
function runPuppeteer(ctx,callBack,jsname){
const execFile = require('child_process').execFile;
var jsArray=new Array();
jsArray[0]='crawlerjs/'+jsname+".js";
for(var i=3; i<arguments.length; i++){
jsArray[i-2]=arguments[i];
}
//const child = execFile('node', ['crawlerjs/kuaishou.js'], (error, stdout, stderr) => {
let maxSize=5000*1024;
const child = execFile('node', jsArray,{maxBuffer:maxSize},(error, stdout, stderr) => {
if (error) {
//throw error;
if(callBack!=null){
var m_ReturnJson={};
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg=error.toString();
callBack(ctx,m_ReturnJson);
}
}
console.log(stdout);
var nPos1=stdout.indexOf("####");
if(nPos1!=-1){
var nPos2=stdout.indexOf("####",nPos1+4);
if(nPos2!=-1){
var m_ReturnJson={};
m_ReturnJson.bOK=true;
try {
var jsonString=stdout.substring(nPos1+4,nPos2);
console.log("jsonString:"+jsonString);
m_ReturnJson.m_ReturnOBJ=JSON.parse(jsonString);
}
catch(err){
m_ReturnJson.bOK=false;
m_ReturnJson.sMsg="解析数据错误:"+err;
}
callBack(ctx,m_ReturnJson);
}
}
});
child.on("exit", () => console.log("exit"));
}
exports.InitMode=function() {
baseCall.AddRouteForGet("netcrawler","hookWeb",{call:hookWeb,CanNoLogin:true});
baseCall.AddRouteForPost("netcrawler","hookWeb",{call:hookWeb,CanNoLogin:true});
crawlerjs下的js文件
给个典型的tb.js文件
/**
* Created by Administrator on 2019-09-06.
*/
const puppeteer = require('puppeteer');
const devices = require('puppeteer/DeviceDescriptors');
const iPhone = devices['iPhone 6'];
function returnCrawler(result){
console.log("####"+JSON.stringify(result)+"####");
}
(async () => {
const browser = await puppeteer.launch({
executablePath: '../chrome-win/chrome.exe',
/*
args: [
'--disable-web-security', // 允许跨域
'--proxy-server=127.0.0.1:1080', // 代理
]
*/
args: [
'--disable-images', // 允许跨域
],
headless: false
});
console.log(process.argv);
const page = await browser.newPage();
// 设置浏览器视窗
/*
page.setViewport({
width: 1920,
height: 1080,
});
*/
await page.emulate(iPhone);
//await page.goto('https://m.toutiaoimg.com/item/6706038150935888391/?app=news_article_lite×tamp=1563929593&req_id=20190724085313010152028146551F0B7&group_id=6706038150935888391');
//await page.goto("http://m.gifshow.com/s/U6kK7y0Q");
await page.goto(process.argv[2], {
waitUntil: 'networkidle2' // 等待网络状态为空闲的时候才继续执行
});
try {
await page.addScriptTag({
url: 'https://cdn.bootcss.com/jquery/3.2.0/jquery.min.js'
});
}
catch(er)
{
console.log("错误:",er);
}
const result = await page.evaluate(() => {
console.log("加载完毕");
var m_ReturnOBJ={};
try {
/*
var m_picArray = $('.shop-info').find('.item-img').find('img');
if (m_picArray.length > 0) {
m_ReturnOBJ.pic = m_picArray[0].src;
}
*/
var imgs=document.querySelectorAll('img.item-img');
if(imgs.length>0) {
m_ReturnOBJ.pic = imgs[0].src;
}else {
var imgs=document.querySelectorAll('img[aria-label=商品主图]');
m_ReturnOBJ.pic = imgs[0].src;
}
// m_ReturnOBJ.html=document.body.innerHTML;
}catch(err)
{
m_ReturnOBJ.pic=err;
console.log(err);
}
//m_ReturnOBJ.pic="123.jpg";
return m_ReturnOBJ;
});
returnCrawler(result);
//await page.screenshot({path: 'jd.png'});
browser.close();
})();
部署位置