feat: 新增多个电力集团采购平台爬虫
- 新增4个电力集团采购平台爬虫: * 中国大唐集团电子商务平台 (CdtCrawler) * 大连能源采购平台 (CeicCrawler) * 华润守正采购交易平台 (SzecpCrawler) - 更新 BidCrawlerService,集成新的爬虫到爬虫任务中 - 添加环境变量示例文件 .env.example,包含数据库和代理配置 - 优化 .env 文件,添加代理配置示例 - 为所有新爬虫添加完整的单元测试文件 - 使用与现有爬虫相同的反检测策略(人类行为模拟) - 支持分页抓取,每个平台最多抓取5页数据 - 统一的错误处理机制,单个爬虫失败不影响其他爬虫执行
This commit is contained in:
@@ -4,6 +4,51 @@ import * as puppeteer from 'puppeteer';
|
||||
// Increase timeout to 120 seconds for manual inspection and slow sites
|
||||
jest.setTimeout(120000);
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
describe('ChngCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
@@ -14,9 +59,20 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080'
|
||||
'--window-size=1920,1080',
|
||||
"--disable-infobars",
|
||||
// "--headless=new",
|
||||
// '--disable-dev-shm-usage',
|
||||
// '--disable-accelerated-2d-canvas',
|
||||
// '--no-first-run',
|
||||
// '--no-zygote',
|
||||
// '--disable-gpu',
|
||||
// '--disable-features=VizDisplayCompositor',
|
||||
// '--disable-webgl',
|
||||
// '--disable-javascript',
|
||||
],
|
||||
defaultViewport: null
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
@@ -24,7 +80,7 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
// await browser.close();
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
@@ -33,6 +89,20 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
Starting crawl for: ${ChngCrawler.name}`);
|
||||
console.log(`Target URL: ${ChngCrawler.url}`);
|
||||
|
||||
// 创建一个临时页面用于模拟人类行为
|
||||
const tempPage = await browser.newPage();
|
||||
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
console.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(tempPage);
|
||||
|
||||
// 模拟人类滚动
|
||||
console.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(tempPage);
|
||||
|
||||
await tempPage.close();
|
||||
|
||||
const results = await ChngCrawler.crawl(browser);
|
||||
|
||||
console.log(`
|
||||
|
||||
Reference in New Issue
Block a user