feat: 新增多个电力集团采购平台爬虫

- 新增4个电力集团采购平台爬虫:
  * 中国大唐集团电子商务平台 (CdtCrawler)
  * 大连能源采购平台 (CeicCrawler)
  * 华润守正采购交易平台 (SzecpCrawler)
- 更新 BidCrawlerService,集成新的爬虫到爬虫任务中
- 添加环境变量示例文件 .env.example,包含数据库和代理配置
- 优化 .env 文件,添加代理配置示例
- 为所有新爬虫添加完整的单元测试文件
- 使用与现有爬虫相同的反检测策略(人类行为模拟)
- 支持分页抓取,每个平台最多抓取5页数据
- 统一的错误处理机制,单个爬虫失败不影响其他爬虫执行
This commit is contained in:
dmy
2026-01-11 22:34:38 +08:00
parent 044fd770f7
commit 6d626a0946
10 changed files with 833 additions and 4 deletions

View File

@@ -4,6 +4,51 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 120 seconds for manual inspection and slow sites
jest.setTimeout(120000);
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
describe('ChngCrawler Real Site Test', () => {
let browser: puppeteer.Browser;
@@ -14,9 +59,20 @@ describe('ChngCrawler Real Site Test', () => {
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080'
'--window-size=1920,1080',
"--disable-infobars",
// "--headless=new",
// '--disable-dev-shm-usage',
// '--disable-accelerated-2d-canvas',
// '--no-first-run',
// '--no-zygote',
// '--disable-gpu',
// '--disable-features=VizDisplayCompositor',
// '--disable-webgl',
// '--disable-javascript',
],
defaultViewport: null
});
});
@@ -24,7 +80,7 @@ describe('ChngCrawler Real Site Test', () => {
if (browser) {
// Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000));
// await browser.close();
await browser.close();
}
});
@@ -33,6 +89,20 @@ describe('ChngCrawler Real Site Test', () => {
Starting crawl for: ${ChngCrawler.name}`);
console.log(`Target URL: ${ChngCrawler.url}`);
// 创建一个临时页面用于模拟人类行为
const tempPage = await browser.newPage();
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
// 模拟人类鼠标移动
console.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(tempPage);
// 模拟人类滚动
console.log('Simulating human scrolling...');
await simulateHumanScrolling(tempPage);
await tempPage.close();
const results = await ChngCrawler.crawl(browser);
console.log(`