feat: 添加华能集团电子商务平台爬虫功能

- 新增 ChngCrawler 爬虫类,支持华能集团电子商务平台招标信息抓取
- 更新 BidCrawlerService,集成 ChngCrawler 到爬虫任务中
- 添加代理配置支持,从环境变量读取代理设置
- 优化爬虫逻辑,支持多个爬虫并行执行
- 新增 ChngCrawler 的单元测试文件
- 改进错误处理,单个爬虫失败不影响其他爬虫执行
- 更新 chdtp_target.ts,添加代理认证支持
This commit is contained in:
dmy
2026-01-11 18:20:43 +08:00
parent d9105797f4
commit 07a7301968
5 changed files with 350 additions and 15 deletions

View File

@@ -0,0 +1,60 @@
import { ChngCrawler } from './chng_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 120 seconds for manual inspection and slow sites
jest.setTimeout(120000);
describe('ChngCrawler Real Site Test', () => {
let browser: puppeteer.Browser;
beforeAll(async () => {
browser = await puppeteer.launch({
headless: false, // Run in non-headless mode
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080'
],
defaultViewport: null
});
});
afterAll(async () => {
if (browser) {
// Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000));
// await browser.close();
}
});
it('should visit the website and list all found bid information', async () => {
console.log(`
Starting crawl for: ${ChngCrawler.name}`);
console.log(`Target URL: ${ChngCrawler.url}`);
const results = await ChngCrawler.crawl(browser);
console.log(`
Successfully found ${results.length} items:
`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) {
console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.');
} else {
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});