feat: 新增中核集团电子采购平台爬虫

- 新增 CnncecpCrawler 爬虫类,支持中核集团电子采购平台招标信息抓取
- 目标平台:https://www.cnncecp.com,专注于核工业领域采购招标
- 使用正则表达式提取招标公告,支持标题、URL和发布日期解析
- 添加完整的人类行为模拟功能(鼠标移动、滚动)降低检测风险
- 支持分页抓取,最多抓取5页数据,使用 index_ 模式进行翻页
- 添加完整的单元测试文件,确保爬虫功能正常
- 统一的错误处理机制,单个爬虫失败不影响整体任务执行
This commit is contained in:
dmy
2026-01-11 23:32:09 +08:00
parent 74a4aec363
commit bf17587bd3
3 changed files with 247 additions and 1 deletions

View File

@@ -0,0 +1,51 @@
import { CnncecpCrawler } from './cnncecp_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
describe('CnncecpCrawler Real Site Test', () => {
let browser: puppeteer.Browser;
beforeAll(async () => {
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
});
afterAll(async () => {
if (browser) {
await browser.close();
}
});
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CnncecpCrawler.name}`);
console.log(`Target URL: ${CnncecpCrawler.url}`);
const results = await CnncecpCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
// Basic assertions to ensure crawler is working
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});