feat: 新增多个电力集团采购平台爬虫
- 新增4个电力集团采购平台爬虫: * 中国大唐集团电子商务平台 (CdtCrawler) * 大连能源采购平台 (CeicCrawler) * 华润守正采购交易平台 (SzecpCrawler) - 更新 BidCrawlerService,集成新的爬虫到爬虫任务中 - 添加环境变量示例文件 .env.example,包含数据库和代理配置 - 优化 .env 文件,添加代理配置示例 - 为所有新爬虫添加完整的单元测试文件 - 使用与现有爬虫相同的反检测策略(人类行为模拟) - 支持分页抓取,每个平台最多抓取5页数据 - 统一的错误处理机制,单个爬虫失败不影响其他爬虫执行
This commit is contained in:
61
src/crawler/services/ceic_target.spec.ts
Normal file
61
src/crawler/services/ceic_target.spec.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
import { CeicCrawler } from './ceic_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 120 seconds for manual inspection and slow sites
|
||||
jest.setTimeout(120000);
|
||||
|
||||
describe('CeicCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Run in non-headless mode
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
'--disable-infobars',
|
||||
],
|
||||
defaultViewport: null
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${CeicCrawler.name}`);
|
||||
console.log(`Target URL: ${CeicCrawler.url}`);
|
||||
|
||||
const results = await CeicCrawler.crawl(browser);
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user