feat: 全面优化爬虫系统和数据处理能力
- 增强数据重试机制:对数据为0的爬虫自动重试,提高数据完整性 - 优化前端筛选逻辑:改进日期筛选,只限制开始时间,更灵活的数据查看 - 新增最近数据接口:添加 /api/bids/recent 获取30天内最新招标数据 - 改进统计展示:实时显示筛选结果数量,优化用户体验 - 完善日志系统:确保日志目录自动创建,避免启动错误 - 增强独立脚本:使用自定义logger,完善错误处理和程序关闭 - 优化主程序:集成自定义日志服务,统一日志格式 - 扩展npm脚本:新增 web 命令用于构建前端 - 改进大唐爬虫:延长等待时间到60秒,提高页面加载成功率 - 优化数据筛选:今日招标改为使用独立接口,提升性能
This commit is contained in:
@@ -32,6 +32,9 @@ export class BidCrawlerService {
|
||||
// 统计结果
|
||||
const crawlResults: Record<string, { success: number; error?: string }> = {};
|
||||
|
||||
// 记录数据为0的爬虫,用于重试
|
||||
const zeroDataCrawlers: any[] = [];
|
||||
|
||||
// 从环境变量读取代理配置
|
||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||
@@ -83,6 +86,11 @@ export class BidCrawlerService {
|
||||
// 记录成功数量
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
|
||||
// 如果数据为0,记录下来用于重试
|
||||
if (results.length === 0) {
|
||||
zeroDataCrawlers.push(crawler);
|
||||
}
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title: item.title,
|
||||
@@ -98,6 +106,45 @@ export class BidCrawlerService {
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
}
|
||||
}
|
||||
|
||||
// 对数据为0的爬虫进行重试
|
||||
if (zeroDataCrawlers.length > 0) {
|
||||
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`);
|
||||
|
||||
for (const crawler of zeroDataCrawlers) {
|
||||
this.logger.log(`Retrying: ${crawler.name}`);
|
||||
|
||||
// 检查是否超时
|
||||
const elapsedTime = Date.now() - startTime;
|
||||
if (elapsedTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`);
|
||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`);
|
||||
|
||||
// 更新统计结果
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title: item.title,
|
||||
url: item.url,
|
||||
publishDate: item.publishDate,
|
||||
source: crawler.name,
|
||||
unit: '',
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error(`Crawl task failed: ${error.message}`);
|
||||
} finally {
|
||||
|
||||
@@ -89,7 +89,7 @@ export const CdtCrawler = {
|
||||
await page.waitForFunction(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
|
||||
}, { timeout: 30000 });
|
||||
}, { timeout: 60000 });
|
||||
|
||||
await page.evaluate(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
|
||||
Reference in New Issue
Block a user