feat: 全面优化爬虫系统和数据处理能力

- 增强数据重试机制:对数据为0的爬虫自动重试,提高数据完整性
- 优化前端筛选逻辑:改进日期筛选,只限制开始时间,更灵活的数据查看
- 新增最近数据接口:添加 /api/bids/recent 获取30天内最新招标数据
- 改进统计展示:实时显示筛选结果数量,优化用户体验
- 完善日志系统:确保日志目录自动创建,避免启动错误
- 增强独立脚本:使用自定义logger,完善错误处理和程序关闭
- 优化主程序:集成自定义日志服务,统一日志格式
- 扩展npm脚本:新增 web 命令用于构建前端
- 改进大唐爬虫:延长等待时间到60秒,提高页面加载成功率
- 优化数据筛选:今日招标改为使用独立接口,提升性能
This commit is contained in:
dmy
2026-01-12 12:28:37 +08:00
parent 3e6456e120
commit 1b28a3462a
10 changed files with 104 additions and 39 deletions

View File

@@ -10,6 +10,11 @@ export class BidsController {
return this.bidsService.findAll(query);
}
@Get('recent')
getRecent() {
return this.bidsService.getRecentBids();
}
@Get('high-priority')
getHighPriority() {
return this.bidsService.getHighPriorityCorrected();

View File

@@ -1,6 +1,6 @@
import { Injectable } from '@nestjs/common';
import { InjectRepository } from '@nestjs/typeorm';
import { Repository, LessThan } from 'typeorm';
import { Repository, LessThan, MoreThanOrEqual } from 'typeorm';
import { BidItem } from '../entities/bid-item.entity';
@Injectable()
@@ -75,4 +75,16 @@ export class BidsService {
.getRawMany();
return result.map((item: any) => item.source);
}
async getRecentBids() {
const thirtyDaysAgo = new Date();
thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30);
thirtyDaysAgo.setHours(0, 0, 0, 0);
return this.bidRepository
.createQueryBuilder('bid')
.where('bid.publishDate >= :thirtyDaysAgo', { thirtyDaysAgo })
.orderBy('bid.publishDate', 'DESC')
.getMany();
}
}

View File

@@ -1,9 +1,15 @@
import * as winston from 'winston';
import DailyRotateFile from 'winston-daily-rotate-file';
import * as path from 'path';
import * as fs from 'fs';
const logDir = path.join(process.cwd(), 'logs');
// 确保日志目录存在
if (!fs.existsSync(logDir)) {
fs.mkdirSync(logDir, { recursive: true });
}
// 日志格式
const logFormat = winston.format.combine(
winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),

View File

@@ -32,6 +32,9 @@ export class BidCrawlerService {
// 统计结果
const crawlResults: Record<string, { success: number; error?: string }> = {};
// 记录数据为0的爬虫用于重试
const zeroDataCrawlers: any[] = [];
// 从环境变量读取代理配置
const proxyHost = this.configService.get<string>('PROXY_HOST');
const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -83,6 +86,11 @@ export class BidCrawlerService {
// 记录成功数量
crawlResults[crawler.name] = { success: results.length };
// 如果数据为0记录下来用于重试
if (results.length === 0) {
zeroDataCrawlers.push(crawler);
}
for (const item of results) {
await this.bidsService.createOrUpdate({
title: item.title,
@@ -98,6 +106,45 @@ export class BidCrawlerService {
crawlResults[crawler.name] = { success: 0, error: err.message };
}
}
// 对数据为0的爬虫进行重试
if (zeroDataCrawlers.length > 0) {
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`);
for (const crawler of zeroDataCrawlers) {
this.logger.log(`Retrying: ${crawler.name}`);
// 检查是否超时
const elapsedTime = Date.now() - startTime;
if (elapsedTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`);
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
break;
}
try {
const results = await crawler.crawl(browser);
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`);
// 更新统计结果
crawlResults[crawler.name] = { success: results.length };
for (const item of results) {
await this.bidsService.createOrUpdate({
title: item.title,
url: item.url,
publishDate: item.publishDate,
source: crawler.name,
unit: '',
});
}
} catch (err) {
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
// 记录错误信息
crawlResults[crawler.name] = { success: 0, error: err.message };
}
}
}
} catch (error) {
this.logger.error(`Crawl task failed: ${error.message}`);
} finally {

View File

@@ -89,7 +89,7 @@ export const CdtCrawler = {
await page.waitForFunction(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
}, { timeout: 30000 });
}, { timeout: 60000 });
await page.evaluate(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));

View File

@@ -1,8 +1,14 @@
import { NestFactory } from '@nestjs/core';
import { AppModule } from './app.module';
import { CustomLogger } from './common/logger/logger.service';
async function bootstrap() {
const app = await NestFactory.create(AppModule);
// 使用自定义日志服务
const logger = await app.resolve(CustomLogger);
app.useLogger(logger);
await app.listen(process.env.PORT ?? 3000);
}
bootstrap();

View File

@@ -1,14 +1,18 @@
import { NestFactory } from '@nestjs/core';
import { AppModule } from '../app.module';
import { BidCrawlerService } from '../crawler/services/bid-crawler.service';
import { Logger } from '@nestjs/common';
import { CustomLogger } from '../common/logger/logger.service';
async function runCrawler() {
const logger = new Logger('CrawlScript');
const app = await NestFactory.createApplicationContext(AppModule);
// 设置自定义 logger使 NestJS 框架日志也输出到文件
const logger = await app.resolve(CustomLogger);
app.useLogger(logger);
logger.setContext('CrawlScript');
try {
const app = await NestFactory.createApplicationContext(AppModule);
const crawlerService = app.get(BidCrawlerService);
const crawlerService = await app.resolve(BidCrawlerService);
logger.log('Starting crawler...');
await crawlerService.crawlAll();
@@ -18,6 +22,7 @@ async function runCrawler() {
process.exit(0);
} catch (error) {
logger.error('Crawler failed:', error);
await app.close();
process.exit(1);
}
}