feat: 增强爬虫系统功能和性能管理

- 新增爬虫状态接口：GET /api/crawler/status 可实时查看爬虫运行状态 - 防止重复爬取：添加 isCrawling 标志位，避免同时运行多个爬虫任务 - 增强爬虫服务：集成所有9个爬虫平台到 BidCrawlerService - 添加执行时间限制：设置最大执行时间为1小时，防止任务无限运行 - 新增来源统计功能：GET /api/bids/sources 可查看所有招标来源平台 - 优化错误处理：完善爬虫完成后的时间统计和超时警告 - 改进控制器逻辑：更好的异常处理和状态管理 - 支持的平台包括：华能、大唐、华润、三峡、中核、中广核、电能e招采、大连能源、北京电力等9大采购平台
2026-01-12 02:09:48 +08:00
parent b839779ec6
commit a1badea135
4 changed files with 62 additions and 7 deletions
--- a/src/bids/controllers/bid.controller.ts
+++ b/src/bids/controllers/bid.controller.ts
@@ -14,4 +14,9 @@ export class BidsController {
  getHighPriority() {
    return this.bidsService.getHighPriorityCorrected();
  }
  @Get('sources')
  getSources() {
    return this.bidsService.getSources();
  }
 }
--- a/src/bids/services/bid.service.ts
+++ b/src/bids/services/bid.service.ts
@@ -65,4 +65,14 @@ export class BidsService {
      createdAt: LessThan(thirtyDaysAgo),
    });
  }
  async getSources() {
    const result = await this.bidRepository
      .createQueryBuilder('bid')
      .select('DISTINCT bid.source')
      .where('bid.source IS NOT NULL')
      .orderBy('bid.source', 'ASC')
      .getRawMany();
    return result.map((item: any) => item.source);
  }
 }
--- a/src/crawler/crawler.controller.ts
+++ b/src/crawler/crawler.controller.ts
@@ -1,21 +1,38 @@
-import { Controller, Post } from '@nestjs/common';
+import { Controller, Post, Get } from '@nestjs/common';
 import { BidCrawlerService } from './services/bid-crawler.service';
@Controller('api/crawler')
 export class CrawlerController {
  private isCrawling = false;
  constructor(private readonly crawlerService: BidCrawlerService) {}
  @Get('status')
  getStatus() {
    return { isCrawling: this.isCrawling };
  }
  @Post('run')
  async runCrawl() {
    if (this.isCrawling) {
      return { message: 'Crawl is already running' };
    }
    this.isCrawling = true;
    // We don't await this because we want it to run in the background 
-    // and return immediately, or we can await if we want the user to wait.
+    // and return immediately, or we can await if we want to user to wait.
    // Given the requirement "Immediate Crawl", usually implies triggering it.
    // However, for a better UI experience, we might want to wait or just trigger.
-    // Let's await it so the user knows when it's done (or failed), 
+    // Let's await it so that user knows when it's done (or failed), 
    // assuming it doesn't take too long for the mock. 
    // Real crawling might take long, so background is better.
    // For this prototype, I'll await it to show completion.
    try {
      await this.crawlerService.crawlAll();
      return { message: 'Crawl completed successfully' };
    } finally {
      this.isCrawling = false;
    }
  }
 }
--- a/src/crawler/services/bid-crawler.service.ts
+++ b/src/crawler/services/bid-crawler.service.ts
@@ -9,6 +9,9 @@ import { CdtCrawler } from './cdt_target';
 import { EpsCrawler } from './eps_target';
 import { CnncecpCrawler } from './cnncecp_target';
 import { CgnpcCrawler } from './cgnpc_target';
 import { CeicCrawler } from './ceic_target';
 import { EspicCrawler } from './espic_target';
 import { PowerbeijingCrawler } from './powerbeijing_target';
@Injectable()
 export class BidCrawlerService {
@@ -22,6 +25,10 @@ export class BidCrawlerService {
  async crawlAll() {
    this.logger.log('Starting crawl task with Puppeteer...');
    // 设置最大执行时间为1小时
    const maxExecutionTime = 60 * 60 * 1000; // 1小时（毫秒）
    const startTime = Date.now();
    // 从环境变量读取代理配置
    const proxyHost = this.configService.get<string>('PROXY_HOST');
    const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -52,11 +59,20 @@ export class BidCrawlerService {
      args,
    });
-    const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler];
+    const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler];
    try {
      for (const crawler of crawlers) {
        this.logger.log(`Crawling: ${crawler.name}`);
        // 检查是否超时
        const elapsedTime = Date.now() - startTime;
        if (elapsedTime > maxExecutionTime) {
          this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour. Stopping...`);
          this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
          break;
        }
        try {
          const results = await crawler.crawl(browser);
          this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
@@ -78,7 +94,14 @@ export class BidCrawlerService {
      this.logger.error(`Crawl task failed: ${error.message}`);
    } finally {
      await browser.close();
-      this.logger.log('Crawl task finished.');
+      
      const totalTime = Date.now() - startTime;
      const minutes = Math.floor(totalTime / 1000 / 60);
      this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
      if (totalTime > maxExecutionTime) {
        this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour.`);
      }
    }
  }
 }