feat: 增强爬虫系统功能和性能管理

- 新增爬虫状态接口：GET /api/crawler/status 可实时查看爬虫运行状态 - 防止重复爬取：添加 isCrawling 标志位，避免同时运行多个爬虫任务 - 增强爬虫服务：集成所有9个爬虫平台到 BidCrawlerService - 添加执行时间限制：设置最大执行时间为1小时，防止任务无限运行 - 新增来源统计功能：GET /api/bids/sources 可查看所有招标来源平台 - 优化错误处理：完善爬虫完成后的时间统计和超时警告 - 改进控制器逻辑：更好的异常处理和状态管理 - 支持的平台包括：华能、大唐、华润、三峡、中核、中广核、电能e招采、大连能源、北京电力等9大采购平台
2026-01-12 02:09:48 +08:00
parent b839779ec6
commit a1badea135
4 changed files with 62 additions and 7 deletions
--- a/src/bids/controllers/bid.controller.ts
+++ b/src/bids/controllers/bid.controller.ts
@@ -14,4 +14,9 @@ export class BidsController {
  getHighPriority() {
    return this.bidsService.getHighPriorityCorrected();
  }
+
+  @Get('sources')
+  getSources() {
+    return this.bidsService.getSources();
+  }
 }
--- a/src/bids/services/bid.service.ts
+++ b/src/bids/services/bid.service.ts
@@ -65,4 +65,14 @@ export class BidsService {
      createdAt: LessThan(thirtyDaysAgo),
    });
  }
+
+  async getSources() {
+    const result = await this.bidRepository
+      .createQueryBuilder('bid')
+      .select('DISTINCT bid.source')
+      .where('bid.source IS NOT NULL')
+      .orderBy('bid.source', 'ASC')
+      .getRawMany();
+    return result.map((item: any) => item.source);
+  }
 }
--- a/src/crawler/crawler.controller.ts
+++ b/src/crawler/crawler.controller.ts
@@ -1,21 +1,38 @@
-import { Controller, Post } from '@nestjs/common';
+import { Controller, Post, Get } from '@nestjs/common';
 import { BidCrawlerService } from './services/bid-crawler.service';

@Controller('api/crawler')
 export class CrawlerController {
+  private isCrawling = false;
+
  constructor(private readonly crawlerService: BidCrawlerService) {}

+  @Get('status')
+  getStatus() {
+    return { isCrawling: this.isCrawling };
+  }
+
  @Post('run')
  async runCrawl() {
+    if (this.isCrawling) {
+      return { message: 'Crawl is already running' };
+    }
+    
+    this.isCrawling = true;
+    
    // We don't await this because we want it to run in the background 
-    // and return immediately, or we can await if we want the user to wait.
+    // and return immediately, or we can await if we want to user to wait.
    // Given the requirement "Immediate Crawl", usually implies triggering it.
    // However, for a better UI experience, we might want to wait or just trigger.
-    // Let's await it so the user knows when it's done (or failed), 
+    // Let's await it so that user knows when it's done (or failed), 
    // assuming it doesn't take too long for the mock. 
    // Real crawling might take long, so background is better.
    // For this prototype, I'll await it to show completion.
-    await this.crawlerService.crawlAll();
-    return { message: 'Crawl completed successfully' };
+    try {
+      await this.crawlerService.crawlAll();
+      return { message: 'Crawl completed successfully' };
+    } finally {
+      this.isCrawling = false;
+    }
  }
 }
--- a/src/crawler/services/bid-crawler.service.ts
+++ b/src/crawler/services/bid-crawler.service.ts
@@ -9,6 +9,9 @@ import { CdtCrawler } from './cdt_target';
 import { EpsCrawler } from './eps_target';
 import { CnncecpCrawler } from './cnncecp_target';
 import { CgnpcCrawler } from './cgnpc_target';
+import { CeicCrawler } from './ceic_target';
+import { EspicCrawler } from './espic_target';
+import { PowerbeijingCrawler } from './powerbeijing_target';

@Injectable()
 export class BidCrawlerService {
@@ -22,6 +25,10 @@ export class BidCrawlerService {
  async crawlAll() {
    this.logger.log('Starting crawl task with Puppeteer...');
    
+    // 设置最大执行时间为1小时
+    const maxExecutionTime = 60 * 60 * 1000; // 1小时（毫秒）
+    const startTime = Date.now();
+    
    // 从环境变量读取代理配置
    const proxyHost = this.configService.get<string>('PROXY_HOST');
    const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -52,11 +59,20 @@ export class BidCrawlerService {
      args,
    });

-    const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler];
+    const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler];

    try {
      for (const crawler of crawlers) {
        this.logger.log(`Crawling: ${crawler.name}`);
+        
+        // 检查是否超时
+        const elapsedTime = Date.now() - startTime;
+        if (elapsedTime > maxExecutionTime) {
+          this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour. Stopping...`);
+          this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
+          break;
+        }
+        
        try {
          const results = await crawler.crawl(browser);
          this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
@@ -78,7 +94,14 @@ export class BidCrawlerService {
      this.logger.error(`Crawl task failed: ${error.message}`);
    } finally {
      await browser.close();
-      this.logger.log('Crawl task finished.');
+      
+      const totalTime = Date.now() - startTime;
+      const minutes = Math.floor(totalTime / 1000 / 60);
+      this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
+      
+      if (totalTime > maxExecutionTime) {
+        this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour.`);
+      }
    }
  }
 }