import { Injectable, Logger } from '@nestjs/common'; import { ConfigService } from '@nestjs/config'; import * as puppeteer from 'puppeteer'; import { BidsService } from '../../bids/services/bid.service'; import { ChdtpCrawler } from './chdtp_target'; import { ChngCrawler } from './chng_target'; import { SzecpCrawler } from './szecp_target'; import { CdtCrawler } from './cdt_target'; import { EpsCrawler } from './eps_target'; import { CnncecpCrawler } from './cnncecp_target'; import { CgnpcCrawler } from './cgnpc_target'; import { CeicCrawler } from './ceic_target'; import { EspicCrawler } from './espic_target'; import { PowerbeijingCrawler } from './powerbeijing_target'; @Injectable() export class BidCrawlerService { private readonly logger = new Logger(BidCrawlerService.name); constructor( private bidsService: BidsService, private configService: ConfigService, ) {} async crawlAll() { this.logger.log('Starting crawl task with Puppeteer...'); // 设置最大执行时间为3小时 const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时(毫秒) const startTime = Date.now(); // 统计结果 const crawlResults: Record = {}; // 从环境变量读取代理配置 const proxyHost = this.configService.get('PROXY_HOST'); const proxyPort = this.configService.get('PROXY_PORT'); const proxyUsername = this.configService.get('PROXY_USERNAME'); const proxyPassword = this.configService.get('PROXY_PASSWORD'); // 构建代理参数 const args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled', '--disable-infobars', '--window-position=0,0', '--ignore-certifcate-errors', '--ignore-certifcate-errors-spki-list', ]; if (proxyHost && proxyPort) { const proxyUrl = proxyUsername && proxyPassword ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` : `http://${proxyHost}:${proxyPort}`; args.push(`--proxy-server=${proxyUrl}`); this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`); } const browser = await puppeteer.launch({ headless: false, args, }); const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler]; try { for (const crawler of crawlers) { this.logger.log(`Crawling: ${crawler.name}`); // 检查是否超时 const elapsedTime = Date.now() - startTime; if (elapsedTime > maxExecutionTime) { this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`); this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`); break; } try { const results = await crawler.crawl(browser); this.logger.log(`Extracted ${results.length} items from ${crawler.name}`); // 记录成功数量 crawlResults[crawler.name] = { success: results.length }; for (const item of results) { await this.bidsService.createOrUpdate({ title: item.title, url: item.url, publishDate: item.publishDate, source: crawler.name, unit: '', }); } } catch (err) { this.logger.error(`Error crawling ${crawler.name}: ${err.message}`); // 记录错误信息 crawlResults[crawler.name] = { success: 0, error: err.message }; } } } catch (error) { this.logger.error(`Crawl task failed: ${error.message}`); } finally { await browser.close(); const totalTime = Date.now() - startTime; const minutes = Math.floor(totalTime / 1000 / 60); this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`); if (totalTime > maxExecutionTime) { this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`); } // 输出统计总结 this.logger.log('='.repeat(50)); this.logger.log('爬虫执行总结 / Crawl Summary'); this.logger.log('='.repeat(50)); let totalSuccess = 0; let errorCount = 0; for (const [source, result] of Object.entries(crawlResults)) { if (result.error) { this.logger.error(`❌ ${source}: 出错 - ${result.error}`); errorCount++; } else { this.logger.log(`✅ ${source}: 成功获取 ${result.success} 条工程信息`); totalSuccess += result.success; } } this.logger.log('='.repeat(50)); this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`); this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`); this.logger.log('='.repeat(50)); } } }