import { Injectable, Logger } from '@nestjs/common'; import { ConfigService } from '@nestjs/config'; import * as puppeteer from 'puppeteer'; import { BidsService } from '../../bids/services/bid.service'; import { ChdtpCrawler } from './chdtp_target'; import { ChngCrawler } from './chng_target'; import { SzecpCrawler } from './szecp_target'; import { CdtCrawler } from './cdt_target'; import { EpsCrawler } from './eps_target'; @Injectable() export class BidCrawlerService { private readonly logger = new Logger(BidCrawlerService.name); constructor( private bidsService: BidsService, private configService: ConfigService, ) {} async crawlAll() { this.logger.log('Starting crawl task with Puppeteer...'); // 从环境变量读取代理配置 const proxyHost = this.configService.get('PROXY_HOST'); const proxyPort = this.configService.get('PROXY_PORT'); const proxyUsername = this.configService.get('PROXY_USERNAME'); const proxyPassword = this.configService.get('PROXY_PASSWORD'); // 构建代理参数 const args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled', '--disable-infobars', '--window-position=0,0', '--ignore-certifcate-errors', '--ignore-certifcate-errors-spki-list', ]; if (proxyHost && proxyPort) { const proxyUrl = proxyUsername && proxyPassword ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` : `http://${proxyHost}:${proxyPort}`; args.push(`--proxy-server=${proxyUrl}`); this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`); } const browser = await puppeteer.launch({ headless: false, args, }); const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler]; try { for (const crawler of crawlers) { this.logger.log(`Crawling: ${crawler.name}`); try { const results = await crawler.crawl(browser); this.logger.log(`Extracted ${results.length} items from ${crawler.name}`); for (const item of results) { await this.bidsService.createOrUpdate({ title: item.title, url: item.url, publishDate: item.publishDate, source: crawler.name, unit: '', }); } } catch (err) { this.logger.error(`Error crawling ${crawler.name}: ${err.message}`); } } } catch (error) { this.logger.error(`Crawl task failed: ${error.message}`); } finally { await browser.close(); this.logger.log('Crawl task finished.'); } } }