feat: 添加华能集团电子商务平台爬虫功能
- 新增 ChngCrawler 爬虫类,支持华能集团电子商务平台招标信息抓取 - 更新 BidCrawlerService,集成 ChngCrawler 到爬虫任务中 - 添加代理配置支持,从环境变量读取代理设置 - 优化爬虫逻辑,支持多个爬虫并行执行 - 新增 ChngCrawler 的单元测试文件 - 改进错误处理,单个爬虫失败不影响其他爬虫执行 - 更新 chdtp_target.ts,添加代理认证支持
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { BidsService } from '../../bids/services/bid.service';
|
||||
import { ChdtpCrawler } from './chdtp_target';
|
||||
import { ChngCrawler } from './chng_target';
|
||||
|
||||
@Injectable()
|
||||
export class BidCrawlerService {
|
||||
@@ -9,33 +11,64 @@ export class BidCrawlerService {
|
||||
|
||||
constructor(
|
||||
private bidsService: BidsService,
|
||||
private configService: ConfigService,
|
||||
) {}
|
||||
|
||||
async crawlAll() {
|
||||
this.logger.log('Starting crawl task with Puppeteer...');
|
||||
|
||||
// 从环境变量读取代理配置
|
||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||
const proxyUsername = this.configService.get<string>('PROXY_USERNAME');
|
||||
const proxyPassword = this.configService.get<string>('PROXY_PASSWORD');
|
||||
|
||||
// 构建代理参数
|
||||
const args = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
];
|
||||
|
||||
if (proxyHost && proxyPort) {
|
||||
const proxyUrl = proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
args.push(`--proxy-server=${proxyUrl}`);
|
||||
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
|
||||
}
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
args,
|
||||
});
|
||||
|
||||
const crawlers = [ChdtpCrawler, ChngCrawler];
|
||||
|
||||
try {
|
||||
// Currently only supports ChdtpCrawler, but can be extended to a list of crawlers
|
||||
const crawler = ChdtpCrawler;
|
||||
this.logger.log(`Crawling: ${crawler.name}`);
|
||||
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||
for (const crawler of crawlers) {
|
||||
this.logger.log(`Crawling: ${crawler.name}`);
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title,
|
||||
url: itemUrl,
|
||||
publishDate,
|
||||
source: type || 'Unknown',
|
||||
});
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title: item.title,
|
||||
url: item.url,
|
||||
publishDate: item.publishDate,
|
||||
source: crawler.name,
|
||||
unit: '',
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
this.logger.error(`Crawl task failed: ${error.message}`);
|
||||
} finally {
|
||||
|
||||
Reference in New Issue
Block a user