From 82f5a818875aea9b2eaaf12ec75d30b1dddf5541 Mon Sep 17 00:00:00 2001 From: dmy Date: Wed, 14 Jan 2026 22:26:32 +0800 Subject: [PATCH] =?UTF-8?q?chore:=20=E6=9B=B4=E6=96=B0.gitignore=E5=B9=B6?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=96=B0=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在.gitignore中添加对*.png、*.log、*-lock.json、*.woff2文件的忽略规则,并新增OFL.txt文件。同时,添加vue.svg图标文件以支持前端展示。更新多个TypeScript文件以优化代码格式和增强可读性。 --- .gitignore | 6 +- frontend/src/assets/vue.svg | 1 + src/ai/ai.service.ts | 31 +- src/ai/entities/ai-recommendation.entity.ts | 7 +- src/app.module.ts | 2 +- src/bids/controllers/bid.controller.ts | 21 +- src/bids/entities/bid-item.entity.ts | 8 +- src/bids/services/bid.service.ts | 90 ++++-- src/common/logger/logger.service.ts | 46 ++- src/common/logger/winston.config.ts | 43 ++- src/crawler/crawler.controller.ts | 10 +- src/crawler/entities/crawl-info-add.entity.ts | 7 +- src/crawler/services/bid-crawler.service.ts | 245 ++++++++++----- src/crawler/services/cdt_target.spec.ts | 26 +- src/crawler/services/cdt_target.ts | 140 ++++++--- src/crawler/services/ceic_target.spec.ts | 28 +- src/crawler/services/ceic_target.ts | 89 ++++-- src/crawler/services/cgnpc_target.spec.ts | 26 +- src/crawler/services/cgnpc_target.ts | 80 +++-- src/crawler/services/chdtp_target.spec.ts | 26 +- src/crawler/services/chdtp_target.ts | 65 ++-- src/crawler/services/chng_target.spec.ts | 63 ++-- src/crawler/services/chng_target.ts | 279 ++++++++++-------- src/crawler/services/cnncecp_target.spec.ts | 26 +- src/crawler/services/cnncecp_target.ts | 80 +++-- src/crawler/services/cnooc_target.spec.ts | 26 +- src/crawler/services/cnooc_target.ts | 80 +++-- src/crawler/services/eps_target.spec.ts | 26 +- src/crawler/services/eps_target.ts | 80 +++-- src/crawler/services/espic_target.spec.ts | 26 +- src/crawler/services/espic_target.ts | 93 +++--- .../services/powerbeijing_target.spec.ts | 26 +- src/crawler/services/powerbeijing_target.ts | 80 +++-- src/crawler/services/sdicc_target.spec.ts | 26 +- src/crawler/services/sdicc_target.ts | 94 +++--- src/crawler/services/szecp_target.spec.ts | 28 +- src/crawler/services/szecp_target.ts | 129 +++++--- src/database/database.module.ts | 6 +- src/keywords/keyword.entity.ts | 8 +- src/main.ts | 13 +- src/scripts/ai-recommendations.ts | 20 +- src/scripts/crawl.ts | 8 +- src/scripts/remove-duplicates.ts | 10 +- src/scripts/sync.ts | 50 +++- src/scripts/update-source.ts | 4 +- widget/looker/sys_run/go.mod | 17 ++ widget/looker/sys_run/go.sum | 32 ++ 47 files changed, 1513 insertions(+), 814 deletions(-) create mode 100644 frontend/src/assets/vue.svg create mode 100644 widget/looker/sys_run/go.mod create mode 100644 widget/looker/sys_run/go.sum diff --git a/.gitignore b/.gitignore index 2c94515..2348b87 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,8 @@ pw-browsers logs build *.exe -*.png \ No newline at end of file +*.png +*.log +*-lock.json +*.woff2 +widget/looker/frontend/src/assets/fonts/OFL.txt diff --git a/frontend/src/assets/vue.svg b/frontend/src/assets/vue.svg new file mode 100644 index 0000000..770e9d3 --- /dev/null +++ b/frontend/src/assets/vue.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/ai/ai.service.ts b/src/ai/ai.service.ts index 18ba0a4..90da279 100644 --- a/src/ai/ai.service.ts +++ b/src/ai/ai.service.ts @@ -31,14 +31,13 @@ export class AiService { @InjectRepository(BidItem) private readonly bidItemRepository: Repository, ) { - const apiKey = this.configService.get('ARK_API_KEY'); // this.openai = new OpenAI({ - // apiKey: apiKey || '', + // apiKey: this.configService.get('ARK_API_KEY') || '', // baseURL: 'https://ark.cn-beijing.volces.com/api/v3', // timeout: 120000, // 120秒超时 // }); this.openai = new OpenAI({ - apiKey: 'sk-5sSOxrJl31MGz76bE14d2fDbA55b44869fCcA0C813Fc893a' , + apiKey: 'sk-5sSOxrJl31MGz76bE14d2fDbA55b44869fCcA0C813Fc893a', baseURL: 'https://aihubmix.com/v1', timeout: 120000, // 120秒超时 }); @@ -49,7 +48,9 @@ export class AiService { this.logger.log(`发送给 AI 的数据数量: ${bids.length}`); try { - const prompt =PromptString+ `请根据以下投标项目标题列表,筛选出我关心的项目。请以 JSON 格式返回,格式如下: + const prompt = + PromptString + + `请根据以下投标项目标题列表,筛选出我关心的项目。请以 JSON 格式返回,格式如下: [ { "title": "项目标题", @@ -58,7 +59,11 @@ export class AiService { ] 投标项目标题列表: -${JSON.stringify(bids.map(b => b.title), null, 2)}`; +${JSON.stringify( + bids.map((b) => b.title), + null, + 2, +)}`; // this.logger.log('发给AI的内容',prompt); const completion = await this.openai.chat.completions.create({ model: 'mimo-v2-flash-free', @@ -97,7 +102,9 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`; } } - async saveRecommendations(recommendations: AIRecommendation[]): Promise { + async saveRecommendations( + recommendations: AIRecommendation[], + ): Promise { this.logger.log('开始保存 AI 推荐结果'); try { @@ -105,7 +112,7 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`; await this.aiRecommendationRepository.clear(); // 保存新的推荐结果(只保存 title 和 confidence) - const entities = recommendations.map(rec => { + const entities = recommendations.map((rec) => { const entity = new AiRecommendationEntity(); entity.title = rec.title; entity.confidence = rec.confidence; @@ -125,14 +132,14 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`; try { const entities = await this.aiRecommendationRepository.find({ - order: { confidence: 'DESC' } + order: { confidence: 'DESC' }, }); // 从 bid-items 表获取 url、source 和 publishDate const result: AIRecommendation[] = []; for (const entity of entities) { const bidItem = await this.bidItemRepository.findOne({ - where: { title: entity.title } + where: { title: entity.title }, }); result.push({ @@ -140,7 +147,7 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`; url: bidItem?.url || '', source: bidItem?.source || '', confidence: entity.confidence, - publishDate: bidItem?.publishDate + publishDate: bidItem?.publishDate, }); } @@ -148,7 +155,9 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`; result.sort((a, b) => { if (!a.publishDate) return 1; if (!b.publishDate) return -1; - return new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime(); + return ( + new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime() + ); }); return result; diff --git a/src/ai/entities/ai-recommendation.entity.ts b/src/ai/entities/ai-recommendation.entity.ts index b46a124..91a6a30 100644 --- a/src/ai/entities/ai-recommendation.entity.ts +++ b/src/ai/entities/ai-recommendation.entity.ts @@ -1,4 +1,9 @@ -import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm'; +import { + Entity, + PrimaryGeneratedColumn, + Column, + CreateDateColumn, +} from 'typeorm'; @Entity('ai_recommendations') export class AiRecommendation { diff --git a/src/app.module.ts b/src/app.module.ts index e22e81b..2fd4c24 100644 --- a/src/app.module.ts +++ b/src/app.module.ts @@ -28,4 +28,4 @@ import { AiModule } from './ai/ai.module'; AiModule, ], }) -export class AppModule {} \ No newline at end of file +export class AppModule {} diff --git a/src/bids/controllers/bid.controller.ts b/src/bids/controllers/bid.controller.ts index edb564c..b450163 100644 --- a/src/bids/controllers/bid.controller.ts +++ b/src/bids/controllers/bid.controller.ts @@ -1,12 +1,19 @@ import { Controller, Get, Query, Patch, Param, Body } from '@nestjs/common'; import { BidsService } from '../services/bid.service'; +interface FindAllQuery { + page?: number; + limit?: number; + source?: string; + keyword?: string; +} + @Controller('api/bids') export class BidsController { constructor(private readonly bidsService: BidsService) {} @Get() - findAll(@Query() query: any) { + findAll(@Query() query: FindAllQuery) { return this.bidsService.findAll(query); } @@ -26,9 +33,17 @@ export class BidsController { } @Get('by-date-range') - getByDateRange(@Query('startDate') startDate: string, @Query('endDate') endDate?: string, @Query('keywords') keywords?: string) { + getByDateRange( + @Query('startDate') startDate: string, + @Query('endDate') endDate?: string, + @Query('keywords') keywords?: string, + ) { const keywordsArray = keywords ? keywords.split(',') : undefined; - return this.bidsService.getBidsByDateRange(startDate, endDate, keywordsArray); + return this.bidsService.getBidsByDateRange( + startDate, + endDate, + keywordsArray, + ); } @Get('crawl-info-stats') diff --git a/src/bids/entities/bid-item.entity.ts b/src/bids/entities/bid-item.entity.ts index 483c921..7183878 100644 --- a/src/bids/entities/bid-item.entity.ts +++ b/src/bids/entities/bid-item.entity.ts @@ -1,4 +1,10 @@ -import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm'; +import { + Entity, + PrimaryGeneratedColumn, + Column, + CreateDateColumn, + UpdateDateColumn, +} from 'typeorm'; @Entity('bid_items') export class BidItem { diff --git a/src/bids/services/bid.service.ts b/src/bids/services/bid.service.ts index 1dd01b8..da91425 100644 --- a/src/bids/services/bid.service.ts +++ b/src/bids/services/bid.service.ts @@ -1,9 +1,36 @@ import { Injectable } from '@nestjs/common'; import { InjectRepository } from '@nestjs/typeorm'; -import { Repository, LessThan, MoreThanOrEqual } from 'typeorm'; +import { Repository, LessThan } from 'typeorm'; import { BidItem } from '../entities/bid-item.entity'; import { CrawlInfoAdd } from '../../crawler/entities/crawl-info-add.entity'; +interface FindAllQuery { + page?: number; + limit?: number; + source?: string; + keyword?: string; +} + +interface SourceResult { + source: string; +} + +interface CrawlInfoAddStats { + source: string; + count: number; + latestUpdate: Date | string; + latestPublishDate: Date | string | null; + error: string | null; +} + +interface CrawlInfoAddRawResult { + source: string; + count: number; + latestPublishDate: Date | string | null; + error: string | null; + latestUpdate: Date | string; +} + @Injectable() export class BidsService { constructor( @@ -13,7 +40,7 @@ export class BidsService { private crawlInfoRepository: Repository, ) {} - async findAll(query?: any) { + async findAll(query?: FindAllQuery) { const { page = 1, limit = 10, source, keyword } = query || {}; const qb = this.bidRepository.createQueryBuilder('bid'); @@ -26,8 +53,8 @@ export class BidsService { } qb.orderBy('bid.publishDate', 'DESC') - .skip((page - 1) * limit) - .take(limit); + .skip((Number(page) - 1) * Number(limit)) + .take(Number(limit)); const [items, total] = await qb.getManyAndCount(); return { items, total }; @@ -35,7 +62,9 @@ export class BidsService { async createOrUpdate(data: Partial) { // Use title or a hash of title to check for duplicates - let item = await this.bidRepository.findOne({ where: { title: data.title } }); + const item = await this.bidRepository.findOne({ + where: { title: data.title }, + }); if (item) { Object.assign(item, data); return this.bidRepository.save(item); @@ -51,21 +80,21 @@ export class BidsService { }); } - async getSources() { + async getSources(): Promise { const result = await this.bidRepository .createQueryBuilder('bid') - .select('DISTINCT bid.source') + .select('DISTINCT bid.source', 'source') .where('bid.source IS NOT NULL') .orderBy('bid.source', 'ASC') - .getRawMany(); - return result.map((item: any) => item.source); + .getRawMany(); + return result.map((item) => item.source); } async getRecentBids() { const thirtyDaysAgo = new Date(); thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30); thirtyDaysAgo.setHours(0, 0, 0, 0); - + return this.bidRepository .createQueryBuilder('bid') .where('bid.publishDate >= :thirtyDaysAgo', { thirtyDaysAgo }) @@ -81,7 +110,11 @@ export class BidsService { .getMany(); } - async getBidsByDateRange(startDate?: string, endDate?: string, keywords?: string[]) { + async getBidsByDateRange( + startDate?: string, + endDate?: string, + keywords?: string[], + ) { const qb = this.bidRepository.createQueryBuilder('bid'); if (startDate) { @@ -97,13 +130,18 @@ export class BidsService { } if (keywords && keywords.length > 0) { - const keywordConditions = keywords.map((keyword, index) => { - return `bid.title LIKE :keyword${index}`; - }).join(' OR '); - qb.andWhere(`(${keywordConditions})`, keywords.reduce((params, keyword, index) => { - params[`keyword${index}`] = `%${keyword}%`; - return params; - }, {})); + const keywordConditions = keywords + .map((keyword, index) => { + return `bid.title LIKE :keyword${index}`; + }) + .join(' OR '); + qb.andWhere( + `(${keywordConditions})`, + keywords.reduce((params, keyword, index) => { + params[`keyword${index}`] = `%${keyword}%`; + return params; + }, {}), + ); } return qb.orderBy('bid.publishDate', 'DESC').getMany(); @@ -118,7 +156,7 @@ export class BidsService { return this.bidRepository.save(item); } - async getCrawlInfoAddStats() { + async getCrawlInfoAddStats(): Promise { // 获取每个来源的最新一次爬虫记录(按 createdAt 降序) const query = ` SELECT @@ -136,15 +174,19 @@ export class BidsService { ORDER BY source ASC `; - const results = await this.crawlInfoRepository.query(query); + const results = + await this.crawlInfoRepository.query(query); - return results.map((item: any) => ({ - source: item.source, - count: item.count, + return results.map((item) => ({ + source: String(item.source), + count: Number(item.count), latestUpdate: item.latestUpdate, latestPublishDate: item.latestPublishDate, // 确保 error 字段正确处理:null 或空字符串都转换为 null,非空字符串保留 - error: item.error && item.error.trim() !== '' ? item.error : null, + error: + item.error && String(item.error).trim() !== '' + ? String(item.error) + : null, })); } } diff --git a/src/common/logger/logger.service.ts b/src/common/logger/logger.service.ts index dbc339b..aaa5494 100644 --- a/src/common/logger/logger.service.ts +++ b/src/common/logger/logger.service.ts @@ -1,6 +1,21 @@ import { Injectable, LoggerService, Scope } from '@nestjs/common'; import { winstonLogger } from './winston.config'; +type LogMessage = string | Error | Record; + +function formatMessage(message: LogMessage): string { + if (typeof message === 'string') { + return message; + } + if (message instanceof Error) { + return message.message; + } + if (typeof message === 'object' && message !== null) { + return JSON.stringify(message); + } + return String(message); +} + @Injectable({ scope: Scope.TRANSIENT }) export class CustomLogger implements LoggerService { private context?: string; @@ -9,23 +24,34 @@ export class CustomLogger implements LoggerService { this.context = context; } - log(message: any, context?: string) { - winstonLogger.info(message, { context: context || this.context }); + log(message: LogMessage, context?: string) { + winstonLogger.info(formatMessage(message), { + context: context || this.context, + }); } - error(message: any, trace?: string, context?: string) { - winstonLogger.error(message, { context: context || this.context, trace }); + error(message: LogMessage, trace?: string, context?: string) { + winstonLogger.error(formatMessage(message), { + context: context || this.context, + trace, + }); } - warn(message: any, context?: string) { - winstonLogger.warn(message, { context: context || this.context }); + warn(message: LogMessage, context?: string) { + winstonLogger.warn(formatMessage(message), { + context: context || this.context, + }); } - debug(message: any, context?: string) { - winstonLogger.debug(message, { context: context || this.context }); + debug(message: LogMessage, context?: string) { + winstonLogger.debug(formatMessage(message), { + context: context || this.context, + }); } - verbose(message: any, context?: string) { - winstonLogger.verbose(message, { context: context || this.context }); + verbose(message: LogMessage, context?: string) { + winstonLogger.verbose(formatMessage(message), { + context: context || this.context, + }); } } diff --git a/src/common/logger/winston.config.ts b/src/common/logger/winston.config.ts index 03bca7a..bfc7d7a 100644 --- a/src/common/logger/winston.config.ts +++ b/src/common/logger/winston.config.ts @@ -16,13 +16,33 @@ const logFormat = winston.format.combine( winston.format.errors({ stack: true }), winston.format.splat(), winston.format.printf(({ timestamp, level, message, context, stack }) => { - let log = `${timestamp} [${level}]`; - if (context) { - log += ` [${context}]`; - } - log += ` ${message}`; + const timestampStr = + typeof timestamp === 'string' ? timestamp : String(timestamp); + const levelStr = typeof level === 'string' ? level : String(level); + const messageStr = typeof message === 'string' ? message : String(message); + const contextStr = context + ? typeof context === 'string' + ? context + : JSON.stringify(context) + : ''; + let stackStr = ''; if (stack) { - log += `\n${stack}`; + if (typeof stack === 'string') { + stackStr = stack; + } else if (typeof stack === 'object' && stack !== null) { + stackStr = JSON.stringify(stack); + } else { + stackStr = String(stack); + } + } + + let log = `${timestampStr} [${levelStr}]`; + if (contextStr) { + log += ` [${contextStr}]`; + } + log += ` ${messageStr}`; + if (stackStr) { + log += `\n${stackStr}`; } return log; }), @@ -30,10 +50,7 @@ const logFormat = winston.format.combine( // 控制台传输 const consoleTransport = new winston.transports.Console({ - format: winston.format.combine( - winston.format.colorize(), - logFormat, - ), + format: winston.format.combine(winston.format.colorize(), logFormat), }); // 应用日志传输(按天轮转) @@ -61,10 +78,6 @@ const errorLogTransport = new DailyRotateFile({ export const winstonLogger = winston.createLogger({ level: process.env.LOG_LEVEL || 'info', format: logFormat, - transports: [ - consoleTransport, - appLogTransport, - errorLogTransport, - ], + transports: [consoleTransport, appLogTransport, errorLogTransport], exitOnError: false, }); diff --git a/src/crawler/crawler.controller.ts b/src/crawler/crawler.controller.ts index 8b2bd33..2fb8dbc 100644 --- a/src/crawler/crawler.controller.ts +++ b/src/crawler/crawler.controller.ts @@ -12,7 +12,7 @@ export class CrawlerController { getStatus() { return { isCrawling: this.isCrawling, - crawlingSources: Array.from(this.crawlingSources) + crawlingSources: Array.from(this.crawlingSources), }; } @@ -21,9 +21,9 @@ export class CrawlerController { if (this.isCrawling) { return { message: 'Crawl is already running' }; } - + this.isCrawling = true; - + // We don't await this because we want it to run in the background // and return immediately, or we can await if we want to user to wait. // Given the requirement "Immediate Crawl", usually implies triggering it. @@ -45,9 +45,9 @@ export class CrawlerController { if (this.crawlingSources.has(sourceName)) { return { message: `Source ${sourceName} is already being crawled` }; } - + this.crawlingSources.add(sourceName); - + try { const result = await this.crawlerService.crawlSingleSource(sourceName); return result; diff --git a/src/crawler/entities/crawl-info-add.entity.ts b/src/crawler/entities/crawl-info-add.entity.ts index 976ec97..931805c 100644 --- a/src/crawler/entities/crawl-info-add.entity.ts +++ b/src/crawler/entities/crawl-info-add.entity.ts @@ -1,4 +1,9 @@ -import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm'; +import { + Entity, + PrimaryGeneratedColumn, + Column, + CreateDateColumn, +} from 'typeorm'; @Entity('crawl_info_add') export class CrawlInfoAdd { diff --git a/src/crawler/services/bid-crawler.service.ts b/src/crawler/services/bid-crawler.service.ts index 138e607..4321525 100644 --- a/src/crawler/services/bid-crawler.service.ts +++ b/src/crawler/services/bid-crawler.service.ts @@ -18,6 +18,17 @@ import { PowerbeijingCrawler } from './powerbeijing_target'; import { SdiccCrawler } from './sdicc_target'; import { CnoocCrawler } from './cnooc_target'; +interface CrawlResult { + title: string; + publishDate: Date; + url: string; +} + +interface Crawler { + name: string; + crawl(browser: puppeteer.Browser): Promise; +} + @Injectable() export class BidCrawlerService { private readonly logger = new Logger(BidCrawlerService.name); @@ -31,17 +42,15 @@ export class BidCrawlerService { async crawlAll() { this.logger.log('Starting crawl task with Puppeteer...'); - + // 设置最大执行时间为3小时 const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时(毫秒) const startTime = Date.now(); - // 统计结果 - const crawlResults: Record = {}; - + const crawlResults: Record = + {}; // 记录数据为0的爬虫,用于重试 - const zeroDataCrawlers: any[] = []; - + const zeroDataCrawlers: Crawler[] = []; // 从环境变量读取代理配置 const proxyHost = this.configService.get('PROXY_HOST'); const proxyPort = this.configService.get('PROXY_PORT'); @@ -60,9 +69,10 @@ export class BidCrawlerService { ]; if (proxyHost && proxyPort) { - const proxyUrl = proxyUsername && proxyPassword - ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` - : `http://${proxyHost}:${proxyPort}`; + const proxyUrl = + proxyUsername && proxyPassword + ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` + : `http://${proxyHost}:${proxyPort}`; args.push(`--proxy-server=${proxyUrl}`); this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`); } @@ -72,24 +82,43 @@ export class BidCrawlerService { args, }); - const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler]; + const crawlers = [ + ChdtpCrawler, + ChngCrawler, + SzecpCrawler, + CdtCrawler, + EpsCrawler, + CnncecpCrawler, + CgnpcCrawler, + CeicCrawler, + EspicCrawler, + PowerbeijingCrawler, + SdiccCrawler, + CnoocCrawler, + ]; try { for (const crawler of crawlers) { this.logger.log(`Crawling: ${crawler.name}`); - + // 检查是否超时 const elapsedTime = Date.now() - startTime; if (elapsedTime > maxExecutionTime) { - this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`); - this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`); + this.logger.warn( + `⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`, + ); + this.logger.warn( + `⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`, + ); break; } - + try { const results = await crawler.crawl(browser); - this.logger.log(`Extracted ${results.length} items from ${crawler.name}`); - + this.logger.log( + `Extracted ${results.length} items from ${crawler.name}`, + ); + // 记录成功数量 crawlResults[crawler.name] = { success: results.length }; @@ -99,12 +128,13 @@ export class BidCrawlerService { } // 获取最新的发布日期 - const latestPublishDate = results.length > 0 - ? results.reduce((latest, item) => { - const itemDate = new Date(item.publishDate); - return itemDate > latest ? itemDate : latest; - }, new Date(0)) - : null; + const latestPublishDate = + results.length > 0 + ? results.reduce((latest, item) => { + const itemDate = new Date(item.publishDate); + return itemDate > latest ? itemDate : latest; + }, new Date(0)) + : null; for (const item of results) { await this.bidsService.createOrUpdate({ @@ -116,46 +146,60 @@ export class BidCrawlerService { } // 保存爬虫统计信息到数据库 - await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate); + await this.saveCrawlInfo( + crawler.name, + results.length, + latestPublishDate, + ); } catch (err) { - this.logger.error(`Error crawling ${crawler.name}: ${err.message}`); + const errorMessage = err instanceof Error ? err.message : String(err); + this.logger.error(`Error crawling ${crawler.name}: ${errorMessage}`); // 记录错误信息 - crawlResults[crawler.name] = { success: 0, error: err.message }; + crawlResults[crawler.name] = { success: 0, error: errorMessage }; // 保存错误信息到数据库 - await this.saveCrawlInfo(crawler.name, 0, null, err.message); + await this.saveCrawlInfo(crawler.name, 0, null, errorMessage); } } - + // 对数据为0的爬虫进行重试 if (zeroDataCrawlers.length > 0) { - this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`); - + this.logger.log( + `Retrying ${zeroDataCrawlers.length} crawlers with zero data...`, + ); + for (const crawler of zeroDataCrawlers) { this.logger.log(`Retrying: ${crawler.name}`); - + // 检查是否超时 const elapsedTime = Date.now() - startTime; if (elapsedTime > maxExecutionTime) { - this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`); - this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`); + this.logger.warn( + `⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`, + ); + this.logger.warn( + `⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`, + ); break; } - + try { const results = await crawler.crawl(browser); - this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`); - + this.logger.log( + `Retry extracted ${results.length} items from ${crawler.name}`, + ); + // 更新统计结果 crawlResults[crawler.name] = { success: results.length }; // 获取最新的发布日期 - const latestPublishDate = results.length > 0 - ? results.reduce((latest, item) => { - const itemDate = new Date(item.publishDate); - return itemDate > latest ? itemDate : latest; - }, new Date(0)) - : null; + const latestPublishDate = + results.length > 0 + ? results.reduce((latest, item) => { + const itemDate = new Date(item.publishDate); + return itemDate > latest ? itemDate : latest; + }, new Date(0)) + : null; for (const item of results) { await this.bidsService.createOrUpdate({ @@ -167,58 +211,76 @@ export class BidCrawlerService { } // 更新爬虫统计信息到数据库 - await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate); + await this.saveCrawlInfo( + crawler.name, + results.length, + latestPublishDate, + ); } catch (err) { - this.logger.error(`Error retrying ${crawler.name}: ${err.message}`); + const errorMessage = + err instanceof Error ? err.message : String(err); + this.logger.error( + `Error retrying ${crawler.name}: ${errorMessage}`, + ); // 记录错误信息 - crawlResults[crawler.name] = { success: 0, error: err.message }; + crawlResults[crawler.name] = { success: 0, error: errorMessage }; // 更新错误信息到数据库 - await this.saveCrawlInfo(crawler.name, 0, null, err.message); + await this.saveCrawlInfo(crawler.name, 0, null, errorMessage); } } } } catch (error) { - this.logger.error(`Crawl task failed: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + this.logger.error(`Crawl task failed: ${errorMessage}`); } finally { await browser.close(); - + const totalTime = Date.now() - startTime; const minutes = Math.floor(totalTime / 1000 / 60); this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`); - + if (totalTime > maxExecutionTime) { - this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`); + this.logger.warn( + `⚠️ Crawl task exceeded maximum execution time of 3 hours.`, + ); } - + // 输出统计总结 this.logger.log('='.repeat(50)); this.logger.log('爬虫执行总结 / Crawl Summary'); this.logger.log('='.repeat(50)); - + let totalSuccess = 0; let errorCount = 0; - + for (const [source, result] of Object.entries(crawlResults)) { if (result.error) { this.logger.error(`❌ ${source}: 出错 - ${result.error}`); errorCount++; } else { - this.logger.log(`✅ ${source}: 成功获取 ${result.success} 条工程信息`); + this.logger.log( + `✅ ${source}: 成功获取 ${result.success} 条工程信息`, + ); totalSuccess += result.success; } } - + this.logger.log('='.repeat(50)); - this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`); - this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`); + this.logger.log( + `总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`, + ); + this.logger.log( + `Total: ${totalSuccess} items, ${errorCount} sources failed`, + ); this.logger.log('='.repeat(50)); } } async crawlSingleSource(sourceName: string) { this.logger.log(`Starting single source crawl for: ${sourceName}`); - + // 从环境变量读取代理配置 const proxyHost = this.configService.get('PROXY_HOST'); const proxyPort = this.configService.get('PROXY_PORT'); @@ -237,9 +299,10 @@ export class BidCrawlerService { ]; if (proxyHost && proxyPort) { - const proxyUrl = proxyUsername && proxyPassword - ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` - : `http://${proxyHost}:${proxyPort}`; + const proxyUrl = + proxyUsername && proxyPassword + ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` + : `http://${proxyHost}:${proxyPort}`; args.push(`--proxy-server=${proxyUrl}`); this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`); } @@ -249,10 +312,23 @@ export class BidCrawlerService { args, }); - const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler]; - - const targetCrawler = crawlers.find(c => c.name === sourceName); - + const crawlers = [ + ChdtpCrawler, + ChngCrawler, + SzecpCrawler, + CdtCrawler, + EpsCrawler, + CnncecpCrawler, + CgnpcCrawler, + CeicCrawler, + EspicCrawler, + PowerbeijingCrawler, + SdiccCrawler, + CnoocCrawler, + ]; + + const targetCrawler = crawlers.find((c) => c.name === sourceName); + if (!targetCrawler) { await browser.close(); throw new Error(`Crawler not found for source: ${sourceName}`); @@ -260,17 +336,20 @@ export class BidCrawlerService { try { this.logger.log(`Crawling: ${targetCrawler.name}`); - + const results = await targetCrawler.crawl(browser); - this.logger.log(`Extracted ${results.length} items from ${targetCrawler.name}`); + this.logger.log( + `Extracted ${results.length} items from ${targetCrawler.name}`, + ); // 获取最新的发布日期 - const latestPublishDate = results.length > 0 - ? results.reduce((latest, item) => { - const itemDate = new Date(item.publishDate); - return itemDate > latest ? itemDate : latest; - }, new Date(0)) - : null; + const latestPublishDate = + results.length > 0 + ? results.reduce((latest, item) => { + const itemDate = new Date(item.publishDate); + return itemDate > latest ? itemDate : latest; + }, new Date(0)) + : null; for (const item of results) { await this.bidsService.createOrUpdate({ @@ -282,7 +361,11 @@ export class BidCrawlerService { } // 保存爬虫统计信息到数据库 - await this.saveCrawlInfo(targetCrawler.name, results.length, latestPublishDate); + await this.saveCrawlInfo( + targetCrawler.name, + results.length, + latestPublishDate, + ); return { success: true, @@ -291,16 +374,19 @@ export class BidCrawlerService { latestPublishDate, }; } catch (err) { - this.logger.error(`Error crawling ${targetCrawler.name}: ${err.message}`); - + const errorMessage = err instanceof Error ? err.message : String(err); + this.logger.error( + `Error crawling ${targetCrawler.name}: ${errorMessage}`, + ); + // 保存错误信息到数据库 - await this.saveCrawlInfo(targetCrawler.name, 0, null, err.message); + await this.saveCrawlInfo(targetCrawler.name, 0, null, errorMessage); return { success: false, source: targetCrawler.name, count: 0, - error: err.message, + error: errorMessage, }; } finally { await browser.close(); @@ -324,7 +410,10 @@ export class BidCrawlerService { await this.crawlInfoRepository.save(crawlInfo); this.logger.log(`Saved crawl info for ${source}: ${count} items`); } catch (err) { - this.logger.error(`Failed to save crawl info for ${source}: ${err.message}`); + const errorMessage = err instanceof Error ? err.message : String(err); + this.logger.error( + `Failed to save crawl info for ${source}: ${errorMessage}`, + ); } } } diff --git a/src/crawler/services/cdt_target.spec.ts b/src/crawler/services/cdt_target.spec.ts index 004899e..720368c 100644 --- a/src/crawler/services/cdt_target.spec.ts +++ b/src/crawler/services/cdt_target.spec.ts @@ -2,7 +2,7 @@ import { CdtCrawler } from './cdt_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('CdtCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('CdtCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${CdtCrawler.name}`); console.log(`Target URL: ${CdtCrawler.url}`); - + const results = await CdtCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('CdtCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/cdt_target.ts b/src/crawler/services/cdt_target.ts index 2661e7a..c7a67d0 100644 --- a/src/crawler/services/cdt_target.ts +++ b/src/crawler/services/cdt_target.ts @@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { const y = Math.floor(Math.random() * viewport.height); await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) { await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface CdtResult { @@ -52,12 +52,22 @@ export interface CdtResult { url: string; } +interface CdtCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): CdtResult[]; +} + export const CdtCrawler = { name: '中国大唐集团电子商务平台', url: 'https://tang.cdt-ec.com/home/index.html', baseUrl: 'https://tang.cdt-ec.com', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: CdtCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('CdtCrawler'); const page = await browser.newPage(); @@ -67,7 +77,9 @@ export const CdtCrawler = { await page.authenticate({ username, password }); } - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', + ); const allResults: CdtResult[] = []; let currentPage = 1; @@ -86,19 +98,26 @@ export const CdtCrawler = { // 点击"招标公告"标签 logger.log('Looking for "招标公告" tab...'); - await page.waitForFunction(() => { - const tabs = Array.from(document.querySelectorAll('span.notice-tab')); - return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告')); - }, { timeout: 60000 }); + await page.waitForFunction( + () => { + const tabs = Array.from(document.querySelectorAll('span.notice-tab')); + return tabs.some( + (tab) => tab.textContent && tab.textContent.includes('招标公告'), + ); + }, + { timeout: 60000 }, + ); await page.evaluate(() => { const tabs = Array.from(document.querySelectorAll('span.notice-tab')); - const target = tabs.find(tab => tab.textContent && tab.textContent.includes('招标公告')) as HTMLElement; + const target = tabs.find( + (tab) => tab.textContent && tab.textContent.includes('招标公告'), + ) as HTMLElement; if (target) target.click(); }); logger.log('Clicked "招标公告" tab.'); - await new Promise(r => setTimeout(r, 2000)); + await new Promise((r) => setTimeout(r, 2000)); // 模拟人类行为 logger.log('Simulating human mouse movements...'); @@ -109,26 +128,43 @@ export const CdtCrawler = { // 点击"招标公告"下的"更多+"链接 logger.log('Looking for "更多+" link under "招标公告"...'); - await page.waitForFunction(() => { - const titles = Array.from(document.querySelectorAll('span.h-notice-title')); - return titles.some(title => title.textContent && title.textContent.includes('招标公告')); - }, { timeout: 30000 }); + await page.waitForFunction( + () => { + const titles = Array.from( + document.querySelectorAll('span.h-notice-title'), + ); + return titles.some( + (title) => + title.textContent && title.textContent.includes('招标公告'), + ); + }, + { timeout: 30000 }, + ); await page.evaluate(() => { - const titles = Array.from(document.querySelectorAll('span.h-notice-title')); - const targetTitle = titles.find(title => title.textContent && title.textContent.includes('招标公告')); + const titles = Array.from( + document.querySelectorAll('span.h-notice-title'), + ); + const targetTitle = titles.find( + (title) => + title.textContent && title.textContent.includes('招标公告'), + ); if (targetTitle) { const parent = targetTitle.parentElement; if (parent) { - const moreLink = parent.querySelector('a.h-notice-more') as HTMLElement; + const moreLink = parent.querySelector( + 'a.h-notice-more', + ) as HTMLElement; if (moreLink) moreLink.click(); } } }); logger.log('Clicked "更多+" link under "招标公告".'); - await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {}); - await new Promise(r => setTimeout(r, 3000)); + await page + .waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }) + .catch(() => {}); + await new Promise((r) => setTimeout(r, 3000)); // 模拟人类行为 logger.log('Simulating human mouse movements...'); @@ -155,7 +191,9 @@ export const CdtCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); @@ -172,7 +210,9 @@ export const CdtCrawler = { }, nextButtonSelector); if (!nextButtonExists) { - logger.log('Next page button not found or disabled. Reached end of list.'); + logger.log( + 'Next page button not found or disabled. Reached end of list.', + ); break; } @@ -186,18 +226,25 @@ export const CdtCrawler = { }, nextButtonSelector); // 等待 AJAX 请求完成(通过监听网络请求) - await page.waitForFunction(() => { - // 检查表格是否正在加载 - const loading = document.querySelector('.layui-table-loading'); - return !loading; - }, { timeout: 30000 }).catch(() => {}); + await page + .waitForFunction( + () => { + // 检查表格是否正在加载 + const loading = document.querySelector('.layui-table-loading'); + return !loading; + }, + { timeout: 30000 }, + ) + .catch(() => {}); // 额外等待确保数据加载完成 - await new Promise(r => setTimeout(r, 2000)); + await new Promise((r) => setTimeout(r, 2000)); // 检查是否真的翻页了(通过检查当前页码) const currentActivePage = await page.evaluate(() => { - const activeSpan = document.querySelector('.layui-laypage-curr em:last-child'); + const activeSpan = document.querySelector( + '.layui-laypage-curr em:last-child', + ); return activeSpan ? parseInt(activeSpan.textContent || '1') : 1; }); @@ -217,25 +264,29 @@ export const CdtCrawler = { // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); - + await new Promise((resolve) => setTimeout(resolve, delay)); } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): CdtResult[] { + extract(this: CdtCrawlerType, html: string): CdtResult[] { const results: CdtResult[] = []; /** * Regex groups for tang.cdt-ec.com: @@ -243,23 +294,24 @@ export const CdtCrawler = { * 2: Title (项目名称) * 3: Date (发布时间) */ - const regex = /]*data-index="[^"]*"[^>]*>[\s\S]*?]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?]*data-field="publish_time"[^>]*>[\s\S]*?]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs; + const regex = + /]*data-index="[^"]*"[^>]*>[\s\S]*?]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?]*data-field="publish_time"[^>]*>[\s\S]*?]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const url = match[1]?.trim(); - const title = match[2]?.trim(); - const dateStr = match[3]?.trim(); + const url = match[1]?.trim() ?? ''; + const title = match[2]?.trim() ?? ''; + const dateStr = match[3]?.trim() ?? ''; if (title && url) { const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/ceic_target.spec.ts b/src/crawler/services/ceic_target.spec.ts index e37deff..6f3a941 100644 --- a/src/crawler/services/ceic_target.spec.ts +++ b/src/crawler/services/ceic_target.spec.ts @@ -29,7 +29,7 @@ describe('CeicCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Run in non-headless mode args: [ @@ -40,14 +40,14 @@ describe('CeicCrawler Real Site Test', () => { '--disable-infobars', ...proxyArgs, ], - defaultViewport: null + defaultViewport: null, }); }); afterAll(async () => { if (browser) { // Keep open for a few seconds after test to see result - await new Promise(r => setTimeout(r, 50000)); + await new Promise((r) => setTimeout(r, 50000)); await browser.close(); } }); @@ -56,29 +56,33 @@ describe('CeicCrawler Real Site Test', () => { console.log(` Starting crawl for: ${CeicCrawler.name}`); console.log(`Target URL: ${CeicCrawler.url}`); - + const results = await CeicCrawler.crawl(browser); - + console.log(` Successfully found ${results.length} items: `); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); expect(results).toBeDefined(); expect(Array.isArray(results)).toBeTruthy(); - + if (results.length === 0) { - console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.'); + console.warn( + 'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.', + ); } else { - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/ceic_target.ts b/src/crawler/services/ceic_target.ts index f31170e..c860b28 100644 --- a/src/crawler/services/ceic_target.ts +++ b/src/crawler/services/ceic_target.ts @@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); +} + +interface CeicCrawlerType { + name: string; + url: string; + baseUrl: string; } export const CeicCrawler = { @@ -52,7 +58,10 @@ export const CeicCrawler = { url: 'https://ceic.dlnyzb.com/3001', baseUrl: 'https://ceic.dlnyzb.com/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: CeicCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('CeicCrawler'); const page = await browser.newPage(); @@ -65,10 +74,14 @@ export const CeicCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); - Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: ChdtpResult[] = []; @@ -82,7 +95,7 @@ export const CeicCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -90,16 +103,25 @@ export const CeicCrawler = { logger.log(`Processing page ${currentPage}...`); // Wait for content to load - MUI list items - await page.waitForFunction(() => { - return document.querySelectorAll('li.MuiListItem-root').length > 0; - }, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.')); + await page + .waitForFunction( + () => { + return ( + document.querySelectorAll('li.MuiListItem-root').length > 0 + ); + }, + { timeout: 60000 }, + ) + .catch(() => logger.warn('Content not found. Site might be slow.')); const pageResults = await page.evaluate(() => { const results: { title: string; dateStr: string; url: string }[] = []; // Extract from MUI list items - const listItems = Array.from(document.querySelectorAll('li.MuiListItem-root')); - listItems.forEach(item => { + const listItems = Array.from( + document.querySelectorAll('li.MuiListItem-root'), + ); + listItems.forEach((item) => { // Find the title link const titleLink = item.querySelector('a.css-1vdw90h'); const title = titleLink?.textContent?.trim() || ''; @@ -125,15 +147,19 @@ export const CeicCrawler = { }); if (pageResults.length === 0) { - logger.warn(`No results found on page ${currentPage}. Extraction failed.`); + logger.warn( + `No results found on page ${currentPage}. Extraction failed.`, + ); break; } - allResults.push(...pageResults.map(r => ({ - title: r.title, - publishDate: r.dateStr ? new Date(r.dateStr) : new Date(), - url: r.url.replace(/\/\//g, '/') - }))); + allResults.push( + ...pageResults.map((r) => ({ + title: r.title, + publishDate: r.dateStr ? new Date(r.dateStr) : new Date(), + url: r.url.replace(/\/\//g, '/'), + })), + ); logger.log(`Extracted ${pageResults.length} items.`); @@ -142,27 +168,30 @@ export const CeicCrawler = { if (!nextButton) break; await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); - + await new Promise((r) => setTimeout(r, 3000)); + // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); - + currentPage++; } return allResults; - } catch (error) { - logger.error(`Crawl failed: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Crawl failed: ${errorMessage}`); return allResults; } finally { if (page) await page.close(); } }, - extract() { return []; } + extract() { + return []; + }, }; diff --git a/src/crawler/services/cgnpc_target.spec.ts b/src/crawler/services/cgnpc_target.spec.ts index 432474e..3060226 100644 --- a/src/crawler/services/cgnpc_target.spec.ts +++ b/src/crawler/services/cgnpc_target.spec.ts @@ -2,7 +2,7 @@ import { CgnpcCrawler } from './cgnpc_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('CgnpcCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('CgnpcCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${CgnpcCrawler.name}`); console.log(`Target URL: ${CgnpcCrawler.url}`); - + const results = await CgnpcCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('CgnpcCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/cgnpc_target.ts b/src/crawler/services/cgnpc_target.ts index 886ef2a..9b046df 100644 --- a/src/crawler/services/cgnpc_target.ts +++ b/src/crawler/services/cgnpc_target.ts @@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface CgnpcResult { @@ -52,12 +52,22 @@ export interface CgnpcResult { url: string; } +interface CgnpcCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): CgnpcResult[]; +} + export const CgnpcCrawler = { name: '中广核电子商务平台', url: 'https://ecp.cgnpc.com.cn/zbgg.html', baseUrl: 'https://ecp.cgnpc.com.cn/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: CgnpcCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('CgnpcCrawler'); const page = await browser.newPage(); @@ -69,11 +79,15 @@ export const CgnpcCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: CgnpcResult[] = []; @@ -87,7 +101,7 @@ export const CgnpcCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -103,12 +117,14 @@ export const CgnpcCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling before pagination...'); await simulateHumanScrolling(page); @@ -127,9 +143,13 @@ export const CgnpcCrawler = { try { // 点击下一页按钮 await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 + await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载 } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } @@ -138,26 +158,27 @@ export const CgnpcCrawler = { // 模拟人类行为 - 翻页后 logger.log('Simulating human mouse movements after pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling after pagination...'); await simulateHumanScrolling(page); // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): CgnpcResult[] { + extract(this: CgnpcCrawlerType, html: string): CgnpcResult[] { const results: CgnpcResult[] = []; /** * Regex groups for ecp.cgnpc.com.cn: @@ -181,24 +202,25 @@ export const CgnpcCrawler = { * * */ - const regex = /
[\s\S]*?]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?
[\s\S]*?

文件获取截止时间<\/p>[\s\S]*?

\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs; + const regex = + /
[\s\S]*?]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?
[\s\S]*?

文件获取截止时间<\/p>[\s\S]*?

\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const title = match[1]?.trim(); - const url = match[2]?.trim(); - const dateStr = match[3]?.trim(); + const title = match[1]?.trim() ?? ''; + const url = match[2]?.trim() ?? ''; + const dateStr = match[3]?.trim() ?? ''; if (title && url) { const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/chdtp_target.spec.ts b/src/crawler/services/chdtp_target.spec.ts index eab266e..9843b83 100644 --- a/src/crawler/services/chdtp_target.spec.ts +++ b/src/crawler/services/chdtp_target.spec.ts @@ -29,7 +29,7 @@ describe('ChdtpCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: true, // Change to false to see the browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('ChdtpCrawler Real Site Test', () => { it('should visit the website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${ChdtpCrawler.name}`); console.log(`Target URL: ${ChdtpCrawler.url}`); - + const results = await ChdtpCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('ChdtpCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if the website structure has changed or if the list is currently empty.'); + console.warn( + 'Warning: No items found. Check if the website structure has changed or if the list is currently empty.', + ); } else { - // Check data integrity of the first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of the first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); -}); \ No newline at end of file +}); diff --git a/src/crawler/services/chdtp_target.ts b/src/crawler/services/chdtp_target.ts index 653df77..410d7a7 100644 --- a/src/crawler/services/chdtp_target.ts +++ b/src/crawler/services/chdtp_target.ts @@ -7,22 +7,34 @@ export interface ChdtpResult { url: string; // Necessary for system uniqueness } +interface ChdtpCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): ChdtpResult[]; +} + export const ChdtpCrawler = { name: '华电集团电子商务平台 ', url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1', baseUrl: 'https://www.chdtp.com/webs/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: ChdtpCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('ChdtpCrawler'); const page = await browser.newPage(); - + const username = process.env.PROXY_USERNAME; const password = process.env.PROXY_PASSWORD; if (username && password) { await page.authenticate({ username, password }); } - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', + ); const allResults: ChdtpResult[] = []; let currentPage = 1; @@ -35,14 +47,16 @@ export const ChdtpCrawler = { while (currentPage <= maxPages) { const content = await page.content(); const pageResults = this.extract(content); - + if (pageResults.length === 0) { logger.warn(`No results found on page ${currentPage}, stopping.`); break; } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // Find the "Next Page" button // Using partial match for src to be robust against path variations @@ -58,35 +72,43 @@ export const ChdtpCrawler = { // For this specific site, we'll try to click. logger.log(`Navigating to page ${currentPage + 1}...`); - + try { await Promise.all([ - page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }), + page.waitForNavigation({ + waitUntil: 'networkidle2', + timeout: 60000, + }), nextButton.click(), ]); } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } currentPage++; - + // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; // Return what we have so far } finally { await page.close(); } }, - extract(html: string): ChdtpResult[] { + extract(this: ChdtpCrawlerType, html: string): ChdtpResult[] { const results: ChdtpResult[] = []; /** * Regex groups for chdtp.com: @@ -96,23 +118,24 @@ export const ChdtpCrawler = { * 4: Business Type * 5: Date */ - const regex = /]*>\s*.*?]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*\s*]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*\s*]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*\[(.*?)\]<\/span><\/td>/gs; + const regex = + /]*>\s*.*?]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*\s*]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*\s*]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*\[(.*?)\]<\/span><\/td>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const urlSuffix = match[2]?.trim(); - const title = match[3]?.trim(); - const dateStr = match[5]?.trim(); + const urlSuffix = match[2]?.trim() ?? ''; + const title = match[3]?.trim() ?? ''; + const dateStr = match[5]?.trim() ?? ''; if (title && urlSuffix) { const fullUrl = this.baseUrl + urlSuffix; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } -}; \ No newline at end of file + }, +}; diff --git a/src/crawler/services/chng_target.spec.ts b/src/crawler/services/chng_target.spec.ts index 7253db6..81f4773 100644 --- a/src/crawler/services/chng_target.spec.ts +++ b/src/crawler/services/chng_target.spec.ts @@ -31,13 +31,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -47,23 +47,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } describe('ChngCrawler Real Site Test', () => { @@ -74,7 +74,7 @@ describe('ChngCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Run in non-headless mode args: [ @@ -82,7 +82,7 @@ describe('ChngCrawler Real Site Test', () => { '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled', '--window-size=1920,1080', - "--disable-infobars", + '--disable-infobars', ...proxyArgs, // "--headless=new", // '--disable-dev-shm-usage', @@ -94,15 +94,14 @@ describe('ChngCrawler Real Site Test', () => { // '--disable-webgl', // '--disable-javascript', ], - defaultViewport: null - + defaultViewport: null, }); }); afterAll(async () => { if (browser) { // Keep open for a few seconds after test to see result - await new Promise(r => setTimeout(r, 50000)); + await new Promise((r) => setTimeout(r, 50000)); await browser.close(); } }); @@ -111,43 +110,51 @@ describe('ChngCrawler Real Site Test', () => { console.log(` Starting crawl for: ${ChngCrawler.name}`); console.log(`Target URL: ${ChngCrawler.url}`); - + // 创建一个临时页面用于模拟人类行为 const tempPage = await browser.newPage(); - await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 }); - + await tempPage.setViewport({ + width: 1920, + height: 1080, + deviceScaleFactor: 1, + }); + // 模拟人类鼠标移动 console.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(tempPage); - + // 模拟人类滚动 console.log('Simulating human scrolling...'); await simulateHumanScrolling(tempPage); - + await tempPage.close(); - + const results = await ChngCrawler.crawl(browser); - + console.log(` Successfully found ${results.length} items: `); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); expect(results).toBeDefined(); expect(Array.isArray(results)).toBeTruthy(); - + if (results.length === 0) { - console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.'); + console.warn( + 'Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.', + ); } else { - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); -}); \ No newline at end of file +}); diff --git a/src/crawler/services/chng_target.ts b/src/crawler/services/chng_target.ts index d069ace..b3dfacd 100644 --- a/src/crawler/services/chng_target.ts +++ b/src/crawler/services/chng_target.ts @@ -16,19 +16,20 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { console.log('Page was closed during mouse movement simulation'); return; } - + const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } catch (error) { - console.log('Mouse movement simulation interrupted:', error.message); + const errorMessage = error instanceof Error ? error.message : String(error); + console.log('Mouse movement simulation interrupted:', errorMessage); } } @@ -43,18 +44,18 @@ async function simulateHumanScrolling(page: puppeteer.Page) { console.log('Page was closed during scrolling simulation'); return; } - + const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 @@ -62,19 +63,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) { await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } } catch (error) { - console.log('Scrolling simulation interrupted:', error.message); + const errorMessage = error instanceof Error ? error.message : String(error); + console.log('Scrolling simulation interrupted:', errorMessage); } } +interface ChngCrawlerType { + name: string; + url: string; + baseUrl: string; +} + export const ChngCrawler = { name: '华能集团电子商务平台', url: 'https://ec.chng.com.cn/channel/home/#/purchase?top=0', baseUrl: 'https://ec.chng.com.cn/channel/home/#', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: ChngCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('ChngCrawler'); let page = await browser.newPage(); // await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 }); @@ -84,42 +95,48 @@ export const ChngCrawler = { if (username && password) { await page.authenticate({ username, password }); } - + await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: ChdtpResult[] = []; let currentPage = 1; - const maxPages = 5; + const maxPages = 5; try { logger.log('Navigating to Bing...'); await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' }); logger.log('Searching for target site...'); - const searchBoxSelector = 'input[name="q"]'; + const searchBoxSelector = 'input[name="q"]'; await page.waitForSelector(searchBoxSelector); await page.type(searchBoxSelector, 'https://ec.chng.com.cn/'); await page.keyboard.press('Enter'); await page.waitForNavigation({ waitUntil: 'networkidle2' }); - + logger.log('Clicking search result...'); // await page.screenshot({ path: 'bing.png' }); const firstResultSelector = '#b_results .b_algo h2 a'; await page.waitForSelector(firstResultSelector); - - const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target()); + + const newTargetPromise = browser.waitForTarget( + (target) => target.opener() === page.target(), + ); await page.click(firstResultSelector); - + const newTarget = await newTargetPromise; const newPage = await newTarget.page(); - + if (newPage) { // await newPage.screenshot({ path: 'newPage.png' }); await page.close(); @@ -131,108 +148,135 @@ export const ChngCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); - + // 等待页面稳定,不强制等待导航 - await new Promise(r => setTimeout(r, 3000)); - - // 模拟人类行为 - logger.log('Simulating human mouse movements...'); - await simulateHumanMouseMovement(page); - - logger.log('Simulating human scrolling...'); - await simulateHumanScrolling(page); - - - // PAUSE 15 SECONDS as requested - logger.log('Pausing 15 seconds before looking for "采购专栏"...'); - await new Promise(r => setTimeout(r, 15000)); - // await page.screenshot({ path: 'huaneng.png' }); + await new Promise((r) => setTimeout(r, 3000)); - logger.log('Looking for "采购专栏" link...'); - await page.waitForFunction(() => { - const divs = Array.from(document.querySelectorAll('div.text')); - return divs.some(div => div.textContent && div.textContent.includes('采购专栏')); - }, { timeout: 60000 }); - - const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null); - - await page.evaluate(() => { - const divs = Array.from(document.querySelectorAll('div.text')); - const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement; - if (target) target.click(); - }); - - const purchaseTarget = await purchaseTargetPromise; - if (purchaseTarget) { - const pPage = await purchaseTarget.page(); - if (pPage) { - logger.log('Switched to Purchase Page tab.'); - page = pPage; - if (username && password) { - await page.authenticate({ username, password }); - } - await new Promise(r => setTimeout(r, 5000)); - } - } - - logger.log(`Active URL: ${page.url()}`); - // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); - + + // PAUSE 15 SECONDS as requested + logger.log('Pausing 15 seconds before looking for "采购专栏"...'); + await new Promise((r) => setTimeout(r, 15000)); + // await page.screenshot({ path: 'huaneng.png' }); + + logger.log('Looking for "采购专栏" link...'); + await page.waitForFunction( + () => { + const divs = Array.from(document.querySelectorAll('div.text')); + return divs.some( + (div) => div.textContent && div.textContent.includes('采购专栏'), + ); + }, + { timeout: 60000 }, + ); + + const purchaseTargetPromise = browser + .waitForTarget((target) => target.opener() === page.target(), { + timeout: 15000, + }) + .catch(() => null); + + await page.evaluate(() => { + const divs = Array.from(document.querySelectorAll('div.text')); + const target = divs.find( + (div) => div.textContent && div.textContent.includes('采购专栏'), + ) as HTMLElement; + if (target) target.click(); + }); + + const purchaseTarget = await purchaseTargetPromise; + if (purchaseTarget) { + const pPage = await purchaseTarget.page(); + if (pPage) { + logger.log('Switched to Purchase Page tab.'); + page = pPage; + if (username && password) { + await page.authenticate({ username, password }); + } + await new Promise((r) => setTimeout(r, 5000)); + } + } + + logger.log(`Active URL: ${page.url()}`); + + // 模拟人类行为 + logger.log('Simulating human mouse movements...'); + await simulateHumanMouseMovement(page); + + logger.log('Simulating human scrolling...'); + await simulateHumanScrolling(page); + while (currentPage <= maxPages) { logger.log(`Processing page ${currentPage}...`); - + // Wait for table rows to load - await page.waitForFunction(() => { - return document.querySelectorAll('tr.ant-table-row').length > 0; - }, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.')); + await page + .waitForFunction( + () => { + return document.querySelectorAll('tr.ant-table-row').length > 0; + }, + { timeout: 60000 }, + ) + .catch(() => logger.warn('Content not found. Site might be slow.')); const pageResults = await page.evaluate((baseUrl) => { // Extract from table rows - const items = Array.from(document.querySelectorAll('tr.ant-table-row')); - return items.map(item => { - const titleSpan = item.querySelector('span.list-text'); - const dateCell = item.querySelector('td.ant-table-row-cell-break-word p'); - - if (titleSpan && dateCell) { - const title = titleSpan.textContent?.trim() || ''; - const dateStr = dateCell.textContent?.trim() || ''; - - if (title.length < 5) return null; // Filter noise - - // URL is not directly available in the table, need to construct from data-row-key - const rowKey = item.getAttribute('data-row-key'); - const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : ''; - - return { - title, - dateStr, - url - }; - } - return null; - }).filter(i => i !== null); + const items = Array.from( + document.querySelectorAll('tr.ant-table-row'), + ); + return items + .map((item) => { + const titleSpan = item.querySelector('span.list-text'); + const dateCell = item.querySelector( + 'td.ant-table-row-cell-break-word p', + ); + + if (titleSpan && dateCell) { + const title = titleSpan.textContent?.trim() || ''; + const dateStr = dateCell.textContent?.trim() || ''; + + if (title.length < 5) return null; // Filter noise + + // URL is not directly available in the table, need to construct from data-row-key + const rowKey = item.getAttribute('data-row-key'); + const url = rowKey + ? `${baseUrl}#/purchase/detail?id=${rowKey}` + : ''; + + return { + title, + dateStr, + url, + }; + } + return null; + }) + .filter((i) => i !== null); }, this.baseUrl); if (pageResults.length === 0) { - logger.warn(`No results found on page ${currentPage}. Extraction failed.`); - break; + logger.warn( + `No results found on page ${currentPage}. Extraction failed.`, + ); + break; } - allResults.push(...pageResults.map(r => ({ - title: r!.title, - publishDate: new Date(r!.dateStr), - url: r!.url.replace(/\/\//g, '/') - }))); - + allResults.push( + ...pageResults.map((r) => ({ + title: r.title, + publishDate: new Date(r.dateStr), + url: r.url.replace(/\/\//g, '/'), + })), + ); + logger.log(`Extracted ${pageResults.length} items.`); // Pagination: look for the "right" icon SVG @@ -241,34 +285,37 @@ export const ChngCrawler = { // 点击下一页前保存当前页面状态 const currentUrl = page.url(); - + await nextButton.click(); - + // 等待页面导航完成 try { await page.waitForFunction( (oldUrl) => window.location.href !== oldUrl, { timeout: 10000 }, - currentUrl + currentUrl, ); - } catch (e) { + } catch { logger.warn('Navigation timeout, continuing anyway'); } - + // 等待页面内容加载 - await new Promise(r => setTimeout(r, 15000)); + await new Promise((r) => setTimeout(r, 15000)); currentPage++; } return allResults; - } catch (error) { - logger.error(`Crawl failed: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Crawl failed: ${errorMessage}`); return allResults; } finally { if (page) await page.close(); } }, - extract() { return []; } -}; \ No newline at end of file + extract() { + return []; + }, +}; diff --git a/src/crawler/services/cnncecp_target.spec.ts b/src/crawler/services/cnncecp_target.spec.ts index 7bb2620..68e57c4 100644 --- a/src/crawler/services/cnncecp_target.spec.ts +++ b/src/crawler/services/cnncecp_target.spec.ts @@ -2,7 +2,7 @@ import { CnncecpCrawler } from './cnncecp_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('CnncecpCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('CnncecpCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${CnncecpCrawler.name}`); console.log(`Target URL: ${CnncecpCrawler.url}`); - + const results = await CnncecpCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('CnncecpCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/cnncecp_target.ts b/src/crawler/services/cnncecp_target.ts index 514bb6e..8ed8746 100644 --- a/src/crawler/services/cnncecp_target.ts +++ b/src/crawler/services/cnncecp_target.ts @@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface CnncecpResult { @@ -52,12 +52,22 @@ export interface CnncecpResult { url: string; } +interface CnncecpCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): CnncecpResult[]; +} + export const CnncecpCrawler = { name: '中核集团电子采购平台', url: 'https://www.cnncecp.com/xzbgg/index.jhtml', baseUrl: 'https://www.cnncecp.com/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: CnncecpCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('CnncecpCrawler'); const page = await browser.newPage(); @@ -69,11 +79,15 @@ export const CnncecpCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: CnncecpResult[] = []; @@ -87,7 +101,7 @@ export const CnncecpCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -103,12 +117,14 @@ export const CnncecpCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling before pagination...'); await simulateHumanScrolling(page); @@ -126,9 +142,13 @@ export const CnncecpCrawler = { try { // 点击下一页按钮 await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 + await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载 } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } @@ -137,26 +157,27 @@ export const CnncecpCrawler = { // 模拟人类行为 - 翻页后 logger.log('Simulating human mouse movements after pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling after pagination...'); await simulateHumanScrolling(page); // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): CnncecpResult[] { + extract(this: CnncecpCrawlerType, html: string): CnncecpResult[] { const results: CnncecpResult[] = []; /** * Regex groups for cnncecp.com: @@ -172,24 +193,25 @@ export const CnncecpCrawler = { * 中核四0四有限公司2026-2028年度质量流量控制器等采购项目(二次)变更公告 * */ - const regex = /
  • [\s\S]*?\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs; + const regex = + /
  • [\s\S]*?\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const dateStr = match[1]?.trim(); - const url = match[2]?.trim(); - const title = match[3]?.trim(); + const dateStr = match[1]?.trim() ?? ''; + const url = match[2]?.trim() ?? ''; + const title = match[3]?.trim() ?? ''; if (title && url) { const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/cnooc_target.spec.ts b/src/crawler/services/cnooc_target.spec.ts index 543f3b0..ccafd08 100644 --- a/src/crawler/services/cnooc_target.spec.ts +++ b/src/crawler/services/cnooc_target.spec.ts @@ -2,7 +2,7 @@ import { CnoocCrawler } from './cnooc_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('CnoocCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('CnoocCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${CnoocCrawler.name}`); console.log(`Target URL: ${CnoocCrawler.url}`); - + const results = await CnoocCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('CnoocCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/cnooc_target.ts b/src/crawler/services/cnooc_target.ts index cfbdbae..12d1b2a 100644 --- a/src/crawler/services/cnooc_target.ts +++ b/src/crawler/services/cnooc_target.ts @@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface CnoocResult { @@ -52,12 +52,22 @@ export interface CnoocResult { url: string; } +interface CnoocCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): CnoocResult[]; +} + export const CnoocCrawler = { name: '中海油招标平台', url: 'https://buy.cnooc.com.cn/cbjyweb/001/001001/moreinfo.html', baseUrl: 'https://buy.cnooc.com.cn/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: CnoocCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('CnoocCrawler'); const page = await browser.newPage(); @@ -69,11 +79,15 @@ export const CnoocCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: CnoocResult[] = []; @@ -87,7 +101,7 @@ export const CnoocCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -103,12 +117,14 @@ export const CnoocCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling before pagination...'); await simulateHumanScrolling(page); @@ -127,9 +143,13 @@ export const CnoocCrawler = { try { // 点击下一页按钮 await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 + await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载 } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } @@ -138,26 +158,27 @@ export const CnoocCrawler = { // 模拟人类行为 - 翻页后 logger.log('Simulating human mouse movements after pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling after pagination...'); await simulateHumanScrolling(page); // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): CnoocResult[] { + extract(this: CnoocCrawlerType, html: string): CnoocResult[] { const results: CnoocResult[] = []; /** * Regex groups for buy.cnooc.com.cn: @@ -173,24 +194,25 @@ export const CnoocCrawler = { * 2026-01-12 *
  • */ - const regex = /
  • [\s\S]*?]*href="([^"]*)"[^>]*>[\s\S]*?]*>([^<]*)<\/font>[\s\S]*?]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs; + const regex = + /
  • [\s\S]*?]*href="([^"]*)"[^>]*>[\s\S]*?]*>([^<]*)<\/font>[\s\S]*?]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const url = match[1]?.trim(); - const title = match[2]?.trim(); - const dateStr = match[3]?.trim(); + const url = match[1]?.trim() ?? ''; + const title = match[2]?.trim() ?? ''; + const dateStr = match[3]?.trim() ?? ''; if (title && url) { const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/eps_target.spec.ts b/src/crawler/services/eps_target.spec.ts index 2975bae..16694af 100644 --- a/src/crawler/services/eps_target.spec.ts +++ b/src/crawler/services/eps_target.spec.ts @@ -2,7 +2,7 @@ import { EpsCrawler } from './eps_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('EpsCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('EpsCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${EpsCrawler.name}`); console.log(`Target URL: ${EpsCrawler.url}`); - + const results = await EpsCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('EpsCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/eps_target.ts b/src/crawler/services/eps_target.ts index 78b8a62..bc175ec 100644 --- a/src/crawler/services/eps_target.ts +++ b/src/crawler/services/eps_target.ts @@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface EpsResult { @@ -52,12 +52,22 @@ export interface EpsResult { url: string; } +interface EpsCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): EpsResult[]; +} + export const EpsCrawler = { name: '中国三峡集团电子商务平台', url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm', baseUrl: 'https://eps.ctg.com.cn/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: EpsCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('EpsCrawler'); const page = await browser.newPage(); @@ -69,11 +79,15 @@ export const EpsCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: EpsResult[] = []; @@ -87,7 +101,7 @@ export const EpsCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -103,12 +117,14 @@ export const EpsCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling before pagination...'); await simulateHumanScrolling(page); @@ -127,9 +143,13 @@ export const EpsCrawler = { try { // 点击下一页按钮,等待页面更新 await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 + await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载 } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } @@ -138,26 +158,27 @@ export const EpsCrawler = { // 模拟人类行为 - 翻页后 logger.log('Simulating human mouse movements after pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling after pagination...'); await simulateHumanScrolling(page); // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): EpsResult[] { + extract(this: EpsCrawlerType, html: string): EpsResult[] { const results: EpsResult[] = []; /** * Regex groups for eps.ctg.com.cn: @@ -179,24 +200,25 @@ export const EpsCrawler = { * *
  • */ - const regex = /]*name="li_name"[^>]*>[\s\S]*?]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; + const regex = + /]*name="li_name"[^>]*>[\s\S]*?]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const url = match[1]?.trim(); - const title = match[2]?.trim(); - const dateStr = match[3]?.trim(); + const url = match[1]?.trim() ?? ''; + const title = match[2]?.trim() ?? ''; + const dateStr = match[3]?.trim() ?? ''; if (title && url) { const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/espic_target.spec.ts b/src/crawler/services/espic_target.spec.ts index 293c251..57a057b 100644 --- a/src/crawler/services/espic_target.spec.ts +++ b/src/crawler/services/espic_target.spec.ts @@ -2,7 +2,7 @@ import { EspicCrawler } from './espic_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('EspicCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('EspicCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${EspicCrawler.name}`); console.log(`Target URL: ${EspicCrawler.getUrl()}`); - + const results = await EspicCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('EspicCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/espic_target.ts b/src/crawler/services/espic_target.ts index 9075750..f7d31c4 100644 --- a/src/crawler/services/espic_target.ts +++ b/src/crawler/services/espic_target.ts @@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface EspicResult { @@ -52,12 +52,19 @@ export interface EspicResult { url: string; } +interface EspicCrawlerType { + name: string; + baseUrl: string; + getUrl(page?: number): string; + extract(html: string): EspicResult[]; +} + export const EspicCrawler = { name: '电能e招采平台(国电投)', baseUrl: 'https://ebid.espic.com.cn/', // 生成动态 URL,使用当前日期 - getUrl(page: number = 1): string { + getUrl(this: EspicCrawlerType, page: number = 1): string { const now = new Date(); const year = now.getFullYear(); const month = now.getMonth() + 1; // 月份从0开始 @@ -66,7 +73,10 @@ export const EspicCrawler = { return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`; }, - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: EspicCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('EspicCrawler'); const page = await browser.newPage(); @@ -78,11 +88,15 @@ export const EspicCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: EspicResult[] = []; @@ -100,15 +114,18 @@ export const EspicCrawler = { () => { // 检查是否已经通过验证(页面不再是 WAF 页面) const bodyText = document.body?.textContent || ''; - return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙'); + return ( + !bodyText.includes('人机识别检测') && + !bodyText.includes('WEB 应用防火墙') + ); }, - { timeout: 30000 } + { timeout: 30000 }, ); // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -124,12 +141,14 @@ export const EspicCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling before pagination...'); await simulateHumanScrolling(page); @@ -141,7 +160,7 @@ export const EspicCrawler = { 'a[aria-label="Next"]', 'a.next', 'li.next a', - 'a.layui-laypage-next:not(.layui-disabled)' + 'a.layui-laypage-next:not(.layui-disabled)', ]; let nextButton: puppeteer.ElementHandle | null = null; @@ -149,7 +168,7 @@ export const EspicCrawler = { try { nextButton = await page.$(selector); if (nextButton) break; - } catch (e) { + } catch { // 继续尝试下一个选择器 } } @@ -164,9 +183,13 @@ export const EspicCrawler = { try { // 点击下一页按钮 await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 + await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载 } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } @@ -175,26 +198,27 @@ export const EspicCrawler = { // 模拟人类行为 - 翻页后 logger.log('Simulating human mouse movements after pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling after pagination...'); await simulateHumanScrolling(page); // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): EspicResult[] { + extract(this: EspicCrawlerType, html: string): EspicResult[] { const results: EspicResult[] = []; /** * Regex groups for ebid.espic.com.cn: @@ -225,24 +249,25 @@ export const EspicCrawler = { * * */ - const regex = /
  • [\s\S]*?]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?
    [\s\S]*?
    \s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; + const regex = + /
  • [\s\S]*?]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?
    [\s\S]*?
    \s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const url = match[1]?.trim(); - const title = match[2]?.trim(); - const dateStr = match[3]?.trim(); + const url = match[1]?.trim() ?? ''; + const title = match[2]?.trim() ?? ''; + const dateStr = match[3]?.trim() ?? ''; if (title && url) { const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/powerbeijing_target.spec.ts b/src/crawler/services/powerbeijing_target.spec.ts index b0fa11b..03a2e0d 100644 --- a/src/crawler/services/powerbeijing_target.spec.ts +++ b/src/crawler/services/powerbeijing_target.spec.ts @@ -2,7 +2,7 @@ import { PowerbeijingCrawler } from './powerbeijing_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('PowerbeijingCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('PowerbeijingCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${PowerbeijingCrawler.name}`); console.log(`Target URL: ${PowerbeijingCrawler.url}`); - + const results = await PowerbeijingCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('PowerbeijingCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/powerbeijing_target.ts b/src/crawler/services/powerbeijing_target.ts index d5aaf2b..825512e 100644 --- a/src/crawler/services/powerbeijing_target.ts +++ b/src/crawler/services/powerbeijing_target.ts @@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface PowerbeijingResult { @@ -52,12 +52,22 @@ export interface PowerbeijingResult { url: string; } +interface PowerbeijingCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): PowerbeijingResult[]; +} + export const PowerbeijingCrawler = { name: '北京京能电子商务平台', url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1', baseUrl: 'https://www.powerbeijing-ec.com/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: PowerbeijingCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('PowerbeijingCrawler'); const page = await browser.newPage(); @@ -69,11 +79,15 @@ export const PowerbeijingCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: PowerbeijingResult[] = []; @@ -87,7 +101,7 @@ export const PowerbeijingCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -103,12 +117,14 @@ export const PowerbeijingCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling before pagination...'); await simulateHumanScrolling(page); @@ -127,9 +143,13 @@ export const PowerbeijingCrawler = { try { // 点击下一页按钮,等待页面更新 await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 + await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载 } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } @@ -138,26 +158,27 @@ export const PowerbeijingCrawler = { // 模拟人类行为 - 翻页后 logger.log('Simulating human mouse movements after pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling after pagination...'); await simulateHumanScrolling(page); // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): PowerbeijingResult[] { + extract(this: PowerbeijingCrawlerType, html: string): PowerbeijingResult[] { const results: PowerbeijingResult[] = []; /** * Regex groups for powerbeijing-ec.com: @@ -176,24 +197,25 @@ export const PowerbeijingCrawler = { * *
  • */ - const regex = /
  • [\s\S]*?]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?
    [\s\S]*?
    \s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; + const regex = + /
  • [\s\S]*?]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?
    [\s\S]*?
    \s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const url = match[1]?.trim(); - const title = match[2]?.trim(); - const dateStr = match[3]?.trim(); + const url = match[1]?.trim() ?? ''; + const title = match[2]?.trim() ?? ''; + const dateStr = match[3]?.trim() ?? ''; if (title && url) { const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/sdicc_target.spec.ts b/src/crawler/services/sdicc_target.spec.ts index aa461a8..81662e1 100644 --- a/src/crawler/services/sdicc_target.spec.ts +++ b/src/crawler/services/sdicc_target.spec.ts @@ -2,7 +2,7 @@ import { SdiccCrawler } from './sdicc_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations -jest.setTimeout(60000*5); +jest.setTimeout(60000 * 5); // 获取代理配置 const getProxyArgs = (): string[] => { @@ -29,7 +29,7 @@ describe('SdiccCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Change to false to see browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], @@ -45,13 +45,15 @@ describe('SdiccCrawler Real Site Test', () => { it('should visit website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${SdiccCrawler.name}`); console.log(`Target URL: ${SdiccCrawler.url}`); - + const results = await SdiccCrawler.crawl(browser); - + console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); @@ -61,13 +63,15 @@ describe('SdiccCrawler Real Site Test', () => { expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { - console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + console.warn( + 'Warning: No items found. Check if website structure has changed or if list is currently empty.', + ); } else { - // Check data integrity of first item - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/sdicc_target.ts b/src/crawler/services/sdicc_target.ts index 9f9464d..75c307b 100644 --- a/src/crawler/services/sdicc_target.ts +++ b/src/crawler/services/sdicc_target.ts @@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); } export interface SdiccResult { @@ -52,12 +52,22 @@ export interface SdiccResult { url: string; } +interface SdiccCrawlerType { + name: string; + url: string; + baseUrl: string; + extract(html: string): SdiccResult[]; +} + export const SdiccCrawler = { name: '国投集团电子采购平台', url: 'https://www.sdicc.com.cn/cgxx/ggList', baseUrl: 'https://www.sdicc.com.cn/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: SdiccCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('SdiccCrawler'); const page = await browser.newPage(); @@ -69,11 +79,15 @@ export const SdiccCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); - Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: SdiccResult[] = []; @@ -87,15 +101,17 @@ export const SdiccCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); // 等待表格加载 logger.log('Waiting for table to load...'); - await page.waitForSelector('.tbody table tbody tr', { timeout: 30000 }).catch(() => { - logger.warn('Table rows not found, trying alternative selectors...'); - }); + await page + .waitForSelector('.tbody table tbody tr', { timeout: 30000 }) + .catch(() => { + logger.warn('Table rows not found, trying alternative selectors...'); + }); while (currentPage <= maxPages) { logger.log(`Processing page ${currentPage}...`); @@ -109,12 +125,14 @@ export const SdiccCrawler = { } allResults.push(...pageResults); - logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + logger.log( + `Extracted ${pageResults.length} items from page ${currentPage}`, + ); // 模拟人类行为 - 翻页前 logger.log('Simulating human mouse movements before pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling before pagination...'); await simulateHumanScrolling(page); @@ -132,10 +150,16 @@ export const SdiccCrawler = { try { // 点击下一页按钮 await nextButton.click(); - await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {}); - await new Promise(r => setTimeout(r, 2000)); // 额外等待确保数据加载完成 + await page + .waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }) + .catch(() => {}); + await new Promise((r) => setTimeout(r, 2000)); // 额外等待确保数据加载完成 } catch (navError) { - logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + const navErrorMessage = + navError instanceof Error ? navError.message : String(navError); + logger.error( + `Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`, + ); break; } @@ -144,26 +168,27 @@ export const SdiccCrawler = { // 模拟人类行为 - 翻页后 logger.log('Simulating human mouse movements after pagination...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling after pagination...'); await simulateHumanScrolling(page); // Random delay between pages const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; - await new Promise(resolve => setTimeout(resolve, delay)); + await new Promise((resolve) => setTimeout(resolve, delay)); } return allResults; - } catch (error) { - logger.error(`Failed to crawl ${this.name}: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Failed to crawl ${this.name}: ${errorMessage}`); return allResults; } finally { await page.close(); } }, - extract(html: string): SdiccResult[] { + extract(this: SdiccCrawlerType, html: string): SdiccResult[] { const results: SdiccResult[] = []; /** * Regex groups for sdicc.com.cn: @@ -180,25 +205,26 @@ export const SdiccCrawler = { * 2026-01-09 * */ - const regex = /]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?]*>]*>([^<]+)<\/span><\/td>[\s\S]*?]*>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs; + const regex = + /]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?]*>]*>([^<]+)<\/span><\/td>[\s\S]*?]*>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs; - let match; + let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { - const ggGuid = match[1]?.trim(); - const gcGuid = match[2]?.trim(); - const title = match[3]?.trim(); - const dateStr = match[4]?.trim(); + const ggGuid = match[1]?.trim() ?? ''; + const gcGuid = match[2]?.trim() ?? ''; + const title = match[3]?.trim() ?? ''; + const dateStr = match[4]?.trim() ?? ''; if (title && ggGuid && gcGuid) { const fullUrl = `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}`; results.push({ title, publishDate: dateStr ? new Date(dateStr) : new Date(), - url: fullUrl.replace(/\/\//g, '/') + url: fullUrl.replace(/\/\//g, '/'), }); } } return results; - } + }, }; diff --git a/src/crawler/services/szecp_target.spec.ts b/src/crawler/services/szecp_target.spec.ts index 26455f1..a6c3f63 100644 --- a/src/crawler/services/szecp_target.spec.ts +++ b/src/crawler/services/szecp_target.spec.ts @@ -29,7 +29,7 @@ describe('SzecpCrawler Real Site Test', () => { if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } - + browser = await puppeteer.launch({ headless: false, // Run in non-headless mode args: [ @@ -40,14 +40,14 @@ describe('SzecpCrawler Real Site Test', () => { '--disable-infobars', ...proxyArgs, ], - defaultViewport: null + defaultViewport: null, }); }); afterAll(async () => { if (browser) { // Keep open for a few seconds after test to see result - await new Promise(r => setTimeout(r, 50000)); + await new Promise((r) => setTimeout(r, 50000)); await browser.close(); } }); @@ -56,29 +56,33 @@ describe('SzecpCrawler Real Site Test', () => { console.log(` Starting crawl for: ${SzecpCrawler.name}`); console.log(`Target URL: ${SzecpCrawler.url}`); - + const results = await SzecpCrawler.crawl(browser); - + console.log(` Successfully found ${results.length} items: `); console.log('----------------------------------------'); results.forEach((item, index) => { - console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log( + `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, + ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); expect(results).toBeDefined(); expect(Array.isArray(results)).toBeTruthy(); - + if (results.length === 0) { - console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.'); + console.warn( + 'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.', + ); } else { - const firstItem = results[0]; - expect(firstItem.title).toBeTruthy(); - expect(firstItem.url).toMatch(/^https?:\/\//); - expect(firstItem.publishDate).toBeInstanceOf(Date); + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); } }); }); diff --git a/src/crawler/services/szecp_target.ts b/src/crawler/services/szecp_target.ts index 498f59f..16b7ce2 100644 --- a/src/crawler/services/szecp_target.ts +++ b/src/crawler/services/szecp_target.ts @@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) { for (let i = 0; i < movements; i++) { const x = Math.floor(Math.random() * viewport.width); const y = Math.floor(Math.random() * viewport.height); - + await page.mouse.move(x, y, { - steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑 }); - + // 随机停顿 100-500ms - await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + await new Promise((r) => setTimeout(r, 100 + Math.random() * 400)); } } @@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) { for (let i = 0; i < scrollCount; i++) { const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px - + await page.evaluate((distance) => { window.scrollBy({ top: distance, - behavior: 'smooth' + behavior: 'smooth', }); }, scrollDistance); // 随机停顿 500-1500ms - await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000)); } // 滚动回顶部 await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' }); }); - await new Promise(r => setTimeout(r, 1000)); + await new Promise((r) => setTimeout(r, 1000)); +} + +interface SzecpCrawlerType { + name: string; + url: string; + baseUrl: string; } export const SzecpCrawler = { @@ -52,7 +58,10 @@ export const SzecpCrawler = { url: 'https://www.szecp.com.cn/first_zbgg/index.html', baseUrl: 'https://www.szecp.com.cn/', - async crawl(browser: puppeteer.Browser): Promise { + async crawl( + this: SzecpCrawlerType, + browser: puppeteer.Browser, + ): Promise { const logger = new Logger('SzecpCrawler'); const page = await browser.newPage(); @@ -65,10 +74,14 @@ export const SzecpCrawler = { await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); - Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + ); await page.setViewport({ width: 1920, height: 1080 }); const allResults: ChdtpResult[] = []; @@ -82,7 +95,7 @@ export const SzecpCrawler = { // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); @@ -90,52 +103,69 @@ export const SzecpCrawler = { logger.log('Clicking search button...'); await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 }); await page.click('.szb-zbcgSearch-key-v1'); - await new Promise(r => setTimeout(r, 3000)); // Wait for results to load + await new Promise((r) => setTimeout(r, 3000)); // Wait for results to load while (currentPage <= maxPages) { logger.log(`Processing page ${currentPage}...`); // Wait for content to load - await page.waitForFunction(() => { - return document.querySelectorAll('.szb-zbcgTable-other').length > 0; - }, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.')); + await page + .waitForFunction( + () => { + return ( + document.querySelectorAll('.szb-zbcgTable-other').length > 0 + ); + }, + { timeout: 60000 }, + ) + .catch(() => logger.warn('Content not found. Site might be slow.')); const pageResults = await page.evaluate((baseUrl) => { // Extract from table rows - const items = Array.from(document.querySelectorAll('.szb-zbcgTable-other')); - return items.map(item => { - const divs = item.querySelectorAll('div'); - if (divs.length >= 5) { - const titleLink = divs[1].querySelector('a'); - const title = titleLink?.textContent?.trim() || ''; - const dateStr = divs[4].textContent?.trim() || ''; - const href = titleLink?.getAttribute('href') || ''; + const items = Array.from( + document.querySelectorAll('.szb-zbcgTable-other'), + ); + return items + .map((item) => { + const divs = item.querySelectorAll('div'); + if (divs.length >= 5) { + const titleLink = divs[1].querySelector('a'); + const title = titleLink?.textContent?.trim() || ''; + const dateStr = divs[4].textContent?.trim() || ''; + const href = titleLink?.getAttribute('href') || ''; - if (title.length < 5) return null; // Filter noise + if (title.length < 5) return null; // Filter noise - // Construct full URL if href is relative - const url = href.startsWith('http') ? href : `${baseUrl}${href}`; + // Construct full URL if href is relative + const url = href.startsWith('http') + ? href + : `${baseUrl}${href}`; - return { - title, - dateStr, - url - }; - } - return null; - }).filter(i => i !== null); + return { + title, + dateStr, + url, + }; + } + return null; + }) + .filter((i) => i !== null); }, this.baseUrl); if (pageResults.length === 0) { - logger.warn(`No results found on page ${currentPage}. Extraction failed.`); + logger.warn( + `No results found on page ${currentPage}. Extraction failed.`, + ); break; } - allResults.push(...pageResults.map(r => ({ - title: r!.title, - publishDate: new Date(r!.dateStr), - url: r!.url.replace(/\/\//g, '/') - }))); + allResults.push( + ...pageResults.map((r) => ({ + title: r.title, + publishDate: new Date(r.dateStr), + url: r.url.replace(/\/\//g, '/'), + })), + ); logger.log(`Extracted ${pageResults.length} items.`); @@ -144,27 +174,30 @@ export const SzecpCrawler = { if (!nextButton) break; await nextButton.click(); - await new Promise(r => setTimeout(r, 3000)); - + await new Promise((r) => setTimeout(r, 3000)); + // 模拟人类行为 logger.log('Simulating human mouse movements...'); await simulateHumanMouseMovement(page); - + logger.log('Simulating human scrolling...'); await simulateHumanScrolling(page); - + currentPage++; } return allResults; - } catch (error) { - logger.error(`Crawl failed: ${error.message}`); + const errorMessage = + error instanceof Error ? error.message : String(error); + logger.error(`Crawl failed: ${errorMessage}`); return allResults; } finally { if (page) await page.close(); } }, - extract() { return []; } + extract() { + return []; + }, }; diff --git a/src/database/database.module.ts b/src/database/database.module.ts index ea1112a..a067ee1 100644 --- a/src/database/database.module.ts +++ b/src/database/database.module.ts @@ -12,7 +12,11 @@ import { CrawlInfoAdd } from '../crawler/entities/crawl-info-add.entity'; imports: [ConfigModule], inject: [ConfigService], useFactory: (configService: ConfigService) => ({ - type: configService.get('DATABASE_TYPE', 'mariadb'), + type: + (configService.get('DATABASE_TYPE', 'mariadb') as + | 'mariadb' + | 'mysql' + | 'postgres') || 'mariadb', host: configService.get('DATABASE_HOST', 'localhost'), port: configService.get('DATABASE_PORT', 3306), username: configService.get('DATABASE_USERNAME', 'root'), diff --git a/src/keywords/keyword.entity.ts b/src/keywords/keyword.entity.ts index 29496df..e79dd96 100644 --- a/src/keywords/keyword.entity.ts +++ b/src/keywords/keyword.entity.ts @@ -1,4 +1,10 @@ -import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm'; +import { + Entity, + PrimaryGeneratedColumn, + Column, + CreateDateColumn, + UpdateDateColumn, +} from 'typeorm'; @Entity('keywords') export class Keyword { diff --git a/src/main.ts b/src/main.ts index c5a45be..3d65095 100644 --- a/src/main.ts +++ b/src/main.ts @@ -6,19 +6,20 @@ async function bootstrap() { const app = await NestFactory.create(AppModule, { bodyParser: true, }); - + // 使用自定义日志服务 const logger = await app.resolve(CustomLogger); app.useLogger(logger); - + // 增加请求体大小限制(默认 100kb,增加到 50mb) - const express = require('express'); + // eslint-disable-next-line @typescript-eslint/no-require-imports + const express = require('express') as typeof import('express'); app.use(express.json({ limit: '50mb' })); app.use(express.urlencoded({ limit: '50mb', extended: true })); - + // 启用 CORS app.enableCors(); - + await app.listen(process.env.PORT ?? 3000); } -bootstrap(); +void bootstrap(); diff --git a/src/scripts/ai-recommendations.ts b/src/scripts/ai-recommendations.ts index f4d145d..b10a7a9 100644 --- a/src/scripts/ai-recommendations.ts +++ b/src/scripts/ai-recommendations.ts @@ -16,7 +16,9 @@ async function generateAiRecommendations() { try { // 获取 BidItem 的 repository 和 AiService - const bidItemRepository = app.get>(getRepositoryToken(BidItem)); + const bidItemRepository = app.get>( + getRepositoryToken(BidItem), + ); const aiService = app.get(AiService); logger.log('开始查询 bid_items 表...'); @@ -27,11 +29,13 @@ async function generateAiRecommendations() { threeDaysAgo.setHours(0, 0, 0, 0); // 使用本地时间格式化输出,避免时区问题 - const localDateStr = threeDaysAgo.toLocaleDateString('zh-CN', { - year: 'numeric', - month: '2-digit', - day: '2-digit' - }).replace(/\//g, '-'); + const localDateStr = threeDaysAgo + .toLocaleDateString('zh-CN', { + year: 'numeric', + month: '2-digit', + day: '2-digit', + }) + .replace(/\//g, '-'); logger.log(`查询起始日期: ${localDateStr}`); // 查询起始日期3天前,截止日期不限制的所有记录 @@ -50,8 +54,8 @@ async function generateAiRecommendations() { } // 提取 title - const bidData = bidItems.map(item => ({ - title: item.title + const bidData = bidItems.map((item) => ({ + title: item.title, })); logger.log('开始调用 AI 获取推荐...'); diff --git a/src/scripts/crawl.ts b/src/scripts/crawl.ts index b97a78a..4a4428a 100644 --- a/src/scripts/crawl.ts +++ b/src/scripts/crawl.ts @@ -5,19 +5,19 @@ import { CustomLogger } from '../common/logger/logger.service'; async function runCrawler() { const app = await NestFactory.createApplicationContext(AppModule); - + // 设置自定义 logger,使 NestJS 框架日志也输出到文件 const logger = await app.resolve(CustomLogger); app.useLogger(logger); logger.setContext('CrawlScript'); - + try { const crawlerService = await app.resolve(BidCrawlerService); - + logger.log('Starting crawler...'); await crawlerService.crawlAll(); logger.log('Crawler completed successfully'); - + await app.close(); process.exit(0); } catch (error) { diff --git a/src/scripts/remove-duplicates.ts b/src/scripts/remove-duplicates.ts index 0a9cec0..9ce63be 100644 --- a/src/scripts/remove-duplicates.ts +++ b/src/scripts/remove-duplicates.ts @@ -15,7 +15,9 @@ async function removeDuplicates() { try { // 获取 BidItem 的 repository - const bidItemRepository = app.get>(getRepositoryToken(BidItem)); + const bidItemRepository = app.get>( + getRepositoryToken(BidItem), + ); logger.log('开始查找重复的title...'); @@ -56,10 +58,12 @@ async function removeDuplicates() { const itemsToDelete = items.slice(1); if (itemsToDelete.length > 0) { - const idsToDelete = itemsToDelete.map(item => item.id); + const idsToDelete = itemsToDelete.map((item) => item.id); const deleteResult = await bidItemRepository.delete(idsToDelete); totalDeleted += deleteResult.affected || 0; - logger.log(` 删除了 ${deleteResult.affected} 条重复记录,保留ID: ${items[0].id} (最晚创建)`); + logger.log( + ` 删除了 ${deleteResult.affected} 条重复记录,保留ID: ${items[0].id} (最晚创建)`, + ); } } diff --git a/src/scripts/sync.ts b/src/scripts/sync.ts index 6ca6e92..63ba8e6 100644 --- a/src/scripts/sync.ts +++ b/src/scripts/sync.ts @@ -8,7 +8,7 @@ import { CrawlInfoAdd } from '../crawler/entities/crawl-info-add.entity'; // 主数据库配置 const masterDbConfig: DataSourceOptions = { - type: process.env.DATABASE_TYPE as any || 'mariadb', + type: (process.env.DATABASE_TYPE as any) || 'mariadb', host: process.env.DATABASE_HOST || 'localhost', port: parseInt(process.env.DATABASE_PORT || '3306'), username: process.env.DATABASE_USERNAME || 'root', @@ -20,7 +20,7 @@ const masterDbConfig: DataSourceOptions = { // Slave 数据库配置 const slaveDbConfig: DataSourceOptions = { - type: process.env.SLAVE_DATABASE_TYPE as any || 'mariadb', + type: (process.env.SLAVE_DATABASE_TYPE as any) || 'mariadb', host: process.env.SLAVE_DATABASE_HOST || 'localhost', port: parseInt(process.env.SLAVE_DATABASE_PORT || '3306'), username: process.env.SLAVE_DATABASE_USERNAME || 'root', @@ -94,12 +94,17 @@ async function createDatabaseIfNotExists(config: DataSourceOptions) { password: (config as any).password, }); - await connection.query(`CREATE DATABASE IF NOT EXISTS \`${(config as any).database}\``); + await connection.query( + `CREATE DATABASE IF NOT EXISTS \`${(config as any).database}\``, + ); await connection.end(); } // 同步表结构 -async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSource): Promise { +async function syncSchema( + masterDataSource: DataSource, + slaveDataSource: DataSource, +): Promise { logger.log('开始同步表结构...'); // 获取主数据库的所有表 @@ -137,8 +142,12 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou if (tableExists[0].count > 0) { // 表存在,先备份数据到临时表 logger.log(`备份表 ${tableName} 的数据到 ${tempTableName}...`); - await slaveDataSource.query(`CREATE TABLE ${tempTableName} AS SELECT * FROM \`${tableName}\``); - logger.log(`备份完成,共备份 ${await slaveDataSource.query(`SELECT COUNT(*) as count FROM ${tempTableName}`).then(r => r[0].count)} 条记录`); + await slaveDataSource.query( + `CREATE TABLE ${tempTableName} AS SELECT * FROM \`${tableName}\``, + ); + logger.log( + `备份完成,共备份 ${await slaveDataSource.query(`SELECT COUNT(*) as count FROM ${tempTableName}`).then((r) => r[0].count)} 条记录`, + ); } // 删除 slave 数据库中的表(如果存在) @@ -151,7 +160,7 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou if (tableExists[0].count > 0) { try { logger.log(`从 ${tempTableName} 恢复数据到 ${tableName}...`); - + // 获取临时表的列名 const columns = await slaveDataSource.query(` SELECT COLUMN_NAME @@ -159,18 +168,22 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou WHERE TABLE_SCHEMA = '${(slaveDbConfig as any).database}' AND TABLE_NAME = '${tempTableName}' `); - - const columnNames = columns.map((c: any) => `\`${c.COLUMN_NAME}\``).join(', '); - + + const columnNames = columns + .map((c: any) => `\`${c.COLUMN_NAME}\``) + .join(', '); + // 将数据从临时表插入到新表 await slaveDataSource.query(` INSERT INTO \`${tableName}\` (${columnNames}) SELECT ${columnNames} FROM ${tempTableName} `); - - const restoredCount = await slaveDataSource.query(`SELECT COUNT(*) as count FROM \`${tableName}\``); + + const restoredCount = await slaveDataSource.query( + `SELECT COUNT(*) as count FROM \`${tableName}\``, + ); logger.log(`数据恢复完成,共恢复 ${restoredCount[0].count} 条记录`); - + // 删除临时表 await slaveDataSource.query(`DROP TABLE IF EXISTS ${tempTableName}`); } catch (error) { @@ -181,13 +194,13 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou } logger.log('表结构同步完成'); - + // 重新初始化 slave 数据库连接以清除 TypeORM 元数据缓存 logger.log('重新初始化 slave 数据库连接...'); await slaveDataSource.destroy(); await slaveDataSource.initialize(); logger.log('Slave 数据库连接重新初始化完成'); - + return slaveDataSource; } @@ -227,7 +240,12 @@ async function syncDatabase() { let totalSynced = 0; for (const table of tables) { - const count = await syncTable(masterDataSource, slaveDataSource, table.entity, table.name); + const count = await syncTable( + masterDataSource, + slaveDataSource, + table.entity, + table.name, + ); totalSynced += count; } diff --git a/src/scripts/update-source.ts b/src/scripts/update-source.ts index a70f551..94f4caa 100644 --- a/src/scripts/update-source.ts +++ b/src/scripts/update-source.ts @@ -15,7 +15,9 @@ async function updateSource() { try { // 获取 BidItem 的 repository - const bidItemRepository = app.get>(getRepositoryToken(BidItem)); + const bidItemRepository = app.get>( + getRepositoryToken(BidItem), + ); const oldSource = '北京电力交易平台'; const newSource = '北京京能电子商务平台'; diff --git a/widget/looker/sys_run/go.mod b/widget/looker/sys_run/go.mod new file mode 100644 index 0000000..3fc2a65 --- /dev/null +++ b/widget/looker/sys_run/go.mod @@ -0,0 +1,17 @@ +module systray_run + +go 1.23 + +require github.com/getlantern/systray v1.2.2 + +require ( + github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 // indirect + github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 // indirect + github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 // indirect + github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 // indirect + github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 // indirect + github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f // indirect + github.com/go-stack/stack v1.8.0 // indirect + github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect + golang.org/x/sys v0.1.0 // indirect +) diff --git a/widget/looker/sys_run/go.sum b/widget/looker/sys_run/go.sum new file mode 100644 index 0000000..4c056eb --- /dev/null +++ b/widget/looker/sys_run/go.sum @@ -0,0 +1,32 @@ +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 h1:NRUJuo3v3WGC/g5YiyF790gut6oQr5f3FBI88Wv0dx4= +github.com/getlantern/context v0.0.0-20190109183933-c447772a6520/go.mod h1:L+mq6/vvYHKjCX2oez0CgEAJmbq1fbb/oNJIWQkBybY= +github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 h1:6uJ+sZ/e03gkbqZ0kUG6mfKoqDb4XMAzMIwlajq19So= +github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7/go.mod h1:l+xpFBrCtDLpK9qNjxs+cHU6+BAdlBaxHqikB6Lku3A= +github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 h1:guBYzEaLz0Vfc/jv0czrr2z7qyzTOGC9hiQ0VC+hKjk= +github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7/go.mod h1:zx/1xUUeYPy3Pcmet8OSXLbF47l+3y6hIPpyLWoR9oc= +github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 h1:micT5vkcr9tOVk1FiH8SWKID8ultN44Z+yzd2y/Vyb0= +github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7/go.mod h1:dD3CgOrwlzca8ed61CsZouQS5h5jIzkK9ZWrTcf0s+o= +github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 h1:XYzSdCbkzOC0FDNrgJqGRo8PCMFOBFL9py72DRs7bmc= +github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55/go.mod h1:6mmzY2kW1TOOrVy+r41Za2MxXM+hhqTtY3oBKd2AgFA= +github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f h1:wrYrQttPS8FHIRSlsrcuKazukx/xqO/PpLZzZXsF+EA= +github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f/go.mod h1:D5ao98qkA6pxftxoqzibIBBrLSUli+kYnJqrgBf9cIA= +github.com/getlantern/systray v1.2.2 h1:dCEHtfmvkJG7HZ8lS/sLklTH4RKUcIsKrAD9sThoEBE= +github.com/getlantern/systray v1.2.2/go.mod h1:pXFOI1wwqwYXEhLPm9ZGjS2u/vVELeIgNMY5HvhHhcE= +github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk= +github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/lxn/walk v0.0.0-20210112085537-c389da54e794/go.mod h1:E23UucZGqpuUANJooIbHWCufXvOcT6E7Stq81gU+CSQ= +github.com/lxn/win v0.0.0-20210218163916-a377121e959e/go.mod h1:KxxjdtRkfNoYDCUP5ryK7XJJNTnpC8atvtmTheChOtk= +github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw= +github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +gopkg.in/Knetic/govaluate.v3 v3.0.0/go.mod h1:csKLBORsPbafmSCGTEh3U7Ozmsuq8ZSIlKk1bcqph0E=