chore: 更新.gitignore并添加新文件

在.gitignore中添加对*.png、*.log、*-lock.json、*.woff2文件的忽略规则,并新增OFL.txt文件。同时,添加vue.svg图标文件以支持前端展示。更新多个TypeScript文件以优化代码格式和增强可读性。
This commit is contained in:
dmy
2026-01-14 22:26:32 +08:00
parent 10565af001
commit 82f5a81887
47 changed files with 1513 additions and 814 deletions

6
.gitignore vendored
View File

@@ -7,4 +7,8 @@ pw-browsers
logs logs
build build
*.exe *.exe
*.png *.png
*.log
*-lock.json
*.woff2
widget/looker/frontend/src/assets/fonts/OFL.txt

View File

@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="37.07" height="36" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 198"><path fill="#41B883" d="M204.8 0H256L128 220.8L0 0h97.92L128 51.2L157.44 0h47.36Z"></path><path fill="#41B883" d="m0 0l128 220.8L256 0h-51.2L128 132.48L50.56 0H0Z"></path><path fill="#35495E" d="M50.56 0L128 133.12L204.8 0h-47.36L128 51.2L97.92 0H50.56Z"></path></svg>

After

Width:  |  Height:  |  Size: 496 B

View File

@@ -31,14 +31,13 @@ export class AiService {
@InjectRepository(BidItem) @InjectRepository(BidItem)
private readonly bidItemRepository: Repository<BidItem>, private readonly bidItemRepository: Repository<BidItem>,
) { ) {
const apiKey = this.configService.get<string>('ARK_API_KEY');
// this.openai = new OpenAI({ // this.openai = new OpenAI({
// apiKey: apiKey || '', // apiKey: this.configService.get<string>('ARK_API_KEY') || '',
// baseURL: 'https://ark.cn-beijing.volces.com/api/v3', // baseURL: 'https://ark.cn-beijing.volces.com/api/v3',
// timeout: 120000, // 120秒超时 // timeout: 120000, // 120秒超时
// }); // });
this.openai = new OpenAI({ this.openai = new OpenAI({
apiKey: 'sk-5sSOxrJl31MGz76bE14d2fDbA55b44869fCcA0C813Fc893a' , apiKey: 'sk-5sSOxrJl31MGz76bE14d2fDbA55b44869fCcA0C813Fc893a',
baseURL: 'https://aihubmix.com/v1', baseURL: 'https://aihubmix.com/v1',
timeout: 120000, // 120秒超时 timeout: 120000, // 120秒超时
}); });
@@ -49,7 +48,9 @@ export class AiService {
this.logger.log(`发送给 AI 的数据数量: ${bids.length}`); this.logger.log(`发送给 AI 的数据数量: ${bids.length}`);
try { try {
const prompt =PromptString+ `请根据以下投标项目标题列表,筛选出我关心的项目。请以 JSON 格式返回,格式如下: const prompt =
PromptString +
`请根据以下投标项目标题列表,筛选出我关心的项目。请以 JSON 格式返回,格式如下:
[ [
{ {
"title": "项目标题", "title": "项目标题",
@@ -58,7 +59,11 @@ export class AiService {
] ]
投标项目标题列表: 投标项目标题列表:
${JSON.stringify(bids.map(b => b.title), null, 2)}`; ${JSON.stringify(
bids.map((b) => b.title),
null,
2,
)}`;
// this.logger.log('发给AI的内容',prompt); // this.logger.log('发给AI的内容',prompt);
const completion = await this.openai.chat.completions.create({ const completion = await this.openai.chat.completions.create({
model: 'mimo-v2-flash-free', model: 'mimo-v2-flash-free',
@@ -97,7 +102,9 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
} }
} }
async saveRecommendations(recommendations: AIRecommendation[]): Promise<void> { async saveRecommendations(
recommendations: AIRecommendation[],
): Promise<void> {
this.logger.log('开始保存 AI 推荐结果'); this.logger.log('开始保存 AI 推荐结果');
try { try {
@@ -105,7 +112,7 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
await this.aiRecommendationRepository.clear(); await this.aiRecommendationRepository.clear();
// 保存新的推荐结果(只保存 title 和 confidence // 保存新的推荐结果(只保存 title 和 confidence
const entities = recommendations.map(rec => { const entities = recommendations.map((rec) => {
const entity = new AiRecommendationEntity(); const entity = new AiRecommendationEntity();
entity.title = rec.title; entity.title = rec.title;
entity.confidence = rec.confidence; entity.confidence = rec.confidence;
@@ -125,14 +132,14 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
try { try {
const entities = await this.aiRecommendationRepository.find({ const entities = await this.aiRecommendationRepository.find({
order: { confidence: 'DESC' } order: { confidence: 'DESC' },
}); });
// 从 bid-items 表获取 url、source 和 publishDate // 从 bid-items 表获取 url、source 和 publishDate
const result: AIRecommendation[] = []; const result: AIRecommendation[] = [];
for (const entity of entities) { for (const entity of entities) {
const bidItem = await this.bidItemRepository.findOne({ const bidItem = await this.bidItemRepository.findOne({
where: { title: entity.title } where: { title: entity.title },
}); });
result.push({ result.push({
@@ -140,7 +147,7 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
url: bidItem?.url || '', url: bidItem?.url || '',
source: bidItem?.source || '', source: bidItem?.source || '',
confidence: entity.confidence, confidence: entity.confidence,
publishDate: bidItem?.publishDate publishDate: bidItem?.publishDate,
}); });
} }
@@ -148,7 +155,9 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
result.sort((a, b) => { result.sort((a, b) => {
if (!a.publishDate) return 1; if (!a.publishDate) return 1;
if (!b.publishDate) return -1; if (!b.publishDate) return -1;
return new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime(); return (
new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime()
);
}); });
return result; return result;

View File

@@ -1,4 +1,9 @@
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm'; import {
Entity,
PrimaryGeneratedColumn,
Column,
CreateDateColumn,
} from 'typeorm';
@Entity('ai_recommendations') @Entity('ai_recommendations')
export class AiRecommendation { export class AiRecommendation {

View File

@@ -28,4 +28,4 @@ import { AiModule } from './ai/ai.module';
AiModule, AiModule,
], ],
}) })
export class AppModule {} export class AppModule {}

View File

@@ -1,12 +1,19 @@
import { Controller, Get, Query, Patch, Param, Body } from '@nestjs/common'; import { Controller, Get, Query, Patch, Param, Body } from '@nestjs/common';
import { BidsService } from '../services/bid.service'; import { BidsService } from '../services/bid.service';
interface FindAllQuery {
page?: number;
limit?: number;
source?: string;
keyword?: string;
}
@Controller('api/bids') @Controller('api/bids')
export class BidsController { export class BidsController {
constructor(private readonly bidsService: BidsService) {} constructor(private readonly bidsService: BidsService) {}
@Get() @Get()
findAll(@Query() query: any) { findAll(@Query() query: FindAllQuery) {
return this.bidsService.findAll(query); return this.bidsService.findAll(query);
} }
@@ -26,9 +33,17 @@ export class BidsController {
} }
@Get('by-date-range') @Get('by-date-range')
getByDateRange(@Query('startDate') startDate: string, @Query('endDate') endDate?: string, @Query('keywords') keywords?: string) { getByDateRange(
@Query('startDate') startDate: string,
@Query('endDate') endDate?: string,
@Query('keywords') keywords?: string,
) {
const keywordsArray = keywords ? keywords.split(',') : undefined; const keywordsArray = keywords ? keywords.split(',') : undefined;
return this.bidsService.getBidsByDateRange(startDate, endDate, keywordsArray); return this.bidsService.getBidsByDateRange(
startDate,
endDate,
keywordsArray,
);
} }
@Get('crawl-info-stats') @Get('crawl-info-stats')

View File

@@ -1,4 +1,10 @@
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm'; import {
Entity,
PrimaryGeneratedColumn,
Column,
CreateDateColumn,
UpdateDateColumn,
} from 'typeorm';
@Entity('bid_items') @Entity('bid_items')
export class BidItem { export class BidItem {

View File

@@ -1,9 +1,36 @@
import { Injectable } from '@nestjs/common'; import { Injectable } from '@nestjs/common';
import { InjectRepository } from '@nestjs/typeorm'; import { InjectRepository } from '@nestjs/typeorm';
import { Repository, LessThan, MoreThanOrEqual } from 'typeorm'; import { Repository, LessThan } from 'typeorm';
import { BidItem } from '../entities/bid-item.entity'; import { BidItem } from '../entities/bid-item.entity';
import { CrawlInfoAdd } from '../../crawler/entities/crawl-info-add.entity'; import { CrawlInfoAdd } from '../../crawler/entities/crawl-info-add.entity';
interface FindAllQuery {
page?: number;
limit?: number;
source?: string;
keyword?: string;
}
interface SourceResult {
source: string;
}
interface CrawlInfoAddStats {
source: string;
count: number;
latestUpdate: Date | string;
latestPublishDate: Date | string | null;
error: string | null;
}
interface CrawlInfoAddRawResult {
source: string;
count: number;
latestPublishDate: Date | string | null;
error: string | null;
latestUpdate: Date | string;
}
@Injectable() @Injectable()
export class BidsService { export class BidsService {
constructor( constructor(
@@ -13,7 +40,7 @@ export class BidsService {
private crawlInfoRepository: Repository<CrawlInfoAdd>, private crawlInfoRepository: Repository<CrawlInfoAdd>,
) {} ) {}
async findAll(query?: any) { async findAll(query?: FindAllQuery) {
const { page = 1, limit = 10, source, keyword } = query || {}; const { page = 1, limit = 10, source, keyword } = query || {};
const qb = this.bidRepository.createQueryBuilder('bid'); const qb = this.bidRepository.createQueryBuilder('bid');
@@ -26,8 +53,8 @@ export class BidsService {
} }
qb.orderBy('bid.publishDate', 'DESC') qb.orderBy('bid.publishDate', 'DESC')
.skip((page - 1) * limit) .skip((Number(page) - 1) * Number(limit))
.take(limit); .take(Number(limit));
const [items, total] = await qb.getManyAndCount(); const [items, total] = await qb.getManyAndCount();
return { items, total }; return { items, total };
@@ -35,7 +62,9 @@ export class BidsService {
async createOrUpdate(data: Partial<BidItem>) { async createOrUpdate(data: Partial<BidItem>) {
// Use title or a hash of title to check for duplicates // Use title or a hash of title to check for duplicates
let item = await this.bidRepository.findOne({ where: { title: data.title } }); const item = await this.bidRepository.findOne({
where: { title: data.title },
});
if (item) { if (item) {
Object.assign(item, data); Object.assign(item, data);
return this.bidRepository.save(item); return this.bidRepository.save(item);
@@ -51,21 +80,21 @@ export class BidsService {
}); });
} }
async getSources() { async getSources(): Promise<string[]> {
const result = await this.bidRepository const result = await this.bidRepository
.createQueryBuilder('bid') .createQueryBuilder('bid')
.select('DISTINCT bid.source') .select('DISTINCT bid.source', 'source')
.where('bid.source IS NOT NULL') .where('bid.source IS NOT NULL')
.orderBy('bid.source', 'ASC') .orderBy('bid.source', 'ASC')
.getRawMany(); .getRawMany<SourceResult>();
return result.map((item: any) => item.source); return result.map((item) => item.source);
} }
async getRecentBids() { async getRecentBids() {
const thirtyDaysAgo = new Date(); const thirtyDaysAgo = new Date();
thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30); thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30);
thirtyDaysAgo.setHours(0, 0, 0, 0); thirtyDaysAgo.setHours(0, 0, 0, 0);
return this.bidRepository return this.bidRepository
.createQueryBuilder('bid') .createQueryBuilder('bid')
.where('bid.publishDate >= :thirtyDaysAgo', { thirtyDaysAgo }) .where('bid.publishDate >= :thirtyDaysAgo', { thirtyDaysAgo })
@@ -81,7 +110,11 @@ export class BidsService {
.getMany(); .getMany();
} }
async getBidsByDateRange(startDate?: string, endDate?: string, keywords?: string[]) { async getBidsByDateRange(
startDate?: string,
endDate?: string,
keywords?: string[],
) {
const qb = this.bidRepository.createQueryBuilder('bid'); const qb = this.bidRepository.createQueryBuilder('bid');
if (startDate) { if (startDate) {
@@ -97,13 +130,18 @@ export class BidsService {
} }
if (keywords && keywords.length > 0) { if (keywords && keywords.length > 0) {
const keywordConditions = keywords.map((keyword, index) => { const keywordConditions = keywords
return `bid.title LIKE :keyword${index}`; .map((keyword, index) => {
}).join(' OR '); return `bid.title LIKE :keyword${index}`;
qb.andWhere(`(${keywordConditions})`, keywords.reduce((params, keyword, index) => { })
params[`keyword${index}`] = `%${keyword}%`; .join(' OR ');
return params; qb.andWhere(
}, {})); `(${keywordConditions})`,
keywords.reduce((params, keyword, index) => {
params[`keyword${index}`] = `%${keyword}%`;
return params;
}, {}),
);
} }
return qb.orderBy('bid.publishDate', 'DESC').getMany(); return qb.orderBy('bid.publishDate', 'DESC').getMany();
@@ -118,7 +156,7 @@ export class BidsService {
return this.bidRepository.save(item); return this.bidRepository.save(item);
} }
async getCrawlInfoAddStats() { async getCrawlInfoAddStats(): Promise<CrawlInfoAddStats[]> {
// 获取每个来源的最新一次爬虫记录(按 createdAt 降序) // 获取每个来源的最新一次爬虫记录(按 createdAt 降序)
const query = ` const query = `
SELECT SELECT
@@ -136,15 +174,19 @@ export class BidsService {
ORDER BY source ASC ORDER BY source ASC
`; `;
const results = await this.crawlInfoRepository.query(query); const results =
await this.crawlInfoRepository.query<CrawlInfoAddRawResult[]>(query);
return results.map((item: any) => ({ return results.map((item) => ({
source: item.source, source: String(item.source),
count: item.count, count: Number(item.count),
latestUpdate: item.latestUpdate, latestUpdate: item.latestUpdate,
latestPublishDate: item.latestPublishDate, latestPublishDate: item.latestPublishDate,
// 确保 error 字段正确处理null 或空字符串都转换为 null非空字符串保留 // 确保 error 字段正确处理null 或空字符串都转换为 null非空字符串保留
error: item.error && item.error.trim() !== '' ? item.error : null, error:
item.error && String(item.error).trim() !== ''
? String(item.error)
: null,
})); }));
} }
} }

View File

@@ -1,6 +1,21 @@
import { Injectable, LoggerService, Scope } from '@nestjs/common'; import { Injectable, LoggerService, Scope } from '@nestjs/common';
import { winstonLogger } from './winston.config'; import { winstonLogger } from './winston.config';
type LogMessage = string | Error | Record<string, unknown>;
function formatMessage(message: LogMessage): string {
if (typeof message === 'string') {
return message;
}
if (message instanceof Error) {
return message.message;
}
if (typeof message === 'object' && message !== null) {
return JSON.stringify(message);
}
return String(message);
}
@Injectable({ scope: Scope.TRANSIENT }) @Injectable({ scope: Scope.TRANSIENT })
export class CustomLogger implements LoggerService { export class CustomLogger implements LoggerService {
private context?: string; private context?: string;
@@ -9,23 +24,34 @@ export class CustomLogger implements LoggerService {
this.context = context; this.context = context;
} }
log(message: any, context?: string) { log(message: LogMessage, context?: string) {
winstonLogger.info(message, { context: context || this.context }); winstonLogger.info(formatMessage(message), {
context: context || this.context,
});
} }
error(message: any, trace?: string, context?: string) { error(message: LogMessage, trace?: string, context?: string) {
winstonLogger.error(message, { context: context || this.context, trace }); winstonLogger.error(formatMessage(message), {
context: context || this.context,
trace,
});
} }
warn(message: any, context?: string) { warn(message: LogMessage, context?: string) {
winstonLogger.warn(message, { context: context || this.context }); winstonLogger.warn(formatMessage(message), {
context: context || this.context,
});
} }
debug(message: any, context?: string) { debug(message: LogMessage, context?: string) {
winstonLogger.debug(message, { context: context || this.context }); winstonLogger.debug(formatMessage(message), {
context: context || this.context,
});
} }
verbose(message: any, context?: string) { verbose(message: LogMessage, context?: string) {
winstonLogger.verbose(message, { context: context || this.context }); winstonLogger.verbose(formatMessage(message), {
context: context || this.context,
});
} }
} }

View File

@@ -16,13 +16,33 @@ const logFormat = winston.format.combine(
winston.format.errors({ stack: true }), winston.format.errors({ stack: true }),
winston.format.splat(), winston.format.splat(),
winston.format.printf(({ timestamp, level, message, context, stack }) => { winston.format.printf(({ timestamp, level, message, context, stack }) => {
let log = `${timestamp} [${level}]`; const timestampStr =
if (context) { typeof timestamp === 'string' ? timestamp : String(timestamp);
log += ` [${context}]`; const levelStr = typeof level === 'string' ? level : String(level);
} const messageStr = typeof message === 'string' ? message : String(message);
log += ` ${message}`; const contextStr = context
? typeof context === 'string'
? context
: JSON.stringify(context)
: '';
let stackStr = '';
if (stack) { if (stack) {
log += `\n${stack}`; if (typeof stack === 'string') {
stackStr = stack;
} else if (typeof stack === 'object' && stack !== null) {
stackStr = JSON.stringify(stack);
} else {
stackStr = String(stack);
}
}
let log = `${timestampStr} [${levelStr}]`;
if (contextStr) {
log += ` [${contextStr}]`;
}
log += ` ${messageStr}`;
if (stackStr) {
log += `\n${stackStr}`;
} }
return log; return log;
}), }),
@@ -30,10 +50,7 @@ const logFormat = winston.format.combine(
// 控制台传输 // 控制台传输
const consoleTransport = new winston.transports.Console({ const consoleTransport = new winston.transports.Console({
format: winston.format.combine( format: winston.format.combine(winston.format.colorize(), logFormat),
winston.format.colorize(),
logFormat,
),
}); });
// 应用日志传输(按天轮转) // 应用日志传输(按天轮转)
@@ -61,10 +78,6 @@ const errorLogTransport = new DailyRotateFile({
export const winstonLogger = winston.createLogger({ export const winstonLogger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info', level: process.env.LOG_LEVEL || 'info',
format: logFormat, format: logFormat,
transports: [ transports: [consoleTransport, appLogTransport, errorLogTransport],
consoleTransport,
appLogTransport,
errorLogTransport,
],
exitOnError: false, exitOnError: false,
}); });

View File

@@ -12,7 +12,7 @@ export class CrawlerController {
getStatus() { getStatus() {
return { return {
isCrawling: this.isCrawling, isCrawling: this.isCrawling,
crawlingSources: Array.from(this.crawlingSources) crawlingSources: Array.from(this.crawlingSources),
}; };
} }
@@ -21,9 +21,9 @@ export class CrawlerController {
if (this.isCrawling) { if (this.isCrawling) {
return { message: 'Crawl is already running' }; return { message: 'Crawl is already running' };
} }
this.isCrawling = true; this.isCrawling = true;
// We don't await this because we want it to run in the background // We don't await this because we want it to run in the background
// and return immediately, or we can await if we want to user to wait. // and return immediately, or we can await if we want to user to wait.
// Given the requirement "Immediate Crawl", usually implies triggering it. // Given the requirement "Immediate Crawl", usually implies triggering it.
@@ -45,9 +45,9 @@ export class CrawlerController {
if (this.crawlingSources.has(sourceName)) { if (this.crawlingSources.has(sourceName)) {
return { message: `Source ${sourceName} is already being crawled` }; return { message: `Source ${sourceName} is already being crawled` };
} }
this.crawlingSources.add(sourceName); this.crawlingSources.add(sourceName);
try { try {
const result = await this.crawlerService.crawlSingleSource(sourceName); const result = await this.crawlerService.crawlSingleSource(sourceName);
return result; return result;

View File

@@ -1,4 +1,9 @@
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm'; import {
Entity,
PrimaryGeneratedColumn,
Column,
CreateDateColumn,
} from 'typeorm';
@Entity('crawl_info_add') @Entity('crawl_info_add')
export class CrawlInfoAdd { export class CrawlInfoAdd {

View File

@@ -18,6 +18,17 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
import { SdiccCrawler } from './sdicc_target'; import { SdiccCrawler } from './sdicc_target';
import { CnoocCrawler } from './cnooc_target'; import { CnoocCrawler } from './cnooc_target';
interface CrawlResult {
title: string;
publishDate: Date;
url: string;
}
interface Crawler {
name: string;
crawl(browser: puppeteer.Browser): Promise<CrawlResult[]>;
}
@Injectable() @Injectable()
export class BidCrawlerService { export class BidCrawlerService {
private readonly logger = new Logger(BidCrawlerService.name); private readonly logger = new Logger(BidCrawlerService.name);
@@ -31,17 +42,15 @@ export class BidCrawlerService {
async crawlAll() { async crawlAll() {
this.logger.log('Starting crawl task with Puppeteer...'); this.logger.log('Starting crawl task with Puppeteer...');
// 设置最大执行时间为3小时 // 设置最大执行时间为3小时
const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时毫秒 const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时毫秒
const startTime = Date.now(); const startTime = Date.now();
// 统计结果 // 统计结果
const crawlResults: Record<string, { success: number; error?: string }> = {}; const crawlResults: Record<string, { success: number; error?: string }> =
{};
// 记录数据为0的爬虫用于重试 // 记录数据为0的爬虫用于重试
const zeroDataCrawlers: any[] = []; const zeroDataCrawlers: Crawler[] = [];
// 从环境变量读取代理配置 // 从环境变量读取代理配置
const proxyHost = this.configService.get<string>('PROXY_HOST'); const proxyHost = this.configService.get<string>('PROXY_HOST');
const proxyPort = this.configService.get<string>('PROXY_PORT'); const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -60,9 +69,10 @@ export class BidCrawlerService {
]; ];
if (proxyHost && proxyPort) { if (proxyHost && proxyPort) {
const proxyUrl = proxyUsername && proxyPassword const proxyUrl =
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` proxyUsername && proxyPassword
: `http://${proxyHost}:${proxyPort}`; ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
: `http://${proxyHost}:${proxyPort}`;
args.push(`--proxy-server=${proxyUrl}`); args.push(`--proxy-server=${proxyUrl}`);
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`); this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
} }
@@ -72,24 +82,43 @@ export class BidCrawlerService {
args, args,
}); });
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler]; const crawlers = [
ChdtpCrawler,
ChngCrawler,
SzecpCrawler,
CdtCrawler,
EpsCrawler,
CnncecpCrawler,
CgnpcCrawler,
CeicCrawler,
EspicCrawler,
PowerbeijingCrawler,
SdiccCrawler,
CnoocCrawler,
];
try { try {
for (const crawler of crawlers) { for (const crawler of crawlers) {
this.logger.log(`Crawling: ${crawler.name}`); this.logger.log(`Crawling: ${crawler.name}`);
// 检查是否超时 // 检查是否超时
const elapsedTime = Date.now() - startTime; const elapsedTime = Date.now() - startTime;
if (elapsedTime > maxExecutionTime) { if (elapsedTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`); this.logger.warn(
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`); `⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`,
);
this.logger.warn(
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
);
break; break;
} }
try { try {
const results = await crawler.crawl(browser); const results = await crawler.crawl(browser);
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`); this.logger.log(
`Extracted ${results.length} items from ${crawler.name}`,
);
// 记录成功数量 // 记录成功数量
crawlResults[crawler.name] = { success: results.length }; crawlResults[crawler.name] = { success: results.length };
@@ -99,12 +128,13 @@ export class BidCrawlerService {
} }
// 获取最新的发布日期 // 获取最新的发布日期
const latestPublishDate = results.length > 0 const latestPublishDate =
? results.reduce((latest, item) => { results.length > 0
const itemDate = new Date(item.publishDate); ? results.reduce((latest, item) => {
return itemDate > latest ? itemDate : latest; const itemDate = new Date(item.publishDate);
}, new Date(0)) return itemDate > latest ? itemDate : latest;
: null; }, new Date(0))
: null;
for (const item of results) { for (const item of results) {
await this.bidsService.createOrUpdate({ await this.bidsService.createOrUpdate({
@@ -116,46 +146,60 @@ export class BidCrawlerService {
} }
// 保存爬虫统计信息到数据库 // 保存爬虫统计信息到数据库
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate); await this.saveCrawlInfo(
crawler.name,
results.length,
latestPublishDate,
);
} catch (err) { } catch (err) {
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`); const errorMessage = err instanceof Error ? err.message : String(err);
this.logger.error(`Error crawling ${crawler.name}: ${errorMessage}`);
// 记录错误信息 // 记录错误信息
crawlResults[crawler.name] = { success: 0, error: err.message }; crawlResults[crawler.name] = { success: 0, error: errorMessage };
// 保存错误信息到数据库 // 保存错误信息到数据库
await this.saveCrawlInfo(crawler.name, 0, null, err.message); await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
} }
} }
// 对数据为0的爬虫进行重试 // 对数据为0的爬虫进行重试
if (zeroDataCrawlers.length > 0) { if (zeroDataCrawlers.length > 0) {
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`); this.logger.log(
`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`,
);
for (const crawler of zeroDataCrawlers) { for (const crawler of zeroDataCrawlers) {
this.logger.log(`Retrying: ${crawler.name}`); this.logger.log(`Retrying: ${crawler.name}`);
// 检查是否超时 // 检查是否超时
const elapsedTime = Date.now() - startTime; const elapsedTime = Date.now() - startTime;
if (elapsedTime > maxExecutionTime) { if (elapsedTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`); this.logger.warn(
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`); `⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`,
);
this.logger.warn(
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
);
break; break;
} }
try { try {
const results = await crawler.crawl(browser); const results = await crawler.crawl(browser);
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`); this.logger.log(
`Retry extracted ${results.length} items from ${crawler.name}`,
);
// 更新统计结果 // 更新统计结果
crawlResults[crawler.name] = { success: results.length }; crawlResults[crawler.name] = { success: results.length };
// 获取最新的发布日期 // 获取最新的发布日期
const latestPublishDate = results.length > 0 const latestPublishDate =
? results.reduce((latest, item) => { results.length > 0
const itemDate = new Date(item.publishDate); ? results.reduce((latest, item) => {
return itemDate > latest ? itemDate : latest; const itemDate = new Date(item.publishDate);
}, new Date(0)) return itemDate > latest ? itemDate : latest;
: null; }, new Date(0))
: null;
for (const item of results) { for (const item of results) {
await this.bidsService.createOrUpdate({ await this.bidsService.createOrUpdate({
@@ -167,58 +211,76 @@ export class BidCrawlerService {
} }
// 更新爬虫统计信息到数据库 // 更新爬虫统计信息到数据库
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate); await this.saveCrawlInfo(
crawler.name,
results.length,
latestPublishDate,
);
} catch (err) { } catch (err) {
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`); const errorMessage =
err instanceof Error ? err.message : String(err);
this.logger.error(
`Error retrying ${crawler.name}: ${errorMessage}`,
);
// 记录错误信息 // 记录错误信息
crawlResults[crawler.name] = { success: 0, error: err.message }; crawlResults[crawler.name] = { success: 0, error: errorMessage };
// 更新错误信息到数据库 // 更新错误信息到数据库
await this.saveCrawlInfo(crawler.name, 0, null, err.message); await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
} }
} }
} }
} catch (error) { } catch (error) {
this.logger.error(`Crawl task failed: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
this.logger.error(`Crawl task failed: ${errorMessage}`);
} finally { } finally {
await browser.close(); await browser.close();
const totalTime = Date.now() - startTime; const totalTime = Date.now() - startTime;
const minutes = Math.floor(totalTime / 1000 / 60); const minutes = Math.floor(totalTime / 1000 / 60);
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`); this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
if (totalTime > maxExecutionTime) { if (totalTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`); this.logger.warn(
`⚠️ Crawl task exceeded maximum execution time of 3 hours.`,
);
} }
// 输出统计总结 // 输出统计总结
this.logger.log('='.repeat(50)); this.logger.log('='.repeat(50));
this.logger.log('爬虫执行总结 / Crawl Summary'); this.logger.log('爬虫执行总结 / Crawl Summary');
this.logger.log('='.repeat(50)); this.logger.log('='.repeat(50));
let totalSuccess = 0; let totalSuccess = 0;
let errorCount = 0; let errorCount = 0;
for (const [source, result] of Object.entries(crawlResults)) { for (const [source, result] of Object.entries(crawlResults)) {
if (result.error) { if (result.error) {
this.logger.error(`${source}: 出错 - ${result.error}`); this.logger.error(`${source}: 出错 - ${result.error}`);
errorCount++; errorCount++;
} else { } else {
this.logger.log(`${source}: 成功获取 ${result.success} 条工程信息`); this.logger.log(
`${source}: 成功获取 ${result.success} 条工程信息`,
);
totalSuccess += result.success; totalSuccess += result.success;
} }
} }
this.logger.log('='.repeat(50)); this.logger.log('='.repeat(50));
this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`); this.logger.log(
this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`); `总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`,
);
this.logger.log(
`Total: ${totalSuccess} items, ${errorCount} sources failed`,
);
this.logger.log('='.repeat(50)); this.logger.log('='.repeat(50));
} }
} }
async crawlSingleSource(sourceName: string) { async crawlSingleSource(sourceName: string) {
this.logger.log(`Starting single source crawl for: ${sourceName}`); this.logger.log(`Starting single source crawl for: ${sourceName}`);
// 从环境变量读取代理配置 // 从环境变量读取代理配置
const proxyHost = this.configService.get<string>('PROXY_HOST'); const proxyHost = this.configService.get<string>('PROXY_HOST');
const proxyPort = this.configService.get<string>('PROXY_PORT'); const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -237,9 +299,10 @@ export class BidCrawlerService {
]; ];
if (proxyHost && proxyPort) { if (proxyHost && proxyPort) {
const proxyUrl = proxyUsername && proxyPassword const proxyUrl =
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` proxyUsername && proxyPassword
: `http://${proxyHost}:${proxyPort}`; ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
: `http://${proxyHost}:${proxyPort}`;
args.push(`--proxy-server=${proxyUrl}`); args.push(`--proxy-server=${proxyUrl}`);
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`); this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
} }
@@ -249,10 +312,23 @@ export class BidCrawlerService {
args, args,
}); });
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler]; const crawlers = [
ChdtpCrawler,
const targetCrawler = crawlers.find(c => c.name === sourceName); ChngCrawler,
SzecpCrawler,
CdtCrawler,
EpsCrawler,
CnncecpCrawler,
CgnpcCrawler,
CeicCrawler,
EspicCrawler,
PowerbeijingCrawler,
SdiccCrawler,
CnoocCrawler,
];
const targetCrawler = crawlers.find((c) => c.name === sourceName);
if (!targetCrawler) { if (!targetCrawler) {
await browser.close(); await browser.close();
throw new Error(`Crawler not found for source: ${sourceName}`); throw new Error(`Crawler not found for source: ${sourceName}`);
@@ -260,17 +336,20 @@ export class BidCrawlerService {
try { try {
this.logger.log(`Crawling: ${targetCrawler.name}`); this.logger.log(`Crawling: ${targetCrawler.name}`);
const results = await targetCrawler.crawl(browser); const results = await targetCrawler.crawl(browser);
this.logger.log(`Extracted ${results.length} items from ${targetCrawler.name}`); this.logger.log(
`Extracted ${results.length} items from ${targetCrawler.name}`,
);
// 获取最新的发布日期 // 获取最新的发布日期
const latestPublishDate = results.length > 0 const latestPublishDate =
? results.reduce((latest, item) => { results.length > 0
const itemDate = new Date(item.publishDate); ? results.reduce((latest, item) => {
return itemDate > latest ? itemDate : latest; const itemDate = new Date(item.publishDate);
}, new Date(0)) return itemDate > latest ? itemDate : latest;
: null; }, new Date(0))
: null;
for (const item of results) { for (const item of results) {
await this.bidsService.createOrUpdate({ await this.bidsService.createOrUpdate({
@@ -282,7 +361,11 @@ export class BidCrawlerService {
} }
// 保存爬虫统计信息到数据库 // 保存爬虫统计信息到数据库
await this.saveCrawlInfo(targetCrawler.name, results.length, latestPublishDate); await this.saveCrawlInfo(
targetCrawler.name,
results.length,
latestPublishDate,
);
return { return {
success: true, success: true,
@@ -291,16 +374,19 @@ export class BidCrawlerService {
latestPublishDate, latestPublishDate,
}; };
} catch (err) { } catch (err) {
this.logger.error(`Error crawling ${targetCrawler.name}: ${err.message}`); const errorMessage = err instanceof Error ? err.message : String(err);
this.logger.error(
`Error crawling ${targetCrawler.name}: ${errorMessage}`,
);
// 保存错误信息到数据库 // 保存错误信息到数据库
await this.saveCrawlInfo(targetCrawler.name, 0, null, err.message); await this.saveCrawlInfo(targetCrawler.name, 0, null, errorMessage);
return { return {
success: false, success: false,
source: targetCrawler.name, source: targetCrawler.name,
count: 0, count: 0,
error: err.message, error: errorMessage,
}; };
} finally { } finally {
await browser.close(); await browser.close();
@@ -324,7 +410,10 @@ export class BidCrawlerService {
await this.crawlInfoRepository.save(crawlInfo); await this.crawlInfoRepository.save(crawlInfo);
this.logger.log(`Saved crawl info for ${source}: ${count} items`); this.logger.log(`Saved crawl info for ${source}: ${count} items`);
} catch (err) { } catch (err) {
this.logger.error(`Failed to save crawl info for ${source}: ${err.message}`); const errorMessage = err instanceof Error ? err.message : String(err);
this.logger.error(
`Failed to save crawl info for ${source}: ${errorMessage}`,
);
} }
} }
} }

View File

@@ -2,7 +2,7 @@ import { CdtCrawler } from './cdt_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CdtCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CdtCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CdtCrawler.name}`); console.log(`\nStarting crawl for: ${CdtCrawler.name}`);
console.log(`Target URL: ${CdtCrawler.url}`); console.log(`Target URL: ${CdtCrawler.url}`);
const results = await CdtCrawler.crawl(browser); const results = await CdtCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('CdtCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface CdtResult { export interface CdtResult {
@@ -52,12 +52,22 @@ export interface CdtResult {
url: string; url: string;
} }
interface CdtCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CdtResult[];
}
export const CdtCrawler = { export const CdtCrawler = {
name: '中国大唐集团电子商务平台', name: '中国大唐集团电子商务平台',
url: 'https://tang.cdt-ec.com/home/index.html', url: 'https://tang.cdt-ec.com/home/index.html',
baseUrl: 'https://tang.cdt-ec.com', baseUrl: 'https://tang.cdt-ec.com',
async crawl(browser: puppeteer.Browser): Promise<CdtResult[]> { async crawl(
this: CdtCrawlerType,
browser: puppeteer.Browser,
): Promise<CdtResult[]> {
const logger = new Logger('CdtCrawler'); const logger = new Logger('CdtCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -67,7 +77,9 @@ export const CdtCrawler = {
await page.authenticate({ username, password }); await page.authenticate({ username, password });
} }
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
);
const allResults: CdtResult[] = []; const allResults: CdtResult[] = [];
let currentPage = 1; let currentPage = 1;
@@ -86,19 +98,26 @@ export const CdtCrawler = {
// 点击"招标公告"标签 // 点击"招标公告"标签
logger.log('Looking for "招标公告" tab...'); logger.log('Looking for "招标公告" tab...');
await page.waitForFunction(() => { await page.waitForFunction(
const tabs = Array.from(document.querySelectorAll('span.notice-tab')); () => {
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告')); const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
}, { timeout: 60000 }); return tabs.some(
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
);
},
{ timeout: 60000 },
);
await page.evaluate(() => { await page.evaluate(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab')); const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
const target = tabs.find(tab => tab.textContent && tab.textContent.includes('招标公告')) as HTMLElement; const target = tabs.find(
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
) as HTMLElement;
if (target) target.click(); if (target) target.click();
}); });
logger.log('Clicked "招标公告" tab.'); logger.log('Clicked "招标公告" tab.');
await new Promise(r => setTimeout(r, 2000)); await new Promise((r) => setTimeout(r, 2000));
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
@@ -109,26 +128,43 @@ export const CdtCrawler = {
// 点击"招标公告"下的"更多+"链接 // 点击"招标公告"下的"更多+"链接
logger.log('Looking for "更多+" link under "招标公告"...'); logger.log('Looking for "更多+" link under "招标公告"...');
await page.waitForFunction(() => { await page.waitForFunction(
const titles = Array.from(document.querySelectorAll('span.h-notice-title')); () => {
return titles.some(title => title.textContent && title.textContent.includes('招标公告')); const titles = Array.from(
}, { timeout: 30000 }); document.querySelectorAll('span.h-notice-title'),
);
return titles.some(
(title) =>
title.textContent && title.textContent.includes('招标公告'),
);
},
{ timeout: 30000 },
);
await page.evaluate(() => { await page.evaluate(() => {
const titles = Array.from(document.querySelectorAll('span.h-notice-title')); const titles = Array.from(
const targetTitle = titles.find(title => title.textContent && title.textContent.includes('招标公告')); document.querySelectorAll('span.h-notice-title'),
);
const targetTitle = titles.find(
(title) =>
title.textContent && title.textContent.includes('招标公告'),
);
if (targetTitle) { if (targetTitle) {
const parent = targetTitle.parentElement; const parent = targetTitle.parentElement;
if (parent) { if (parent) {
const moreLink = parent.querySelector('a.h-notice-more') as HTMLElement; const moreLink = parent.querySelector(
'a.h-notice-more',
) as HTMLElement;
if (moreLink) moreLink.click(); if (moreLink) moreLink.click();
} }
} }
}); });
logger.log('Clicked "更多+" link under "招标公告".'); logger.log('Clicked "更多+" link under "招标公告".');
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {}); await page
await new Promise(r => setTimeout(r, 3000)); .waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
.catch(() => {});
await new Promise((r) => setTimeout(r, 3000));
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
@@ -155,7 +191,9 @@ export const CdtCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
@@ -172,7 +210,9 @@ export const CdtCrawler = {
}, nextButtonSelector); }, nextButtonSelector);
if (!nextButtonExists) { if (!nextButtonExists) {
logger.log('Next page button not found or disabled. Reached end of list.'); logger.log(
'Next page button not found or disabled. Reached end of list.',
);
break; break;
} }
@@ -186,18 +226,25 @@ export const CdtCrawler = {
}, nextButtonSelector); }, nextButtonSelector);
// 等待 AJAX 请求完成(通过监听网络请求) // 等待 AJAX 请求完成(通过监听网络请求)
await page.waitForFunction(() => { await page
// 检查表格是否正在加载 .waitForFunction(
const loading = document.querySelector('.layui-table-loading'); () => {
return !loading; // 检查表格是否正在加载
}, { timeout: 30000 }).catch(() => {}); const loading = document.querySelector('.layui-table-loading');
return !loading;
},
{ timeout: 30000 },
)
.catch(() => {});
// 额外等待确保数据加载完成 // 额外等待确保数据加载完成
await new Promise(r => setTimeout(r, 2000)); await new Promise((r) => setTimeout(r, 2000));
// 检查是否真的翻页了(通过检查当前页码) // 检查是否真的翻页了(通过检查当前页码)
const currentActivePage = await page.evaluate(() => { const currentActivePage = await page.evaluate(() => {
const activeSpan = document.querySelector('.layui-laypage-curr em:last-child'); const activeSpan = document.querySelector(
'.layui-laypage-curr em:last-child',
);
return activeSpan ? parseInt(activeSpan.textContent || '1') : 1; return activeSpan ? parseInt(activeSpan.textContent || '1') : 1;
}); });
@@ -217,25 +264,29 @@ export const CdtCrawler = {
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): CdtResult[] { extract(this: CdtCrawlerType, html: string): CdtResult[] {
const results: CdtResult[] = []; const results: CdtResult[] = [];
/** /**
* Regex groups for tang.cdt-ec.com: * Regex groups for tang.cdt-ec.com:
@@ -243,23 +294,24 @@ export const CdtCrawler = {
* 2: Title (项目名称) * 2: Title (项目名称)
* 3: Date (发布时间) * 3: Date (发布时间)
*/ */
const regex = /<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs; const regex =
/<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim(); const url = match[1]?.trim() ?? '';
const title = match[2]?.trim(); const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim(); const dateStr = match[3]?.trim() ?? '';
if (title && url) { if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -29,7 +29,7 @@ describe('CeicCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Run in non-headless mode headless: false, // Run in non-headless mode
args: [ args: [
@@ -40,14 +40,14 @@ describe('CeicCrawler Real Site Test', () => {
'--disable-infobars', '--disable-infobars',
...proxyArgs, ...proxyArgs,
], ],
defaultViewport: null defaultViewport: null,
}); });
}); });
afterAll(async () => { afterAll(async () => {
if (browser) { if (browser) {
// Keep open for a few seconds after test to see result // Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000)); await new Promise((r) => setTimeout(r, 50000));
await browser.close(); await browser.close();
} }
}); });
@@ -56,29 +56,33 @@ describe('CeicCrawler Real Site Test', () => {
console.log(` console.log(`
Starting crawl for: ${CeicCrawler.name}`); Starting crawl for: ${CeicCrawler.name}`);
console.log(`Target URL: ${CeicCrawler.url}`); console.log(`Target URL: ${CeicCrawler.url}`);
const results = await CeicCrawler.crawl(browser); const results = await CeicCrawler.crawl(browser);
console.log(` console.log(`
Successfully found ${results.length} items: Successfully found ${results.length} items:
`); `);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
expect(results).toBeDefined(); expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.'); console.warn(
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
);
} else { } else {
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
}
interface CeicCrawlerType {
name: string;
url: string;
baseUrl: string;
} }
export const CeicCrawler = { export const CeicCrawler = {
@@ -52,7 +58,10 @@ export const CeicCrawler = {
url: 'https://ceic.dlnyzb.com/3001', url: 'https://ceic.dlnyzb.com/3001',
baseUrl: 'https://ceic.dlnyzb.com/', baseUrl: 'https://ceic.dlnyzb.com/',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> { async crawl(
this: CeicCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('CeicCrawler'); const logger = new Logger('CeicCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -65,10 +74,14 @@ export const CeicCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = []; const allResults: ChdtpResult[] = [];
@@ -82,7 +95,7 @@ export const CeicCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -90,16 +103,25 @@ export const CeicCrawler = {
logger.log(`Processing page ${currentPage}...`); logger.log(`Processing page ${currentPage}...`);
// Wait for content to load - MUI list items // Wait for content to load - MUI list items
await page.waitForFunction(() => { await page
return document.querySelectorAll('li.MuiListItem-root').length > 0; .waitForFunction(
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.')); () => {
return (
document.querySelectorAll('li.MuiListItem-root').length > 0
);
},
{ timeout: 60000 },
)
.catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate(() => { const pageResults = await page.evaluate(() => {
const results: { title: string; dateStr: string; url: string }[] = []; const results: { title: string; dateStr: string; url: string }[] = [];
// Extract from MUI list items // Extract from MUI list items
const listItems = Array.from(document.querySelectorAll('li.MuiListItem-root')); const listItems = Array.from(
listItems.forEach(item => { document.querySelectorAll('li.MuiListItem-root'),
);
listItems.forEach((item) => {
// Find the title link // Find the title link
const titleLink = item.querySelector('a.css-1vdw90h'); const titleLink = item.querySelector('a.css-1vdw90h');
const title = titleLink?.textContent?.trim() || ''; const title = titleLink?.textContent?.trim() || '';
@@ -125,15 +147,19 @@ export const CeicCrawler = {
}); });
if (pageResults.length === 0) { if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`); logger.warn(
`No results found on page ${currentPage}. Extraction failed.`,
);
break; break;
} }
allResults.push(...pageResults.map(r => ({ allResults.push(
title: r.title, ...pageResults.map((r) => ({
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(), title: r.title,
url: r.url.replace(/\/\//g, '/') publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
}))); url: r.url.replace(/\/\//g, '/'),
})),
);
logger.log(`Extracted ${pageResults.length} items.`); logger.log(`Extracted ${pageResults.length} items.`);
@@ -142,27 +168,30 @@ export const CeicCrawler = {
if (!nextButton) break; if (!nextButton) break;
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); await new Promise((r) => setTimeout(r, 3000));
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
currentPage++; currentPage++;
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Crawl failed: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Crawl failed: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
if (page) await page.close(); if (page) await page.close();
} }
}, },
extract() { return []; } extract() {
return [];
},
}; };

View File

@@ -2,7 +2,7 @@ import { CgnpcCrawler } from './cgnpc_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CgnpcCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CgnpcCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CgnpcCrawler.name}`); console.log(`\nStarting crawl for: ${CgnpcCrawler.name}`);
console.log(`Target URL: ${CgnpcCrawler.url}`); console.log(`Target URL: ${CgnpcCrawler.url}`);
const results = await CgnpcCrawler.crawl(browser); const results = await CgnpcCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('CgnpcCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface CgnpcResult { export interface CgnpcResult {
@@ -52,12 +52,22 @@ export interface CgnpcResult {
url: string; url: string;
} }
interface CgnpcCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CgnpcResult[];
}
export const CgnpcCrawler = { export const CgnpcCrawler = {
name: '中广核电子商务平台', name: '中广核电子商务平台',
url: 'https://ecp.cgnpc.com.cn/zbgg.html', url: 'https://ecp.cgnpc.com.cn/zbgg.html',
baseUrl: 'https://ecp.cgnpc.com.cn/', baseUrl: 'https://ecp.cgnpc.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<CgnpcResult[]> { async crawl(
this: CgnpcCrawlerType,
browser: puppeteer.Browser,
): Promise<CgnpcResult[]> {
const logger = new Logger('CgnpcCrawler'); const logger = new Logger('CgnpcCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const CgnpcCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: CgnpcResult[] = []; const allResults: CgnpcResult[] = [];
@@ -87,7 +101,7 @@ export const CgnpcCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const CgnpcCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const CgnpcCrawler = {
try { try {
// 点击下一页按钮 // 点击下一页按钮
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
@@ -138,26 +158,27 @@ export const CgnpcCrawler = {
// 模拟人类行为 - 翻页后 // 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...'); logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...'); logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): CgnpcResult[] { extract(this: CgnpcCrawlerType, html: string): CgnpcResult[] {
const results: CgnpcResult[] = []; const results: CgnpcResult[] = [];
/** /**
* Regex groups for ecp.cgnpc.com.cn: * Regex groups for ecp.cgnpc.com.cn:
@@ -181,24 +202,25 @@ export const CgnpcCrawler = {
* </div> * </div>
* </div> * </div>
*/ */
const regex = /<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p><\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs; const regex =
/<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p><\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const title = match[1]?.trim(); const title = match[1]?.trim() ?? '';
const url = match[2]?.trim(); const url = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim(); const dateStr = match[3]?.trim() ?? '';
if (title && url) { if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -29,7 +29,7 @@ describe('ChdtpCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: true, // Change to false to see the browser UI headless: true, // Change to false to see the browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('ChdtpCrawler Real Site Test', () => {
it('should visit the website and list all found bid information', async () => { it('should visit the website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${ChdtpCrawler.name}`); console.log(`\nStarting crawl for: ${ChdtpCrawler.name}`);
console.log(`Target URL: ${ChdtpCrawler.url}`); console.log(`Target URL: ${ChdtpCrawler.url}`);
const results = await ChdtpCrawler.crawl(browser); const results = await ChdtpCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('ChdtpCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if the website structure has changed or if the list is currently empty.'); console.warn(
'Warning: No items found. Check if the website structure has changed or if the list is currently empty.',
);
} else { } else {
// Check data integrity of the first item // Check data integrity of the first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -7,22 +7,34 @@ export interface ChdtpResult {
url: string; // Necessary for system uniqueness url: string; // Necessary for system uniqueness
} }
interface ChdtpCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): ChdtpResult[];
}
export const ChdtpCrawler = { export const ChdtpCrawler = {
name: '华电集团电子商务平台 ', name: '华电集团电子商务平台 ',
url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1', url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1',
baseUrl: 'https://www.chdtp.com/webs/', baseUrl: 'https://www.chdtp.com/webs/',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> { async crawl(
this: ChdtpCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('ChdtpCrawler'); const logger = new Logger('ChdtpCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
const username = process.env.PROXY_USERNAME; const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD; const password = process.env.PROXY_PASSWORD;
if (username && password) { if (username && password) {
await page.authenticate({ username, password }); await page.authenticate({ username, password });
} }
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
);
const allResults: ChdtpResult[] = []; const allResults: ChdtpResult[] = [];
let currentPage = 1; let currentPage = 1;
@@ -35,14 +47,16 @@ export const ChdtpCrawler = {
while (currentPage <= maxPages) { while (currentPage <= maxPages) {
const content = await page.content(); const content = await page.content();
const pageResults = this.extract(content); const pageResults = this.extract(content);
if (pageResults.length === 0) { if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`); logger.warn(`No results found on page ${currentPage}, stopping.`);
break; break;
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// Find the "Next Page" button // Find the "Next Page" button
// Using partial match for src to be robust against path variations // Using partial match for src to be robust against path variations
@@ -58,35 +72,43 @@ export const ChdtpCrawler = {
// For this specific site, we'll try to click. // For this specific site, we'll try to click.
logger.log(`Navigating to page ${currentPage + 1}...`); logger.log(`Navigating to page ${currentPage + 1}...`);
try { try {
await Promise.all([ await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }), page.waitForNavigation({
waitUntil: 'networkidle2',
timeout: 60000,
}),
nextButton.click(), nextButton.click(),
]); ]);
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
currentPage++; currentPage++;
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; // Return what we have so far return allResults; // Return what we have so far
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): ChdtpResult[] { extract(this: ChdtpCrawlerType, html: string): ChdtpResult[] {
const results: ChdtpResult[] = []; const results: ChdtpResult[] = [];
/** /**
* Regex groups for chdtp.com: * Regex groups for chdtp.com:
@@ -96,23 +118,24 @@ export const ChdtpCrawler = {
* 4: Business Type * 4: Business Type
* 5: Date * 5: Date
*/ */
const regex = /<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs; const regex =
/<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const urlSuffix = match[2]?.trim(); const urlSuffix = match[2]?.trim() ?? '';
const title = match[3]?.trim(); const title = match[3]?.trim() ?? '';
const dateStr = match[5]?.trim(); const dateStr = match[5]?.trim() ?? '';
if (title && urlSuffix) { if (title && urlSuffix) {
const fullUrl = this.baseUrl + urlSuffix; const fullUrl = this.baseUrl + urlSuffix;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -31,13 +31,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -47,23 +47,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
describe('ChngCrawler Real Site Test', () => { describe('ChngCrawler Real Site Test', () => {
@@ -74,7 +74,7 @@ describe('ChngCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Run in non-headless mode headless: false, // Run in non-headless mode
args: [ args: [
@@ -82,7 +82,7 @@ describe('ChngCrawler Real Site Test', () => {
'--disable-setuid-sandbox', '--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled', '--disable-blink-features=AutomationControlled',
'--window-size=1920,1080', '--window-size=1920,1080',
"--disable-infobars", '--disable-infobars',
...proxyArgs, ...proxyArgs,
// "--headless=new", // "--headless=new",
// '--disable-dev-shm-usage', // '--disable-dev-shm-usage',
@@ -94,15 +94,14 @@ describe('ChngCrawler Real Site Test', () => {
// '--disable-webgl', // '--disable-webgl',
// '--disable-javascript', // '--disable-javascript',
], ],
defaultViewport: null defaultViewport: null,
}); });
}); });
afterAll(async () => { afterAll(async () => {
if (browser) { if (browser) {
// Keep open for a few seconds after test to see result // Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000)); await new Promise((r) => setTimeout(r, 50000));
await browser.close(); await browser.close();
} }
}); });
@@ -111,43 +110,51 @@ describe('ChngCrawler Real Site Test', () => {
console.log(` console.log(`
Starting crawl for: ${ChngCrawler.name}`); Starting crawl for: ${ChngCrawler.name}`);
console.log(`Target URL: ${ChngCrawler.url}`); console.log(`Target URL: ${ChngCrawler.url}`);
// 创建一个临时页面用于模拟人类行为 // 创建一个临时页面用于模拟人类行为
const tempPage = await browser.newPage(); const tempPage = await browser.newPage();
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 }); await tempPage.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 1,
});
// 模拟人类鼠标移动 // 模拟人类鼠标移动
console.log('Simulating human mouse movements...'); console.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(tempPage); await simulateHumanMouseMovement(tempPage);
// 模拟人类滚动 // 模拟人类滚动
console.log('Simulating human scrolling...'); console.log('Simulating human scrolling...');
await simulateHumanScrolling(tempPage); await simulateHumanScrolling(tempPage);
await tempPage.close(); await tempPage.close();
const results = await ChngCrawler.crawl(browser); const results = await ChngCrawler.crawl(browser);
console.log(` console.log(`
Successfully found ${results.length} items: Successfully found ${results.length} items:
`); `);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
expect(results).toBeDefined(); expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.'); console.warn(
'Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.',
);
} else { } else {
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -16,19 +16,20 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
console.log('Page was closed during mouse movement simulation'); console.log('Page was closed during mouse movement simulation');
return; return;
} }
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} catch (error) { } catch (error) {
console.log('Mouse movement simulation interrupted:', error.message); const errorMessage = error instanceof Error ? error.message : String(error);
console.log('Mouse movement simulation interrupted:', errorMessage);
} }
} }
@@ -43,18 +44,18 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
console.log('Page was closed during scrolling simulation'); console.log('Page was closed during scrolling simulation');
return; return;
} }
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
@@ -62,19 +63,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
} catch (error) { } catch (error) {
console.log('Scrolling simulation interrupted:', error.message); const errorMessage = error instanceof Error ? error.message : String(error);
console.log('Scrolling simulation interrupted:', errorMessage);
} }
} }
interface ChngCrawlerType {
name: string;
url: string;
baseUrl: string;
}
export const ChngCrawler = { export const ChngCrawler = {
name: '华能集团电子商务平台', name: '华能集团电子商务平台',
url: 'https://ec.chng.com.cn/channel/home/#/purchase?top=0', url: 'https://ec.chng.com.cn/channel/home/#/purchase?top=0',
baseUrl: 'https://ec.chng.com.cn/channel/home/#', baseUrl: 'https://ec.chng.com.cn/channel/home/#',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> { async crawl(
this: ChngCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('ChngCrawler'); const logger = new Logger('ChngCrawler');
let page = await browser.newPage(); let page = await browser.newPage();
// await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 }); // await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
@@ -84,42 +95,48 @@ export const ChngCrawler = {
if (username && password) { if (username && password) {
await page.authenticate({ username, password }); await page.authenticate({ username, password });
} }
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = []; const allResults: ChdtpResult[] = [];
let currentPage = 1; let currentPage = 1;
const maxPages = 5; const maxPages = 5;
try { try {
logger.log('Navigating to Bing...'); logger.log('Navigating to Bing...');
await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' }); await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' });
logger.log('Searching for target site...'); logger.log('Searching for target site...');
const searchBoxSelector = 'input[name="q"]'; const searchBoxSelector = 'input[name="q"]';
await page.waitForSelector(searchBoxSelector); await page.waitForSelector(searchBoxSelector);
await page.type(searchBoxSelector, 'https://ec.chng.com.cn/'); await page.type(searchBoxSelector, 'https://ec.chng.com.cn/');
await page.keyboard.press('Enter'); await page.keyboard.press('Enter');
await page.waitForNavigation({ waitUntil: 'networkidle2' }); await page.waitForNavigation({ waitUntil: 'networkidle2' });
logger.log('Clicking search result...'); logger.log('Clicking search result...');
// await page.screenshot({ path: 'bing.png' }); // await page.screenshot({ path: 'bing.png' });
const firstResultSelector = '#b_results .b_algo h2 a'; const firstResultSelector = '#b_results .b_algo h2 a';
await page.waitForSelector(firstResultSelector); await page.waitForSelector(firstResultSelector);
const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target()); const newTargetPromise = browser.waitForTarget(
(target) => target.opener() === page.target(),
);
await page.click(firstResultSelector); await page.click(firstResultSelector);
const newTarget = await newTargetPromise; const newTarget = await newTargetPromise;
const newPage = await newTarget.page(); const newPage = await newTarget.page();
if (newPage) { if (newPage) {
// await newPage.screenshot({ path: 'newPage.png' }); // await newPage.screenshot({ path: 'newPage.png' });
await page.close(); await page.close();
@@ -131,108 +148,135 @@ export const ChngCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// 等待页面稳定,不强制等待导航 // 等待页面稳定,不强制等待导航
await new Promise(r => setTimeout(r, 3000)); await new Promise((r) => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// PAUSE 15 SECONDS as requested
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
await new Promise(r => setTimeout(r, 15000));
// await page.screenshot({ path: 'huaneng.png' });
logger.log('Looking for "采购专栏" link...');
await page.waitForFunction(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
return divs.some(div => div.textContent && div.textContent.includes('采购专栏'));
}, { timeout: 60000 });
const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null);
await page.evaluate(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement;
if (target) target.click();
});
const purchaseTarget = await purchaseTargetPromise;
if (purchaseTarget) {
const pPage = await purchaseTarget.page();
if (pPage) {
logger.log('Switched to Purchase Page tab.');
page = pPage;
if (username && password) {
await page.authenticate({ username, password });
}
await new Promise(r => setTimeout(r, 5000));
}
}
logger.log(`Active URL: ${page.url()}`);
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// PAUSE 15 SECONDS as requested
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
await new Promise((r) => setTimeout(r, 15000));
// await page.screenshot({ path: 'huaneng.png' });
logger.log('Looking for "采购专栏" link...');
await page.waitForFunction(
() => {
const divs = Array.from(document.querySelectorAll('div.text'));
return divs.some(
(div) => div.textContent && div.textContent.includes('采购专栏'),
);
},
{ timeout: 60000 },
);
const purchaseTargetPromise = browser
.waitForTarget((target) => target.opener() === page.target(), {
timeout: 15000,
})
.catch(() => null);
await page.evaluate(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
const target = divs.find(
(div) => div.textContent && div.textContent.includes('采购专栏'),
) as HTMLElement;
if (target) target.click();
});
const purchaseTarget = await purchaseTargetPromise;
if (purchaseTarget) {
const pPage = await purchaseTarget.page();
if (pPage) {
logger.log('Switched to Purchase Page tab.');
page = pPage;
if (username && password) {
await page.authenticate({ username, password });
}
await new Promise((r) => setTimeout(r, 5000));
}
}
logger.log(`Active URL: ${page.url()}`);
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) { while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`); logger.log(`Processing page ${currentPage}...`);
// Wait for table rows to load // Wait for table rows to load
await page.waitForFunction(() => { await page
return document.querySelectorAll('tr.ant-table-row').length > 0; .waitForFunction(
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.')); () => {
return document.querySelectorAll('tr.ant-table-row').length > 0;
},
{ timeout: 60000 },
)
.catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate((baseUrl) => { const pageResults = await page.evaluate((baseUrl) => {
// Extract from table rows // Extract from table rows
const items = Array.from(document.querySelectorAll('tr.ant-table-row')); const items = Array.from(
return items.map(item => { document.querySelectorAll('tr.ant-table-row'),
const titleSpan = item.querySelector('span.list-text'); );
const dateCell = item.querySelector('td.ant-table-row-cell-break-word p'); return items
.map((item) => {
if (titleSpan && dateCell) { const titleSpan = item.querySelector('span.list-text');
const title = titleSpan.textContent?.trim() || ''; const dateCell = item.querySelector(
const dateStr = dateCell.textContent?.trim() || ''; 'td.ant-table-row-cell-break-word p',
);
if (title.length < 5) return null; // Filter noise
if (titleSpan && dateCell) {
// URL is not directly available in the table, need to construct from data-row-key const title = titleSpan.textContent?.trim() || '';
const rowKey = item.getAttribute('data-row-key'); const dateStr = dateCell.textContent?.trim() || '';
const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : '';
if (title.length < 5) return null; // Filter noise
return {
title, // URL is not directly available in the table, need to construct from data-row-key
dateStr, const rowKey = item.getAttribute('data-row-key');
url const url = rowKey
}; ? `${baseUrl}#/purchase/detail?id=${rowKey}`
} : '';
return null;
}).filter(i => i !== null); return {
title,
dateStr,
url,
};
}
return null;
})
.filter((i) => i !== null);
}, this.baseUrl); }, this.baseUrl);
if (pageResults.length === 0) { if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`); logger.warn(
break; `No results found on page ${currentPage}. Extraction failed.`,
);
break;
} }
allResults.push(...pageResults.map(r => ({ allResults.push(
title: r!.title, ...pageResults.map((r) => ({
publishDate: new Date(r!.dateStr), title: r.title,
url: r!.url.replace(/\/\//g, '/') publishDate: new Date(r.dateStr),
}))); url: r.url.replace(/\/\//g, '/'),
})),
);
logger.log(`Extracted ${pageResults.length} items.`); logger.log(`Extracted ${pageResults.length} items.`);
// Pagination: look for the "right" icon SVG // Pagination: look for the "right" icon SVG
@@ -241,34 +285,37 @@ export const ChngCrawler = {
// 点击下一页前保存当前页面状态 // 点击下一页前保存当前页面状态
const currentUrl = page.url(); const currentUrl = page.url();
await nextButton.click(); await nextButton.click();
// 等待页面导航完成 // 等待页面导航完成
try { try {
await page.waitForFunction( await page.waitForFunction(
(oldUrl) => window.location.href !== oldUrl, (oldUrl) => window.location.href !== oldUrl,
{ timeout: 10000 }, { timeout: 10000 },
currentUrl currentUrl,
); );
} catch (e) { } catch {
logger.warn('Navigation timeout, continuing anyway'); logger.warn('Navigation timeout, continuing anyway');
} }
// 等待页面内容加载 // 等待页面内容加载
await new Promise(r => setTimeout(r, 15000)); await new Promise((r) => setTimeout(r, 15000));
currentPage++; currentPage++;
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Crawl failed: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Crawl failed: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
if (page) await page.close(); if (page) await page.close();
} }
}, },
extract() { return []; } extract() {
}; return [];
},
};

View File

@@ -2,7 +2,7 @@ import { CnncecpCrawler } from './cnncecp_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CnncecpCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CnncecpCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CnncecpCrawler.name}`); console.log(`\nStarting crawl for: ${CnncecpCrawler.name}`);
console.log(`Target URL: ${CnncecpCrawler.url}`); console.log(`Target URL: ${CnncecpCrawler.url}`);
const results = await CnncecpCrawler.crawl(browser); const results = await CnncecpCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('CnncecpCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface CnncecpResult { export interface CnncecpResult {
@@ -52,12 +52,22 @@ export interface CnncecpResult {
url: string; url: string;
} }
interface CnncecpCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CnncecpResult[];
}
export const CnncecpCrawler = { export const CnncecpCrawler = {
name: '中核集团电子采购平台', name: '中核集团电子采购平台',
url: 'https://www.cnncecp.com/xzbgg/index.jhtml', url: 'https://www.cnncecp.com/xzbgg/index.jhtml',
baseUrl: 'https://www.cnncecp.com/', baseUrl: 'https://www.cnncecp.com/',
async crawl(browser: puppeteer.Browser): Promise<CnncecpResult[]> { async crawl(
this: CnncecpCrawlerType,
browser: puppeteer.Browser,
): Promise<CnncecpResult[]> {
const logger = new Logger('CnncecpCrawler'); const logger = new Logger('CnncecpCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const CnncecpCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: CnncecpResult[] = []; const allResults: CnncecpResult[] = [];
@@ -87,7 +101,7 @@ export const CnncecpCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const CnncecpCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -126,9 +142,13 @@ export const CnncecpCrawler = {
try { try {
// 点击下一页按钮 // 点击下一页按钮
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
@@ -137,26 +157,27 @@ export const CnncecpCrawler = {
// 模拟人类行为 - 翻页后 // 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...'); logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...'); logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): CnncecpResult[] { extract(this: CnncecpCrawlerType, html: string): CnncecpResult[] {
const results: CnncecpResult[] = []; const results: CnncecpResult[] = [];
/** /**
* Regex groups for cnncecp.com: * Regex groups for cnncecp.com:
@@ -172,24 +193,25 @@ export const CnncecpCrawler = {
* <a href="https://www.cnncecp.com/xzbgg/1862778.jhtml">中核四0四有限公司2026-2028年度质量流量控制器等采购项目二次变更公告</a> * <a href="https://www.cnncecp.com/xzbgg/1862778.jhtml">中核四0四有限公司2026-2028年度质量流量控制器等采购项目二次变更公告</a>
* </li> * </li>
*/ */
const regex = /<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs; const regex =
/<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const dateStr = match[1]?.trim(); const dateStr = match[1]?.trim() ?? '';
const url = match[2]?.trim(); const url = match[2]?.trim() ?? '';
const title = match[3]?.trim(); const title = match[3]?.trim() ?? '';
if (title && url) { if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -2,7 +2,7 @@ import { CnoocCrawler } from './cnooc_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CnoocCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CnoocCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CnoocCrawler.name}`); console.log(`\nStarting crawl for: ${CnoocCrawler.name}`);
console.log(`Target URL: ${CnoocCrawler.url}`); console.log(`Target URL: ${CnoocCrawler.url}`);
const results = await CnoocCrawler.crawl(browser); const results = await CnoocCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('CnoocCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface CnoocResult { export interface CnoocResult {
@@ -52,12 +52,22 @@ export interface CnoocResult {
url: string; url: string;
} }
interface CnoocCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CnoocResult[];
}
export const CnoocCrawler = { export const CnoocCrawler = {
name: '中海油招标平台', name: '中海油招标平台',
url: 'https://buy.cnooc.com.cn/cbjyweb/001/001001/moreinfo.html', url: 'https://buy.cnooc.com.cn/cbjyweb/001/001001/moreinfo.html',
baseUrl: 'https://buy.cnooc.com.cn/', baseUrl: 'https://buy.cnooc.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<CnoocResult[]> { async crawl(
this: CnoocCrawlerType,
browser: puppeteer.Browser,
): Promise<CnoocResult[]> {
const logger = new Logger('CnoocCrawler'); const logger = new Logger('CnoocCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const CnoocCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: CnoocResult[] = []; const allResults: CnoocResult[] = [];
@@ -87,7 +101,7 @@ export const CnoocCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const CnoocCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const CnoocCrawler = {
try { try {
// 点击下一页按钮 // 点击下一页按钮
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
@@ -138,26 +158,27 @@ export const CnoocCrawler = {
// 模拟人类行为 - 翻页后 // 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...'); logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...'); logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): CnoocResult[] { extract(this: CnoocCrawlerType, html: string): CnoocResult[] {
const results: CnoocResult[] = []; const results: CnoocResult[] = [];
/** /**
* Regex groups for buy.cnooc.com.cn: * Regex groups for buy.cnooc.com.cn:
@@ -173,24 +194,25 @@ export const CnoocCrawler = {
* <span class="now-span" style="width:100px">2026-01-12</span> * <span class="now-span" style="width:100px">2026-01-12</span>
* </li> * </li>
*/ */
const regex = /<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs; const regex =
/<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim(); const url = match[1]?.trim() ?? '';
const title = match[2]?.trim(); const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim(); const dateStr = match[3]?.trim() ?? '';
if (title && url) { if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -2,7 +2,7 @@ import { EpsCrawler } from './eps_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('EpsCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('EpsCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${EpsCrawler.name}`); console.log(`\nStarting crawl for: ${EpsCrawler.name}`);
console.log(`Target URL: ${EpsCrawler.url}`); console.log(`Target URL: ${EpsCrawler.url}`);
const results = await EpsCrawler.crawl(browser); const results = await EpsCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('EpsCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface EpsResult { export interface EpsResult {
@@ -52,12 +52,22 @@ export interface EpsResult {
url: string; url: string;
} }
interface EpsCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): EpsResult[];
}
export const EpsCrawler = { export const EpsCrawler = {
name: '中国三峡集团电子商务平台', name: '中国三峡集团电子商务平台',
url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm', url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm',
baseUrl: 'https://eps.ctg.com.cn/', baseUrl: 'https://eps.ctg.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<EpsResult[]> { async crawl(
this: EpsCrawlerType,
browser: puppeteer.Browser,
): Promise<EpsResult[]> {
const logger = new Logger('EpsCrawler'); const logger = new Logger('EpsCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const EpsCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: EpsResult[] = []; const allResults: EpsResult[] = [];
@@ -87,7 +101,7 @@ export const EpsCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const EpsCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const EpsCrawler = {
try { try {
// 点击下一页按钮,等待页面更新 // 点击下一页按钮,等待页面更新
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
@@ -138,26 +158,27 @@ export const EpsCrawler = {
// 模拟人类行为 - 翻页后 // 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...'); logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...'); logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): EpsResult[] { extract(this: EpsCrawlerType, html: string): EpsResult[] {
const results: EpsResult[] = []; const results: EpsResult[] = [];
/** /**
* Regex groups for eps.ctg.com.cn: * Regex groups for eps.ctg.com.cn:
@@ -179,24 +200,25 @@ export const EpsCrawler = {
* </a> * </a>
* </li> * </li>
*/ */
const regex = /<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; const regex =
/<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim(); const url = match[1]?.trim() ?? '';
const title = match[2]?.trim(); const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim(); const dateStr = match[3]?.trim() ?? '';
if (title && url) { if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -2,7 +2,7 @@ import { EspicCrawler } from './espic_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('EspicCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('EspicCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${EspicCrawler.name}`); console.log(`\nStarting crawl for: ${EspicCrawler.name}`);
console.log(`Target URL: ${EspicCrawler.getUrl()}`); console.log(`Target URL: ${EspicCrawler.getUrl()}`);
const results = await EspicCrawler.crawl(browser); const results = await EspicCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('EspicCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface EspicResult { export interface EspicResult {
@@ -52,12 +52,19 @@ export interface EspicResult {
url: string; url: string;
} }
interface EspicCrawlerType {
name: string;
baseUrl: string;
getUrl(page?: number): string;
extract(html: string): EspicResult[];
}
export const EspicCrawler = { export const EspicCrawler = {
name: '电能e招采平台国电投', name: '电能e招采平台国电投',
baseUrl: 'https://ebid.espic.com.cn/', baseUrl: 'https://ebid.espic.com.cn/',
// 生成动态 URL使用当前日期 // 生成动态 URL使用当前日期
getUrl(page: number = 1): string { getUrl(this: EspicCrawlerType, page: number = 1): string {
const now = new Date(); const now = new Date();
const year = now.getFullYear(); const year = now.getFullYear();
const month = now.getMonth() + 1; // 月份从0开始 const month = now.getMonth() + 1; // 月份从0开始
@@ -66,7 +73,10 @@ export const EspicCrawler = {
return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`; return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`;
}, },
async crawl(browser: puppeteer.Browser): Promise<EspicResult[]> { async crawl(
this: EspicCrawlerType,
browser: puppeteer.Browser,
): Promise<EspicResult[]> {
const logger = new Logger('EspicCrawler'); const logger = new Logger('EspicCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -78,11 +88,15 @@ export const EspicCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: EspicResult[] = []; const allResults: EspicResult[] = [];
@@ -100,15 +114,18 @@ export const EspicCrawler = {
() => { () => {
// 检查是否已经通过验证(页面不再是 WAF 页面) // 检查是否已经通过验证(页面不再是 WAF 页面)
const bodyText = document.body?.textContent || ''; const bodyText = document.body?.textContent || '';
return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙'); return (
!bodyText.includes('人机识别检测') &&
!bodyText.includes('WEB 应用防火墙')
);
}, },
{ timeout: 30000 } { timeout: 30000 },
); );
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -124,12 +141,14 @@ export const EspicCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -141,7 +160,7 @@ export const EspicCrawler = {
'a[aria-label="Next"]', 'a[aria-label="Next"]',
'a.next', 'a.next',
'li.next a', 'li.next a',
'a.layui-laypage-next:not(.layui-disabled)' 'a.layui-laypage-next:not(.layui-disabled)',
]; ];
let nextButton: puppeteer.ElementHandle<Element> | null = null; let nextButton: puppeteer.ElementHandle<Element> | null = null;
@@ -149,7 +168,7 @@ export const EspicCrawler = {
try { try {
nextButton = await page.$(selector); nextButton = await page.$(selector);
if (nextButton) break; if (nextButton) break;
} catch (e) { } catch {
// 继续尝试下一个选择器 // 继续尝试下一个选择器
} }
} }
@@ -164,9 +183,13 @@ export const EspicCrawler = {
try { try {
// 点击下一页按钮 // 点击下一页按钮
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
@@ -175,26 +198,27 @@ export const EspicCrawler = {
// 模拟人类行为 - 翻页后 // 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...'); logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...'); logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): EspicResult[] { extract(this: EspicCrawlerType, html: string): EspicResult[] {
const results: EspicResult[] = []; const results: EspicResult[] = [];
/** /**
* Regex groups for ebid.espic.com.cn: * Regex groups for ebid.espic.com.cn:
@@ -225,24 +249,25 @@ export const EspicCrawler = {
* </a> * </a>
* </li> * </li>
*/ */
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; const regex =
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim(); const url = match[1]?.trim() ?? '';
const title = match[2]?.trim(); const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim(); const dateStr = match[3]?.trim() ?? '';
if (title && url) { if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -2,7 +2,7 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('PowerbeijingCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('PowerbeijingCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${PowerbeijingCrawler.name}`); console.log(`\nStarting crawl for: ${PowerbeijingCrawler.name}`);
console.log(`Target URL: ${PowerbeijingCrawler.url}`); console.log(`Target URL: ${PowerbeijingCrawler.url}`);
const results = await PowerbeijingCrawler.crawl(browser); const results = await PowerbeijingCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('PowerbeijingCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface PowerbeijingResult { export interface PowerbeijingResult {
@@ -52,12 +52,22 @@ export interface PowerbeijingResult {
url: string; url: string;
} }
interface PowerbeijingCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): PowerbeijingResult[];
}
export const PowerbeijingCrawler = { export const PowerbeijingCrawler = {
name: '北京京能电子商务平台', name: '北京京能电子商务平台',
url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1', url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
baseUrl: 'https://www.powerbeijing-ec.com/', baseUrl: 'https://www.powerbeijing-ec.com/',
async crawl(browser: puppeteer.Browser): Promise<PowerbeijingResult[]> { async crawl(
this: PowerbeijingCrawlerType,
browser: puppeteer.Browser,
): Promise<PowerbeijingResult[]> {
const logger = new Logger('PowerbeijingCrawler'); const logger = new Logger('PowerbeijingCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const PowerbeijingCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: PowerbeijingResult[] = []; const allResults: PowerbeijingResult[] = [];
@@ -87,7 +101,7 @@ export const PowerbeijingCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const PowerbeijingCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const PowerbeijingCrawler = {
try { try {
// 点击下一页按钮,等待页面更新 // 点击下一页按钮,等待页面更新
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
@@ -138,26 +158,27 @@ export const PowerbeijingCrawler = {
// 模拟人类行为 - 翻页后 // 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...'); logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...'); logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): PowerbeijingResult[] { extract(this: PowerbeijingCrawlerType, html: string): PowerbeijingResult[] {
const results: PowerbeijingResult[] = []; const results: PowerbeijingResult[] = [];
/** /**
* Regex groups for powerbeijing-ec.com: * Regex groups for powerbeijing-ec.com:
@@ -176,24 +197,25 @@ export const PowerbeijingCrawler = {
* </a> * </a>
* </li> * </li>
*/ */
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; const regex =
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim(); const url = match[1]?.trim() ?? '';
const title = match[2]?.trim(); const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim(); const dateStr = match[3]?.trim() ?? '';
if (title && url) { if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url; const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -2,7 +2,7 @@ import { SdiccCrawler } from './sdicc_target';
import * as puppeteer from 'puppeteer'; import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000 * 5);
// 获取代理配置 // 获取代理配置
const getProxyArgs = (): string[] => { const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('SdiccCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('SdiccCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => { it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${SdiccCrawler.name}`); console.log(`\nStarting crawl for: ${SdiccCrawler.name}`);
console.log(`Target URL: ${SdiccCrawler.url}`); console.log(`Target URL: ${SdiccCrawler.url}`);
const results = await SdiccCrawler.crawl(browser); const results = await SdiccCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`); console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
@@ -61,13 +63,15 @@ describe('SdiccCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure) // Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else { } else {
// Check data integrity of first item // Check data integrity of first item
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
} }
export interface SdiccResult { export interface SdiccResult {
@@ -52,12 +52,22 @@ export interface SdiccResult {
url: string; url: string;
} }
interface SdiccCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): SdiccResult[];
}
export const SdiccCrawler = { export const SdiccCrawler = {
name: '国投集团电子采购平台', name: '国投集团电子采购平台',
url: 'https://www.sdicc.com.cn/cgxx/ggList', url: 'https://www.sdicc.com.cn/cgxx/ggList',
baseUrl: 'https://www.sdicc.com.cn/', baseUrl: 'https://www.sdicc.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<SdiccResult[]> { async crawl(
this: SdiccCrawlerType,
browser: puppeteer.Browser,
): Promise<SdiccResult[]> {
const logger = new Logger('SdiccCrawler'); const logger = new Logger('SdiccCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const SdiccCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: SdiccResult[] = []; const allResults: SdiccResult[] = [];
@@ -87,15 +101,17 @@ export const SdiccCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// 等待表格加载 // 等待表格加载
logger.log('Waiting for table to load...'); logger.log('Waiting for table to load...');
await page.waitForSelector('.tbody table tbody tr', { timeout: 30000 }).catch(() => { await page
logger.warn('Table rows not found, trying alternative selectors...'); .waitForSelector('.tbody table tbody tr', { timeout: 30000 })
}); .catch(() => {
logger.warn('Table rows not found, trying alternative selectors...');
});
while (currentPage <= maxPages) { while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`); logger.log(`Processing page ${currentPage}...`);
@@ -109,12 +125,14 @@ export const SdiccCrawler = {
} }
allResults.push(...pageResults); allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前 // 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...'); logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -132,10 +150,16 @@ export const SdiccCrawler = {
try { try {
// 点击下一页按钮 // 点击下一页按钮
await nextButton.click(); await nextButton.click();
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {}); await page
await new Promise(r => setTimeout(r, 2000)); // 额外等待确保数据加载完成 .waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
.catch(() => {});
await new Promise((r) => setTimeout(r, 2000)); // 额外等待确保数据加载完成
} catch (navError) { } catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break; break;
} }
@@ -144,26 +168,27 @@ export const SdiccCrawler = {
// 模拟人类行为 - 翻页后 // 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...'); logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...'); logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise((resolve) => setTimeout(resolve, delay));
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
await page.close(); await page.close();
} }
}, },
extract(html: string): SdiccResult[] { extract(this: SdiccCrawlerType, html: string): SdiccResult[] {
const results: SdiccResult[] = []; const results: SdiccResult[] = [];
/** /**
* Regex groups for sdicc.com.cn: * Regex groups for sdicc.com.cn:
@@ -180,25 +205,26 @@ export const SdiccCrawler = {
* <td colspan="1" rowspan="1"><span> 2026-01-09 </span></td> * <td colspan="1" rowspan="1"><span> 2026-01-09 </span></td>
* </tr> * </tr>
*/ */
const regex = /<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs; const regex =
/<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
let match; let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) { while ((match = regex.exec(html)) !== null) {
const ggGuid = match[1]?.trim(); const ggGuid = match[1]?.trim() ?? '';
const gcGuid = match[2]?.trim(); const gcGuid = match[2]?.trim() ?? '';
const title = match[3]?.trim(); const title = match[3]?.trim() ?? '';
const dateStr = match[4]?.trim(); const dateStr = match[4]?.trim() ?? '';
if (title && ggGuid && gcGuid) { if (title && ggGuid && gcGuid) {
const fullUrl = `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}`; const fullUrl = `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}`;
results.push({ results.push({
title, title,
publishDate: dateStr ? new Date(dateStr) : new Date(), publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/') url: fullUrl.replace(/\/\//g, '/'),
}); });
} }
} }
return results; return results;
} },
}; };

View File

@@ -29,7 +29,7 @@ describe('SzecpCrawler Real Site Test', () => {
if (proxyArgs.length > 0) { if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' ')); console.log('Using proxy:', proxyArgs.join(' '));
} }
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Run in non-headless mode headless: false, // Run in non-headless mode
args: [ args: [
@@ -40,14 +40,14 @@ describe('SzecpCrawler Real Site Test', () => {
'--disable-infobars', '--disable-infobars',
...proxyArgs, ...proxyArgs,
], ],
defaultViewport: null defaultViewport: null,
}); });
}); });
afterAll(async () => { afterAll(async () => {
if (browser) { if (browser) {
// Keep open for a few seconds after test to see result // Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000)); await new Promise((r) => setTimeout(r, 50000));
await browser.close(); await browser.close();
} }
}); });
@@ -56,29 +56,33 @@ describe('SzecpCrawler Real Site Test', () => {
console.log(` console.log(`
Starting crawl for: ${SzecpCrawler.name}`); Starting crawl for: ${SzecpCrawler.name}`);
console.log(`Target URL: ${SzecpCrawler.url}`); console.log(`Target URL: ${SzecpCrawler.url}`);
const results = await SzecpCrawler.crawl(browser); const results = await SzecpCrawler.crawl(browser);
console.log(` console.log(`
Successfully found ${results.length} items: Successfully found ${results.length} items:
`); `);
console.log('----------------------------------------'); console.log('----------------------------------------');
results.forEach((item, index) => { results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`); console.log(` Link: ${item.url}`);
console.log('----------------------------------------'); console.log('----------------------------------------');
}); });
expect(results).toBeDefined(); expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy(); expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) { if (results.length === 0) {
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.'); console.warn(
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
);
} else { } else {
const firstItem = results[0]; const firstItem = results[0];
expect(firstItem.title).toBeTruthy(); expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date); expect(firstItem.publishDate).toBeInstanceOf(Date);
} }
}); });
}); });

View File

@@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) { for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width); const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height); const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, { await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑 steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
}); });
// 随机停顿 100-500ms // 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
} }
} }
@@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) { for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => { await page.evaluate((distance) => {
window.scrollBy({ window.scrollBy({
top: distance, top: distance,
behavior: 'smooth' behavior: 'smooth',
}); });
}, scrollDistance); }, scrollDistance);
// 随机停顿 500-1500ms // 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
} }
// 滚动回顶部 // 滚动回顶部
await page.evaluate(() => { await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' }); window.scrollTo({ top: 0, behavior: 'smooth' });
}); });
await new Promise(r => setTimeout(r, 1000)); await new Promise((r) => setTimeout(r, 1000));
}
interface SzecpCrawlerType {
name: string;
url: string;
baseUrl: string;
} }
export const SzecpCrawler = { export const SzecpCrawler = {
@@ -52,7 +58,10 @@ export const SzecpCrawler = {
url: 'https://www.szecp.com.cn/first_zbgg/index.html', url: 'https://www.szecp.com.cn/first_zbgg/index.html',
baseUrl: 'https://www.szecp.com.cn/', baseUrl: 'https://www.szecp.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> { async crawl(
this: SzecpCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('SzecpCrawler'); const logger = new Logger('SzecpCrawler');
const page = await browser.newPage(); const page = await browser.newPage();
@@ -65,10 +74,14 @@ export const SzecpCrawler = {
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' }); Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
}); });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = []; const allResults: ChdtpResult[] = [];
@@ -82,7 +95,7 @@ export const SzecpCrawler = {
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
@@ -90,52 +103,69 @@ export const SzecpCrawler = {
logger.log('Clicking search button...'); logger.log('Clicking search button...');
await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 }); await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 });
await page.click('.szb-zbcgSearch-key-v1'); await page.click('.szb-zbcgSearch-key-v1');
await new Promise(r => setTimeout(r, 3000)); // Wait for results to load await new Promise((r) => setTimeout(r, 3000)); // Wait for results to load
while (currentPage <= maxPages) { while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`); logger.log(`Processing page ${currentPage}...`);
// Wait for content to load // Wait for content to load
await page.waitForFunction(() => { await page
return document.querySelectorAll('.szb-zbcgTable-other').length > 0; .waitForFunction(
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.')); () => {
return (
document.querySelectorAll('.szb-zbcgTable-other').length > 0
);
},
{ timeout: 60000 },
)
.catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate((baseUrl) => { const pageResults = await page.evaluate((baseUrl) => {
// Extract from table rows // Extract from table rows
const items = Array.from(document.querySelectorAll('.szb-zbcgTable-other')); const items = Array.from(
return items.map(item => { document.querySelectorAll('.szb-zbcgTable-other'),
const divs = item.querySelectorAll('div'); );
if (divs.length >= 5) { return items
const titleLink = divs[1].querySelector('a'); .map((item) => {
const title = titleLink?.textContent?.trim() || ''; const divs = item.querySelectorAll('div');
const dateStr = divs[4].textContent?.trim() || ''; if (divs.length >= 5) {
const href = titleLink?.getAttribute('href') || ''; const titleLink = divs[1].querySelector('a');
const title = titleLink?.textContent?.trim() || '';
const dateStr = divs[4].textContent?.trim() || '';
const href = titleLink?.getAttribute('href') || '';
if (title.length < 5) return null; // Filter noise if (title.length < 5) return null; // Filter noise
// Construct full URL if href is relative // Construct full URL if href is relative
const url = href.startsWith('http') ? href : `${baseUrl}${href}`; const url = href.startsWith('http')
? href
: `${baseUrl}${href}`;
return { return {
title, title,
dateStr, dateStr,
url url,
}; };
} }
return null; return null;
}).filter(i => i !== null); })
.filter((i) => i !== null);
}, this.baseUrl); }, this.baseUrl);
if (pageResults.length === 0) { if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`); logger.warn(
`No results found on page ${currentPage}. Extraction failed.`,
);
break; break;
} }
allResults.push(...pageResults.map(r => ({ allResults.push(
title: r!.title, ...pageResults.map((r) => ({
publishDate: new Date(r!.dateStr), title: r.title,
url: r!.url.replace(/\/\//g, '/') publishDate: new Date(r.dateStr),
}))); url: r.url.replace(/\/\//g, '/'),
})),
);
logger.log(`Extracted ${pageResults.length} items.`); logger.log(`Extracted ${pageResults.length} items.`);
@@ -144,27 +174,30 @@ export const SzecpCrawler = {
if (!nextButton) break; if (!nextButton) break;
await nextButton.click(); await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); await new Promise((r) => setTimeout(r, 3000));
// 模拟人类行为 // 模拟人类行为
logger.log('Simulating human mouse movements...'); logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page); await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
currentPage++; currentPage++;
} }
return allResults; return allResults;
} catch (error) { } catch (error) {
logger.error(`Crawl failed: ${error.message}`); const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Crawl failed: ${errorMessage}`);
return allResults; return allResults;
} finally { } finally {
if (page) await page.close(); if (page) await page.close();
} }
}, },
extract() { return []; } extract() {
return [];
},
}; };

View File

@@ -12,7 +12,11 @@ import { CrawlInfoAdd } from '../crawler/entities/crawl-info-add.entity';
imports: [ConfigModule], imports: [ConfigModule],
inject: [ConfigService], inject: [ConfigService],
useFactory: (configService: ConfigService) => ({ useFactory: (configService: ConfigService) => ({
type: configService.get<any>('DATABASE_TYPE', 'mariadb'), type:
(configService.get<string>('DATABASE_TYPE', 'mariadb') as
| 'mariadb'
| 'mysql'
| 'postgres') || 'mariadb',
host: configService.get<string>('DATABASE_HOST', 'localhost'), host: configService.get<string>('DATABASE_HOST', 'localhost'),
port: configService.get<number>('DATABASE_PORT', 3306), port: configService.get<number>('DATABASE_PORT', 3306),
username: configService.get<string>('DATABASE_USERNAME', 'root'), username: configService.get<string>('DATABASE_USERNAME', 'root'),

View File

@@ -1,4 +1,10 @@
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm'; import {
Entity,
PrimaryGeneratedColumn,
Column,
CreateDateColumn,
UpdateDateColumn,
} from 'typeorm';
@Entity('keywords') @Entity('keywords')
export class Keyword { export class Keyword {

View File

@@ -6,19 +6,20 @@ async function bootstrap() {
const app = await NestFactory.create(AppModule, { const app = await NestFactory.create(AppModule, {
bodyParser: true, bodyParser: true,
}); });
// 使用自定义日志服务 // 使用自定义日志服务
const logger = await app.resolve(CustomLogger); const logger = await app.resolve(CustomLogger);
app.useLogger(logger); app.useLogger(logger);
// 增加请求体大小限制(默认 100kb增加到 50mb // 增加请求体大小限制(默认 100kb增加到 50mb
const express = require('express'); // eslint-disable-next-line @typescript-eslint/no-require-imports
const express = require('express') as typeof import('express');
app.use(express.json({ limit: '50mb' })); app.use(express.json({ limit: '50mb' }));
app.use(express.urlencoded({ limit: '50mb', extended: true })); app.use(express.urlencoded({ limit: '50mb', extended: true }));
// 启用 CORS // 启用 CORS
app.enableCors(); app.enableCors();
await app.listen(process.env.PORT ?? 3000); await app.listen(process.env.PORT ?? 3000);
} }
bootstrap(); void bootstrap();

View File

@@ -16,7 +16,9 @@ async function generateAiRecommendations() {
try { try {
// 获取 BidItem 的 repository 和 AiService // 获取 BidItem 的 repository 和 AiService
const bidItemRepository = app.get<Repository<BidItem>>(getRepositoryToken(BidItem)); const bidItemRepository = app.get<Repository<BidItem>>(
getRepositoryToken(BidItem),
);
const aiService = app.get(AiService); const aiService = app.get(AiService);
logger.log('开始查询 bid_items 表...'); logger.log('开始查询 bid_items 表...');
@@ -27,11 +29,13 @@ async function generateAiRecommendations() {
threeDaysAgo.setHours(0, 0, 0, 0); threeDaysAgo.setHours(0, 0, 0, 0);
// 使用本地时间格式化输出,避免时区问题 // 使用本地时间格式化输出,避免时区问题
const localDateStr = threeDaysAgo.toLocaleDateString('zh-CN', { const localDateStr = threeDaysAgo
year: 'numeric', .toLocaleDateString('zh-CN', {
month: '2-digit', year: 'numeric',
day: '2-digit' month: '2-digit',
}).replace(/\//g, '-'); day: '2-digit',
})
.replace(/\//g, '-');
logger.log(`查询起始日期: ${localDateStr}`); logger.log(`查询起始日期: ${localDateStr}`);
// 查询起始日期3天前截止日期不限制的所有记录 // 查询起始日期3天前截止日期不限制的所有记录
@@ -50,8 +54,8 @@ async function generateAiRecommendations() {
} }
// 提取 title // 提取 title
const bidData = bidItems.map(item => ({ const bidData = bidItems.map((item) => ({
title: item.title title: item.title,
})); }));
logger.log('开始调用 AI 获取推荐...'); logger.log('开始调用 AI 获取推荐...');

View File

@@ -5,19 +5,19 @@ import { CustomLogger } from '../common/logger/logger.service';
async function runCrawler() { async function runCrawler() {
const app = await NestFactory.createApplicationContext(AppModule); const app = await NestFactory.createApplicationContext(AppModule);
// 设置自定义 logger使 NestJS 框架日志也输出到文件 // 设置自定义 logger使 NestJS 框架日志也输出到文件
const logger = await app.resolve(CustomLogger); const logger = await app.resolve(CustomLogger);
app.useLogger(logger); app.useLogger(logger);
logger.setContext('CrawlScript'); logger.setContext('CrawlScript');
try { try {
const crawlerService = await app.resolve(BidCrawlerService); const crawlerService = await app.resolve(BidCrawlerService);
logger.log('Starting crawler...'); logger.log('Starting crawler...');
await crawlerService.crawlAll(); await crawlerService.crawlAll();
logger.log('Crawler completed successfully'); logger.log('Crawler completed successfully');
await app.close(); await app.close();
process.exit(0); process.exit(0);
} catch (error) { } catch (error) {

View File

@@ -15,7 +15,9 @@ async function removeDuplicates() {
try { try {
// 获取 BidItem 的 repository // 获取 BidItem 的 repository
const bidItemRepository = app.get<Repository<BidItem>>(getRepositoryToken(BidItem)); const bidItemRepository = app.get<Repository<BidItem>>(
getRepositoryToken(BidItem),
);
logger.log('开始查找重复的title...'); logger.log('开始查找重复的title...');
@@ -56,10 +58,12 @@ async function removeDuplicates() {
const itemsToDelete = items.slice(1); const itemsToDelete = items.slice(1);
if (itemsToDelete.length > 0) { if (itemsToDelete.length > 0) {
const idsToDelete = itemsToDelete.map(item => item.id); const idsToDelete = itemsToDelete.map((item) => item.id);
const deleteResult = await bidItemRepository.delete(idsToDelete); const deleteResult = await bidItemRepository.delete(idsToDelete);
totalDeleted += deleteResult.affected || 0; totalDeleted += deleteResult.affected || 0;
logger.log(` 删除了 ${deleteResult.affected} 条重复记录保留ID: ${items[0].id} (最晚创建)`); logger.log(
` 删除了 ${deleteResult.affected} 条重复记录保留ID: ${items[0].id} (最晚创建)`,
);
} }
} }

View File

@@ -8,7 +8,7 @@ import { CrawlInfoAdd } from '../crawler/entities/crawl-info-add.entity';
// 主数据库配置 // 主数据库配置
const masterDbConfig: DataSourceOptions = { const masterDbConfig: DataSourceOptions = {
type: process.env.DATABASE_TYPE as any || 'mariadb', type: (process.env.DATABASE_TYPE as any) || 'mariadb',
host: process.env.DATABASE_HOST || 'localhost', host: process.env.DATABASE_HOST || 'localhost',
port: parseInt(process.env.DATABASE_PORT || '3306'), port: parseInt(process.env.DATABASE_PORT || '3306'),
username: process.env.DATABASE_USERNAME || 'root', username: process.env.DATABASE_USERNAME || 'root',
@@ -20,7 +20,7 @@ const masterDbConfig: DataSourceOptions = {
// Slave 数据库配置 // Slave 数据库配置
const slaveDbConfig: DataSourceOptions = { const slaveDbConfig: DataSourceOptions = {
type: process.env.SLAVE_DATABASE_TYPE as any || 'mariadb', type: (process.env.SLAVE_DATABASE_TYPE as any) || 'mariadb',
host: process.env.SLAVE_DATABASE_HOST || 'localhost', host: process.env.SLAVE_DATABASE_HOST || 'localhost',
port: parseInt(process.env.SLAVE_DATABASE_PORT || '3306'), port: parseInt(process.env.SLAVE_DATABASE_PORT || '3306'),
username: process.env.SLAVE_DATABASE_USERNAME || 'root', username: process.env.SLAVE_DATABASE_USERNAME || 'root',
@@ -94,12 +94,17 @@ async function createDatabaseIfNotExists(config: DataSourceOptions) {
password: (config as any).password, password: (config as any).password,
}); });
await connection.query(`CREATE DATABASE IF NOT EXISTS \`${(config as any).database}\``); await connection.query(
`CREATE DATABASE IF NOT EXISTS \`${(config as any).database}\``,
);
await connection.end(); await connection.end();
} }
// 同步表结构 // 同步表结构
async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSource): Promise<DataSource> { async function syncSchema(
masterDataSource: DataSource,
slaveDataSource: DataSource,
): Promise<DataSource> {
logger.log('开始同步表结构...'); logger.log('开始同步表结构...');
// 获取主数据库的所有表 // 获取主数据库的所有表
@@ -137,8 +142,12 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou
if (tableExists[0].count > 0) { if (tableExists[0].count > 0) {
// 表存在,先备份数据到临时表 // 表存在,先备份数据到临时表
logger.log(`备份表 ${tableName} 的数据到 ${tempTableName}...`); logger.log(`备份表 ${tableName} 的数据到 ${tempTableName}...`);
await slaveDataSource.query(`CREATE TABLE ${tempTableName} AS SELECT * FROM \`${tableName}\``); await slaveDataSource.query(
logger.log(`备份完成,共备份 ${await slaveDataSource.query(`SELECT COUNT(*) as count FROM ${tempTableName}`).then(r => r[0].count)} 条记录`); `CREATE TABLE ${tempTableName} AS SELECT * FROM \`${tableName}\``,
);
logger.log(
`备份完成,共备份 ${await slaveDataSource.query(`SELECT COUNT(*) as count FROM ${tempTableName}`).then((r) => r[0].count)} 条记录`,
);
} }
// 删除 slave 数据库中的表(如果存在) // 删除 slave 数据库中的表(如果存在)
@@ -151,7 +160,7 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou
if (tableExists[0].count > 0) { if (tableExists[0].count > 0) {
try { try {
logger.log(`${tempTableName} 恢复数据到 ${tableName}...`); logger.log(`${tempTableName} 恢复数据到 ${tableName}...`);
// 获取临时表的列名 // 获取临时表的列名
const columns = await slaveDataSource.query(` const columns = await slaveDataSource.query(`
SELECT COLUMN_NAME SELECT COLUMN_NAME
@@ -159,18 +168,22 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou
WHERE TABLE_SCHEMA = '${(slaveDbConfig as any).database}' WHERE TABLE_SCHEMA = '${(slaveDbConfig as any).database}'
AND TABLE_NAME = '${tempTableName}' AND TABLE_NAME = '${tempTableName}'
`); `);
const columnNames = columns.map((c: any) => `\`${c.COLUMN_NAME}\``).join(', '); const columnNames = columns
.map((c: any) => `\`${c.COLUMN_NAME}\``)
.join(', ');
// 将数据从临时表插入到新表 // 将数据从临时表插入到新表
await slaveDataSource.query(` await slaveDataSource.query(`
INSERT INTO \`${tableName}\` (${columnNames}) INSERT INTO \`${tableName}\` (${columnNames})
SELECT ${columnNames} FROM ${tempTableName} SELECT ${columnNames} FROM ${tempTableName}
`); `);
const restoredCount = await slaveDataSource.query(`SELECT COUNT(*) as count FROM \`${tableName}\``); const restoredCount = await slaveDataSource.query(
`SELECT COUNT(*) as count FROM \`${tableName}\``,
);
logger.log(`数据恢复完成,共恢复 ${restoredCount[0].count} 条记录`); logger.log(`数据恢复完成,共恢复 ${restoredCount[0].count} 条记录`);
// 删除临时表 // 删除临时表
await slaveDataSource.query(`DROP TABLE IF EXISTS ${tempTableName}`); await slaveDataSource.query(`DROP TABLE IF EXISTS ${tempTableName}`);
} catch (error) { } catch (error) {
@@ -181,13 +194,13 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou
} }
logger.log('表结构同步完成'); logger.log('表结构同步完成');
// 重新初始化 slave 数据库连接以清除 TypeORM 元数据缓存 // 重新初始化 slave 数据库连接以清除 TypeORM 元数据缓存
logger.log('重新初始化 slave 数据库连接...'); logger.log('重新初始化 slave 数据库连接...');
await slaveDataSource.destroy(); await slaveDataSource.destroy();
await slaveDataSource.initialize(); await slaveDataSource.initialize();
logger.log('Slave 数据库连接重新初始化完成'); logger.log('Slave 数据库连接重新初始化完成');
return slaveDataSource; return slaveDataSource;
} }
@@ -227,7 +240,12 @@ async function syncDatabase() {
let totalSynced = 0; let totalSynced = 0;
for (const table of tables) { for (const table of tables) {
const count = await syncTable(masterDataSource, slaveDataSource, table.entity, table.name); const count = await syncTable(
masterDataSource,
slaveDataSource,
table.entity,
table.name,
);
totalSynced += count; totalSynced += count;
} }

View File

@@ -15,7 +15,9 @@ async function updateSource() {
try { try {
// 获取 BidItem 的 repository // 获取 BidItem 的 repository
const bidItemRepository = app.get<Repository<BidItem>>(getRepositoryToken(BidItem)); const bidItemRepository = app.get<Repository<BidItem>>(
getRepositoryToken(BidItem),
);
const oldSource = '北京电力交易平台'; const oldSource = '北京电力交易平台';
const newSource = '北京京能电子商务平台'; const newSource = '北京京能电子商务平台';

View File

@@ -0,0 +1,17 @@
module systray_run
go 1.23
require github.com/getlantern/systray v1.2.2
require (
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 // indirect
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 // indirect
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 // indirect
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 // indirect
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 // indirect
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f // indirect
github.com/go-stack/stack v1.8.0 // indirect
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect
golang.org/x/sys v0.1.0 // indirect
)

View File

@@ -0,0 +1,32 @@
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 h1:NRUJuo3v3WGC/g5YiyF790gut6oQr5f3FBI88Wv0dx4=
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520/go.mod h1:L+mq6/vvYHKjCX2oez0CgEAJmbq1fbb/oNJIWQkBybY=
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 h1:6uJ+sZ/e03gkbqZ0kUG6mfKoqDb4XMAzMIwlajq19So=
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7/go.mod h1:l+xpFBrCtDLpK9qNjxs+cHU6+BAdlBaxHqikB6Lku3A=
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 h1:guBYzEaLz0Vfc/jv0czrr2z7qyzTOGC9hiQ0VC+hKjk=
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7/go.mod h1:zx/1xUUeYPy3Pcmet8OSXLbF47l+3y6hIPpyLWoR9oc=
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 h1:micT5vkcr9tOVk1FiH8SWKID8ultN44Z+yzd2y/Vyb0=
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7/go.mod h1:dD3CgOrwlzca8ed61CsZouQS5h5jIzkK9ZWrTcf0s+o=
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 h1:XYzSdCbkzOC0FDNrgJqGRo8PCMFOBFL9py72DRs7bmc=
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55/go.mod h1:6mmzY2kW1TOOrVy+r41Za2MxXM+hhqTtY3oBKd2AgFA=
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f h1:wrYrQttPS8FHIRSlsrcuKazukx/xqO/PpLZzZXsF+EA=
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f/go.mod h1:D5ao98qkA6pxftxoqzibIBBrLSUli+kYnJqrgBf9cIA=
github.com/getlantern/systray v1.2.2 h1:dCEHtfmvkJG7HZ8lS/sLklTH4RKUcIsKrAD9sThoEBE=
github.com/getlantern/systray v1.2.2/go.mod h1:pXFOI1wwqwYXEhLPm9ZGjS2u/vVELeIgNMY5HvhHhcE=
github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/lxn/walk v0.0.0-20210112085537-c389da54e794/go.mod h1:E23UucZGqpuUANJooIbHWCufXvOcT6E7Stq81gU+CSQ=
github.com/lxn/win v0.0.0-20210218163916-a377121e959e/go.mod h1:KxxjdtRkfNoYDCUP5ryK7XJJNTnpC8atvtmTheChOtk=
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw=
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
gopkg.in/Knetic/govaluate.v3 v3.0.0/go.mod h1:csKLBORsPbafmSCGTEh3U7Ozmsuq8ZSIlKk1bcqph0E=