chore: 更新.gitignore并添加新文件
在.gitignore中添加对*.png、*.log、*-lock.json、*.woff2文件的忽略规则,并新增OFL.txt文件。同时,添加vue.svg图标文件以支持前端展示。更新多个TypeScript文件以优化代码格式和增强可读性。
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -8,3 +8,7 @@ logs
|
||||
build
|
||||
*.exe
|
||||
*.png
|
||||
*.log
|
||||
*-lock.json
|
||||
*.woff2
|
||||
widget/looker/frontend/src/assets/fonts/OFL.txt
|
||||
|
||||
1
frontend/src/assets/vue.svg
Normal file
1
frontend/src/assets/vue.svg
Normal file
@@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="37.07" height="36" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 198"><path fill="#41B883" d="M204.8 0H256L128 220.8L0 0h97.92L128 51.2L157.44 0h47.36Z"></path><path fill="#41B883" d="m0 0l128 220.8L256 0h-51.2L128 132.48L50.56 0H0Z"></path><path fill="#35495E" d="M50.56 0L128 133.12L204.8 0h-47.36L128 51.2L97.92 0H50.56Z"></path></svg>
|
||||
|
After Width: | Height: | Size: 496 B |
@@ -31,14 +31,13 @@ export class AiService {
|
||||
@InjectRepository(BidItem)
|
||||
private readonly bidItemRepository: Repository<BidItem>,
|
||||
) {
|
||||
const apiKey = this.configService.get<string>('ARK_API_KEY');
|
||||
// this.openai = new OpenAI({
|
||||
// apiKey: apiKey || '',
|
||||
// apiKey: this.configService.get<string>('ARK_API_KEY') || '',
|
||||
// baseURL: 'https://ark.cn-beijing.volces.com/api/v3',
|
||||
// timeout: 120000, // 120秒超时
|
||||
// });
|
||||
this.openai = new OpenAI({
|
||||
apiKey: 'sk-5sSOxrJl31MGz76bE14d2fDbA55b44869fCcA0C813Fc893a' ,
|
||||
apiKey: 'sk-5sSOxrJl31MGz76bE14d2fDbA55b44869fCcA0C813Fc893a',
|
||||
baseURL: 'https://aihubmix.com/v1',
|
||||
timeout: 120000, // 120秒超时
|
||||
});
|
||||
@@ -49,7 +48,9 @@ export class AiService {
|
||||
this.logger.log(`发送给 AI 的数据数量: ${bids.length}`);
|
||||
|
||||
try {
|
||||
const prompt =PromptString+ `请根据以下投标项目标题列表,筛选出我关心的项目。请以 JSON 格式返回,格式如下:
|
||||
const prompt =
|
||||
PromptString +
|
||||
`请根据以下投标项目标题列表,筛选出我关心的项目。请以 JSON 格式返回,格式如下:
|
||||
[
|
||||
{
|
||||
"title": "项目标题",
|
||||
@@ -58,7 +59,11 @@ export class AiService {
|
||||
]
|
||||
|
||||
投标项目标题列表:
|
||||
${JSON.stringify(bids.map(b => b.title), null, 2)}`;
|
||||
${JSON.stringify(
|
||||
bids.map((b) => b.title),
|
||||
null,
|
||||
2,
|
||||
)}`;
|
||||
// this.logger.log('发给AI的内容',prompt);
|
||||
const completion = await this.openai.chat.completions.create({
|
||||
model: 'mimo-v2-flash-free',
|
||||
@@ -97,7 +102,9 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
|
||||
}
|
||||
}
|
||||
|
||||
async saveRecommendations(recommendations: AIRecommendation[]): Promise<void> {
|
||||
async saveRecommendations(
|
||||
recommendations: AIRecommendation[],
|
||||
): Promise<void> {
|
||||
this.logger.log('开始保存 AI 推荐结果');
|
||||
|
||||
try {
|
||||
@@ -105,7 +112,7 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
|
||||
await this.aiRecommendationRepository.clear();
|
||||
|
||||
// 保存新的推荐结果(只保存 title 和 confidence)
|
||||
const entities = recommendations.map(rec => {
|
||||
const entities = recommendations.map((rec) => {
|
||||
const entity = new AiRecommendationEntity();
|
||||
entity.title = rec.title;
|
||||
entity.confidence = rec.confidence;
|
||||
@@ -125,14 +132,14 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
|
||||
|
||||
try {
|
||||
const entities = await this.aiRecommendationRepository.find({
|
||||
order: { confidence: 'DESC' }
|
||||
order: { confidence: 'DESC' },
|
||||
});
|
||||
|
||||
// 从 bid-items 表获取 url、source 和 publishDate
|
||||
const result: AIRecommendation[] = [];
|
||||
for (const entity of entities) {
|
||||
const bidItem = await this.bidItemRepository.findOne({
|
||||
where: { title: entity.title }
|
||||
where: { title: entity.title },
|
||||
});
|
||||
|
||||
result.push({
|
||||
@@ -140,7 +147,7 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
|
||||
url: bidItem?.url || '',
|
||||
source: bidItem?.source || '',
|
||||
confidence: entity.confidence,
|
||||
publishDate: bidItem?.publishDate
|
||||
publishDate: bidItem?.publishDate,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -148,7 +155,9 @@ ${JSON.stringify(bids.map(b => b.title), null, 2)}`;
|
||||
result.sort((a, b) => {
|
||||
if (!a.publishDate) return 1;
|
||||
if (!b.publishDate) return -1;
|
||||
return new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime();
|
||||
return (
|
||||
new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime()
|
||||
);
|
||||
});
|
||||
|
||||
return result;
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm';
|
||||
import {
|
||||
Entity,
|
||||
PrimaryGeneratedColumn,
|
||||
Column,
|
||||
CreateDateColumn,
|
||||
} from 'typeorm';
|
||||
|
||||
@Entity('ai_recommendations')
|
||||
export class AiRecommendation {
|
||||
|
||||
@@ -1,12 +1,19 @@
|
||||
import { Controller, Get, Query, Patch, Param, Body } from '@nestjs/common';
|
||||
import { BidsService } from '../services/bid.service';
|
||||
|
||||
interface FindAllQuery {
|
||||
page?: number;
|
||||
limit?: number;
|
||||
source?: string;
|
||||
keyword?: string;
|
||||
}
|
||||
|
||||
@Controller('api/bids')
|
||||
export class BidsController {
|
||||
constructor(private readonly bidsService: BidsService) {}
|
||||
|
||||
@Get()
|
||||
findAll(@Query() query: any) {
|
||||
findAll(@Query() query: FindAllQuery) {
|
||||
return this.bidsService.findAll(query);
|
||||
}
|
||||
|
||||
@@ -26,9 +33,17 @@ export class BidsController {
|
||||
}
|
||||
|
||||
@Get('by-date-range')
|
||||
getByDateRange(@Query('startDate') startDate: string, @Query('endDate') endDate?: string, @Query('keywords') keywords?: string) {
|
||||
getByDateRange(
|
||||
@Query('startDate') startDate: string,
|
||||
@Query('endDate') endDate?: string,
|
||||
@Query('keywords') keywords?: string,
|
||||
) {
|
||||
const keywordsArray = keywords ? keywords.split(',') : undefined;
|
||||
return this.bidsService.getBidsByDateRange(startDate, endDate, keywordsArray);
|
||||
return this.bidsService.getBidsByDateRange(
|
||||
startDate,
|
||||
endDate,
|
||||
keywordsArray,
|
||||
);
|
||||
}
|
||||
|
||||
@Get('crawl-info-stats')
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
|
||||
import {
|
||||
Entity,
|
||||
PrimaryGeneratedColumn,
|
||||
Column,
|
||||
CreateDateColumn,
|
||||
UpdateDateColumn,
|
||||
} from 'typeorm';
|
||||
|
||||
@Entity('bid_items')
|
||||
export class BidItem {
|
||||
|
||||
@@ -1,9 +1,36 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import { Repository, LessThan, MoreThanOrEqual } from 'typeorm';
|
||||
import { Repository, LessThan } from 'typeorm';
|
||||
import { BidItem } from '../entities/bid-item.entity';
|
||||
import { CrawlInfoAdd } from '../../crawler/entities/crawl-info-add.entity';
|
||||
|
||||
interface FindAllQuery {
|
||||
page?: number;
|
||||
limit?: number;
|
||||
source?: string;
|
||||
keyword?: string;
|
||||
}
|
||||
|
||||
interface SourceResult {
|
||||
source: string;
|
||||
}
|
||||
|
||||
interface CrawlInfoAddStats {
|
||||
source: string;
|
||||
count: number;
|
||||
latestUpdate: Date | string;
|
||||
latestPublishDate: Date | string | null;
|
||||
error: string | null;
|
||||
}
|
||||
|
||||
interface CrawlInfoAddRawResult {
|
||||
source: string;
|
||||
count: number;
|
||||
latestPublishDate: Date | string | null;
|
||||
error: string | null;
|
||||
latestUpdate: Date | string;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class BidsService {
|
||||
constructor(
|
||||
@@ -13,7 +40,7 @@ export class BidsService {
|
||||
private crawlInfoRepository: Repository<CrawlInfoAdd>,
|
||||
) {}
|
||||
|
||||
async findAll(query?: any) {
|
||||
async findAll(query?: FindAllQuery) {
|
||||
const { page = 1, limit = 10, source, keyword } = query || {};
|
||||
const qb = this.bidRepository.createQueryBuilder('bid');
|
||||
|
||||
@@ -26,8 +53,8 @@ export class BidsService {
|
||||
}
|
||||
|
||||
qb.orderBy('bid.publishDate', 'DESC')
|
||||
.skip((page - 1) * limit)
|
||||
.take(limit);
|
||||
.skip((Number(page) - 1) * Number(limit))
|
||||
.take(Number(limit));
|
||||
|
||||
const [items, total] = await qb.getManyAndCount();
|
||||
return { items, total };
|
||||
@@ -35,7 +62,9 @@ export class BidsService {
|
||||
|
||||
async createOrUpdate(data: Partial<BidItem>) {
|
||||
// Use title or a hash of title to check for duplicates
|
||||
let item = await this.bidRepository.findOne({ where: { title: data.title } });
|
||||
const item = await this.bidRepository.findOne({
|
||||
where: { title: data.title },
|
||||
});
|
||||
if (item) {
|
||||
Object.assign(item, data);
|
||||
return this.bidRepository.save(item);
|
||||
@@ -51,14 +80,14 @@ export class BidsService {
|
||||
});
|
||||
}
|
||||
|
||||
async getSources() {
|
||||
async getSources(): Promise<string[]> {
|
||||
const result = await this.bidRepository
|
||||
.createQueryBuilder('bid')
|
||||
.select('DISTINCT bid.source')
|
||||
.select('DISTINCT bid.source', 'source')
|
||||
.where('bid.source IS NOT NULL')
|
||||
.orderBy('bid.source', 'ASC')
|
||||
.getRawMany();
|
||||
return result.map((item: any) => item.source);
|
||||
.getRawMany<SourceResult>();
|
||||
return result.map((item) => item.source);
|
||||
}
|
||||
|
||||
async getRecentBids() {
|
||||
@@ -81,7 +110,11 @@ export class BidsService {
|
||||
.getMany();
|
||||
}
|
||||
|
||||
async getBidsByDateRange(startDate?: string, endDate?: string, keywords?: string[]) {
|
||||
async getBidsByDateRange(
|
||||
startDate?: string,
|
||||
endDate?: string,
|
||||
keywords?: string[],
|
||||
) {
|
||||
const qb = this.bidRepository.createQueryBuilder('bid');
|
||||
|
||||
if (startDate) {
|
||||
@@ -97,13 +130,18 @@ export class BidsService {
|
||||
}
|
||||
|
||||
if (keywords && keywords.length > 0) {
|
||||
const keywordConditions = keywords.map((keyword, index) => {
|
||||
return `bid.title LIKE :keyword${index}`;
|
||||
}).join(' OR ');
|
||||
qb.andWhere(`(${keywordConditions})`, keywords.reduce((params, keyword, index) => {
|
||||
params[`keyword${index}`] = `%${keyword}%`;
|
||||
return params;
|
||||
}, {}));
|
||||
const keywordConditions = keywords
|
||||
.map((keyword, index) => {
|
||||
return `bid.title LIKE :keyword${index}`;
|
||||
})
|
||||
.join(' OR ');
|
||||
qb.andWhere(
|
||||
`(${keywordConditions})`,
|
||||
keywords.reduce((params, keyword, index) => {
|
||||
params[`keyword${index}`] = `%${keyword}%`;
|
||||
return params;
|
||||
}, {}),
|
||||
);
|
||||
}
|
||||
|
||||
return qb.orderBy('bid.publishDate', 'DESC').getMany();
|
||||
@@ -118,7 +156,7 @@ export class BidsService {
|
||||
return this.bidRepository.save(item);
|
||||
}
|
||||
|
||||
async getCrawlInfoAddStats() {
|
||||
async getCrawlInfoAddStats(): Promise<CrawlInfoAddStats[]> {
|
||||
// 获取每个来源的最新一次爬虫记录(按 createdAt 降序)
|
||||
const query = `
|
||||
SELECT
|
||||
@@ -136,15 +174,19 @@ export class BidsService {
|
||||
ORDER BY source ASC
|
||||
`;
|
||||
|
||||
const results = await this.crawlInfoRepository.query(query);
|
||||
const results =
|
||||
await this.crawlInfoRepository.query<CrawlInfoAddRawResult[]>(query);
|
||||
|
||||
return results.map((item: any) => ({
|
||||
source: item.source,
|
||||
count: item.count,
|
||||
return results.map((item) => ({
|
||||
source: String(item.source),
|
||||
count: Number(item.count),
|
||||
latestUpdate: item.latestUpdate,
|
||||
latestPublishDate: item.latestPublishDate,
|
||||
// 确保 error 字段正确处理:null 或空字符串都转换为 null,非空字符串保留
|
||||
error: item.error && item.error.trim() !== '' ? item.error : null,
|
||||
error:
|
||||
item.error && String(item.error).trim() !== ''
|
||||
? String(item.error)
|
||||
: null,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,21 @@
|
||||
import { Injectable, LoggerService, Scope } from '@nestjs/common';
|
||||
import { winstonLogger } from './winston.config';
|
||||
|
||||
type LogMessage = string | Error | Record<string, unknown>;
|
||||
|
||||
function formatMessage(message: LogMessage): string {
|
||||
if (typeof message === 'string') {
|
||||
return message;
|
||||
}
|
||||
if (message instanceof Error) {
|
||||
return message.message;
|
||||
}
|
||||
if (typeof message === 'object' && message !== null) {
|
||||
return JSON.stringify(message);
|
||||
}
|
||||
return String(message);
|
||||
}
|
||||
|
||||
@Injectable({ scope: Scope.TRANSIENT })
|
||||
export class CustomLogger implements LoggerService {
|
||||
private context?: string;
|
||||
@@ -9,23 +24,34 @@ export class CustomLogger implements LoggerService {
|
||||
this.context = context;
|
||||
}
|
||||
|
||||
log(message: any, context?: string) {
|
||||
winstonLogger.info(message, { context: context || this.context });
|
||||
log(message: LogMessage, context?: string) {
|
||||
winstonLogger.info(formatMessage(message), {
|
||||
context: context || this.context,
|
||||
});
|
||||
}
|
||||
|
||||
error(message: any, trace?: string, context?: string) {
|
||||
winstonLogger.error(message, { context: context || this.context, trace });
|
||||
error(message: LogMessage, trace?: string, context?: string) {
|
||||
winstonLogger.error(formatMessage(message), {
|
||||
context: context || this.context,
|
||||
trace,
|
||||
});
|
||||
}
|
||||
|
||||
warn(message: any, context?: string) {
|
||||
winstonLogger.warn(message, { context: context || this.context });
|
||||
warn(message: LogMessage, context?: string) {
|
||||
winstonLogger.warn(formatMessage(message), {
|
||||
context: context || this.context,
|
||||
});
|
||||
}
|
||||
|
||||
debug(message: any, context?: string) {
|
||||
winstonLogger.debug(message, { context: context || this.context });
|
||||
debug(message: LogMessage, context?: string) {
|
||||
winstonLogger.debug(formatMessage(message), {
|
||||
context: context || this.context,
|
||||
});
|
||||
}
|
||||
|
||||
verbose(message: any, context?: string) {
|
||||
winstonLogger.verbose(message, { context: context || this.context });
|
||||
verbose(message: LogMessage, context?: string) {
|
||||
winstonLogger.verbose(formatMessage(message), {
|
||||
context: context || this.context,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,13 +16,33 @@ const logFormat = winston.format.combine(
|
||||
winston.format.errors({ stack: true }),
|
||||
winston.format.splat(),
|
||||
winston.format.printf(({ timestamp, level, message, context, stack }) => {
|
||||
let log = `${timestamp} [${level}]`;
|
||||
if (context) {
|
||||
log += ` [${context}]`;
|
||||
}
|
||||
log += ` ${message}`;
|
||||
const timestampStr =
|
||||
typeof timestamp === 'string' ? timestamp : String(timestamp);
|
||||
const levelStr = typeof level === 'string' ? level : String(level);
|
||||
const messageStr = typeof message === 'string' ? message : String(message);
|
||||
const contextStr = context
|
||||
? typeof context === 'string'
|
||||
? context
|
||||
: JSON.stringify(context)
|
||||
: '';
|
||||
let stackStr = '';
|
||||
if (stack) {
|
||||
log += `\n${stack}`;
|
||||
if (typeof stack === 'string') {
|
||||
stackStr = stack;
|
||||
} else if (typeof stack === 'object' && stack !== null) {
|
||||
stackStr = JSON.stringify(stack);
|
||||
} else {
|
||||
stackStr = String(stack);
|
||||
}
|
||||
}
|
||||
|
||||
let log = `${timestampStr} [${levelStr}]`;
|
||||
if (contextStr) {
|
||||
log += ` [${contextStr}]`;
|
||||
}
|
||||
log += ` ${messageStr}`;
|
||||
if (stackStr) {
|
||||
log += `\n${stackStr}`;
|
||||
}
|
||||
return log;
|
||||
}),
|
||||
@@ -30,10 +50,7 @@ const logFormat = winston.format.combine(
|
||||
|
||||
// 控制台传输
|
||||
const consoleTransport = new winston.transports.Console({
|
||||
format: winston.format.combine(
|
||||
winston.format.colorize(),
|
||||
logFormat,
|
||||
),
|
||||
format: winston.format.combine(winston.format.colorize(), logFormat),
|
||||
});
|
||||
|
||||
// 应用日志传输(按天轮转)
|
||||
@@ -61,10 +78,6 @@ const errorLogTransport = new DailyRotateFile({
|
||||
export const winstonLogger = winston.createLogger({
|
||||
level: process.env.LOG_LEVEL || 'info',
|
||||
format: logFormat,
|
||||
transports: [
|
||||
consoleTransport,
|
||||
appLogTransport,
|
||||
errorLogTransport,
|
||||
],
|
||||
transports: [consoleTransport, appLogTransport, errorLogTransport],
|
||||
exitOnError: false,
|
||||
});
|
||||
|
||||
@@ -12,7 +12,7 @@ export class CrawlerController {
|
||||
getStatus() {
|
||||
return {
|
||||
isCrawling: this.isCrawling,
|
||||
crawlingSources: Array.from(this.crawlingSources)
|
||||
crawlingSources: Array.from(this.crawlingSources),
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm';
|
||||
import {
|
||||
Entity,
|
||||
PrimaryGeneratedColumn,
|
||||
Column,
|
||||
CreateDateColumn,
|
||||
} from 'typeorm';
|
||||
|
||||
@Entity('crawl_info_add')
|
||||
export class CrawlInfoAdd {
|
||||
|
||||
@@ -18,6 +18,17 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
|
||||
import { SdiccCrawler } from './sdicc_target';
|
||||
import { CnoocCrawler } from './cnooc_target';
|
||||
|
||||
interface CrawlResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface Crawler {
|
||||
name: string;
|
||||
crawl(browser: puppeteer.Browser): Promise<CrawlResult[]>;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class BidCrawlerService {
|
||||
private readonly logger = new Logger(BidCrawlerService.name);
|
||||
@@ -35,13 +46,11 @@ export class BidCrawlerService {
|
||||
// 设置最大执行时间为3小时
|
||||
const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时(毫秒)
|
||||
const startTime = Date.now();
|
||||
|
||||
// 统计结果
|
||||
const crawlResults: Record<string, { success: number; error?: string }> = {};
|
||||
|
||||
const crawlResults: Record<string, { success: number; error?: string }> =
|
||||
{};
|
||||
// 记录数据为0的爬虫,用于重试
|
||||
const zeroDataCrawlers: any[] = [];
|
||||
|
||||
const zeroDataCrawlers: Crawler[] = [];
|
||||
// 从环境变量读取代理配置
|
||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||
@@ -60,9 +69,10 @@ export class BidCrawlerService {
|
||||
];
|
||||
|
||||
if (proxyHost && proxyPort) {
|
||||
const proxyUrl = proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
const proxyUrl =
|
||||
proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
args.push(`--proxy-server=${proxyUrl}`);
|
||||
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
|
||||
}
|
||||
@@ -72,7 +82,20 @@ export class BidCrawlerService {
|
||||
args,
|
||||
});
|
||||
|
||||
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler];
|
||||
const crawlers = [
|
||||
ChdtpCrawler,
|
||||
ChngCrawler,
|
||||
SzecpCrawler,
|
||||
CdtCrawler,
|
||||
EpsCrawler,
|
||||
CnncecpCrawler,
|
||||
CgnpcCrawler,
|
||||
CeicCrawler,
|
||||
EspicCrawler,
|
||||
PowerbeijingCrawler,
|
||||
SdiccCrawler,
|
||||
CnoocCrawler,
|
||||
];
|
||||
|
||||
try {
|
||||
for (const crawler of crawlers) {
|
||||
@@ -81,14 +104,20 @@ export class BidCrawlerService {
|
||||
// 检查是否超时
|
||||
const elapsedTime = Date.now() - startTime;
|
||||
if (elapsedTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`);
|
||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||
this.logger.warn(
|
||||
`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`,
|
||||
);
|
||||
this.logger.warn(
|
||||
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||
this.logger.log(
|
||||
`Extracted ${results.length} items from ${crawler.name}`,
|
||||
);
|
||||
|
||||
// 记录成功数量
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
@@ -99,12 +128,13 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
const latestPublishDate =
|
||||
results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
@@ -116,20 +146,27 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 保存爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
|
||||
await this.saveCrawlInfo(
|
||||
crawler.name,
|
||||
results.length,
|
||||
latestPublishDate,
|
||||
);
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
||||
const errorMessage = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${errorMessage}`);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
crawlResults[crawler.name] = { success: 0, error: errorMessage };
|
||||
|
||||
// 保存错误信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
// 对数据为0的爬虫进行重试
|
||||
if (zeroDataCrawlers.length > 0) {
|
||||
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`);
|
||||
this.logger.log(
|
||||
`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`,
|
||||
);
|
||||
|
||||
for (const crawler of zeroDataCrawlers) {
|
||||
this.logger.log(`Retrying: ${crawler.name}`);
|
||||
@@ -137,25 +174,32 @@ export class BidCrawlerService {
|
||||
// 检查是否超时
|
||||
const elapsedTime = Date.now() - startTime;
|
||||
if (elapsedTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`);
|
||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||
this.logger.warn(
|
||||
`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`,
|
||||
);
|
||||
this.logger.warn(
|
||||
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`);
|
||||
this.logger.log(
|
||||
`Retry extracted ${results.length} items from ${crawler.name}`,
|
||||
);
|
||||
|
||||
// 更新统计结果
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
const latestPublishDate =
|
||||
results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
@@ -167,19 +211,29 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 更新爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
|
||||
await this.saveCrawlInfo(
|
||||
crawler.name,
|
||||
results.length,
|
||||
latestPublishDate,
|
||||
);
|
||||
} catch (err) {
|
||||
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
|
||||
const errorMessage =
|
||||
err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(
|
||||
`Error retrying ${crawler.name}: ${errorMessage}`,
|
||||
);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
crawlResults[crawler.name] = { success: 0, error: errorMessage };
|
||||
|
||||
// 更新错误信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error(`Crawl task failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
this.logger.error(`Crawl task failed: ${errorMessage}`);
|
||||
} finally {
|
||||
await browser.close();
|
||||
|
||||
@@ -188,7 +242,9 @@ export class BidCrawlerService {
|
||||
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
|
||||
|
||||
if (totalTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`);
|
||||
this.logger.warn(
|
||||
`⚠️ Crawl task exceeded maximum execution time of 3 hours.`,
|
||||
);
|
||||
}
|
||||
|
||||
// 输出统计总结
|
||||
@@ -204,14 +260,20 @@ export class BidCrawlerService {
|
||||
this.logger.error(`❌ ${source}: 出错 - ${result.error}`);
|
||||
errorCount++;
|
||||
} else {
|
||||
this.logger.log(`✅ ${source}: 成功获取 ${result.success} 条工程信息`);
|
||||
this.logger.log(
|
||||
`✅ ${source}: 成功获取 ${result.success} 条工程信息`,
|
||||
);
|
||||
totalSuccess += result.success;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log('='.repeat(50));
|
||||
this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`);
|
||||
this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`);
|
||||
this.logger.log(
|
||||
`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`,
|
||||
);
|
||||
this.logger.log(
|
||||
`Total: ${totalSuccess} items, ${errorCount} sources failed`,
|
||||
);
|
||||
this.logger.log('='.repeat(50));
|
||||
}
|
||||
}
|
||||
@@ -237,9 +299,10 @@ export class BidCrawlerService {
|
||||
];
|
||||
|
||||
if (proxyHost && proxyPort) {
|
||||
const proxyUrl = proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
const proxyUrl =
|
||||
proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
args.push(`--proxy-server=${proxyUrl}`);
|
||||
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
|
||||
}
|
||||
@@ -249,9 +312,22 @@ export class BidCrawlerService {
|
||||
args,
|
||||
});
|
||||
|
||||
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler];
|
||||
const crawlers = [
|
||||
ChdtpCrawler,
|
||||
ChngCrawler,
|
||||
SzecpCrawler,
|
||||
CdtCrawler,
|
||||
EpsCrawler,
|
||||
CnncecpCrawler,
|
||||
CgnpcCrawler,
|
||||
CeicCrawler,
|
||||
EspicCrawler,
|
||||
PowerbeijingCrawler,
|
||||
SdiccCrawler,
|
||||
CnoocCrawler,
|
||||
];
|
||||
|
||||
const targetCrawler = crawlers.find(c => c.name === sourceName);
|
||||
const targetCrawler = crawlers.find((c) => c.name === sourceName);
|
||||
|
||||
if (!targetCrawler) {
|
||||
await browser.close();
|
||||
@@ -262,15 +338,18 @@ export class BidCrawlerService {
|
||||
this.logger.log(`Crawling: ${targetCrawler.name}`);
|
||||
|
||||
const results = await targetCrawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${targetCrawler.name}`);
|
||||
this.logger.log(
|
||||
`Extracted ${results.length} items from ${targetCrawler.name}`,
|
||||
);
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
const latestPublishDate =
|
||||
results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
@@ -282,7 +361,11 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 保存爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(targetCrawler.name, results.length, latestPublishDate);
|
||||
await this.saveCrawlInfo(
|
||||
targetCrawler.name,
|
||||
results.length,
|
||||
latestPublishDate,
|
||||
);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
@@ -291,16 +374,19 @@ export class BidCrawlerService {
|
||||
latestPublishDate,
|
||||
};
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${targetCrawler.name}: ${err.message}`);
|
||||
const errorMessage = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(
|
||||
`Error crawling ${targetCrawler.name}: ${errorMessage}`,
|
||||
);
|
||||
|
||||
// 保存错误信息到数据库
|
||||
await this.saveCrawlInfo(targetCrawler.name, 0, null, err.message);
|
||||
await this.saveCrawlInfo(targetCrawler.name, 0, null, errorMessage);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
source: targetCrawler.name,
|
||||
count: 0,
|
||||
error: err.message,
|
||||
error: errorMessage,
|
||||
};
|
||||
} finally {
|
||||
await browser.close();
|
||||
@@ -324,7 +410,10 @@ export class BidCrawlerService {
|
||||
await this.crawlInfoRepository.save(crawlInfo);
|
||||
this.logger.log(`Saved crawl info for ${source}: ${count} items`);
|
||||
} catch (err) {
|
||||
this.logger.error(`Failed to save crawl info for ${source}: ${err.message}`);
|
||||
const errorMessage = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(
|
||||
`Failed to save crawl info for ${source}: ${errorMessage}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ import { CdtCrawler } from './cdt_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('CdtCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CdtCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CdtResult {
|
||||
@@ -52,12 +52,22 @@ export interface CdtResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CdtCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CdtResult[];
|
||||
}
|
||||
|
||||
export const CdtCrawler = {
|
||||
name: '中国大唐集团电子商务平台',
|
||||
url: 'https://tang.cdt-ec.com/home/index.html',
|
||||
baseUrl: 'https://tang.cdt-ec.com',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CdtResult[]> {
|
||||
async crawl(
|
||||
this: CdtCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CdtResult[]> {
|
||||
const logger = new Logger('CdtCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -67,7 +77,9 @@ export const CdtCrawler = {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
||||
);
|
||||
|
||||
const allResults: CdtResult[] = [];
|
||||
let currentPage = 1;
|
||||
@@ -86,19 +98,26 @@ export const CdtCrawler = {
|
||||
|
||||
// 点击"招标公告"标签
|
||||
logger.log('Looking for "招标公告" tab...');
|
||||
await page.waitForFunction(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
|
||||
}, { timeout: 60000 });
|
||||
await page.waitForFunction(
|
||||
() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
return tabs.some(
|
||||
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
const target = tabs.find(tab => tab.textContent && tab.textContent.includes('招标公告')) as HTMLElement;
|
||||
const target = tabs.find(
|
||||
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
|
||||
) as HTMLElement;
|
||||
if (target) target.click();
|
||||
});
|
||||
|
||||
logger.log('Clicked "招标公告" tab.');
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
@@ -109,26 +128,43 @@ export const CdtCrawler = {
|
||||
|
||||
// 点击"招标公告"下的"更多+"链接
|
||||
logger.log('Looking for "更多+" link under "招标公告"...');
|
||||
await page.waitForFunction(() => {
|
||||
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
|
||||
return titles.some(title => title.textContent && title.textContent.includes('招标公告'));
|
||||
}, { timeout: 30000 });
|
||||
await page.waitForFunction(
|
||||
() => {
|
||||
const titles = Array.from(
|
||||
document.querySelectorAll('span.h-notice-title'),
|
||||
);
|
||||
return titles.some(
|
||||
(title) =>
|
||||
title.textContent && title.textContent.includes('招标公告'),
|
||||
);
|
||||
},
|
||||
{ timeout: 30000 },
|
||||
);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
|
||||
const targetTitle = titles.find(title => title.textContent && title.textContent.includes('招标公告'));
|
||||
const titles = Array.from(
|
||||
document.querySelectorAll('span.h-notice-title'),
|
||||
);
|
||||
const targetTitle = titles.find(
|
||||
(title) =>
|
||||
title.textContent && title.textContent.includes('招标公告'),
|
||||
);
|
||||
if (targetTitle) {
|
||||
const parent = targetTitle.parentElement;
|
||||
if (parent) {
|
||||
const moreLink = parent.querySelector('a.h-notice-more') as HTMLElement;
|
||||
const moreLink = parent.querySelector(
|
||||
'a.h-notice-more',
|
||||
) as HTMLElement;
|
||||
if (moreLink) moreLink.click();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
logger.log('Clicked "更多+" link under "招标公告".');
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await page
|
||||
.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
|
||||
.catch(() => {});
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
@@ -155,7 +191,9 @@ export const CdtCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -172,7 +210,9 @@ export const CdtCrawler = {
|
||||
}, nextButtonSelector);
|
||||
|
||||
if (!nextButtonExists) {
|
||||
logger.log('Next page button not found or disabled. Reached end of list.');
|
||||
logger.log(
|
||||
'Next page button not found or disabled. Reached end of list.',
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -186,18 +226,25 @@ export const CdtCrawler = {
|
||||
}, nextButtonSelector);
|
||||
|
||||
// 等待 AJAX 请求完成(通过监听网络请求)
|
||||
await page.waitForFunction(() => {
|
||||
// 检查表格是否正在加载
|
||||
const loading = document.querySelector('.layui-table-loading');
|
||||
return !loading;
|
||||
}, { timeout: 30000 }).catch(() => {});
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
// 检查表格是否正在加载
|
||||
const loading = document.querySelector('.layui-table-loading');
|
||||
return !loading;
|
||||
},
|
||||
{ timeout: 30000 },
|
||||
)
|
||||
.catch(() => {});
|
||||
|
||||
// 额外等待确保数据加载完成
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
|
||||
// 检查是否真的翻页了(通过检查当前页码)
|
||||
const currentActivePage = await page.evaluate(() => {
|
||||
const activeSpan = document.querySelector('.layui-laypage-curr em:last-child');
|
||||
const activeSpan = document.querySelector(
|
||||
'.layui-laypage-curr em:last-child',
|
||||
);
|
||||
return activeSpan ? parseInt(activeSpan.textContent || '1') : 1;
|
||||
});
|
||||
|
||||
@@ -217,25 +264,29 @@ export const CdtCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CdtResult[] {
|
||||
extract(this: CdtCrawlerType, html: string): CdtResult[] {
|
||||
const results: CdtResult[] = [];
|
||||
/**
|
||||
* Regex groups for tang.cdt-ec.com:
|
||||
@@ -243,23 +294,24 @@ export const CdtCrawler = {
|
||||
* 2: Title (项目名称)
|
||||
* 3: Date (发布时间)
|
||||
*/
|
||||
const regex = /<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
|
||||
const regex =
|
||||
/<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -40,14 +40,14 @@ describe('CeicCrawler Real Site Test', () => {
|
||||
'--disable-infobars',
|
||||
...proxyArgs,
|
||||
],
|
||||
defaultViewport: null
|
||||
defaultViewport: null,
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await new Promise((r) => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
@@ -64,7 +64,9 @@ Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -73,12 +75,14 @@ Successfully found ${results.length} items:
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
|
||||
console.warn(
|
||||
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
|
||||
);
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -14,11 +14,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,19 +32,25 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
interface CeicCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
}
|
||||
|
||||
export const CeicCrawler = {
|
||||
@@ -52,7 +58,10 @@ export const CeicCrawler = {
|
||||
url: 'https://ceic.dlnyzb.com/3001',
|
||||
baseUrl: 'https://ceic.dlnyzb.com/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: CeicCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('CeicCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -65,10 +74,14 @@ export const CeicCrawler = {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
@@ -90,16 +103,25 @@ export const CeicCrawler = {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for content to load - MUI list items
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('li.MuiListItem-root').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
return (
|
||||
document.querySelectorAll('li.MuiListItem-root').length > 0
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
)
|
||||
.catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate(() => {
|
||||
const results: { title: string; dateStr: string; url: string }[] = [];
|
||||
|
||||
// Extract from MUI list items
|
||||
const listItems = Array.from(document.querySelectorAll('li.MuiListItem-root'));
|
||||
listItems.forEach(item => {
|
||||
const listItems = Array.from(
|
||||
document.querySelectorAll('li.MuiListItem-root'),
|
||||
);
|
||||
listItems.forEach((item) => {
|
||||
// Find the title link
|
||||
const titleLink = item.querySelector('a.css-1vdw90h');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
@@ -125,15 +147,19 @@ export const CeicCrawler = {
|
||||
});
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
logger.warn(
|
||||
`No results found on page ${currentPage}. Extraction failed.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r.title,
|
||||
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
|
||||
url: r.url.replace(/\/\//g, '/')
|
||||
})));
|
||||
allResults.push(
|
||||
...pageResults.map((r) => ({
|
||||
title: r.title,
|
||||
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
|
||||
url: r.url.replace(/\/\//g, '/'),
|
||||
})),
|
||||
);
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
@@ -142,7 +168,7 @@ export const CeicCrawler = {
|
||||
if (!nextButton) break;
|
||||
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
@@ -155,14 +181,17 @@ export const CeicCrawler = {
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Crawl failed: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
extract() {
|
||||
return [];
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { CgnpcCrawler } from './cgnpc_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('CgnpcCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CgnpcCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CgnpcResult {
|
||||
@@ -52,12 +52,22 @@ export interface CgnpcResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CgnpcCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CgnpcResult[];
|
||||
}
|
||||
|
||||
export const CgnpcCrawler = {
|
||||
name: '中广核电子商务平台',
|
||||
url: 'https://ecp.cgnpc.com.cn/zbgg.html',
|
||||
baseUrl: 'https://ecp.cgnpc.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CgnpcResult[]> {
|
||||
async crawl(
|
||||
this: CgnpcCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CgnpcResult[]> {
|
||||
const logger = new Logger('CgnpcCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const CgnpcCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CgnpcResult[] = [];
|
||||
@@ -103,7 +117,9 @@ export const CgnpcCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -127,9 +143,13 @@ export const CgnpcCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -144,20 +164,21 @@ export const CgnpcCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CgnpcResult[] {
|
||||
extract(this: CgnpcCrawlerType, html: string): CgnpcResult[] {
|
||||
const results: CgnpcResult[] = [];
|
||||
/**
|
||||
* Regex groups for ecp.cgnpc.com.cn:
|
||||
@@ -181,24 +202,25 @@ export const CgnpcCrawler = {
|
||||
* </div>
|
||||
* </div>
|
||||
*/
|
||||
const regex = /<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p>文件获取截止时间<\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
|
||||
const regex =
|
||||
/<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p>文件获取截止时间<\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const title = match[1]?.trim();
|
||||
const url = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const title = match[1]?.trim() ?? '';
|
||||
const url = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -51,7 +51,9 @@ describe('ChdtpCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('ChdtpCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if the website structure has changed or if the list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if the website structure has changed or if the list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of the first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of the first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -7,12 +7,22 @@ export interface ChdtpResult {
|
||||
url: string; // Necessary for system uniqueness
|
||||
}
|
||||
|
||||
interface ChdtpCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): ChdtpResult[];
|
||||
}
|
||||
|
||||
export const ChdtpCrawler = {
|
||||
name: '华电集团电子商务平台 ',
|
||||
url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1',
|
||||
baseUrl: 'https://www.chdtp.com/webs/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: ChdtpCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('ChdtpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -22,7 +32,9 @@ export const ChdtpCrawler = {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
||||
);
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
let currentPage = 1;
|
||||
@@ -42,7 +54,9 @@ export const ChdtpCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// Find the "Next Page" button
|
||||
// Using partial match for src to be robust against path variations
|
||||
@@ -61,11 +75,18 @@ export const ChdtpCrawler = {
|
||||
|
||||
try {
|
||||
await Promise.all([
|
||||
page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }),
|
||||
page.waitForNavigation({
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
}),
|
||||
nextButton.click(),
|
||||
]);
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -73,20 +94,21 @@ export const ChdtpCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults; // Return what we have so far
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): ChdtpResult[] {
|
||||
extract(this: ChdtpCrawlerType, html: string): ChdtpResult[] {
|
||||
const results: ChdtpResult[] = [];
|
||||
/**
|
||||
* Regex groups for chdtp.com:
|
||||
@@ -96,23 +118,24 @@ export const ChdtpCrawler = {
|
||||
* 4: Business Type
|
||||
* 5: Date
|
||||
*/
|
||||
const regex = /<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs;
|
||||
const regex =
|
||||
/<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const urlSuffix = match[2]?.trim();
|
||||
const title = match[3]?.trim();
|
||||
const dateStr = match[5]?.trim();
|
||||
const urlSuffix = match[2]?.trim() ?? '';
|
||||
const title = match[3]?.trim() ?? '';
|
||||
const dateStr = match[5]?.trim() ?? '';
|
||||
|
||||
if (title && urlSuffix) {
|
||||
const fullUrl = this.baseUrl + urlSuffix;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -33,11 +33,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,19 +51,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
describe('ChngCrawler Real Site Test', () => {
|
||||
@@ -82,7 +82,7 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
"--disable-infobars",
|
||||
'--disable-infobars',
|
||||
...proxyArgs,
|
||||
// "--headless=new",
|
||||
// '--disable-dev-shm-usage',
|
||||
@@ -94,15 +94,14 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
// '--disable-webgl',
|
||||
// '--disable-javascript',
|
||||
],
|
||||
defaultViewport: null
|
||||
|
||||
defaultViewport: null,
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await new Promise((r) => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
@@ -114,7 +113,11 @@ Starting crawl for: ${ChngCrawler.name}`);
|
||||
|
||||
// 创建一个临时页面用于模拟人类行为
|
||||
const tempPage = await browser.newPage();
|
||||
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
await tempPage.setViewport({
|
||||
width: 1920,
|
||||
height: 1080,
|
||||
deviceScaleFactor: 1,
|
||||
});
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
console.log('Simulating human mouse movements...');
|
||||
@@ -133,7 +136,9 @@ Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -142,12 +147,14 @@ Successfully found ${results.length} items:
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.');
|
||||
console.warn(
|
||||
'Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.',
|
||||
);
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -21,14 +21,15 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('Mouse movement simulation interrupted:', error.message);
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
console.log('Mouse movement simulation interrupted:', errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,12 +50,12 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
@@ -62,19 +63,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('Scrolling simulation interrupted:', error.message);
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
console.log('Scrolling simulation interrupted:', errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
interface ChngCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
}
|
||||
|
||||
export const ChngCrawler = {
|
||||
name: '华能集团电子商务平台',
|
||||
url: 'https://ec.chng.com.cn/channel/home/#/purchase?top=0',
|
||||
baseUrl: 'https://ec.chng.com.cn/channel/home/#',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: ChngCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('ChngCrawler');
|
||||
let page = await browser.newPage();
|
||||
// await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
@@ -87,11 +98,15 @@ export const ChngCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
@@ -114,7 +129,9 @@ export const ChngCrawler = {
|
||||
const firstResultSelector = '#b_results .b_algo h2 a';
|
||||
await page.waitForSelector(firstResultSelector);
|
||||
|
||||
const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target());
|
||||
const newTargetPromise = browser.waitForTarget(
|
||||
(target) => target.opener() === page.target(),
|
||||
);
|
||||
await page.click(firstResultSelector);
|
||||
|
||||
const newTarget = await newTargetPromise;
|
||||
@@ -136,46 +153,56 @@ export const ChngCrawler = {
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 等待页面稳定,不强制等待导航
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
|
||||
// PAUSE 15 SECONDS as requested
|
||||
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
|
||||
await new Promise(r => setTimeout(r, 15000));
|
||||
await new Promise((r) => setTimeout(r, 15000));
|
||||
// await page.screenshot({ path: 'huaneng.png' });
|
||||
|
||||
logger.log('Looking for "采购专栏" link...');
|
||||
await page.waitForFunction(() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
return divs.some(div => div.textContent && div.textContent.includes('采购专栏'));
|
||||
}, { timeout: 60000 });
|
||||
await page.waitForFunction(
|
||||
() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
return divs.some(
|
||||
(div) => div.textContent && div.textContent.includes('采购专栏'),
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
);
|
||||
|
||||
const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null);
|
||||
const purchaseTargetPromise = browser
|
||||
.waitForTarget((target) => target.opener() === page.target(), {
|
||||
timeout: 15000,
|
||||
})
|
||||
.catch(() => null);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement;
|
||||
const target = divs.find(
|
||||
(div) => div.textContent && div.textContent.includes('采购专栏'),
|
||||
) as HTMLElement;
|
||||
if (target) target.click();
|
||||
});
|
||||
|
||||
const purchaseTarget = await purchaseTargetPromise;
|
||||
if (purchaseTarget) {
|
||||
const pPage = await purchaseTarget.page();
|
||||
if (pPage) {
|
||||
logger.log('Switched to Purchase Page tab.');
|
||||
page = pPage;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
const pPage = await purchaseTarget.page();
|
||||
if (pPage) {
|
||||
logger.log('Switched to Purchase Page tab.');
|
||||
page = pPage;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
}
|
||||
}
|
||||
|
||||
logger.log(`Active URL: ${page.url()}`);
|
||||
@@ -191,47 +218,64 @@ export const ChngCrawler = {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for table rows to load
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('tr.ant-table-row').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
return document.querySelectorAll('tr.ant-table-row').length > 0;
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
)
|
||||
.catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate((baseUrl) => {
|
||||
// Extract from table rows
|
||||
const items = Array.from(document.querySelectorAll('tr.ant-table-row'));
|
||||
return items.map(item => {
|
||||
const titleSpan = item.querySelector('span.list-text');
|
||||
const dateCell = item.querySelector('td.ant-table-row-cell-break-word p');
|
||||
const items = Array.from(
|
||||
document.querySelectorAll('tr.ant-table-row'),
|
||||
);
|
||||
return items
|
||||
.map((item) => {
|
||||
const titleSpan = item.querySelector('span.list-text');
|
||||
const dateCell = item.querySelector(
|
||||
'td.ant-table-row-cell-break-word p',
|
||||
);
|
||||
|
||||
if (titleSpan && dateCell) {
|
||||
const title = titleSpan.textContent?.trim() || '';
|
||||
const dateStr = dateCell.textContent?.trim() || '';
|
||||
if (titleSpan && dateCell) {
|
||||
const title = titleSpan.textContent?.trim() || '';
|
||||
const dateStr = dateCell.textContent?.trim() || '';
|
||||
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
|
||||
// URL is not directly available in the table, need to construct from data-row-key
|
||||
const rowKey = item.getAttribute('data-row-key');
|
||||
const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : '';
|
||||
// URL is not directly available in the table, need to construct from data-row-key
|
||||
const rowKey = item.getAttribute('data-row-key');
|
||||
const url = rowKey
|
||||
? `${baseUrl}#/purchase/detail?id=${rowKey}`
|
||||
: '';
|
||||
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}).filter(i => i !== null);
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter((i) => i !== null);
|
||||
}, this.baseUrl);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
break;
|
||||
logger.warn(
|
||||
`No results found on page ${currentPage}. Extraction failed.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r!.title,
|
||||
publishDate: new Date(r!.dateStr),
|
||||
url: r!.url.replace(/\/\//g, '/')
|
||||
})));
|
||||
allResults.push(
|
||||
...pageResults.map((r) => ({
|
||||
title: r.title,
|
||||
publishDate: new Date(r.dateStr),
|
||||
url: r.url.replace(/\/\//g, '/'),
|
||||
})),
|
||||
);
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
@@ -249,26 +293,29 @@ export const ChngCrawler = {
|
||||
await page.waitForFunction(
|
||||
(oldUrl) => window.location.href !== oldUrl,
|
||||
{ timeout: 10000 },
|
||||
currentUrl
|
||||
currentUrl,
|
||||
);
|
||||
} catch (e) {
|
||||
} catch {
|
||||
logger.warn('Navigation timeout, continuing anyway');
|
||||
}
|
||||
|
||||
// 等待页面内容加载
|
||||
await new Promise(r => setTimeout(r, 15000));
|
||||
await new Promise((r) => setTimeout(r, 15000));
|
||||
currentPage++;
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Crawl failed: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
extract() {
|
||||
return [];
|
||||
},
|
||||
};
|
||||
@@ -2,7 +2,7 @@ import { CnncecpCrawler } from './cnncecp_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('CnncecpCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CnncecpCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CnncecpResult {
|
||||
@@ -52,12 +52,22 @@ export interface CnncecpResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CnncecpCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CnncecpResult[];
|
||||
}
|
||||
|
||||
export const CnncecpCrawler = {
|
||||
name: '中核集团电子采购平台',
|
||||
url: 'https://www.cnncecp.com/xzbgg/index.jhtml',
|
||||
baseUrl: 'https://www.cnncecp.com/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CnncecpResult[]> {
|
||||
async crawl(
|
||||
this: CnncecpCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CnncecpResult[]> {
|
||||
const logger = new Logger('CnncecpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const CnncecpCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CnncecpResult[] = [];
|
||||
@@ -103,7 +117,9 @@ export const CnncecpCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -126,9 +142,13 @@ export const CnncecpCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -143,20 +163,21 @@ export const CnncecpCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CnncecpResult[] {
|
||||
extract(this: CnncecpCrawlerType, html: string): CnncecpResult[] {
|
||||
const results: CnncecpResult[] = [];
|
||||
/**
|
||||
* Regex groups for cnncecp.com:
|
||||
@@ -172,24 +193,25 @@ export const CnncecpCrawler = {
|
||||
* <a href="https://www.cnncecp.com/xzbgg/1862778.jhtml">中核四0四有限公司2026-2028年度质量流量控制器等采购项目(二次)变更公告</a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const dateStr = match[1]?.trim();
|
||||
const url = match[2]?.trim();
|
||||
const title = match[3]?.trim();
|
||||
const dateStr = match[1]?.trim() ?? '';
|
||||
const url = match[2]?.trim() ?? '';
|
||||
const title = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { CnoocCrawler } from './cnooc_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('CnoocCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CnoocCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CnoocResult {
|
||||
@@ -52,12 +52,22 @@ export interface CnoocResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CnoocCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CnoocResult[];
|
||||
}
|
||||
|
||||
export const CnoocCrawler = {
|
||||
name: '中海油招标平台',
|
||||
url: 'https://buy.cnooc.com.cn/cbjyweb/001/001001/moreinfo.html',
|
||||
baseUrl: 'https://buy.cnooc.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CnoocResult[]> {
|
||||
async crawl(
|
||||
this: CnoocCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CnoocResult[]> {
|
||||
const logger = new Logger('CnoocCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const CnoocCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CnoocResult[] = [];
|
||||
@@ -103,7 +117,9 @@ export const CnoocCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -127,9 +143,13 @@ export const CnoocCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -144,20 +164,21 @@ export const CnoocCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CnoocResult[] {
|
||||
extract(this: CnoocCrawlerType, html: string): CnoocResult[] {
|
||||
const results: CnoocResult[] = [];
|
||||
/**
|
||||
* Regex groups for buy.cnooc.com.cn:
|
||||
@@ -173,24 +194,25 @@ export const CnoocCrawler = {
|
||||
* <span class="now-span" style="width:100px">2026-01-12</span>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { EpsCrawler } from './eps_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('EpsCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('EpsCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface EpsResult {
|
||||
@@ -52,12 +52,22 @@ export interface EpsResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface EpsCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): EpsResult[];
|
||||
}
|
||||
|
||||
export const EpsCrawler = {
|
||||
name: '中国三峡集团电子商务平台',
|
||||
url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm',
|
||||
baseUrl: 'https://eps.ctg.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<EpsResult[]> {
|
||||
async crawl(
|
||||
this: EpsCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<EpsResult[]> {
|
||||
const logger = new Logger('EpsCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const EpsCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: EpsResult[] = [];
|
||||
@@ -103,7 +117,9 @@ export const EpsCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -127,9 +143,13 @@ export const EpsCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮,等待页面更新
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -144,20 +164,21 @@ export const EpsCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): EpsResult[] {
|
||||
extract(this: EpsCrawlerType, html: string): EpsResult[] {
|
||||
const results: EpsResult[] = [];
|
||||
/**
|
||||
* Regex groups for eps.ctg.com.cn:
|
||||
@@ -179,24 +200,25 @@ export const EpsCrawler = {
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { EspicCrawler } from './espic_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('EspicCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('EspicCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface EspicResult {
|
||||
@@ -52,12 +52,19 @@ export interface EspicResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface EspicCrawlerType {
|
||||
name: string;
|
||||
baseUrl: string;
|
||||
getUrl(page?: number): string;
|
||||
extract(html: string): EspicResult[];
|
||||
}
|
||||
|
||||
export const EspicCrawler = {
|
||||
name: '电能e招采平台(国电投)',
|
||||
baseUrl: 'https://ebid.espic.com.cn/',
|
||||
|
||||
// 生成动态 URL,使用当前日期
|
||||
getUrl(page: number = 1): string {
|
||||
getUrl(this: EspicCrawlerType, page: number = 1): string {
|
||||
const now = new Date();
|
||||
const year = now.getFullYear();
|
||||
const month = now.getMonth() + 1; // 月份从0开始
|
||||
@@ -66,7 +73,10 @@ export const EspicCrawler = {
|
||||
return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`;
|
||||
},
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<EspicResult[]> {
|
||||
async crawl(
|
||||
this: EspicCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<EspicResult[]> {
|
||||
const logger = new Logger('EspicCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -78,11 +88,15 @@ export const EspicCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: EspicResult[] = [];
|
||||
@@ -100,9 +114,12 @@ export const EspicCrawler = {
|
||||
() => {
|
||||
// 检查是否已经通过验证(页面不再是 WAF 页面)
|
||||
const bodyText = document.body?.textContent || '';
|
||||
return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙');
|
||||
return (
|
||||
!bodyText.includes('人机识别检测') &&
|
||||
!bodyText.includes('WEB 应用防火墙')
|
||||
);
|
||||
},
|
||||
{ timeout: 30000 }
|
||||
{ timeout: 30000 },
|
||||
);
|
||||
|
||||
// 模拟人类行为
|
||||
@@ -124,7 +141,9 @@ export const EspicCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -141,7 +160,7 @@ export const EspicCrawler = {
|
||||
'a[aria-label="Next"]',
|
||||
'a.next',
|
||||
'li.next a',
|
||||
'a.layui-laypage-next:not(.layui-disabled)'
|
||||
'a.layui-laypage-next:not(.layui-disabled)',
|
||||
];
|
||||
|
||||
let nextButton: puppeteer.ElementHandle<Element> | null = null;
|
||||
@@ -149,7 +168,7 @@ export const EspicCrawler = {
|
||||
try {
|
||||
nextButton = await page.$(selector);
|
||||
if (nextButton) break;
|
||||
} catch (e) {
|
||||
} catch {
|
||||
// 继续尝试下一个选择器
|
||||
}
|
||||
}
|
||||
@@ -164,9 +183,13 @@ export const EspicCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -181,20 +204,21 @@ export const EspicCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): EspicResult[] {
|
||||
extract(this: EspicCrawlerType, html: string): EspicResult[] {
|
||||
const results: EspicResult[] = [];
|
||||
/**
|
||||
* Regex groups for ebid.espic.com.cn:
|
||||
@@ -225,24 +249,25 @@ export const EspicCrawler = {
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('PowerbeijingCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('PowerbeijingCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface PowerbeijingResult {
|
||||
@@ -52,12 +52,22 @@ export interface PowerbeijingResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface PowerbeijingCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): PowerbeijingResult[];
|
||||
}
|
||||
|
||||
export const PowerbeijingCrawler = {
|
||||
name: '北京京能电子商务平台',
|
||||
url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
|
||||
baseUrl: 'https://www.powerbeijing-ec.com/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<PowerbeijingResult[]> {
|
||||
async crawl(
|
||||
this: PowerbeijingCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<PowerbeijingResult[]> {
|
||||
const logger = new Logger('PowerbeijingCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const PowerbeijingCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: PowerbeijingResult[] = [];
|
||||
@@ -103,7 +117,9 @@ export const PowerbeijingCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -127,9 +143,13 @@ export const PowerbeijingCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮,等待页面更新
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -144,20 +164,21 @@ export const PowerbeijingCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): PowerbeijingResult[] {
|
||||
extract(this: PowerbeijingCrawlerType, html: string): PowerbeijingResult[] {
|
||||
const results: PowerbeijingResult[] = [];
|
||||
/**
|
||||
* Regex groups for powerbeijing-ec.com:
|
||||
@@ -176,24 +197,25 @@ export const PowerbeijingCrawler = {
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { SdiccCrawler } from './sdicc_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -51,7 +51,9 @@ describe('SdiccCrawler Real Site Test', () => {
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('SdiccCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface SdiccResult {
|
||||
@@ -52,12 +52,22 @@ export interface SdiccResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface SdiccCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): SdiccResult[];
|
||||
}
|
||||
|
||||
export const SdiccCrawler = {
|
||||
name: '国投集团电子采购平台',
|
||||
url: 'https://www.sdicc.com.cn/cgxx/ggList',
|
||||
baseUrl: 'https://www.sdicc.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<SdiccResult[]> {
|
||||
async crawl(
|
||||
this: SdiccCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<SdiccResult[]> {
|
||||
const logger = new Logger('SdiccCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const SdiccCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: SdiccResult[] = [];
|
||||
@@ -93,9 +107,11 @@ export const SdiccCrawler = {
|
||||
|
||||
// 等待表格加载
|
||||
logger.log('Waiting for table to load...');
|
||||
await page.waitForSelector('.tbody table tbody tr', { timeout: 30000 }).catch(() => {
|
||||
logger.warn('Table rows not found, trying alternative selectors...');
|
||||
});
|
||||
await page
|
||||
.waitForSelector('.tbody table tbody tr', { timeout: 30000 })
|
||||
.catch(() => {
|
||||
logger.warn('Table rows not found, trying alternative selectors...');
|
||||
});
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
@@ -109,7 +125,9 @@ export const SdiccCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -132,10 +150,16 @@ export const SdiccCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
|
||||
await new Promise(r => setTimeout(r, 2000)); // 额外等待确保数据加载完成
|
||||
await page
|
||||
.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
|
||||
.catch(() => {});
|
||||
await new Promise((r) => setTimeout(r, 2000)); // 额外等待确保数据加载完成
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -150,20 +174,21 @@ export const SdiccCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): SdiccResult[] {
|
||||
extract(this: SdiccCrawlerType, html: string): SdiccResult[] {
|
||||
const results: SdiccResult[] = [];
|
||||
/**
|
||||
* Regex groups for sdicc.com.cn:
|
||||
@@ -180,25 +205,26 @@ export const SdiccCrawler = {
|
||||
* <td colspan="1" rowspan="1"><span> 2026-01-09 </span></td>
|
||||
* </tr>
|
||||
*/
|
||||
const regex = /<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
|
||||
const regex =
|
||||
/<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const ggGuid = match[1]?.trim();
|
||||
const gcGuid = match[2]?.trim();
|
||||
const title = match[3]?.trim();
|
||||
const dateStr = match[4]?.trim();
|
||||
const ggGuid = match[1]?.trim() ?? '';
|
||||
const gcGuid = match[2]?.trim() ?? '';
|
||||
const title = match[3]?.trim() ?? '';
|
||||
const dateStr = match[4]?.trim() ?? '';
|
||||
|
||||
if (title && ggGuid && gcGuid) {
|
||||
const fullUrl = `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}`;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -40,14 +40,14 @@ describe('SzecpCrawler Real Site Test', () => {
|
||||
'--disable-infobars',
|
||||
...proxyArgs,
|
||||
],
|
||||
defaultViewport: null
|
||||
defaultViewport: null,
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await new Promise((r) => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
@@ -64,7 +64,9 @@ Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -73,12 +75,14 @@ Successfully found ${results.length} items:
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
|
||||
console.warn(
|
||||
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
|
||||
);
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -14,11 +14,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,19 +32,25 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
interface SzecpCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
}
|
||||
|
||||
export const SzecpCrawler = {
|
||||
@@ -52,7 +58,10 @@ export const SzecpCrawler = {
|
||||
url: 'https://www.szecp.com.cn/first_zbgg/index.html',
|
||||
baseUrl: 'https://www.szecp.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: SzecpCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('SzecpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -65,10 +74,14 @@ export const SzecpCrawler = {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
@@ -90,52 +103,69 @@ export const SzecpCrawler = {
|
||||
logger.log('Clicking search button...');
|
||||
await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 });
|
||||
await page.click('.szb-zbcgSearch-key-v1');
|
||||
await new Promise(r => setTimeout(r, 3000)); // Wait for results to load
|
||||
await new Promise((r) => setTimeout(r, 3000)); // Wait for results to load
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for content to load
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('.szb-zbcgTable-other').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
return (
|
||||
document.querySelectorAll('.szb-zbcgTable-other').length > 0
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
)
|
||||
.catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate((baseUrl) => {
|
||||
// Extract from table rows
|
||||
const items = Array.from(document.querySelectorAll('.szb-zbcgTable-other'));
|
||||
return items.map(item => {
|
||||
const divs = item.querySelectorAll('div');
|
||||
if (divs.length >= 5) {
|
||||
const titleLink = divs[1].querySelector('a');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
const dateStr = divs[4].textContent?.trim() || '';
|
||||
const href = titleLink?.getAttribute('href') || '';
|
||||
const items = Array.from(
|
||||
document.querySelectorAll('.szb-zbcgTable-other'),
|
||||
);
|
||||
return items
|
||||
.map((item) => {
|
||||
const divs = item.querySelectorAll('div');
|
||||
if (divs.length >= 5) {
|
||||
const titleLink = divs[1].querySelector('a');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
const dateStr = divs[4].textContent?.trim() || '';
|
||||
const href = titleLink?.getAttribute('href') || '';
|
||||
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
|
||||
// Construct full URL if href is relative
|
||||
const url = href.startsWith('http') ? href : `${baseUrl}${href}`;
|
||||
// Construct full URL if href is relative
|
||||
const url = href.startsWith('http')
|
||||
? href
|
||||
: `${baseUrl}${href}`;
|
||||
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}).filter(i => i !== null);
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter((i) => i !== null);
|
||||
}, this.baseUrl);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
logger.warn(
|
||||
`No results found on page ${currentPage}. Extraction failed.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r!.title,
|
||||
publishDate: new Date(r!.dateStr),
|
||||
url: r!.url.replace(/\/\//g, '/')
|
||||
})));
|
||||
allResults.push(
|
||||
...pageResults.map((r) => ({
|
||||
title: r.title,
|
||||
publishDate: new Date(r.dateStr),
|
||||
url: r.url.replace(/\/\//g, '/'),
|
||||
})),
|
||||
);
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
@@ -144,7 +174,7 @@ export const SzecpCrawler = {
|
||||
if (!nextButton) break;
|
||||
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
@@ -157,14 +187,17 @@ export const SzecpCrawler = {
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Crawl failed: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
extract() {
|
||||
return [];
|
||||
},
|
||||
};
|
||||
|
||||
@@ -12,7 +12,11 @@ import { CrawlInfoAdd } from '../crawler/entities/crawl-info-add.entity';
|
||||
imports: [ConfigModule],
|
||||
inject: [ConfigService],
|
||||
useFactory: (configService: ConfigService) => ({
|
||||
type: configService.get<any>('DATABASE_TYPE', 'mariadb'),
|
||||
type:
|
||||
(configService.get<string>('DATABASE_TYPE', 'mariadb') as
|
||||
| 'mariadb'
|
||||
| 'mysql'
|
||||
| 'postgres') || 'mariadb',
|
||||
host: configService.get<string>('DATABASE_HOST', 'localhost'),
|
||||
port: configService.get<number>('DATABASE_PORT', 3306),
|
||||
username: configService.get<string>('DATABASE_USERNAME', 'root'),
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
|
||||
import {
|
||||
Entity,
|
||||
PrimaryGeneratedColumn,
|
||||
Column,
|
||||
CreateDateColumn,
|
||||
UpdateDateColumn,
|
||||
} from 'typeorm';
|
||||
|
||||
@Entity('keywords')
|
||||
export class Keyword {
|
||||
|
||||
@@ -12,7 +12,8 @@ async function bootstrap() {
|
||||
app.useLogger(logger);
|
||||
|
||||
// 增加请求体大小限制(默认 100kb,增加到 50mb)
|
||||
const express = require('express');
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
const express = require('express') as typeof import('express');
|
||||
app.use(express.json({ limit: '50mb' }));
|
||||
app.use(express.urlencoded({ limit: '50mb', extended: true }));
|
||||
|
||||
@@ -21,4 +22,4 @@ async function bootstrap() {
|
||||
|
||||
await app.listen(process.env.PORT ?? 3000);
|
||||
}
|
||||
bootstrap();
|
||||
void bootstrap();
|
||||
|
||||
@@ -16,7 +16,9 @@ async function generateAiRecommendations() {
|
||||
|
||||
try {
|
||||
// 获取 BidItem 的 repository 和 AiService
|
||||
const bidItemRepository = app.get<Repository<BidItem>>(getRepositoryToken(BidItem));
|
||||
const bidItemRepository = app.get<Repository<BidItem>>(
|
||||
getRepositoryToken(BidItem),
|
||||
);
|
||||
const aiService = app.get(AiService);
|
||||
|
||||
logger.log('开始查询 bid_items 表...');
|
||||
@@ -27,11 +29,13 @@ async function generateAiRecommendations() {
|
||||
threeDaysAgo.setHours(0, 0, 0, 0);
|
||||
|
||||
// 使用本地时间格式化输出,避免时区问题
|
||||
const localDateStr = threeDaysAgo.toLocaleDateString('zh-CN', {
|
||||
year: 'numeric',
|
||||
month: '2-digit',
|
||||
day: '2-digit'
|
||||
}).replace(/\//g, '-');
|
||||
const localDateStr = threeDaysAgo
|
||||
.toLocaleDateString('zh-CN', {
|
||||
year: 'numeric',
|
||||
month: '2-digit',
|
||||
day: '2-digit',
|
||||
})
|
||||
.replace(/\//g, '-');
|
||||
logger.log(`查询起始日期: ${localDateStr}`);
|
||||
|
||||
// 查询起始日期3天前,截止日期不限制的所有记录
|
||||
@@ -50,8 +54,8 @@ async function generateAiRecommendations() {
|
||||
}
|
||||
|
||||
// 提取 title
|
||||
const bidData = bidItems.map(item => ({
|
||||
title: item.title
|
||||
const bidData = bidItems.map((item) => ({
|
||||
title: item.title,
|
||||
}));
|
||||
|
||||
logger.log('开始调用 AI 获取推荐...');
|
||||
|
||||
@@ -15,7 +15,9 @@ async function removeDuplicates() {
|
||||
|
||||
try {
|
||||
// 获取 BidItem 的 repository
|
||||
const bidItemRepository = app.get<Repository<BidItem>>(getRepositoryToken(BidItem));
|
||||
const bidItemRepository = app.get<Repository<BidItem>>(
|
||||
getRepositoryToken(BidItem),
|
||||
);
|
||||
|
||||
logger.log('开始查找重复的title...');
|
||||
|
||||
@@ -56,10 +58,12 @@ async function removeDuplicates() {
|
||||
const itemsToDelete = items.slice(1);
|
||||
|
||||
if (itemsToDelete.length > 0) {
|
||||
const idsToDelete = itemsToDelete.map(item => item.id);
|
||||
const idsToDelete = itemsToDelete.map((item) => item.id);
|
||||
const deleteResult = await bidItemRepository.delete(idsToDelete);
|
||||
totalDeleted += deleteResult.affected || 0;
|
||||
logger.log(` 删除了 ${deleteResult.affected} 条重复记录,保留ID: ${items[0].id} (最晚创建)`);
|
||||
logger.log(
|
||||
` 删除了 ${deleteResult.affected} 条重复记录,保留ID: ${items[0].id} (最晚创建)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import { CrawlInfoAdd } from '../crawler/entities/crawl-info-add.entity';
|
||||
|
||||
// 主数据库配置
|
||||
const masterDbConfig: DataSourceOptions = {
|
||||
type: process.env.DATABASE_TYPE as any || 'mariadb',
|
||||
type: (process.env.DATABASE_TYPE as any) || 'mariadb',
|
||||
host: process.env.DATABASE_HOST || 'localhost',
|
||||
port: parseInt(process.env.DATABASE_PORT || '3306'),
|
||||
username: process.env.DATABASE_USERNAME || 'root',
|
||||
@@ -20,7 +20,7 @@ const masterDbConfig: DataSourceOptions = {
|
||||
|
||||
// Slave 数据库配置
|
||||
const slaveDbConfig: DataSourceOptions = {
|
||||
type: process.env.SLAVE_DATABASE_TYPE as any || 'mariadb',
|
||||
type: (process.env.SLAVE_DATABASE_TYPE as any) || 'mariadb',
|
||||
host: process.env.SLAVE_DATABASE_HOST || 'localhost',
|
||||
port: parseInt(process.env.SLAVE_DATABASE_PORT || '3306'),
|
||||
username: process.env.SLAVE_DATABASE_USERNAME || 'root',
|
||||
@@ -94,12 +94,17 @@ async function createDatabaseIfNotExists(config: DataSourceOptions) {
|
||||
password: (config as any).password,
|
||||
});
|
||||
|
||||
await connection.query(`CREATE DATABASE IF NOT EXISTS \`${(config as any).database}\``);
|
||||
await connection.query(
|
||||
`CREATE DATABASE IF NOT EXISTS \`${(config as any).database}\``,
|
||||
);
|
||||
await connection.end();
|
||||
}
|
||||
|
||||
// 同步表结构
|
||||
async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSource): Promise<DataSource> {
|
||||
async function syncSchema(
|
||||
masterDataSource: DataSource,
|
||||
slaveDataSource: DataSource,
|
||||
): Promise<DataSource> {
|
||||
logger.log('开始同步表结构...');
|
||||
|
||||
// 获取主数据库的所有表
|
||||
@@ -137,8 +142,12 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou
|
||||
if (tableExists[0].count > 0) {
|
||||
// 表存在,先备份数据到临时表
|
||||
logger.log(`备份表 ${tableName} 的数据到 ${tempTableName}...`);
|
||||
await slaveDataSource.query(`CREATE TABLE ${tempTableName} AS SELECT * FROM \`${tableName}\``);
|
||||
logger.log(`备份完成,共备份 ${await slaveDataSource.query(`SELECT COUNT(*) as count FROM ${tempTableName}`).then(r => r[0].count)} 条记录`);
|
||||
await slaveDataSource.query(
|
||||
`CREATE TABLE ${tempTableName} AS SELECT * FROM \`${tableName}\``,
|
||||
);
|
||||
logger.log(
|
||||
`备份完成,共备份 ${await slaveDataSource.query(`SELECT COUNT(*) as count FROM ${tempTableName}`).then((r) => r[0].count)} 条记录`,
|
||||
);
|
||||
}
|
||||
|
||||
// 删除 slave 数据库中的表(如果存在)
|
||||
@@ -160,7 +169,9 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou
|
||||
AND TABLE_NAME = '${tempTableName}'
|
||||
`);
|
||||
|
||||
const columnNames = columns.map((c: any) => `\`${c.COLUMN_NAME}\``).join(', ');
|
||||
const columnNames = columns
|
||||
.map((c: any) => `\`${c.COLUMN_NAME}\``)
|
||||
.join(', ');
|
||||
|
||||
// 将数据从临时表插入到新表
|
||||
await slaveDataSource.query(`
|
||||
@@ -168,7 +179,9 @@ async function syncSchema(masterDataSource: DataSource, slaveDataSource: DataSou
|
||||
SELECT ${columnNames} FROM ${tempTableName}
|
||||
`);
|
||||
|
||||
const restoredCount = await slaveDataSource.query(`SELECT COUNT(*) as count FROM \`${tableName}\``);
|
||||
const restoredCount = await slaveDataSource.query(
|
||||
`SELECT COUNT(*) as count FROM \`${tableName}\``,
|
||||
);
|
||||
logger.log(`数据恢复完成,共恢复 ${restoredCount[0].count} 条记录`);
|
||||
|
||||
// 删除临时表
|
||||
@@ -227,7 +240,12 @@ async function syncDatabase() {
|
||||
|
||||
let totalSynced = 0;
|
||||
for (const table of tables) {
|
||||
const count = await syncTable(masterDataSource, slaveDataSource, table.entity, table.name);
|
||||
const count = await syncTable(
|
||||
masterDataSource,
|
||||
slaveDataSource,
|
||||
table.entity,
|
||||
table.name,
|
||||
);
|
||||
totalSynced += count;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,9 @@ async function updateSource() {
|
||||
|
||||
try {
|
||||
// 获取 BidItem 的 repository
|
||||
const bidItemRepository = app.get<Repository<BidItem>>(getRepositoryToken(BidItem));
|
||||
const bidItemRepository = app.get<Repository<BidItem>>(
|
||||
getRepositoryToken(BidItem),
|
||||
);
|
||||
|
||||
const oldSource = '北京电力交易平台';
|
||||
const newSource = '北京京能电子商务平台';
|
||||
|
||||
17
widget/looker/sys_run/go.mod
Normal file
17
widget/looker/sys_run/go.mod
Normal file
@@ -0,0 +1,17 @@
|
||||
module systray_run
|
||||
|
||||
go 1.23
|
||||
|
||||
require github.com/getlantern/systray v1.2.2
|
||||
|
||||
require (
|
||||
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 // indirect
|
||||
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 // indirect
|
||||
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 // indirect
|
||||
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 // indirect
|
||||
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 // indirect
|
||||
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f // indirect
|
||||
github.com/go-stack/stack v1.8.0 // indirect
|
||||
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect
|
||||
golang.org/x/sys v0.1.0 // indirect
|
||||
)
|
||||
32
widget/looker/sys_run/go.sum
Normal file
32
widget/looker/sys_run/go.sum
Normal file
@@ -0,0 +1,32 @@
|
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 h1:NRUJuo3v3WGC/g5YiyF790gut6oQr5f3FBI88Wv0dx4=
|
||||
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520/go.mod h1:L+mq6/vvYHKjCX2oez0CgEAJmbq1fbb/oNJIWQkBybY=
|
||||
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 h1:6uJ+sZ/e03gkbqZ0kUG6mfKoqDb4XMAzMIwlajq19So=
|
||||
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7/go.mod h1:l+xpFBrCtDLpK9qNjxs+cHU6+BAdlBaxHqikB6Lku3A=
|
||||
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 h1:guBYzEaLz0Vfc/jv0czrr2z7qyzTOGC9hiQ0VC+hKjk=
|
||||
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7/go.mod h1:zx/1xUUeYPy3Pcmet8OSXLbF47l+3y6hIPpyLWoR9oc=
|
||||
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 h1:micT5vkcr9tOVk1FiH8SWKID8ultN44Z+yzd2y/Vyb0=
|
||||
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7/go.mod h1:dD3CgOrwlzca8ed61CsZouQS5h5jIzkK9ZWrTcf0s+o=
|
||||
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 h1:XYzSdCbkzOC0FDNrgJqGRo8PCMFOBFL9py72DRs7bmc=
|
||||
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55/go.mod h1:6mmzY2kW1TOOrVy+r41Za2MxXM+hhqTtY3oBKd2AgFA=
|
||||
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f h1:wrYrQttPS8FHIRSlsrcuKazukx/xqO/PpLZzZXsF+EA=
|
||||
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f/go.mod h1:D5ao98qkA6pxftxoqzibIBBrLSUli+kYnJqrgBf9cIA=
|
||||
github.com/getlantern/systray v1.2.2 h1:dCEHtfmvkJG7HZ8lS/sLklTH4RKUcIsKrAD9sThoEBE=
|
||||
github.com/getlantern/systray v1.2.2/go.mod h1:pXFOI1wwqwYXEhLPm9ZGjS2u/vVELeIgNMY5HvhHhcE=
|
||||
github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk=
|
||||
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
|
||||
github.com/lxn/walk v0.0.0-20210112085537-c389da54e794/go.mod h1:E23UucZGqpuUANJooIbHWCufXvOcT6E7Stq81gU+CSQ=
|
||||
github.com/lxn/win v0.0.0-20210218163916-a377121e959e/go.mod h1:KxxjdtRkfNoYDCUP5ryK7XJJNTnpC8atvtmTheChOtk=
|
||||
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw=
|
||||
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
|
||||
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
gopkg.in/Knetic/govaluate.v3 v3.0.0/go.mod h1:csKLBORsPbafmSCGTEh3U7Ozmsuq8ZSIlKk1bcqph0E=
|
||||
Reference in New Issue
Block a user