chore: 更新.gitignore并添加新文件
在.gitignore中添加对*.png、*.log、*-lock.json、*.woff2文件的忽略规则,并新增OFL.txt文件。同时,添加vue.svg图标文件以支持前端展示。更新多个TypeScript文件以优化代码格式和增强可读性。
This commit is contained in:
@@ -18,6 +18,17 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
|
||||
import { SdiccCrawler } from './sdicc_target';
|
||||
import { CnoocCrawler } from './cnooc_target';
|
||||
|
||||
interface CrawlResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface Crawler {
|
||||
name: string;
|
||||
crawl(browser: puppeteer.Browser): Promise<CrawlResult[]>;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class BidCrawlerService {
|
||||
private readonly logger = new Logger(BidCrawlerService.name);
|
||||
@@ -31,17 +42,15 @@ export class BidCrawlerService {
|
||||
|
||||
async crawlAll() {
|
||||
this.logger.log('Starting crawl task with Puppeteer...');
|
||||
|
||||
|
||||
// 设置最大执行时间为3小时
|
||||
const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时(毫秒)
|
||||
const startTime = Date.now();
|
||||
|
||||
// 统计结果
|
||||
const crawlResults: Record<string, { success: number; error?: string }> = {};
|
||||
|
||||
const crawlResults: Record<string, { success: number; error?: string }> =
|
||||
{};
|
||||
// 记录数据为0的爬虫,用于重试
|
||||
const zeroDataCrawlers: any[] = [];
|
||||
|
||||
const zeroDataCrawlers: Crawler[] = [];
|
||||
// 从环境变量读取代理配置
|
||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||
@@ -60,9 +69,10 @@ export class BidCrawlerService {
|
||||
];
|
||||
|
||||
if (proxyHost && proxyPort) {
|
||||
const proxyUrl = proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
const proxyUrl =
|
||||
proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
args.push(`--proxy-server=${proxyUrl}`);
|
||||
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
|
||||
}
|
||||
@@ -72,24 +82,43 @@ export class BidCrawlerService {
|
||||
args,
|
||||
});
|
||||
|
||||
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler];
|
||||
const crawlers = [
|
||||
ChdtpCrawler,
|
||||
ChngCrawler,
|
||||
SzecpCrawler,
|
||||
CdtCrawler,
|
||||
EpsCrawler,
|
||||
CnncecpCrawler,
|
||||
CgnpcCrawler,
|
||||
CeicCrawler,
|
||||
EspicCrawler,
|
||||
PowerbeijingCrawler,
|
||||
SdiccCrawler,
|
||||
CnoocCrawler,
|
||||
];
|
||||
|
||||
try {
|
||||
for (const crawler of crawlers) {
|
||||
this.logger.log(`Crawling: ${crawler.name}`);
|
||||
|
||||
|
||||
// 检查是否超时
|
||||
const elapsedTime = Date.now() - startTime;
|
||||
if (elapsedTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`);
|
||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||
this.logger.warn(
|
||||
`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`,
|
||||
);
|
||||
this.logger.warn(
|
||||
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||
|
||||
this.logger.log(
|
||||
`Extracted ${results.length} items from ${crawler.name}`,
|
||||
);
|
||||
|
||||
// 记录成功数量
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
|
||||
@@ -99,12 +128,13 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
const latestPublishDate =
|
||||
results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
@@ -116,46 +146,60 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 保存爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
|
||||
await this.saveCrawlInfo(
|
||||
crawler.name,
|
||||
results.length,
|
||||
latestPublishDate,
|
||||
);
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
||||
const errorMessage = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${errorMessage}`);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
crawlResults[crawler.name] = { success: 0, error: errorMessage };
|
||||
|
||||
// 保存错误信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// 对数据为0的爬虫进行重试
|
||||
if (zeroDataCrawlers.length > 0) {
|
||||
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`);
|
||||
|
||||
this.logger.log(
|
||||
`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`,
|
||||
);
|
||||
|
||||
for (const crawler of zeroDataCrawlers) {
|
||||
this.logger.log(`Retrying: ${crawler.name}`);
|
||||
|
||||
|
||||
// 检查是否超时
|
||||
const elapsedTime = Date.now() - startTime;
|
||||
if (elapsedTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`);
|
||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||
this.logger.warn(
|
||||
`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`,
|
||||
);
|
||||
this.logger.warn(
|
||||
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`);
|
||||
|
||||
this.logger.log(
|
||||
`Retry extracted ${results.length} items from ${crawler.name}`,
|
||||
);
|
||||
|
||||
// 更新统计结果
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
const latestPublishDate =
|
||||
results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
@@ -167,58 +211,76 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 更新爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
|
||||
await this.saveCrawlInfo(
|
||||
crawler.name,
|
||||
results.length,
|
||||
latestPublishDate,
|
||||
);
|
||||
} catch (err) {
|
||||
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
|
||||
const errorMessage =
|
||||
err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(
|
||||
`Error retrying ${crawler.name}: ${errorMessage}`,
|
||||
);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
crawlResults[crawler.name] = { success: 0, error: errorMessage };
|
||||
|
||||
// 更新错误信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error(`Crawl task failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
this.logger.error(`Crawl task failed: ${errorMessage}`);
|
||||
} finally {
|
||||
await browser.close();
|
||||
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
const minutes = Math.floor(totalTime / 1000 / 60);
|
||||
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
|
||||
|
||||
|
||||
if (totalTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`);
|
||||
this.logger.warn(
|
||||
`⚠️ Crawl task exceeded maximum execution time of 3 hours.`,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// 输出统计总结
|
||||
this.logger.log('='.repeat(50));
|
||||
this.logger.log('爬虫执行总结 / Crawl Summary');
|
||||
this.logger.log('='.repeat(50));
|
||||
|
||||
|
||||
let totalSuccess = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
|
||||
for (const [source, result] of Object.entries(crawlResults)) {
|
||||
if (result.error) {
|
||||
this.logger.error(`❌ ${source}: 出错 - ${result.error}`);
|
||||
errorCount++;
|
||||
} else {
|
||||
this.logger.log(`✅ ${source}: 成功获取 ${result.success} 条工程信息`);
|
||||
this.logger.log(
|
||||
`✅ ${source}: 成功获取 ${result.success} 条工程信息`,
|
||||
);
|
||||
totalSuccess += result.success;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
this.logger.log('='.repeat(50));
|
||||
this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`);
|
||||
this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`);
|
||||
this.logger.log(
|
||||
`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`,
|
||||
);
|
||||
this.logger.log(
|
||||
`Total: ${totalSuccess} items, ${errorCount} sources failed`,
|
||||
);
|
||||
this.logger.log('='.repeat(50));
|
||||
}
|
||||
}
|
||||
|
||||
async crawlSingleSource(sourceName: string) {
|
||||
this.logger.log(`Starting single source crawl for: ${sourceName}`);
|
||||
|
||||
|
||||
// 从环境变量读取代理配置
|
||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||
@@ -237,9 +299,10 @@ export class BidCrawlerService {
|
||||
];
|
||||
|
||||
if (proxyHost && proxyPort) {
|
||||
const proxyUrl = proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
const proxyUrl =
|
||||
proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
args.push(`--proxy-server=${proxyUrl}`);
|
||||
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
|
||||
}
|
||||
@@ -249,10 +312,23 @@ export class BidCrawlerService {
|
||||
args,
|
||||
});
|
||||
|
||||
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler];
|
||||
|
||||
const targetCrawler = crawlers.find(c => c.name === sourceName);
|
||||
|
||||
const crawlers = [
|
||||
ChdtpCrawler,
|
||||
ChngCrawler,
|
||||
SzecpCrawler,
|
||||
CdtCrawler,
|
||||
EpsCrawler,
|
||||
CnncecpCrawler,
|
||||
CgnpcCrawler,
|
||||
CeicCrawler,
|
||||
EspicCrawler,
|
||||
PowerbeijingCrawler,
|
||||
SdiccCrawler,
|
||||
CnoocCrawler,
|
||||
];
|
||||
|
||||
const targetCrawler = crawlers.find((c) => c.name === sourceName);
|
||||
|
||||
if (!targetCrawler) {
|
||||
await browser.close();
|
||||
throw new Error(`Crawler not found for source: ${sourceName}`);
|
||||
@@ -260,17 +336,20 @@ export class BidCrawlerService {
|
||||
|
||||
try {
|
||||
this.logger.log(`Crawling: ${targetCrawler.name}`);
|
||||
|
||||
|
||||
const results = await targetCrawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${targetCrawler.name}`);
|
||||
this.logger.log(
|
||||
`Extracted ${results.length} items from ${targetCrawler.name}`,
|
||||
);
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
const latestPublishDate =
|
||||
results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
@@ -282,7 +361,11 @@ export class BidCrawlerService {
|
||||
}
|
||||
|
||||
// 保存爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(targetCrawler.name, results.length, latestPublishDate);
|
||||
await this.saveCrawlInfo(
|
||||
targetCrawler.name,
|
||||
results.length,
|
||||
latestPublishDate,
|
||||
);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
@@ -291,16 +374,19 @@ export class BidCrawlerService {
|
||||
latestPublishDate,
|
||||
};
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${targetCrawler.name}: ${err.message}`);
|
||||
|
||||
const errorMessage = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(
|
||||
`Error crawling ${targetCrawler.name}: ${errorMessage}`,
|
||||
);
|
||||
|
||||
// 保存错误信息到数据库
|
||||
await this.saveCrawlInfo(targetCrawler.name, 0, null, err.message);
|
||||
await this.saveCrawlInfo(targetCrawler.name, 0, null, errorMessage);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
source: targetCrawler.name,
|
||||
count: 0,
|
||||
error: err.message,
|
||||
error: errorMessage,
|
||||
};
|
||||
} finally {
|
||||
await browser.close();
|
||||
@@ -324,7 +410,10 @@ export class BidCrawlerService {
|
||||
await this.crawlInfoRepository.save(crawlInfo);
|
||||
this.logger.log(`Saved crawl info for ${source}: ${count} items`);
|
||||
} catch (err) {
|
||||
this.logger.error(`Failed to save crawl info for ${source}: ${err.message}`);
|
||||
const errorMessage = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(
|
||||
`Failed to save crawl info for ${source}: ${errorMessage}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ import { CdtCrawler } from './cdt_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('CdtCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('CdtCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${CdtCrawler.name}`);
|
||||
console.log(`Target URL: ${CdtCrawler.url}`);
|
||||
|
||||
|
||||
const results = await CdtCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CdtCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CdtResult {
|
||||
@@ -52,12 +52,22 @@ export interface CdtResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CdtCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CdtResult[];
|
||||
}
|
||||
|
||||
export const CdtCrawler = {
|
||||
name: '中国大唐集团电子商务平台',
|
||||
url: 'https://tang.cdt-ec.com/home/index.html',
|
||||
baseUrl: 'https://tang.cdt-ec.com',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CdtResult[]> {
|
||||
async crawl(
|
||||
this: CdtCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CdtResult[]> {
|
||||
const logger = new Logger('CdtCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -67,7 +77,9 @@ export const CdtCrawler = {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
||||
);
|
||||
|
||||
const allResults: CdtResult[] = [];
|
||||
let currentPage = 1;
|
||||
@@ -86,19 +98,26 @@ export const CdtCrawler = {
|
||||
|
||||
// 点击"招标公告"标签
|
||||
logger.log('Looking for "招标公告" tab...');
|
||||
await page.waitForFunction(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
|
||||
}, { timeout: 60000 });
|
||||
await page.waitForFunction(
|
||||
() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
return tabs.some(
|
||||
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
const target = tabs.find(tab => tab.textContent && tab.textContent.includes('招标公告')) as HTMLElement;
|
||||
const target = tabs.find(
|
||||
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
|
||||
) as HTMLElement;
|
||||
if (target) target.click();
|
||||
});
|
||||
|
||||
logger.log('Clicked "招标公告" tab.');
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
@@ -109,26 +128,43 @@ export const CdtCrawler = {
|
||||
|
||||
// 点击"招标公告"下的"更多+"链接
|
||||
logger.log('Looking for "更多+" link under "招标公告"...');
|
||||
await page.waitForFunction(() => {
|
||||
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
|
||||
return titles.some(title => title.textContent && title.textContent.includes('招标公告'));
|
||||
}, { timeout: 30000 });
|
||||
await page.waitForFunction(
|
||||
() => {
|
||||
const titles = Array.from(
|
||||
document.querySelectorAll('span.h-notice-title'),
|
||||
);
|
||||
return titles.some(
|
||||
(title) =>
|
||||
title.textContent && title.textContent.includes('招标公告'),
|
||||
);
|
||||
},
|
||||
{ timeout: 30000 },
|
||||
);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
|
||||
const targetTitle = titles.find(title => title.textContent && title.textContent.includes('招标公告'));
|
||||
const titles = Array.from(
|
||||
document.querySelectorAll('span.h-notice-title'),
|
||||
);
|
||||
const targetTitle = titles.find(
|
||||
(title) =>
|
||||
title.textContent && title.textContent.includes('招标公告'),
|
||||
);
|
||||
if (targetTitle) {
|
||||
const parent = targetTitle.parentElement;
|
||||
if (parent) {
|
||||
const moreLink = parent.querySelector('a.h-notice-more') as HTMLElement;
|
||||
const moreLink = parent.querySelector(
|
||||
'a.h-notice-more',
|
||||
) as HTMLElement;
|
||||
if (moreLink) moreLink.click();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
logger.log('Clicked "更多+" link under "招标公告".');
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await page
|
||||
.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
|
||||
.catch(() => {});
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
@@ -155,7 +191,9 @@ export const CdtCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
@@ -172,7 +210,9 @@ export const CdtCrawler = {
|
||||
}, nextButtonSelector);
|
||||
|
||||
if (!nextButtonExists) {
|
||||
logger.log('Next page button not found or disabled. Reached end of list.');
|
||||
logger.log(
|
||||
'Next page button not found or disabled. Reached end of list.',
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -186,18 +226,25 @@ export const CdtCrawler = {
|
||||
}, nextButtonSelector);
|
||||
|
||||
// 等待 AJAX 请求完成(通过监听网络请求)
|
||||
await page.waitForFunction(() => {
|
||||
// 检查表格是否正在加载
|
||||
const loading = document.querySelector('.layui-table-loading');
|
||||
return !loading;
|
||||
}, { timeout: 30000 }).catch(() => {});
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
// 检查表格是否正在加载
|
||||
const loading = document.querySelector('.layui-table-loading');
|
||||
return !loading;
|
||||
},
|
||||
{ timeout: 30000 },
|
||||
)
|
||||
.catch(() => {});
|
||||
|
||||
// 额外等待确保数据加载完成
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
|
||||
// 检查是否真的翻页了(通过检查当前页码)
|
||||
const currentActivePage = await page.evaluate(() => {
|
||||
const activeSpan = document.querySelector('.layui-laypage-curr em:last-child');
|
||||
const activeSpan = document.querySelector(
|
||||
'.layui-laypage-curr em:last-child',
|
||||
);
|
||||
return activeSpan ? parseInt(activeSpan.textContent || '1') : 1;
|
||||
});
|
||||
|
||||
@@ -217,25 +264,29 @@ export const CdtCrawler = {
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CdtResult[] {
|
||||
extract(this: CdtCrawlerType, html: string): CdtResult[] {
|
||||
const results: CdtResult[] = [];
|
||||
/**
|
||||
* Regex groups for tang.cdt-ec.com:
|
||||
@@ -243,23 +294,24 @@ export const CdtCrawler = {
|
||||
* 2: Title (项目名称)
|
||||
* 3: Date (发布时间)
|
||||
*/
|
||||
const regex = /<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
|
||||
const regex =
|
||||
/<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -29,7 +29,7 @@ describe('CeicCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Run in non-headless mode
|
||||
args: [
|
||||
@@ -40,14 +40,14 @@ describe('CeicCrawler Real Site Test', () => {
|
||||
'--disable-infobars',
|
||||
...proxyArgs,
|
||||
],
|
||||
defaultViewport: null
|
||||
defaultViewport: null,
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await new Promise((r) => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
@@ -56,29 +56,33 @@ describe('CeicCrawler Real Site Test', () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${CeicCrawler.name}`);
|
||||
console.log(`Target URL: ${CeicCrawler.url}`);
|
||||
|
||||
|
||||
const results = await CeicCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
|
||||
console.warn(
|
||||
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
|
||||
);
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
interface CeicCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
}
|
||||
|
||||
export const CeicCrawler = {
|
||||
@@ -52,7 +58,10 @@ export const CeicCrawler = {
|
||||
url: 'https://ceic.dlnyzb.com/3001',
|
||||
baseUrl: 'https://ceic.dlnyzb.com/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: CeicCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('CeicCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -65,10 +74,14 @@ export const CeicCrawler = {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
@@ -82,7 +95,7 @@ export const CeicCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -90,16 +103,25 @@ export const CeicCrawler = {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for content to load - MUI list items
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('li.MuiListItem-root').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
return (
|
||||
document.querySelectorAll('li.MuiListItem-root').length > 0
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
)
|
||||
.catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate(() => {
|
||||
const results: { title: string; dateStr: string; url: string }[] = [];
|
||||
|
||||
// Extract from MUI list items
|
||||
const listItems = Array.from(document.querySelectorAll('li.MuiListItem-root'));
|
||||
listItems.forEach(item => {
|
||||
const listItems = Array.from(
|
||||
document.querySelectorAll('li.MuiListItem-root'),
|
||||
);
|
||||
listItems.forEach((item) => {
|
||||
// Find the title link
|
||||
const titleLink = item.querySelector('a.css-1vdw90h');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
@@ -125,15 +147,19 @@ export const CeicCrawler = {
|
||||
});
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
logger.warn(
|
||||
`No results found on page ${currentPage}. Extraction failed.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r.title,
|
||||
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
|
||||
url: r.url.replace(/\/\//g, '/')
|
||||
})));
|
||||
allResults.push(
|
||||
...pageResults.map((r) => ({
|
||||
title: r.title,
|
||||
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
|
||||
url: r.url.replace(/\/\//g, '/'),
|
||||
})),
|
||||
);
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
@@ -142,27 +168,30 @@ export const CeicCrawler = {
|
||||
if (!nextButton) break;
|
||||
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
|
||||
currentPage++;
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Crawl failed: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
extract() {
|
||||
return [];
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { CgnpcCrawler } from './cgnpc_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('CgnpcCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('CgnpcCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${CgnpcCrawler.name}`);
|
||||
console.log(`Target URL: ${CgnpcCrawler.url}`);
|
||||
|
||||
|
||||
const results = await CgnpcCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CgnpcCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CgnpcResult {
|
||||
@@ -52,12 +52,22 @@ export interface CgnpcResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CgnpcCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CgnpcResult[];
|
||||
}
|
||||
|
||||
export const CgnpcCrawler = {
|
||||
name: '中广核电子商务平台',
|
||||
url: 'https://ecp.cgnpc.com.cn/zbgg.html',
|
||||
baseUrl: 'https://ecp.cgnpc.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CgnpcResult[]> {
|
||||
async crawl(
|
||||
this: CgnpcCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CgnpcResult[]> {
|
||||
const logger = new Logger('CgnpcCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const CgnpcCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CgnpcResult[] = [];
|
||||
@@ -87,7 +101,7 @@ export const CgnpcCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -103,12 +117,14 @@ export const CgnpcCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -127,9 +143,13 @@ export const CgnpcCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -138,26 +158,27 @@ export const CgnpcCrawler = {
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CgnpcResult[] {
|
||||
extract(this: CgnpcCrawlerType, html: string): CgnpcResult[] {
|
||||
const results: CgnpcResult[] = [];
|
||||
/**
|
||||
* Regex groups for ecp.cgnpc.com.cn:
|
||||
@@ -181,24 +202,25 @@ export const CgnpcCrawler = {
|
||||
* </div>
|
||||
* </div>
|
||||
*/
|
||||
const regex = /<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p>文件获取截止时间<\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
|
||||
const regex =
|
||||
/<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p>文件获取截止时间<\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const title = match[1]?.trim();
|
||||
const url = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const title = match[1]?.trim() ?? '';
|
||||
const url = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -29,7 +29,7 @@ describe('ChdtpCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: true, // Change to false to see the browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('ChdtpCrawler Real Site Test', () => {
|
||||
it('should visit the website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${ChdtpCrawler.name}`);
|
||||
console.log(`Target URL: ${ChdtpCrawler.url}`);
|
||||
|
||||
|
||||
const results = await ChdtpCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('ChdtpCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if the website structure has changed or if the list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if the website structure has changed or if the list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of the first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of the first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -7,22 +7,34 @@ export interface ChdtpResult {
|
||||
url: string; // Necessary for system uniqueness
|
||||
}
|
||||
|
||||
interface ChdtpCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): ChdtpResult[];
|
||||
}
|
||||
|
||||
export const ChdtpCrawler = {
|
||||
name: '华电集团电子商务平台 ',
|
||||
url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1',
|
||||
baseUrl: 'https://www.chdtp.com/webs/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: ChdtpCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('ChdtpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
|
||||
);
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
let currentPage = 1;
|
||||
@@ -35,14 +47,16 @@ export const ChdtpCrawler = {
|
||||
while (currentPage <= maxPages) {
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// Find the "Next Page" button
|
||||
// Using partial match for src to be robust against path variations
|
||||
@@ -58,35 +72,43 @@ export const ChdtpCrawler = {
|
||||
// For this specific site, we'll try to click.
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
|
||||
try {
|
||||
await Promise.all([
|
||||
page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }),
|
||||
page.waitForNavigation({
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
}),
|
||||
nextButton.click(),
|
||||
]);
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults; // Return what we have so far
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): ChdtpResult[] {
|
||||
extract(this: ChdtpCrawlerType, html: string): ChdtpResult[] {
|
||||
const results: ChdtpResult[] = [];
|
||||
/**
|
||||
* Regex groups for chdtp.com:
|
||||
@@ -96,23 +118,24 @@ export const ChdtpCrawler = {
|
||||
* 4: Business Type
|
||||
* 5: Date
|
||||
*/
|
||||
const regex = /<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs;
|
||||
const regex =
|
||||
/<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const urlSuffix = match[2]?.trim();
|
||||
const title = match[3]?.trim();
|
||||
const dateStr = match[5]?.trim();
|
||||
const urlSuffix = match[2]?.trim() ?? '';
|
||||
const title = match[3]?.trim() ?? '';
|
||||
const dateStr = match[5]?.trim() ?? '';
|
||||
|
||||
if (title && urlSuffix) {
|
||||
const fullUrl = this.baseUrl + urlSuffix;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
@@ -31,13 +31,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,23 +47,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
describe('ChngCrawler Real Site Test', () => {
|
||||
@@ -74,7 +74,7 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Run in non-headless mode
|
||||
args: [
|
||||
@@ -82,7 +82,7 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
"--disable-infobars",
|
||||
'--disable-infobars',
|
||||
...proxyArgs,
|
||||
// "--headless=new",
|
||||
// '--disable-dev-shm-usage',
|
||||
@@ -94,15 +94,14 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
// '--disable-webgl',
|
||||
// '--disable-javascript',
|
||||
],
|
||||
defaultViewport: null
|
||||
|
||||
defaultViewport: null,
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await new Promise((r) => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
@@ -111,43 +110,51 @@ describe('ChngCrawler Real Site Test', () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${ChngCrawler.name}`);
|
||||
console.log(`Target URL: ${ChngCrawler.url}`);
|
||||
|
||||
|
||||
// 创建一个临时页面用于模拟人类行为
|
||||
const tempPage = await browser.newPage();
|
||||
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
|
||||
await tempPage.setViewport({
|
||||
width: 1920,
|
||||
height: 1080,
|
||||
deviceScaleFactor: 1,
|
||||
});
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
console.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(tempPage);
|
||||
|
||||
|
||||
// 模拟人类滚动
|
||||
console.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(tempPage);
|
||||
|
||||
|
||||
await tempPage.close();
|
||||
|
||||
|
||||
const results = await ChngCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.');
|
||||
console.warn(
|
||||
'Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.',
|
||||
);
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -16,19 +16,20 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
console.log('Page was closed during mouse movement simulation');
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('Mouse movement simulation interrupted:', error.message);
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
console.log('Mouse movement simulation interrupted:', errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,18 +44,18 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
console.log('Page was closed during scrolling simulation');
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
@@ -62,19 +63,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('Scrolling simulation interrupted:', error.message);
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
console.log('Scrolling simulation interrupted:', errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
interface ChngCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
}
|
||||
|
||||
export const ChngCrawler = {
|
||||
name: '华能集团电子商务平台',
|
||||
url: 'https://ec.chng.com.cn/channel/home/#/purchase?top=0',
|
||||
baseUrl: 'https://ec.chng.com.cn/channel/home/#',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: ChngCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('ChngCrawler');
|
||||
let page = await browser.newPage();
|
||||
// await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
@@ -84,42 +95,48 @@ export const ChngCrawler = {
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log('Navigating to Bing...');
|
||||
await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' });
|
||||
|
||||
logger.log('Searching for target site...');
|
||||
const searchBoxSelector = 'input[name="q"]';
|
||||
const searchBoxSelector = 'input[name="q"]';
|
||||
await page.waitForSelector(searchBoxSelector);
|
||||
await page.type(searchBoxSelector, 'https://ec.chng.com.cn/');
|
||||
await page.keyboard.press('Enter');
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2' });
|
||||
|
||||
|
||||
logger.log('Clicking search result...');
|
||||
// await page.screenshot({ path: 'bing.png' });
|
||||
const firstResultSelector = '#b_results .b_algo h2 a';
|
||||
await page.waitForSelector(firstResultSelector);
|
||||
|
||||
const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target());
|
||||
|
||||
const newTargetPromise = browser.waitForTarget(
|
||||
(target) => target.opener() === page.target(),
|
||||
);
|
||||
await page.click(firstResultSelector);
|
||||
|
||||
|
||||
const newTarget = await newTargetPromise;
|
||||
const newPage = await newTarget.page();
|
||||
|
||||
|
||||
if (newPage) {
|
||||
// await newPage.screenshot({ path: 'newPage.png' });
|
||||
await page.close();
|
||||
@@ -131,108 +148,135 @@ export const ChngCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
|
||||
// 等待页面稳定,不强制等待导航
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
|
||||
// PAUSE 15 SECONDS as requested
|
||||
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
|
||||
await new Promise(r => setTimeout(r, 15000));
|
||||
// await page.screenshot({ path: 'huaneng.png' });
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
logger.log('Looking for "采购专栏" link...');
|
||||
await page.waitForFunction(() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
return divs.some(div => div.textContent && div.textContent.includes('采购专栏'));
|
||||
}, { timeout: 60000 });
|
||||
|
||||
const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement;
|
||||
if (target) target.click();
|
||||
});
|
||||
|
||||
const purchaseTarget = await purchaseTargetPromise;
|
||||
if (purchaseTarget) {
|
||||
const pPage = await purchaseTarget.page();
|
||||
if (pPage) {
|
||||
logger.log('Switched to Purchase Page tab.');
|
||||
page = pPage;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
}
|
||||
}
|
||||
|
||||
logger.log(`Active URL: ${page.url()}`);
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
|
||||
// PAUSE 15 SECONDS as requested
|
||||
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
|
||||
await new Promise((r) => setTimeout(r, 15000));
|
||||
// await page.screenshot({ path: 'huaneng.png' });
|
||||
|
||||
logger.log('Looking for "采购专栏" link...');
|
||||
await page.waitForFunction(
|
||||
() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
return divs.some(
|
||||
(div) => div.textContent && div.textContent.includes('采购专栏'),
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
);
|
||||
|
||||
const purchaseTargetPromise = browser
|
||||
.waitForTarget((target) => target.opener() === page.target(), {
|
||||
timeout: 15000,
|
||||
})
|
||||
.catch(() => null);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
const target = divs.find(
|
||||
(div) => div.textContent && div.textContent.includes('采购专栏'),
|
||||
) as HTMLElement;
|
||||
if (target) target.click();
|
||||
});
|
||||
|
||||
const purchaseTarget = await purchaseTargetPromise;
|
||||
if (purchaseTarget) {
|
||||
const pPage = await purchaseTarget.page();
|
||||
if (pPage) {
|
||||
logger.log('Switched to Purchase Page tab.');
|
||||
page = pPage;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
}
|
||||
}
|
||||
|
||||
logger.log(`Active URL: ${page.url()}`);
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
|
||||
// Wait for table rows to load
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('tr.ant-table-row').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
return document.querySelectorAll('tr.ant-table-row').length > 0;
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
)
|
||||
.catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate((baseUrl) => {
|
||||
// Extract from table rows
|
||||
const items = Array.from(document.querySelectorAll('tr.ant-table-row'));
|
||||
return items.map(item => {
|
||||
const titleSpan = item.querySelector('span.list-text');
|
||||
const dateCell = item.querySelector('td.ant-table-row-cell-break-word p');
|
||||
|
||||
if (titleSpan && dateCell) {
|
||||
const title = titleSpan.textContent?.trim() || '';
|
||||
const dateStr = dateCell.textContent?.trim() || '';
|
||||
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
|
||||
// URL is not directly available in the table, need to construct from data-row-key
|
||||
const rowKey = item.getAttribute('data-row-key');
|
||||
const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : '';
|
||||
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}).filter(i => i !== null);
|
||||
const items = Array.from(
|
||||
document.querySelectorAll('tr.ant-table-row'),
|
||||
);
|
||||
return items
|
||||
.map((item) => {
|
||||
const titleSpan = item.querySelector('span.list-text');
|
||||
const dateCell = item.querySelector(
|
||||
'td.ant-table-row-cell-break-word p',
|
||||
);
|
||||
|
||||
if (titleSpan && dateCell) {
|
||||
const title = titleSpan.textContent?.trim() || '';
|
||||
const dateStr = dateCell.textContent?.trim() || '';
|
||||
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
|
||||
// URL is not directly available in the table, need to construct from data-row-key
|
||||
const rowKey = item.getAttribute('data-row-key');
|
||||
const url = rowKey
|
||||
? `${baseUrl}#/purchase/detail?id=${rowKey}`
|
||||
: '';
|
||||
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter((i) => i !== null);
|
||||
}, this.baseUrl);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
break;
|
||||
logger.warn(
|
||||
`No results found on page ${currentPage}. Extraction failed.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r!.title,
|
||||
publishDate: new Date(r!.dateStr),
|
||||
url: r!.url.replace(/\/\//g, '/')
|
||||
})));
|
||||
|
||||
allResults.push(
|
||||
...pageResults.map((r) => ({
|
||||
title: r.title,
|
||||
publishDate: new Date(r.dateStr),
|
||||
url: r.url.replace(/\/\//g, '/'),
|
||||
})),
|
||||
);
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
// Pagination: look for the "right" icon SVG
|
||||
@@ -241,34 +285,37 @@ export const ChngCrawler = {
|
||||
|
||||
// 点击下一页前保存当前页面状态
|
||||
const currentUrl = page.url();
|
||||
|
||||
|
||||
await nextButton.click();
|
||||
|
||||
|
||||
// 等待页面导航完成
|
||||
try {
|
||||
await page.waitForFunction(
|
||||
(oldUrl) => window.location.href !== oldUrl,
|
||||
{ timeout: 10000 },
|
||||
currentUrl
|
||||
currentUrl,
|
||||
);
|
||||
} catch (e) {
|
||||
} catch {
|
||||
logger.warn('Navigation timeout, continuing anyway');
|
||||
}
|
||||
|
||||
|
||||
// 等待页面内容加载
|
||||
await new Promise(r => setTimeout(r, 15000));
|
||||
await new Promise((r) => setTimeout(r, 15000));
|
||||
currentPage++;
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Crawl failed: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
};
|
||||
extract() {
|
||||
return [];
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { CnncecpCrawler } from './cnncecp_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('CnncecpCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('CnncecpCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${CnncecpCrawler.name}`);
|
||||
console.log(`Target URL: ${CnncecpCrawler.url}`);
|
||||
|
||||
|
||||
const results = await CnncecpCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CnncecpCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CnncecpResult {
|
||||
@@ -52,12 +52,22 @@ export interface CnncecpResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CnncecpCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CnncecpResult[];
|
||||
}
|
||||
|
||||
export const CnncecpCrawler = {
|
||||
name: '中核集团电子采购平台',
|
||||
url: 'https://www.cnncecp.com/xzbgg/index.jhtml',
|
||||
baseUrl: 'https://www.cnncecp.com/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CnncecpResult[]> {
|
||||
async crawl(
|
||||
this: CnncecpCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CnncecpResult[]> {
|
||||
const logger = new Logger('CnncecpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const CnncecpCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CnncecpResult[] = [];
|
||||
@@ -87,7 +101,7 @@ export const CnncecpCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -103,12 +117,14 @@ export const CnncecpCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -126,9 +142,13 @@ export const CnncecpCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -137,26 +157,27 @@ export const CnncecpCrawler = {
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CnncecpResult[] {
|
||||
extract(this: CnncecpCrawlerType, html: string): CnncecpResult[] {
|
||||
const results: CnncecpResult[] = [];
|
||||
/**
|
||||
* Regex groups for cnncecp.com:
|
||||
@@ -172,24 +193,25 @@ export const CnncecpCrawler = {
|
||||
* <a href="https://www.cnncecp.com/xzbgg/1862778.jhtml">中核四0四有限公司2026-2028年度质量流量控制器等采购项目(二次)变更公告</a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const dateStr = match[1]?.trim();
|
||||
const url = match[2]?.trim();
|
||||
const title = match[3]?.trim();
|
||||
const dateStr = match[1]?.trim() ?? '';
|
||||
const url = match[2]?.trim() ?? '';
|
||||
const title = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { CnoocCrawler } from './cnooc_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('CnoocCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('CnoocCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${CnoocCrawler.name}`);
|
||||
console.log(`Target URL: ${CnoocCrawler.url}`);
|
||||
|
||||
|
||||
const results = await CnoocCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('CnoocCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CnoocResult {
|
||||
@@ -52,12 +52,22 @@ export interface CnoocResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface CnoocCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): CnoocResult[];
|
||||
}
|
||||
|
||||
export const CnoocCrawler = {
|
||||
name: '中海油招标平台',
|
||||
url: 'https://buy.cnooc.com.cn/cbjyweb/001/001001/moreinfo.html',
|
||||
baseUrl: 'https://buy.cnooc.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CnoocResult[]> {
|
||||
async crawl(
|
||||
this: CnoocCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<CnoocResult[]> {
|
||||
const logger = new Logger('CnoocCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const CnoocCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CnoocResult[] = [];
|
||||
@@ -87,7 +101,7 @@ export const CnoocCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -103,12 +117,14 @@ export const CnoocCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -127,9 +143,13 @@ export const CnoocCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -138,26 +158,27 @@ export const CnoocCrawler = {
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CnoocResult[] {
|
||||
extract(this: CnoocCrawlerType, html: string): CnoocResult[] {
|
||||
const results: CnoocResult[] = [];
|
||||
/**
|
||||
* Regex groups for buy.cnooc.com.cn:
|
||||
@@ -173,24 +194,25 @@ export const CnoocCrawler = {
|
||||
* <span class="now-span" style="width:100px">2026-01-12</span>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { EpsCrawler } from './eps_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('EpsCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('EpsCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${EpsCrawler.name}`);
|
||||
console.log(`Target URL: ${EpsCrawler.url}`);
|
||||
|
||||
|
||||
const results = await EpsCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('EpsCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface EpsResult {
|
||||
@@ -52,12 +52,22 @@ export interface EpsResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface EpsCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): EpsResult[];
|
||||
}
|
||||
|
||||
export const EpsCrawler = {
|
||||
name: '中国三峡集团电子商务平台',
|
||||
url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm',
|
||||
baseUrl: 'https://eps.ctg.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<EpsResult[]> {
|
||||
async crawl(
|
||||
this: EpsCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<EpsResult[]> {
|
||||
const logger = new Logger('EpsCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const EpsCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: EpsResult[] = [];
|
||||
@@ -87,7 +101,7 @@ export const EpsCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -103,12 +117,14 @@ export const EpsCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -127,9 +143,13 @@ export const EpsCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮,等待页面更新
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -138,26 +158,27 @@ export const EpsCrawler = {
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): EpsResult[] {
|
||||
extract(this: EpsCrawlerType, html: string): EpsResult[] {
|
||||
const results: EpsResult[] = [];
|
||||
/**
|
||||
* Regex groups for eps.ctg.com.cn:
|
||||
@@ -179,24 +200,25 @@ export const EpsCrawler = {
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { EspicCrawler } from './espic_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('EspicCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('EspicCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${EspicCrawler.name}`);
|
||||
console.log(`Target URL: ${EspicCrawler.getUrl()}`);
|
||||
|
||||
|
||||
const results = await EspicCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('EspicCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface EspicResult {
|
||||
@@ -52,12 +52,19 @@ export interface EspicResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface EspicCrawlerType {
|
||||
name: string;
|
||||
baseUrl: string;
|
||||
getUrl(page?: number): string;
|
||||
extract(html: string): EspicResult[];
|
||||
}
|
||||
|
||||
export const EspicCrawler = {
|
||||
name: '电能e招采平台(国电投)',
|
||||
baseUrl: 'https://ebid.espic.com.cn/',
|
||||
|
||||
// 生成动态 URL,使用当前日期
|
||||
getUrl(page: number = 1): string {
|
||||
getUrl(this: EspicCrawlerType, page: number = 1): string {
|
||||
const now = new Date();
|
||||
const year = now.getFullYear();
|
||||
const month = now.getMonth() + 1; // 月份从0开始
|
||||
@@ -66,7 +73,10 @@ export const EspicCrawler = {
|
||||
return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`;
|
||||
},
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<EspicResult[]> {
|
||||
async crawl(
|
||||
this: EspicCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<EspicResult[]> {
|
||||
const logger = new Logger('EspicCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -78,11 +88,15 @@ export const EspicCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: EspicResult[] = [];
|
||||
@@ -100,15 +114,18 @@ export const EspicCrawler = {
|
||||
() => {
|
||||
// 检查是否已经通过验证(页面不再是 WAF 页面)
|
||||
const bodyText = document.body?.textContent || '';
|
||||
return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙');
|
||||
return (
|
||||
!bodyText.includes('人机识别检测') &&
|
||||
!bodyText.includes('WEB 应用防火墙')
|
||||
);
|
||||
},
|
||||
{ timeout: 30000 }
|
||||
{ timeout: 30000 },
|
||||
);
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -124,12 +141,14 @@ export const EspicCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -141,7 +160,7 @@ export const EspicCrawler = {
|
||||
'a[aria-label="Next"]',
|
||||
'a.next',
|
||||
'li.next a',
|
||||
'a.layui-laypage-next:not(.layui-disabled)'
|
||||
'a.layui-laypage-next:not(.layui-disabled)',
|
||||
];
|
||||
|
||||
let nextButton: puppeteer.ElementHandle<Element> | null = null;
|
||||
@@ -149,7 +168,7 @@ export const EspicCrawler = {
|
||||
try {
|
||||
nextButton = await page.$(selector);
|
||||
if (nextButton) break;
|
||||
} catch (e) {
|
||||
} catch {
|
||||
// 继续尝试下一个选择器
|
||||
}
|
||||
}
|
||||
@@ -164,9 +183,13 @@ export const EspicCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -175,26 +198,27 @@ export const EspicCrawler = {
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): EspicResult[] {
|
||||
extract(this: EspicCrawlerType, html: string): EspicResult[] {
|
||||
const results: EspicResult[] = [];
|
||||
/**
|
||||
* Regex groups for ebid.espic.com.cn:
|
||||
@@ -225,24 +249,25 @@ export const EspicCrawler = {
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('PowerbeijingCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('PowerbeijingCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${PowerbeijingCrawler.name}`);
|
||||
console.log(`Target URL: ${PowerbeijingCrawler.url}`);
|
||||
|
||||
|
||||
const results = await PowerbeijingCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('PowerbeijingCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface PowerbeijingResult {
|
||||
@@ -52,12 +52,22 @@ export interface PowerbeijingResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface PowerbeijingCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): PowerbeijingResult[];
|
||||
}
|
||||
|
||||
export const PowerbeijingCrawler = {
|
||||
name: '北京京能电子商务平台',
|
||||
url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
|
||||
baseUrl: 'https://www.powerbeijing-ec.com/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<PowerbeijingResult[]> {
|
||||
async crawl(
|
||||
this: PowerbeijingCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<PowerbeijingResult[]> {
|
||||
const logger = new Logger('PowerbeijingCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const PowerbeijingCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: PowerbeijingResult[] = [];
|
||||
@@ -87,7 +101,7 @@ export const PowerbeijingCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -103,12 +117,14 @@ export const PowerbeijingCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -127,9 +143,13 @@ export const PowerbeijingCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮,等待页面更新
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -138,26 +158,27 @@ export const PowerbeijingCrawler = {
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): PowerbeijingResult[] {
|
||||
extract(this: PowerbeijingCrawlerType, html: string): PowerbeijingResult[] {
|
||||
const results: PowerbeijingResult[] = [];
|
||||
/**
|
||||
* Regex groups for powerbeijing-ec.com:
|
||||
@@ -176,24 +197,25 @@ export const PowerbeijingCrawler = {
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
const regex =
|
||||
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
const url = match[1]?.trim() ?? '';
|
||||
const title = match[2]?.trim() ?? '';
|
||||
const dateStr = match[3]?.trim() ?? '';
|
||||
|
||||
if (title && url) {
|
||||
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -2,7 +2,7 @@ import { SdiccCrawler } from './sdicc_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
jest.setTimeout(60000 * 5);
|
||||
|
||||
// 获取代理配置
|
||||
const getProxyArgs = (): string[] => {
|
||||
@@ -29,7 +29,7 @@ describe('SdiccCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
|
||||
@@ -45,13 +45,15 @@ describe('SdiccCrawler Real Site Test', () => {
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${SdiccCrawler.name}`);
|
||||
console.log(`Target URL: ${SdiccCrawler.url}`);
|
||||
|
||||
|
||||
const results = await SdiccCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
@@ -61,13 +63,15 @@ describe('SdiccCrawler Real Site Test', () => {
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
console.warn(
|
||||
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
|
||||
);
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface SdiccResult {
|
||||
@@ -52,12 +52,22 @@ export interface SdiccResult {
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface SdiccCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
extract(html: string): SdiccResult[];
|
||||
}
|
||||
|
||||
export const SdiccCrawler = {
|
||||
name: '国投集团电子采购平台',
|
||||
url: 'https://www.sdicc.com.cn/cgxx/ggList',
|
||||
baseUrl: 'https://www.sdicc.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<SdiccResult[]> {
|
||||
async crawl(
|
||||
this: SdiccCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<SdiccResult[]> {
|
||||
const logger = new Logger('SdiccCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -69,11 +79,15 @@ export const SdiccCrawler = {
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: SdiccResult[] = [];
|
||||
@@ -87,15 +101,17 @@ export const SdiccCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 等待表格加载
|
||||
logger.log('Waiting for table to load...');
|
||||
await page.waitForSelector('.tbody table tbody tr', { timeout: 30000 }).catch(() => {
|
||||
logger.warn('Table rows not found, trying alternative selectors...');
|
||||
});
|
||||
await page
|
||||
.waitForSelector('.tbody table tbody tr', { timeout: 30000 })
|
||||
.catch(() => {
|
||||
logger.warn('Table rows not found, trying alternative selectors...');
|
||||
});
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
@@ -109,12 +125,14 @@ export const SdiccCrawler = {
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
logger.log(
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -132,10 +150,16 @@ export const SdiccCrawler = {
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
|
||||
await new Promise(r => setTimeout(r, 2000)); // 额外等待确保数据加载完成
|
||||
await page
|
||||
.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
|
||||
.catch(() => {});
|
||||
await new Promise((r) => setTimeout(r, 2000)); // 额外等待确保数据加载完成
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
const navErrorMessage =
|
||||
navError instanceof Error ? navError.message : String(navError);
|
||||
logger.error(
|
||||
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -144,26 +168,27 @@ export const SdiccCrawler = {
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): SdiccResult[] {
|
||||
extract(this: SdiccCrawlerType, html: string): SdiccResult[] {
|
||||
const results: SdiccResult[] = [];
|
||||
/**
|
||||
* Regex groups for sdicc.com.cn:
|
||||
@@ -180,25 +205,26 @@ export const SdiccCrawler = {
|
||||
* <td colspan="1" rowspan="1"><span> 2026-01-09 </span></td>
|
||||
* </tr>
|
||||
*/
|
||||
const regex = /<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
|
||||
const regex =
|
||||
/<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
|
||||
|
||||
let match;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const ggGuid = match[1]?.trim();
|
||||
const gcGuid = match[2]?.trim();
|
||||
const title = match[3]?.trim();
|
||||
const dateStr = match[4]?.trim();
|
||||
const ggGuid = match[1]?.trim() ?? '';
|
||||
const gcGuid = match[2]?.trim() ?? '';
|
||||
const title = match[3]?.trim() ?? '';
|
||||
const dateStr = match[4]?.trim() ?? '';
|
||||
|
||||
if (title && ggGuid && gcGuid) {
|
||||
const fullUrl = `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}`;
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: fullUrl.replace(/\/\//g, '/')
|
||||
url: fullUrl.replace(/\/\//g, '/'),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -29,7 +29,7 @@ describe('SzecpCrawler Real Site Test', () => {
|
||||
if (proxyArgs.length > 0) {
|
||||
console.log('Using proxy:', proxyArgs.join(' '));
|
||||
}
|
||||
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Run in non-headless mode
|
||||
args: [
|
||||
@@ -40,14 +40,14 @@ describe('SzecpCrawler Real Site Test', () => {
|
||||
'--disable-infobars',
|
||||
...proxyArgs,
|
||||
],
|
||||
defaultViewport: null
|
||||
defaultViewport: null,
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await new Promise((r) => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
@@ -56,29 +56,33 @@ describe('SzecpCrawler Real Site Test', () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${SzecpCrawler.name}`);
|
||||
console.log(`Target URL: ${SzecpCrawler.url}`);
|
||||
|
||||
|
||||
const results = await SzecpCrawler.crawl(browser);
|
||||
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(
|
||||
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
|
||||
);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
|
||||
console.warn(
|
||||
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
|
||||
);
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
steps: 10 + Math.floor(Math.random() * 20), // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
interface SzecpCrawlerType {
|
||||
name: string;
|
||||
url: string;
|
||||
baseUrl: string;
|
||||
}
|
||||
|
||||
export const SzecpCrawler = {
|
||||
@@ -52,7 +58,10 @@ export const SzecpCrawler = {
|
||||
url: 'https://www.szecp.com.cn/first_zbgg/index.html',
|
||||
baseUrl: 'https://www.szecp.com.cn/',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
async crawl(
|
||||
this: SzecpCrawlerType,
|
||||
browser: puppeteer.Browser,
|
||||
): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('SzecpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
@@ -65,10 +74,14 @@ export const SzecpCrawler = {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
@@ -82,7 +95,7 @@ export const SzecpCrawler = {
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
@@ -90,52 +103,69 @@ export const SzecpCrawler = {
|
||||
logger.log('Clicking search button...');
|
||||
await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 });
|
||||
await page.click('.szb-zbcgSearch-key-v1');
|
||||
await new Promise(r => setTimeout(r, 3000)); // Wait for results to load
|
||||
await new Promise((r) => setTimeout(r, 3000)); // Wait for results to load
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for content to load
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('.szb-zbcgTable-other').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
await page
|
||||
.waitForFunction(
|
||||
() => {
|
||||
return (
|
||||
document.querySelectorAll('.szb-zbcgTable-other').length > 0
|
||||
);
|
||||
},
|
||||
{ timeout: 60000 },
|
||||
)
|
||||
.catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate((baseUrl) => {
|
||||
// Extract from table rows
|
||||
const items = Array.from(document.querySelectorAll('.szb-zbcgTable-other'));
|
||||
return items.map(item => {
|
||||
const divs = item.querySelectorAll('div');
|
||||
if (divs.length >= 5) {
|
||||
const titleLink = divs[1].querySelector('a');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
const dateStr = divs[4].textContent?.trim() || '';
|
||||
const href = titleLink?.getAttribute('href') || '';
|
||||
const items = Array.from(
|
||||
document.querySelectorAll('.szb-zbcgTable-other'),
|
||||
);
|
||||
return items
|
||||
.map((item) => {
|
||||
const divs = item.querySelectorAll('div');
|
||||
if (divs.length >= 5) {
|
||||
const titleLink = divs[1].querySelector('a');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
const dateStr = divs[4].textContent?.trim() || '';
|
||||
const href = titleLink?.getAttribute('href') || '';
|
||||
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
|
||||
// Construct full URL if href is relative
|
||||
const url = href.startsWith('http') ? href : `${baseUrl}${href}`;
|
||||
// Construct full URL if href is relative
|
||||
const url = href.startsWith('http')
|
||||
? href
|
||||
: `${baseUrl}${href}`;
|
||||
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}).filter(i => i !== null);
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter((i) => i !== null);
|
||||
}, this.baseUrl);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
logger.warn(
|
||||
`No results found on page ${currentPage}. Extraction failed.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r!.title,
|
||||
publishDate: new Date(r!.dateStr),
|
||||
url: r!.url.replace(/\/\//g, '/')
|
||||
})));
|
||||
allResults.push(
|
||||
...pageResults.map((r) => ({
|
||||
title: r.title,
|
||||
publishDate: new Date(r.dateStr),
|
||||
url: r.url.replace(/\/\//g, '/'),
|
||||
})),
|
||||
);
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
@@ -144,27 +174,30 @@ export const SzecpCrawler = {
|
||||
if (!nextButton) break;
|
||||
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
|
||||
currentPage++;
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Crawl failed: ${errorMessage}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
extract() {
|
||||
return [];
|
||||
},
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user