chore: 更新.gitignore并添加新文件

在.gitignore中添加对*.png、*.log、*-lock.json、*.woff2文件的忽略规则,并新增OFL.txt文件。同时,添加vue.svg图标文件以支持前端展示。更新多个TypeScript文件以优化代码格式和增强可读性。
This commit is contained in:
dmy
2026-01-14 22:26:32 +08:00
parent 10565af001
commit 82f5a81887
47 changed files with 1513 additions and 814 deletions

View File

@@ -12,7 +12,7 @@ export class CrawlerController {
getStatus() {
return {
isCrawling: this.isCrawling,
crawlingSources: Array.from(this.crawlingSources)
crawlingSources: Array.from(this.crawlingSources),
};
}
@@ -21,9 +21,9 @@ export class CrawlerController {
if (this.isCrawling) {
return { message: 'Crawl is already running' };
}
this.isCrawling = true;
// We don't await this because we want it to run in the background
// and return immediately, or we can await if we want to user to wait.
// Given the requirement "Immediate Crawl", usually implies triggering it.
@@ -45,9 +45,9 @@ export class CrawlerController {
if (this.crawlingSources.has(sourceName)) {
return { message: `Source ${sourceName} is already being crawled` };
}
this.crawlingSources.add(sourceName);
try {
const result = await this.crawlerService.crawlSingleSource(sourceName);
return result;

View File

@@ -1,4 +1,9 @@
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm';
import {
Entity,
PrimaryGeneratedColumn,
Column,
CreateDateColumn,
} from 'typeorm';
@Entity('crawl_info_add')
export class CrawlInfoAdd {

View File

@@ -18,6 +18,17 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
import { SdiccCrawler } from './sdicc_target';
import { CnoocCrawler } from './cnooc_target';
interface CrawlResult {
title: string;
publishDate: Date;
url: string;
}
interface Crawler {
name: string;
crawl(browser: puppeteer.Browser): Promise<CrawlResult[]>;
}
@Injectable()
export class BidCrawlerService {
private readonly logger = new Logger(BidCrawlerService.name);
@@ -31,17 +42,15 @@ export class BidCrawlerService {
async crawlAll() {
this.logger.log('Starting crawl task with Puppeteer...');
// 设置最大执行时间为3小时
const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时毫秒
const startTime = Date.now();
// 统计结果
const crawlResults: Record<string, { success: number; error?: string }> = {};
const crawlResults: Record<string, { success: number; error?: string }> =
{};
// 记录数据为0的爬虫用于重试
const zeroDataCrawlers: any[] = [];
const zeroDataCrawlers: Crawler[] = [];
// 从环境变量读取代理配置
const proxyHost = this.configService.get<string>('PROXY_HOST');
const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -60,9 +69,10 @@ export class BidCrawlerService {
];
if (proxyHost && proxyPort) {
const proxyUrl = proxyUsername && proxyPassword
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
: `http://${proxyHost}:${proxyPort}`;
const proxyUrl =
proxyUsername && proxyPassword
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
: `http://${proxyHost}:${proxyPort}`;
args.push(`--proxy-server=${proxyUrl}`);
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
}
@@ -72,24 +82,43 @@ export class BidCrawlerService {
args,
});
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler];
const crawlers = [
ChdtpCrawler,
ChngCrawler,
SzecpCrawler,
CdtCrawler,
EpsCrawler,
CnncecpCrawler,
CgnpcCrawler,
CeicCrawler,
EspicCrawler,
PowerbeijingCrawler,
SdiccCrawler,
CnoocCrawler,
];
try {
for (const crawler of crawlers) {
this.logger.log(`Crawling: ${crawler.name}`);
// 检查是否超时
const elapsedTime = Date.now() - startTime;
if (elapsedTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`);
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
this.logger.warn(
`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`,
);
this.logger.warn(
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
);
break;
}
try {
const results = await crawler.crawl(browser);
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
this.logger.log(
`Extracted ${results.length} items from ${crawler.name}`,
);
// 记录成功数量
crawlResults[crawler.name] = { success: results.length };
@@ -99,12 +128,13 @@ export class BidCrawlerService {
}
// 获取最新的发布日期
const latestPublishDate = results.length > 0
? results.reduce((latest, item) => {
const itemDate = new Date(item.publishDate);
return itemDate > latest ? itemDate : latest;
}, new Date(0))
: null;
const latestPublishDate =
results.length > 0
? results.reduce((latest, item) => {
const itemDate = new Date(item.publishDate);
return itemDate > latest ? itemDate : latest;
}, new Date(0))
: null;
for (const item of results) {
await this.bidsService.createOrUpdate({
@@ -116,46 +146,60 @@ export class BidCrawlerService {
}
// 保存爬虫统计信息到数据库
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
await this.saveCrawlInfo(
crawler.name,
results.length,
latestPublishDate,
);
} catch (err) {
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
const errorMessage = err instanceof Error ? err.message : String(err);
this.logger.error(`Error crawling ${crawler.name}: ${errorMessage}`);
// 记录错误信息
crawlResults[crawler.name] = { success: 0, error: err.message };
crawlResults[crawler.name] = { success: 0, error: errorMessage };
// 保存错误信息到数据库
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
}
}
// 对数据为0的爬虫进行重试
if (zeroDataCrawlers.length > 0) {
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`);
this.logger.log(
`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`,
);
for (const crawler of zeroDataCrawlers) {
this.logger.log(`Retrying: ${crawler.name}`);
// 检查是否超时
const elapsedTime = Date.now() - startTime;
if (elapsedTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`);
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
this.logger.warn(
`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`,
);
this.logger.warn(
`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`,
);
break;
}
try {
const results = await crawler.crawl(browser);
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`);
this.logger.log(
`Retry extracted ${results.length} items from ${crawler.name}`,
);
// 更新统计结果
crawlResults[crawler.name] = { success: results.length };
// 获取最新的发布日期
const latestPublishDate = results.length > 0
? results.reduce((latest, item) => {
const itemDate = new Date(item.publishDate);
return itemDate > latest ? itemDate : latest;
}, new Date(0))
: null;
const latestPublishDate =
results.length > 0
? results.reduce((latest, item) => {
const itemDate = new Date(item.publishDate);
return itemDate > latest ? itemDate : latest;
}, new Date(0))
: null;
for (const item of results) {
await this.bidsService.createOrUpdate({
@@ -167,58 +211,76 @@ export class BidCrawlerService {
}
// 更新爬虫统计信息到数据库
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
await this.saveCrawlInfo(
crawler.name,
results.length,
latestPublishDate,
);
} catch (err) {
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
const errorMessage =
err instanceof Error ? err.message : String(err);
this.logger.error(
`Error retrying ${crawler.name}: ${errorMessage}`,
);
// 记录错误信息
crawlResults[crawler.name] = { success: 0, error: err.message };
crawlResults[crawler.name] = { success: 0, error: errorMessage };
// 更新错误信息到数据库
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
await this.saveCrawlInfo(crawler.name, 0, null, errorMessage);
}
}
}
} catch (error) {
this.logger.error(`Crawl task failed: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
this.logger.error(`Crawl task failed: ${errorMessage}`);
} finally {
await browser.close();
const totalTime = Date.now() - startTime;
const minutes = Math.floor(totalTime / 1000 / 60);
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
if (totalTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`);
this.logger.warn(
`⚠️ Crawl task exceeded maximum execution time of 3 hours.`,
);
}
// 输出统计总结
this.logger.log('='.repeat(50));
this.logger.log('爬虫执行总结 / Crawl Summary');
this.logger.log('='.repeat(50));
let totalSuccess = 0;
let errorCount = 0;
for (const [source, result] of Object.entries(crawlResults)) {
if (result.error) {
this.logger.error(`${source}: 出错 - ${result.error}`);
errorCount++;
} else {
this.logger.log(`${source}: 成功获取 ${result.success} 条工程信息`);
this.logger.log(
`${source}: 成功获取 ${result.success} 条工程信息`,
);
totalSuccess += result.success;
}
}
this.logger.log('='.repeat(50));
this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`);
this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`);
this.logger.log(
`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`,
);
this.logger.log(
`Total: ${totalSuccess} items, ${errorCount} sources failed`,
);
this.logger.log('='.repeat(50));
}
}
async crawlSingleSource(sourceName: string) {
this.logger.log(`Starting single source crawl for: ${sourceName}`);
// 从环境变量读取代理配置
const proxyHost = this.configService.get<string>('PROXY_HOST');
const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -237,9 +299,10 @@ export class BidCrawlerService {
];
if (proxyHost && proxyPort) {
const proxyUrl = proxyUsername && proxyPassword
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
: `http://${proxyHost}:${proxyPort}`;
const proxyUrl =
proxyUsername && proxyPassword
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
: `http://${proxyHost}:${proxyPort}`;
args.push(`--proxy-server=${proxyUrl}`);
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
}
@@ -249,10 +312,23 @@ export class BidCrawlerService {
args,
});
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler];
const targetCrawler = crawlers.find(c => c.name === sourceName);
const crawlers = [
ChdtpCrawler,
ChngCrawler,
SzecpCrawler,
CdtCrawler,
EpsCrawler,
CnncecpCrawler,
CgnpcCrawler,
CeicCrawler,
EspicCrawler,
PowerbeijingCrawler,
SdiccCrawler,
CnoocCrawler,
];
const targetCrawler = crawlers.find((c) => c.name === sourceName);
if (!targetCrawler) {
await browser.close();
throw new Error(`Crawler not found for source: ${sourceName}`);
@@ -260,17 +336,20 @@ export class BidCrawlerService {
try {
this.logger.log(`Crawling: ${targetCrawler.name}`);
const results = await targetCrawler.crawl(browser);
this.logger.log(`Extracted ${results.length} items from ${targetCrawler.name}`);
this.logger.log(
`Extracted ${results.length} items from ${targetCrawler.name}`,
);
// 获取最新的发布日期
const latestPublishDate = results.length > 0
? results.reduce((latest, item) => {
const itemDate = new Date(item.publishDate);
return itemDate > latest ? itemDate : latest;
}, new Date(0))
: null;
const latestPublishDate =
results.length > 0
? results.reduce((latest, item) => {
const itemDate = new Date(item.publishDate);
return itemDate > latest ? itemDate : latest;
}, new Date(0))
: null;
for (const item of results) {
await this.bidsService.createOrUpdate({
@@ -282,7 +361,11 @@ export class BidCrawlerService {
}
// 保存爬虫统计信息到数据库
await this.saveCrawlInfo(targetCrawler.name, results.length, latestPublishDate);
await this.saveCrawlInfo(
targetCrawler.name,
results.length,
latestPublishDate,
);
return {
success: true,
@@ -291,16 +374,19 @@ export class BidCrawlerService {
latestPublishDate,
};
} catch (err) {
this.logger.error(`Error crawling ${targetCrawler.name}: ${err.message}`);
const errorMessage = err instanceof Error ? err.message : String(err);
this.logger.error(
`Error crawling ${targetCrawler.name}: ${errorMessage}`,
);
// 保存错误信息到数据库
await this.saveCrawlInfo(targetCrawler.name, 0, null, err.message);
await this.saveCrawlInfo(targetCrawler.name, 0, null, errorMessage);
return {
success: false,
source: targetCrawler.name,
count: 0,
error: err.message,
error: errorMessage,
};
} finally {
await browser.close();
@@ -324,7 +410,10 @@ export class BidCrawlerService {
await this.crawlInfoRepository.save(crawlInfo);
this.logger.log(`Saved crawl info for ${source}: ${count} items`);
} catch (err) {
this.logger.error(`Failed to save crawl info for ${source}: ${err.message}`);
const errorMessage = err instanceof Error ? err.message : String(err);
this.logger.error(
`Failed to save crawl info for ${source}: ${errorMessage}`,
);
}
}
}

View File

@@ -2,7 +2,7 @@ import { CdtCrawler } from './cdt_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CdtCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CdtCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CdtCrawler.name}`);
console.log(`Target URL: ${CdtCrawler.url}`);
const results = await CdtCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('CdtCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -13,11 +13,11 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -31,19 +31,19 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface CdtResult {
@@ -52,12 +52,22 @@ export interface CdtResult {
url: string;
}
interface CdtCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CdtResult[];
}
export const CdtCrawler = {
name: '中国大唐集团电子商务平台',
url: 'https://tang.cdt-ec.com/home/index.html',
baseUrl: 'https://tang.cdt-ec.com',
async crawl(browser: puppeteer.Browser): Promise<CdtResult[]> {
async crawl(
this: CdtCrawlerType,
browser: puppeteer.Browser,
): Promise<CdtResult[]> {
const logger = new Logger('CdtCrawler');
const page = await browser.newPage();
@@ -67,7 +77,9 @@ export const CdtCrawler = {
await page.authenticate({ username, password });
}
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
);
const allResults: CdtResult[] = [];
let currentPage = 1;
@@ -86,19 +98,26 @@ export const CdtCrawler = {
// 点击"招标公告"标签
logger.log('Looking for "招标公告" tab...');
await page.waitForFunction(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
}, { timeout: 60000 });
await page.waitForFunction(
() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
return tabs.some(
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
);
},
{ timeout: 60000 },
);
await page.evaluate(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
const target = tabs.find(tab => tab.textContent && tab.textContent.includes('招标公告')) as HTMLElement;
const target = tabs.find(
(tab) => tab.textContent && tab.textContent.includes('招标公告'),
) as HTMLElement;
if (target) target.click();
});
logger.log('Clicked "招标公告" tab.');
await new Promise(r => setTimeout(r, 2000));
await new Promise((r) => setTimeout(r, 2000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
@@ -109,26 +128,43 @@ export const CdtCrawler = {
// 点击"招标公告"下的"更多+"链接
logger.log('Looking for "更多+" link under "招标公告"...');
await page.waitForFunction(() => {
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
return titles.some(title => title.textContent && title.textContent.includes('招标公告'));
}, { timeout: 30000 });
await page.waitForFunction(
() => {
const titles = Array.from(
document.querySelectorAll('span.h-notice-title'),
);
return titles.some(
(title) =>
title.textContent && title.textContent.includes('招标公告'),
);
},
{ timeout: 30000 },
);
await page.evaluate(() => {
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
const targetTitle = titles.find(title => title.textContent && title.textContent.includes('招标公告'));
const titles = Array.from(
document.querySelectorAll('span.h-notice-title'),
);
const targetTitle = titles.find(
(title) =>
title.textContent && title.textContent.includes('招标公告'),
);
if (targetTitle) {
const parent = targetTitle.parentElement;
if (parent) {
const moreLink = parent.querySelector('a.h-notice-more') as HTMLElement;
const moreLink = parent.querySelector(
'a.h-notice-more',
) as HTMLElement;
if (moreLink) moreLink.click();
}
}
});
logger.log('Clicked "更多+" link under "招标公告".');
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
await new Promise(r => setTimeout(r, 3000));
await page
.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
.catch(() => {});
await new Promise((r) => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
@@ -155,7 +191,9 @@ export const CdtCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
@@ -172,7 +210,9 @@ export const CdtCrawler = {
}, nextButtonSelector);
if (!nextButtonExists) {
logger.log('Next page button not found or disabled. Reached end of list.');
logger.log(
'Next page button not found or disabled. Reached end of list.',
);
break;
}
@@ -186,18 +226,25 @@ export const CdtCrawler = {
}, nextButtonSelector);
// 等待 AJAX 请求完成(通过监听网络请求)
await page.waitForFunction(() => {
// 检查表格是否正在加载
const loading = document.querySelector('.layui-table-loading');
return !loading;
}, { timeout: 30000 }).catch(() => {});
await page
.waitForFunction(
() => {
// 检查表格是否正在加载
const loading = document.querySelector('.layui-table-loading');
return !loading;
},
{ timeout: 30000 },
)
.catch(() => {});
// 额外等待确保数据加载完成
await new Promise(r => setTimeout(r, 2000));
await new Promise((r) => setTimeout(r, 2000));
// 检查是否真的翻页了(通过检查当前页码)
const currentActivePage = await page.evaluate(() => {
const activeSpan = document.querySelector('.layui-laypage-curr em:last-child');
const activeSpan = document.querySelector(
'.layui-laypage-curr em:last-child',
);
return activeSpan ? parseInt(activeSpan.textContent || '1') : 1;
});
@@ -217,25 +264,29 @@ export const CdtCrawler = {
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): CdtResult[] {
extract(this: CdtCrawlerType, html: string): CdtResult[] {
const results: CdtResult[] = [];
/**
* Regex groups for tang.cdt-ec.com:
@@ -243,23 +294,24 @@ export const CdtCrawler = {
* 2: Title (项目名称)
* 3: Date (发布时间)
*/
const regex = /<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
const regex =
/<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
const url = match[1]?.trim() ?? '';
const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -29,7 +29,7 @@ describe('CeicCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Run in non-headless mode
args: [
@@ -40,14 +40,14 @@ describe('CeicCrawler Real Site Test', () => {
'--disable-infobars',
...proxyArgs,
],
defaultViewport: null
defaultViewport: null,
});
});
afterAll(async () => {
if (browser) {
// Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000));
await new Promise((r) => setTimeout(r, 50000));
await browser.close();
}
});
@@ -56,29 +56,33 @@ describe('CeicCrawler Real Site Test', () => {
console.log(`
Starting crawl for: ${CeicCrawler.name}`);
console.log(`Target URL: ${CeicCrawler.url}`);
const results = await CeicCrawler.crawl(browser);
console.log(`
Successfully found ${results.length} items:
`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) {
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
console.warn(
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
);
} else {
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
interface CeicCrawlerType {
name: string;
url: string;
baseUrl: string;
}
export const CeicCrawler = {
@@ -52,7 +58,10 @@ export const CeicCrawler = {
url: 'https://ceic.dlnyzb.com/3001',
baseUrl: 'https://ceic.dlnyzb.com/',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
async crawl(
this: CeicCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('CeicCrawler');
const page = await browser.newPage();
@@ -65,10 +74,14 @@ export const CeicCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = [];
@@ -82,7 +95,7 @@ export const CeicCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -90,16 +103,25 @@ export const CeicCrawler = {
logger.log(`Processing page ${currentPage}...`);
// Wait for content to load - MUI list items
await page.waitForFunction(() => {
return document.querySelectorAll('li.MuiListItem-root').length > 0;
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
await page
.waitForFunction(
() => {
return (
document.querySelectorAll('li.MuiListItem-root').length > 0
);
},
{ timeout: 60000 },
)
.catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate(() => {
const results: { title: string; dateStr: string; url: string }[] = [];
// Extract from MUI list items
const listItems = Array.from(document.querySelectorAll('li.MuiListItem-root'));
listItems.forEach(item => {
const listItems = Array.from(
document.querySelectorAll('li.MuiListItem-root'),
);
listItems.forEach((item) => {
// Find the title link
const titleLink = item.querySelector('a.css-1vdw90h');
const title = titleLink?.textContent?.trim() || '';
@@ -125,15 +147,19 @@ export const CeicCrawler = {
});
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
logger.warn(
`No results found on page ${currentPage}. Extraction failed.`,
);
break;
}
allResults.push(...pageResults.map(r => ({
title: r.title,
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
url: r.url.replace(/\/\//g, '/')
})));
allResults.push(
...pageResults.map((r) => ({
title: r.title,
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
url: r.url.replace(/\/\//g, '/'),
})),
);
logger.log(`Extracted ${pageResults.length} items.`);
@@ -142,27 +168,30 @@ export const CeicCrawler = {
if (!nextButton) break;
await nextButton.click();
await new Promise(r => setTimeout(r, 3000));
await new Promise((r) => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
currentPage++;
}
return allResults;
} catch (error) {
logger.error(`Crawl failed: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Crawl failed: ${errorMessage}`);
return allResults;
} finally {
if (page) await page.close();
}
},
extract() { return []; }
extract() {
return [];
},
};

View File

@@ -2,7 +2,7 @@ import { CgnpcCrawler } from './cgnpc_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CgnpcCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CgnpcCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CgnpcCrawler.name}`);
console.log(`Target URL: ${CgnpcCrawler.url}`);
const results = await CgnpcCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('CgnpcCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface CgnpcResult {
@@ -52,12 +52,22 @@ export interface CgnpcResult {
url: string;
}
interface CgnpcCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CgnpcResult[];
}
export const CgnpcCrawler = {
name: '中广核电子商务平台',
url: 'https://ecp.cgnpc.com.cn/zbgg.html',
baseUrl: 'https://ecp.cgnpc.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<CgnpcResult[]> {
async crawl(
this: CgnpcCrawlerType,
browser: puppeteer.Browser,
): Promise<CgnpcResult[]> {
const logger = new Logger('CgnpcCrawler');
const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const CgnpcCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: CgnpcResult[] = [];
@@ -87,7 +101,7 @@ export const CgnpcCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const CgnpcCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const CgnpcCrawler = {
try {
// 点击下一页按钮
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
@@ -138,26 +158,27 @@ export const CgnpcCrawler = {
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): CgnpcResult[] {
extract(this: CgnpcCrawlerType, html: string): CgnpcResult[] {
const results: CgnpcResult[] = [];
/**
* Regex groups for ecp.cgnpc.com.cn:
@@ -181,24 +202,25 @@ export const CgnpcCrawler = {
* </div>
* </div>
*/
const regex = /<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p><\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
const regex =
/<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p><\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const title = match[1]?.trim();
const url = match[2]?.trim();
const dateStr = match[3]?.trim();
const title = match[1]?.trim() ?? '';
const url = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -29,7 +29,7 @@ describe('ChdtpCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: true, // Change to false to see the browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('ChdtpCrawler Real Site Test', () => {
it('should visit the website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${ChdtpCrawler.name}`);
console.log(`Target URL: ${ChdtpCrawler.url}`);
const results = await ChdtpCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('ChdtpCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if the website structure has changed or if the list is currently empty.');
console.warn(
'Warning: No items found. Check if the website structure has changed or if the list is currently empty.',
);
} else {
// Check data integrity of the first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of the first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});
});

View File

@@ -7,22 +7,34 @@ export interface ChdtpResult {
url: string; // Necessary for system uniqueness
}
interface ChdtpCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): ChdtpResult[];
}
export const ChdtpCrawler = {
name: '华电集团电子商务平台 ',
url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1',
baseUrl: 'https://www.chdtp.com/webs/',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
async crawl(
this: ChdtpCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('ChdtpCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
);
const allResults: ChdtpResult[] = [];
let currentPage = 1;
@@ -35,14 +47,16 @@ export const ChdtpCrawler = {
while (currentPage <= maxPages) {
const content = await page.content();
const pageResults = this.extract(content);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`);
break;
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// Find the "Next Page" button
// Using partial match for src to be robust against path variations
@@ -58,35 +72,43 @@ export const ChdtpCrawler = {
// For this specific site, we'll try to click.
logger.log(`Navigating to page ${currentPage + 1}...`);
try {
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }),
page.waitForNavigation({
waitUntil: 'networkidle2',
timeout: 60000,
}),
nextButton.click(),
]);
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
currentPage++;
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults; // Return what we have so far
} finally {
await page.close();
}
},
extract(html: string): ChdtpResult[] {
extract(this: ChdtpCrawlerType, html: string): ChdtpResult[] {
const results: ChdtpResult[] = [];
/**
* Regex groups for chdtp.com:
@@ -96,23 +118,24 @@ export const ChdtpCrawler = {
* 4: Business Type
* 5: Date
*/
const regex = /<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs;
const regex =
/<tr[^>]*>\s*<td class="td_1">.*?<span[^>]*>\s*(.*?)\s*<\/span>.*?<\/td>\s*<td class="td_2">\s*<a[^>]*href="javascript:toGetContent\('(.*?)'\)" title="(.*?)">.*?<\/a><\/td>\s*<td class="td_3">\s*<a[^>]*>\s*(.*?)\s*<\/a>\s*<\/td>\s*<td class="td_4"><span>\[(.*?)\]<\/span><\/td>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const urlSuffix = match[2]?.trim();
const title = match[3]?.trim();
const dateStr = match[5]?.trim();
const urlSuffix = match[2]?.trim() ?? '';
const title = match[3]?.trim() ?? '';
const dateStr = match[5]?.trim() ?? '';
if (title && urlSuffix) {
const fullUrl = this.baseUrl + urlSuffix;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
};
},
};

View File

@@ -31,13 +31,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -47,23 +47,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
describe('ChngCrawler Real Site Test', () => {
@@ -74,7 +74,7 @@ describe('ChngCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Run in non-headless mode
args: [
@@ -82,7 +82,7 @@ describe('ChngCrawler Real Site Test', () => {
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080',
"--disable-infobars",
'--disable-infobars',
...proxyArgs,
// "--headless=new",
// '--disable-dev-shm-usage',
@@ -94,15 +94,14 @@ describe('ChngCrawler Real Site Test', () => {
// '--disable-webgl',
// '--disable-javascript',
],
defaultViewport: null
defaultViewport: null,
});
});
afterAll(async () => {
if (browser) {
// Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000));
await new Promise((r) => setTimeout(r, 50000));
await browser.close();
}
});
@@ -111,43 +110,51 @@ describe('ChngCrawler Real Site Test', () => {
console.log(`
Starting crawl for: ${ChngCrawler.name}`);
console.log(`Target URL: ${ChngCrawler.url}`);
// 创建一个临时页面用于模拟人类行为
const tempPage = await browser.newPage();
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
await tempPage.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 1,
});
// 模拟人类鼠标移动
console.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(tempPage);
// 模拟人类滚动
console.log('Simulating human scrolling...');
await simulateHumanScrolling(tempPage);
await tempPage.close();
const results = await ChngCrawler.crawl(browser);
console.log(`
Successfully found ${results.length} items:
`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) {
console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.');
console.warn(
'Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.',
);
} else {
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});
});

View File

@@ -16,19 +16,20 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
console.log('Page was closed during mouse movement simulation');
return;
}
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
} catch (error) {
console.log('Mouse movement simulation interrupted:', error.message);
const errorMessage = error instanceof Error ? error.message : String(error);
console.log('Mouse movement simulation interrupted:', errorMessage);
}
}
@@ -43,18 +44,18 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
console.log('Page was closed during scrolling simulation');
return;
}
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
@@ -62,19 +63,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
} catch (error) {
console.log('Scrolling simulation interrupted:', error.message);
const errorMessage = error instanceof Error ? error.message : String(error);
console.log('Scrolling simulation interrupted:', errorMessage);
}
}
interface ChngCrawlerType {
name: string;
url: string;
baseUrl: string;
}
export const ChngCrawler = {
name: '华能集团电子商务平台',
url: 'https://ec.chng.com.cn/channel/home/#/purchase?top=0',
baseUrl: 'https://ec.chng.com.cn/channel/home/#',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
async crawl(
this: ChngCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('ChngCrawler');
let page = await browser.newPage();
// await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
@@ -84,42 +95,48 @@ export const ChngCrawler = {
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = [];
let currentPage = 1;
const maxPages = 5;
const maxPages = 5;
try {
logger.log('Navigating to Bing...');
await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' });
logger.log('Searching for target site...');
const searchBoxSelector = 'input[name="q"]';
const searchBoxSelector = 'input[name="q"]';
await page.waitForSelector(searchBoxSelector);
await page.type(searchBoxSelector, 'https://ec.chng.com.cn/');
await page.keyboard.press('Enter');
await page.waitForNavigation({ waitUntil: 'networkidle2' });
logger.log('Clicking search result...');
// await page.screenshot({ path: 'bing.png' });
const firstResultSelector = '#b_results .b_algo h2 a';
await page.waitForSelector(firstResultSelector);
const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target());
const newTargetPromise = browser.waitForTarget(
(target) => target.opener() === page.target(),
);
await page.click(firstResultSelector);
const newTarget = await newTargetPromise;
const newPage = await newTarget.page();
if (newPage) {
// await newPage.screenshot({ path: 'newPage.png' });
await page.close();
@@ -131,108 +148,135 @@ export const ChngCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// 等待页面稳定,不强制等待导航
await new Promise(r => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// PAUSE 15 SECONDS as requested
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
await new Promise(r => setTimeout(r, 15000));
// await page.screenshot({ path: 'huaneng.png' });
await new Promise((r) => setTimeout(r, 3000));
logger.log('Looking for "采购专栏" link...');
await page.waitForFunction(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
return divs.some(div => div.textContent && div.textContent.includes('采购专栏'));
}, { timeout: 60000 });
const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null);
await page.evaluate(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement;
if (target) target.click();
});
const purchaseTarget = await purchaseTargetPromise;
if (purchaseTarget) {
const pPage = await purchaseTarget.page();
if (pPage) {
logger.log('Switched to Purchase Page tab.');
page = pPage;
if (username && password) {
await page.authenticate({ username, password });
}
await new Promise(r => setTimeout(r, 5000));
}
}
logger.log(`Active URL: ${page.url()}`);
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// PAUSE 15 SECONDS as requested
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
await new Promise((r) => setTimeout(r, 15000));
// await page.screenshot({ path: 'huaneng.png' });
logger.log('Looking for "采购专栏" link...');
await page.waitForFunction(
() => {
const divs = Array.from(document.querySelectorAll('div.text'));
return divs.some(
(div) => div.textContent && div.textContent.includes('采购专栏'),
);
},
{ timeout: 60000 },
);
const purchaseTargetPromise = browser
.waitForTarget((target) => target.opener() === page.target(), {
timeout: 15000,
})
.catch(() => null);
await page.evaluate(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
const target = divs.find(
(div) => div.textContent && div.textContent.includes('采购专栏'),
) as HTMLElement;
if (target) target.click();
});
const purchaseTarget = await purchaseTargetPromise;
if (purchaseTarget) {
const pPage = await purchaseTarget.page();
if (pPage) {
logger.log('Switched to Purchase Page tab.');
page = pPage;
if (username && password) {
await page.authenticate({ username, password });
}
await new Promise((r) => setTimeout(r, 5000));
}
}
logger.log(`Active URL: ${page.url()}`);
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
// Wait for table rows to load
await page.waitForFunction(() => {
return document.querySelectorAll('tr.ant-table-row').length > 0;
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
await page
.waitForFunction(
() => {
return document.querySelectorAll('tr.ant-table-row').length > 0;
},
{ timeout: 60000 },
)
.catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate((baseUrl) => {
// Extract from table rows
const items = Array.from(document.querySelectorAll('tr.ant-table-row'));
return items.map(item => {
const titleSpan = item.querySelector('span.list-text');
const dateCell = item.querySelector('td.ant-table-row-cell-break-word p');
if (titleSpan && dateCell) {
const title = titleSpan.textContent?.trim() || '';
const dateStr = dateCell.textContent?.trim() || '';
if (title.length < 5) return null; // Filter noise
// URL is not directly available in the table, need to construct from data-row-key
const rowKey = item.getAttribute('data-row-key');
const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : '';
return {
title,
dateStr,
url
};
}
return null;
}).filter(i => i !== null);
const items = Array.from(
document.querySelectorAll('tr.ant-table-row'),
);
return items
.map((item) => {
const titleSpan = item.querySelector('span.list-text');
const dateCell = item.querySelector(
'td.ant-table-row-cell-break-word p',
);
if (titleSpan && dateCell) {
const title = titleSpan.textContent?.trim() || '';
const dateStr = dateCell.textContent?.trim() || '';
if (title.length < 5) return null; // Filter noise
// URL is not directly available in the table, need to construct from data-row-key
const rowKey = item.getAttribute('data-row-key');
const url = rowKey
? `${baseUrl}#/purchase/detail?id=${rowKey}`
: '';
return {
title,
dateStr,
url,
};
}
return null;
})
.filter((i) => i !== null);
}, this.baseUrl);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
break;
logger.warn(
`No results found on page ${currentPage}. Extraction failed.`,
);
break;
}
allResults.push(...pageResults.map(r => ({
title: r!.title,
publishDate: new Date(r!.dateStr),
url: r!.url.replace(/\/\//g, '/')
})));
allResults.push(
...pageResults.map((r) => ({
title: r.title,
publishDate: new Date(r.dateStr),
url: r.url.replace(/\/\//g, '/'),
})),
);
logger.log(`Extracted ${pageResults.length} items.`);
// Pagination: look for the "right" icon SVG
@@ -241,34 +285,37 @@ export const ChngCrawler = {
// 点击下一页前保存当前页面状态
const currentUrl = page.url();
await nextButton.click();
// 等待页面导航完成
try {
await page.waitForFunction(
(oldUrl) => window.location.href !== oldUrl,
{ timeout: 10000 },
currentUrl
currentUrl,
);
} catch (e) {
} catch {
logger.warn('Navigation timeout, continuing anyway');
}
// 等待页面内容加载
await new Promise(r => setTimeout(r, 15000));
await new Promise((r) => setTimeout(r, 15000));
currentPage++;
}
return allResults;
} catch (error) {
logger.error(`Crawl failed: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Crawl failed: ${errorMessage}`);
return allResults;
} finally {
if (page) await page.close();
}
},
extract() { return []; }
};
extract() {
return [];
},
};

View File

@@ -2,7 +2,7 @@ import { CnncecpCrawler } from './cnncecp_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CnncecpCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CnncecpCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CnncecpCrawler.name}`);
console.log(`Target URL: ${CnncecpCrawler.url}`);
const results = await CnncecpCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('CnncecpCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface CnncecpResult {
@@ -52,12 +52,22 @@ export interface CnncecpResult {
url: string;
}
interface CnncecpCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CnncecpResult[];
}
export const CnncecpCrawler = {
name: '中核集团电子采购平台',
url: 'https://www.cnncecp.com/xzbgg/index.jhtml',
baseUrl: 'https://www.cnncecp.com/',
async crawl(browser: puppeteer.Browser): Promise<CnncecpResult[]> {
async crawl(
this: CnncecpCrawlerType,
browser: puppeteer.Browser,
): Promise<CnncecpResult[]> {
const logger = new Logger('CnncecpCrawler');
const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const CnncecpCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: CnncecpResult[] = [];
@@ -87,7 +101,7 @@ export const CnncecpCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const CnncecpCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
@@ -126,9 +142,13 @@ export const CnncecpCrawler = {
try {
// 点击下一页按钮
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
@@ -137,26 +157,27 @@ export const CnncecpCrawler = {
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): CnncecpResult[] {
extract(this: CnncecpCrawlerType, html: string): CnncecpResult[] {
const results: CnncecpResult[] = [];
/**
* Regex groups for cnncecp.com:
@@ -172,24 +193,25 @@ export const CnncecpCrawler = {
* <a href="https://www.cnncecp.com/xzbgg/1862778.jhtml">中核四0四有限公司2026-2028年度质量流量控制器等采购项目二次变更公告</a>
* </li>
*/
const regex = /<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
const regex =
/<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const dateStr = match[1]?.trim();
const url = match[2]?.trim();
const title = match[3]?.trim();
const dateStr = match[1]?.trim() ?? '';
const url = match[2]?.trim() ?? '';
const title = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -2,7 +2,7 @@ import { CnoocCrawler } from './cnooc_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('CnoocCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('CnoocCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CnoocCrawler.name}`);
console.log(`Target URL: ${CnoocCrawler.url}`);
const results = await CnoocCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('CnoocCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface CnoocResult {
@@ -52,12 +52,22 @@ export interface CnoocResult {
url: string;
}
interface CnoocCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): CnoocResult[];
}
export const CnoocCrawler = {
name: '中海油招标平台',
url: 'https://buy.cnooc.com.cn/cbjyweb/001/001001/moreinfo.html',
baseUrl: 'https://buy.cnooc.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<CnoocResult[]> {
async crawl(
this: CnoocCrawlerType,
browser: puppeteer.Browser,
): Promise<CnoocResult[]> {
const logger = new Logger('CnoocCrawler');
const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const CnoocCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: CnoocResult[] = [];
@@ -87,7 +101,7 @@ export const CnoocCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const CnoocCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const CnoocCrawler = {
try {
// 点击下一页按钮
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
@@ -138,26 +158,27 @@ export const CnoocCrawler = {
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): CnoocResult[] {
extract(this: CnoocCrawlerType, html: string): CnoocResult[] {
const results: CnoocResult[] = [];
/**
* Regex groups for buy.cnooc.com.cn:
@@ -173,24 +194,25 @@ export const CnoocCrawler = {
* <span class="now-span" style="width:100px">2026-01-12</span>
* </li>
*/
const regex = /<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
const regex =
/<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
const url = match[1]?.trim() ?? '';
const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -2,7 +2,7 @@ import { EpsCrawler } from './eps_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('EpsCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('EpsCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${EpsCrawler.name}`);
console.log(`Target URL: ${EpsCrawler.url}`);
const results = await EpsCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('EpsCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface EpsResult {
@@ -52,12 +52,22 @@ export interface EpsResult {
url: string;
}
interface EpsCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): EpsResult[];
}
export const EpsCrawler = {
name: '中国三峡集团电子商务平台',
url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm',
baseUrl: 'https://eps.ctg.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<EpsResult[]> {
async crawl(
this: EpsCrawlerType,
browser: puppeteer.Browser,
): Promise<EpsResult[]> {
const logger = new Logger('EpsCrawler');
const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const EpsCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: EpsResult[] = [];
@@ -87,7 +101,7 @@ export const EpsCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const EpsCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const EpsCrawler = {
try {
// 点击下一页按钮,等待页面更新
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
@@ -138,26 +158,27 @@ export const EpsCrawler = {
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): EpsResult[] {
extract(this: EpsCrawlerType, html: string): EpsResult[] {
const results: EpsResult[] = [];
/**
* Regex groups for eps.ctg.com.cn:
@@ -179,24 +200,25 @@ export const EpsCrawler = {
* </a>
* </li>
*/
const regex = /<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
const regex =
/<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
const url = match[1]?.trim() ?? '';
const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -2,7 +2,7 @@ import { EspicCrawler } from './espic_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('EspicCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('EspicCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${EspicCrawler.name}`);
console.log(`Target URL: ${EspicCrawler.getUrl()}`);
const results = await EspicCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('EspicCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface EspicResult {
@@ -52,12 +52,19 @@ export interface EspicResult {
url: string;
}
interface EspicCrawlerType {
name: string;
baseUrl: string;
getUrl(page?: number): string;
extract(html: string): EspicResult[];
}
export const EspicCrawler = {
name: '电能e招采平台国电投',
baseUrl: 'https://ebid.espic.com.cn/',
// 生成动态 URL使用当前日期
getUrl(page: number = 1): string {
getUrl(this: EspicCrawlerType, page: number = 1): string {
const now = new Date();
const year = now.getFullYear();
const month = now.getMonth() + 1; // 月份从0开始
@@ -66,7 +73,10 @@ export const EspicCrawler = {
return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`;
},
async crawl(browser: puppeteer.Browser): Promise<EspicResult[]> {
async crawl(
this: EspicCrawlerType,
browser: puppeteer.Browser,
): Promise<EspicResult[]> {
const logger = new Logger('EspicCrawler');
const page = await browser.newPage();
@@ -78,11 +88,15 @@ export const EspicCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: EspicResult[] = [];
@@ -100,15 +114,18 @@ export const EspicCrawler = {
() => {
// 检查是否已经通过验证(页面不再是 WAF 页面)
const bodyText = document.body?.textContent || '';
return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙');
return (
!bodyText.includes('人机识别检测') &&
!bodyText.includes('WEB 应用防火墙')
);
},
{ timeout: 30000 }
{ timeout: 30000 },
);
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -124,12 +141,14 @@ export const EspicCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
@@ -141,7 +160,7 @@ export const EspicCrawler = {
'a[aria-label="Next"]',
'a.next',
'li.next a',
'a.layui-laypage-next:not(.layui-disabled)'
'a.layui-laypage-next:not(.layui-disabled)',
];
let nextButton: puppeteer.ElementHandle<Element> | null = null;
@@ -149,7 +168,7 @@ export const EspicCrawler = {
try {
nextButton = await page.$(selector);
if (nextButton) break;
} catch (e) {
} catch {
// 继续尝试下一个选择器
}
}
@@ -164,9 +183,13 @@ export const EspicCrawler = {
try {
// 点击下一页按钮
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
@@ -175,26 +198,27 @@ export const EspicCrawler = {
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): EspicResult[] {
extract(this: EspicCrawlerType, html: string): EspicResult[] {
const results: EspicResult[] = [];
/**
* Regex groups for ebid.espic.com.cn:
@@ -225,24 +249,25 @@ export const EspicCrawler = {
* </a>
* </li>
*/
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
const regex =
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
const url = match[1]?.trim() ?? '';
const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -2,7 +2,7 @@ import { PowerbeijingCrawler } from './powerbeijing_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('PowerbeijingCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('PowerbeijingCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${PowerbeijingCrawler.name}`);
console.log(`Target URL: ${PowerbeijingCrawler.url}`);
const results = await PowerbeijingCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('PowerbeijingCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface PowerbeijingResult {
@@ -52,12 +52,22 @@ export interface PowerbeijingResult {
url: string;
}
interface PowerbeijingCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): PowerbeijingResult[];
}
export const PowerbeijingCrawler = {
name: '北京京能电子商务平台',
url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
baseUrl: 'https://www.powerbeijing-ec.com/',
async crawl(browser: puppeteer.Browser): Promise<PowerbeijingResult[]> {
async crawl(
this: PowerbeijingCrawlerType,
browser: puppeteer.Browser,
): Promise<PowerbeijingResult[]> {
const logger = new Logger('PowerbeijingCrawler');
const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const PowerbeijingCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: PowerbeijingResult[] = [];
@@ -87,7 +101,7 @@ export const PowerbeijingCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -103,12 +117,14 @@ export const PowerbeijingCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
@@ -127,9 +143,13 @@ export const PowerbeijingCrawler = {
try {
// 点击下一页按钮,等待页面更新
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
@@ -138,26 +158,27 @@ export const PowerbeijingCrawler = {
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): PowerbeijingResult[] {
extract(this: PowerbeijingCrawlerType, html: string): PowerbeijingResult[] {
const results: PowerbeijingResult[] = [];
/**
* Regex groups for powerbeijing-ec.com:
@@ -176,24 +197,25 @@ export const PowerbeijingCrawler = {
* </a>
* </li>
*/
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
const regex =
/<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
const url = match[1]?.trim() ?? '';
const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -2,7 +2,7 @@ import { SdiccCrawler } from './sdicc_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
jest.setTimeout(60000 * 5);
// 获取代理配置
const getProxyArgs = (): string[] => {
@@ -29,7 +29,7 @@ describe('SdiccCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
@@ -45,13 +45,15 @@ describe('SdiccCrawler Real Site Test', () => {
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${SdiccCrawler.name}`);
console.log(`Target URL: ${SdiccCrawler.url}`);
const results = await SdiccCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
@@ -61,13 +63,15 @@ describe('SdiccCrawler Real Site Test', () => {
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
console.warn(
'Warning: No items found. Check if website structure has changed or if list is currently empty.',
);
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -11,13 +11,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -27,23 +27,23 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
export interface SdiccResult {
@@ -52,12 +52,22 @@ export interface SdiccResult {
url: string;
}
interface SdiccCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): SdiccResult[];
}
export const SdiccCrawler = {
name: '国投集团电子采购平台',
url: 'https://www.sdicc.com.cn/cgxx/ggList',
baseUrl: 'https://www.sdicc.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<SdiccResult[]> {
async crawl(
this: SdiccCrawlerType,
browser: puppeteer.Browser,
): Promise<SdiccResult[]> {
const logger = new Logger('SdiccCrawler');
const page = await browser.newPage();
@@ -69,11 +79,15 @@ export const SdiccCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: SdiccResult[] = [];
@@ -87,15 +101,17 @@ export const SdiccCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// 等待表格加载
logger.log('Waiting for table to load...');
await page.waitForSelector('.tbody table tbody tr', { timeout: 30000 }).catch(() => {
logger.warn('Table rows not found, trying alternative selectors...');
});
await page
.waitForSelector('.tbody table tbody tr', { timeout: 30000 })
.catch(() => {
logger.warn('Table rows not found, trying alternative selectors...');
});
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
@@ -109,12 +125,14 @@ export const SdiccCrawler = {
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
@@ -132,10 +150,16 @@ export const SdiccCrawler = {
try {
// 点击下一页按钮
await nextButton.click();
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
await new Promise(r => setTimeout(r, 2000)); // 额外等待确保数据加载完成
await page
.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 })
.catch(() => {});
await new Promise((r) => setTimeout(r, 2000)); // 额外等待确保数据加载完成
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
@@ -144,26 +168,27 @@ export const SdiccCrawler = {
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): SdiccResult[] {
extract(this: SdiccCrawlerType, html: string): SdiccResult[] {
const results: SdiccResult[] = [];
/**
* Regex groups for sdicc.com.cn:
@@ -180,25 +205,26 @@ export const SdiccCrawler = {
* <td colspan="1" rowspan="1"><span> 2026-01-09 </span></td>
* </tr>
*/
const regex = /<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
const regex =
/<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
let match;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const ggGuid = match[1]?.trim();
const gcGuid = match[2]?.trim();
const title = match[3]?.trim();
const dateStr = match[4]?.trim();
const ggGuid = match[1]?.trim() ?? '';
const gcGuid = match[2]?.trim() ?? '';
const title = match[3]?.trim() ?? '';
const dateStr = match[4]?.trim() ?? '';
if (title && ggGuid && gcGuid) {
const fullUrl = `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}`;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
}
},
};

View File

@@ -29,7 +29,7 @@ describe('SzecpCrawler Real Site Test', () => {
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Run in non-headless mode
args: [
@@ -40,14 +40,14 @@ describe('SzecpCrawler Real Site Test', () => {
'--disable-infobars',
...proxyArgs,
],
defaultViewport: null
defaultViewport: null,
});
});
afterAll(async () => {
if (browser) {
// Keep open for a few seconds after test to see result
await new Promise(r => setTimeout(r, 50000));
await new Promise((r) => setTimeout(r, 50000));
await browser.close();
}
});
@@ -56,29 +56,33 @@ describe('SzecpCrawler Real Site Test', () => {
console.log(`
Starting crawl for: ${SzecpCrawler.name}`);
console.log(`Target URL: ${SzecpCrawler.url}`);
const results = await SzecpCrawler.crawl(browser);
console.log(`
Successfully found ${results.length} items:
`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(
`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`,
);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) {
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
console.warn(
'Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.',
);
} else {
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -12,13 +12,13 @@ async function simulateHumanMouseMovement(page: puppeteer.Page) {
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
@@ -28,23 +28,29 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
await new Promise((r) => setTimeout(r, 1000));
}
interface SzecpCrawlerType {
name: string;
url: string;
baseUrl: string;
}
export const SzecpCrawler = {
@@ -52,7 +58,10 @@ export const SzecpCrawler = {
url: 'https://www.szecp.com.cn/first_zbgg/index.html',
baseUrl: 'https://www.szecp.com.cn/',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
async crawl(
this: SzecpCrawlerType,
browser: puppeteer.Browser,
): Promise<ChdtpResult[]> {
const logger = new Logger('SzecpCrawler');
const page = await browser.newPage();
@@ -65,10 +74,14 @@ export const SzecpCrawler = {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = [];
@@ -82,7 +95,7 @@ export const SzecpCrawler = {
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
@@ -90,52 +103,69 @@ export const SzecpCrawler = {
logger.log('Clicking search button...');
await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 });
await page.click('.szb-zbcgSearch-key-v1');
await new Promise(r => setTimeout(r, 3000)); // Wait for results to load
await new Promise((r) => setTimeout(r, 3000)); // Wait for results to load
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
// Wait for content to load
await page.waitForFunction(() => {
return document.querySelectorAll('.szb-zbcgTable-other').length > 0;
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
await page
.waitForFunction(
() => {
return (
document.querySelectorAll('.szb-zbcgTable-other').length > 0
);
},
{ timeout: 60000 },
)
.catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate((baseUrl) => {
// Extract from table rows
const items = Array.from(document.querySelectorAll('.szb-zbcgTable-other'));
return items.map(item => {
const divs = item.querySelectorAll('div');
if (divs.length >= 5) {
const titleLink = divs[1].querySelector('a');
const title = titleLink?.textContent?.trim() || '';
const dateStr = divs[4].textContent?.trim() || '';
const href = titleLink?.getAttribute('href') || '';
const items = Array.from(
document.querySelectorAll('.szb-zbcgTable-other'),
);
return items
.map((item) => {
const divs = item.querySelectorAll('div');
if (divs.length >= 5) {
const titleLink = divs[1].querySelector('a');
const title = titleLink?.textContent?.trim() || '';
const dateStr = divs[4].textContent?.trim() || '';
const href = titleLink?.getAttribute('href') || '';
if (title.length < 5) return null; // Filter noise
if (title.length < 5) return null; // Filter noise
// Construct full URL if href is relative
const url = href.startsWith('http') ? href : `${baseUrl}${href}`;
// Construct full URL if href is relative
const url = href.startsWith('http')
? href
: `${baseUrl}${href}`;
return {
title,
dateStr,
url
};
}
return null;
}).filter(i => i !== null);
return {
title,
dateStr,
url,
};
}
return null;
})
.filter((i) => i !== null);
}, this.baseUrl);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
logger.warn(
`No results found on page ${currentPage}. Extraction failed.`,
);
break;
}
allResults.push(...pageResults.map(r => ({
title: r!.title,
publishDate: new Date(r!.dateStr),
url: r!.url.replace(/\/\//g, '/')
})));
allResults.push(
...pageResults.map((r) => ({
title: r.title,
publishDate: new Date(r.dateStr),
url: r.url.replace(/\/\//g, '/'),
})),
);
logger.log(`Extracted ${pageResults.length} items.`);
@@ -144,27 +174,30 @@ export const SzecpCrawler = {
if (!nextButton) break;
await nextButton.click();
await new Promise(r => setTimeout(r, 3000));
await new Promise((r) => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
currentPage++;
}
return allResults;
} catch (error) {
logger.error(`Crawl failed: ${error.message}`);
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Crawl failed: ${errorMessage}`);
return allResults;
} finally {
if (page) await page.close();
}
},
extract() { return []; }
extract() {
return [];
},
};