Files
bidding_watcher/src/crawler/services/eps_target.ts
dmy 5edebd9d55 refactor: improve date handling and timezone consistency
- Add timezone support to database module (+08:00)
- Extract date formatting utilities to shared modules
- Standardize timezone handling across frontend and backend
- Improve date formatting consistency in UI components
- Refactor crawler page.goto options for better readability
2026-01-15 16:17:41 +08:00

285 lines
8.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20), // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth',
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise((r) => setTimeout(r, 1000));
}
// 检查错误是否为代理隧道连接失败
function isTunnelConnectionFailedError(error: unknown): boolean {
if (error instanceof Error) {
return (
error.message.includes('net::ERR_TUNNEL_CONNECTION_FAILED') ||
error.message.includes('ERR_TUNNEL_CONNECTION_FAILED')
);
}
return false;
}
// 延迟重试函数
async function delayRetry(
operation: () => Promise<void>,
maxRetries: number = 3,
delayMs: number = 5000,
logger?: Logger,
): Promise<void> {
let lastError: Error | unknown;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
await operation();
return;
} catch (error) {
lastError = error;
if (isTunnelConnectionFailedError(error)) {
if (attempt < maxRetries) {
const delay = delayMs * attempt; // 递增延迟
logger?.warn(
`代理隧道连接失败,第 ${attempt} 次尝试失败,${delay / 1000} 秒后重试...`,
);
await new Promise((resolve) => setTimeout(resolve, delay));
} else {
logger?.error(
`代理隧道连接失败,已达到最大重试次数 ${maxRetries}`,
);
throw error;
}
} else {
// 非代理错误,直接抛出
throw error;
}
}
}
throw lastError;
}
export interface EpsResult {
title: string;
publishDate: Date;
url: string;
}
interface EpsCrawlerType {
name: string;
url: string;
baseUrl: string;
extract(html: string): EpsResult[];
}
export const EpsCrawler = {
name: '中国三峡集团电子商务平台',
url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm',
baseUrl: 'https://eps.ctg.com.cn/',
async crawl(
this: EpsCrawlerType,
browser: puppeteer.Browser,
): Promise<EpsResult[]> {
const logger = new Logger('EpsCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
);
await page.setViewport({ width: 1920, height: 1080 });
const allResults: EpsResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
logger.log(`Navigating to ${this.url}...`);
await delayRetry(
async () => {
await page.goto(this.url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
},
3,
5000,
logger,
);
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
const content = await page.content();
const pageResults = this.extract(content);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`);
break;
}
allResults.push(...pageResults);
logger.log(
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
// 查找下一页按钮 - 根据网站实际结构调整选择器
// 分页结构: <a href="javascript:;" aria-label="Next" class="pageItem" page="2">下页</a>
const nextButtonSelector = 'a.pageItem[aria-label="Next"]';
const nextButton = await page.$(nextButtonSelector);
if (!nextButton) {
logger.log('Next page button not found. Reached end of list.');
break;
}
logger.log(`Navigating to page ${currentPage + 1}...`);
try {
// 点击下一页按钮,等待页面更新
await nextButton.click();
await new Promise((r) => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
const navErrorMessage =
navError instanceof Error ? navError.message : String(navError);
logger.error(
`Navigation to page ${currentPage + 1} failed: ${navErrorMessage}`,
);
break;
}
currentPage++;
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise((resolve) => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
const errorMessage =
error instanceof Error ? error.message : String(error);
logger.error(`Failed to crawl ${this.name}: ${errorMessage}`);
throw error;
} finally {
await page.close();
}
},
extract(this: EpsCrawlerType, html: string): EpsResult[] {
const results: EpsResult[] = [];
/**
* Regex groups for eps.ctg.com.cn:
* 1: URL (href属性)
* 2: Title (title属性)
* 3: Date (发布时间格式2026-01-09)
*
* HTML结构示例:
* <li name="li_name">
* <a id="0" href="https://eps.ctg.com.cn/cms/channel/1ywgg1/240630340.htm"
* title="三峡福清兴化湾海上风电场一期项目金风Y6风机发电机更换施工招标公告"
* target="_blank" style="">
* <span style="max-width: 700px;">
* <i class="iconfont"></i>
* <em style="width:6.5em; color: #1e52a8;font-weight: 700;float: none;"></em>
* 三峡福清兴化湾海上风电场一期项目金风Y6风机发电机更换施工招标公告
* </span>
* <em>2026-01-09</em>
* </a>
* </li>
*/
const regex =
/<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim() ?? '';
const title = match[2]?.trim() ?? '';
const dateStr = match[3]?.trim() ?? '';
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/'),
});
}
}
return results;
},
};