Files
bidding_watcher/src/crawler/services/espic_target.ts

248 lines
8.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
export interface EspicResult {
title: string;
publishDate: Date;
url: string;
}
export const EspicCrawler = {
name: '电能e招采平台国电投',
baseUrl: 'https://ebid.espic.com.cn',
// 生成动态 URL使用当前日期
getUrl(page: number = 1): string {
const now = new Date();
const year = now.getFullYear();
const month = now.getMonth() + 1; // 月份从0开始
const day = now.getDate();
const timeStr = `${year}-${month}-${day}`;
return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`;
},
async crawl(browser: puppeteer.Browser): Promise<EspicResult[]> {
const logger = new Logger('EspicCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
const allResults: EspicResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
const url = this.getUrl(currentPage);
logger.log(`Navigating to ${url}...`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
// 等待 WAF 验证通过
logger.log('Waiting for WAF verification...');
await page.waitForFunction(
() => {
// 检查是否已经通过验证(页面不再是 WAF 页面)
const bodyText = document.body?.textContent || '';
return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙');
},
{ timeout: 30000 }
);
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
const content = await page.content();
const pageResults = this.extract(content);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`);
break;
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
// 查找下一页按钮 - 根据网站实际结构调整选择器
// 实际结构: <a href="javascript:void(0);" onclick="turnPage(2);">下一页</a>
const nextButtonSelectors = [
'a[onclick*="turnPage"]',
'a:contains("下一页")',
'a[aria-label="Next"]',
'a.next',
'li.next a',
'a.layui-laypage-next:not(.layui-disabled)'
];
let nextButton: puppeteer.ElementHandle<Element> | null = null;
for (const selector of nextButtonSelectors) {
try {
nextButton = await page.$(selector);
if (nextButton) break;
} catch (e) {
// 继续尝试下一个选择器
}
}
if (!nextButton) {
logger.log('Next page button not found. Reached end of list.');
break;
}
logger.log(`Navigating to page ${currentPage + 1}...`);
try {
// 点击下一页按钮
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
break;
}
currentPage++;
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): EspicResult[] {
const results: EspicResult[] = [];
/**
* Regex groups for ebid.espic.com.cn:
* 1: URL (href属性)
* 2: Title (title属性完整的招标工程名称)
* 3: Date (发布时间格式2026-01-11)
*
* HTML结构示例:
* <li>
* <a href="https://ebid.espic.com.cn/sdny_bulletin/2026-01-11/977309.html"
* title="GJDTHN2025225-国家电投集团河南公司平顶山发电2026年-2028年两年期旺河灰场运维项目招标公告"
* class="clearfix" target="_blank">
* <div class="row">
* <div class="col-10 ">
* <h5>GJDTHN2025225-国家电投集团河南公司平顶山发电2026年-2028年两年期旺...</h5>
* <dl class="newsinfo row">
* <dd class="col">招标编号:<span>DNYZC-2026-01-11-001</span></dd>
* <dd class="col">招标方式:<span>公开招标</span></dd>
* <dd class="col">报名截止时间:<span>2026-01-19</span></dd>
* </dl>
* </div>
* <div class="col-2 ">
* <div class="newsDate">
* <div>2026-01-11</div>
* </div>
* </div>
* </div>
* </a>
* </li>
*/
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
let match;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
if (title && url) {
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: url.startsWith('http') ? url : this.baseUrl + url
});
}
}
return results;
}
};