Files
bidding_watcher/src/crawler/services/chng_target.ts
dmy 044fd770f7 feat: 增强华能电商平台爬虫的反检测能力和数据结构
- 在 BidItem 实体中添加 priority 和 unit 字段,扩展数据结构
- 将爬虫浏览器模式改为非无头模式(headless: false)便于调试
- 为 ChngCrawler 添加人类行为模拟功能:
  * 模拟鼠标随机移动,增加移动步数和停顿时间
  * 模拟人类滚动行为,包括随机滚动距离和停顿
  * 添加 navigator 属性伪装,包括语言、插件等
- 在关键节点添加截图功能(bing.png, newPage.png, huaneng.png)用于调试
- 优化反检测策略,降低被目标网站识别为机器人的风险
2026-01-11 21:35:24 +08:00

234 lines
8.4 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
import { ChdtpResult } from './chdtp_target';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
export const ChngCrawler = {
name: '华能集团电子商务平台',
url: 'https://ec.chng.com.cn/ecmall/index.html#/purchase/home?top=0',
baseUrl: 'https://ec.chng.com.cn/ecmall/index.html',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
const logger = new Logger('ChngCrawler');
let page = await browser.newPage();
// await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
// await page.setViewport({ deviceScaleFactor: 1 });
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
logger.log('Navigating to Bing...');
await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' });
logger.log('Searching for target site...');
const searchBoxSelector = 'input[name="q"]';
await page.waitForSelector(searchBoxSelector);
await page.type(searchBoxSelector, 'https://ec.chng.com.cn/');
await page.keyboard.press('Enter');
await page.waitForNavigation({ waitUntil: 'networkidle2' });
logger.log('Clicking search result...');
await page.screenshot({ path: 'bing.png' });
const firstResultSelector = '#b_results .b_algo h2 a';
await page.waitForSelector(firstResultSelector);
const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target());
await page.click(firstResultSelector);
const newTarget = await newTargetPromise;
const newPage = await newTarget.page();
if (newPage) {
await newPage.screenshot({ path: 'newPage.png' });
await page.close();
page = newPage;
if (username && password) {
await page.authenticate({ username, password });
}
}
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
await page.waitForNavigation({ waitUntil: 'domcontentloaded' }).catch(() => {});
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// PAUSE 15 SECONDS as requested
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
await new Promise(r => setTimeout(r, 15000));
await page.screenshot({ path: 'huaneng.png' });
logger.log('Looking for "采购专栏" link...');
await page.waitForFunction(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
return divs.some(div => div.textContent && div.textContent.includes('采购专栏'));
}, { timeout: 60000 });
const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null);
await page.evaluate(() => {
const divs = Array.from(document.querySelectorAll('div.text'));
const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement;
if (target) target.click();
});
const purchaseTarget = await purchaseTargetPromise;
if (purchaseTarget) {
const pPage = await purchaseTarget.page();
if (pPage) {
logger.log('Switched to Purchase Page tab.');
page = pPage;
if (username && password) {
await page.authenticate({ username, password });
}
await new Promise(r => setTimeout(r, 5000));
}
}
logger.log(`Active URL: ${page.url()}`);
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
// Wait for table rows to load
await page.waitForFunction(() => {
return document.querySelectorAll('tr.ant-table-row').length > 0;
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate((baseUrl) => {
// Extract from table rows
const items = Array.from(document.querySelectorAll('tr.ant-table-row'));
return items.map(item => {
const titleSpan = item.querySelector('span.list-text');
const dateCell = item.querySelector('td.ant-table-row-cell-break-word p');
if (titleSpan && dateCell) {
const title = titleSpan.textContent?.trim() || '';
const dateStr = dateCell.textContent?.trim() || '';
if (title.length < 5) return null; // Filter noise
// URL is not directly available in the table, need to construct from data-row-key
const rowKey = item.getAttribute('data-row-key');
const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : '';
return {
title,
dateStr,
url
};
}
return null;
}).filter(i => i !== null);
}, this.baseUrl);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
break;
}
allResults.push(...pageResults.map(r => ({
title: r!.title,
publishDate: new Date(r!.dateStr),
url: r!.url
})));
logger.log(`Extracted ${pageResults.length} items.`);
// Pagination: look for the "right" icon SVG
const nextButton = await page.$('svg[data-icon="right"]');
if (!nextButton) break;
await nextButton.click();
await new Promise(r => setTimeout(r, 5000));
currentPage++;
}
return allResults;
} catch (error) {
logger.error(`Crawl failed: ${error.message}`);
return allResults;
} finally {
if (page) await page.close();
}
},
extract() { return []; }
};