Files
bidding_watcher/src/crawler/services/szecp_target.ts
dmy 6d626a0946 feat: 新增多个电力集团采购平台爬虫
- 新增4个电力集团采购平台爬虫:
  * 中国大唐集团电子商务平台 (CdtCrawler)
  * 大连能源采购平台 (CeicCrawler)
  * 华润守正采购交易平台 (SzecpCrawler)
- 更新 BidCrawlerService,集成新的爬虫到爬虫任务中
- 添加环境变量示例文件 .env.example,包含数据库和代理配置
- 优化 .env 文件,添加代理配置示例
- 为所有新爬虫添加完整的单元测试文件
- 使用与现有爬虫相同的反检测策略(人类行为模拟)
- 支持分页抓取,每个平台最多抓取5页数据
- 统一的错误处理机制,单个爬虫失败不影响其他爬虫执行
2026-01-11 22:34:38 +08:00

171 lines
5.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
import { ChdtpResult } from './chdtp_target';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
export const SzecpCrawler = {
name: '华润守正采购交易平台',
url: 'https://www.szecp.com.cn/first_zbgg/index.html',
baseUrl: 'https://www.szecp.com.cn',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
const logger = new Logger('SzecpCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
logger.log(`Navigating to ${this.url}...`);
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// Wait for search button to be available and click it
logger.log('Clicking search button...');
await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 });
await page.click('.szb-zbcgSearch-key-v1');
await new Promise(r => setTimeout(r, 3000)); // Wait for results to load
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
// Wait for content to load
await page.waitForFunction(() => {
return document.querySelectorAll('.szb-zbcgTable-other').length > 0;
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate((baseUrl) => {
// Extract from table rows
const items = Array.from(document.querySelectorAll('.szb-zbcgTable-other'));
return items.map(item => {
const divs = item.querySelectorAll('div');
if (divs.length >= 5) {
const titleLink = divs[1].querySelector('a');
const title = titleLink?.textContent?.trim() || '';
const dateStr = divs[4].textContent?.trim() || '';
const href = titleLink?.getAttribute('href') || '';
if (title.length < 5) return null; // Filter noise
// Construct full URL if href is relative
const url = href.startsWith('http') ? href : `${baseUrl}${href}`;
return {
title,
dateStr,
url
};
}
return null;
}).filter(i => i !== null);
}, this.baseUrl);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
break;
}
allResults.push(...pageResults.map(r => ({
title: r!.title,
publishDate: new Date(r!.dateStr),
url: r!.url
})));
logger.log(`Extracted ${pageResults.length} items.`);
// Pagination: look for next page link
const nextButton = await page.$('.pagination li a[page="+"]');
if (!nextButton) break;
await nextButton.click();
await new Promise(r => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
currentPage++;
}
return allResults;
} catch (error) {
logger.error(`Crawl failed: ${error.message}`);
return allResults;
} finally {
if (page) await page.close();
}
},
extract() { return []; }
};