Files
bidding_watcher/src/crawler/services/ceic_target.ts
dmy b3d784f1e3 feat: 添加Tailwind CSS支持并修复URL拼接问题
添加Tailwind CSS及相关配置
修复多个爬虫服务中的URL拼接问题,避免双斜杠
调整前端导航菜单项顺序
2026-01-13 18:07:00 +08:00

169 lines
5.5 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
import { ChdtpResult } from './chdtp_target';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
export const CeicCrawler = {
name: '国家能源集团生态协作平台',
url: 'https://ceic.dlnyzb.com/3001',
baseUrl: 'https://ceic.dlnyzb.com/',
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
const logger = new Logger('CeicCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
const allResults: ChdtpResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
logger.log(`Navigating to ${this.url}...`);
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
// Wait for content to load - MUI list items
await page.waitForFunction(() => {
return document.querySelectorAll('li.MuiListItem-root').length > 0;
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
const pageResults = await page.evaluate(() => {
const results: { title: string; dateStr: string; url: string }[] = [];
// Extract from MUI list items
const listItems = Array.from(document.querySelectorAll('li.MuiListItem-root'));
listItems.forEach(item => {
// Find the title link
const titleLink = item.querySelector('a.css-1vdw90h');
const title = titleLink?.textContent?.trim() || '';
const href = titleLink?.getAttribute('href') || '';
// Find the publish date - look for text containing "发布时间:"
const paragraphs = Array.from(item.querySelectorAll('p'));
let dateStr = '';
for (const p of paragraphs) {
const text = p.textContent || '';
if (text.includes('发布时间:')) {
dateStr = text.replace('发布时间:', '').trim();
break;
}
}
if (title.length >= 5 && href) {
results.push({ title, dateStr, url: href });
}
});
return results;
});
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
break;
}
allResults.push(...pageResults.map(r => ({
title: r.title,
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
url: r.url.replace(/\/\//g, '/')
})));
logger.log(`Extracted ${pageResults.length} items.`);
// Pagination: look for next page button in MUI pagination
const nextButton = await page.$('a[aria-label="Go to next page"]');
if (!nextButton) break;
await nextButton.click();
await new Promise(r => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
currentPage++;
}
return allResults;
} catch (error) {
logger.error(`Crawl failed: ${error.message}`);
return allResults;
} finally {
if (page) await page.close();
}
},
extract() { return []; }
};