Files
bidding_watcher/src/crawler/services/cdt_target.ts
dmy b3d784f1e3 feat: 添加Tailwind CSS支持并修复URL拼接问题
添加Tailwind CSS及相关配置
修复多个爬虫服务中的URL拼接问题,避免双斜杠
调整前端导航菜单项顺序
2026-01-13 18:07:00 +08:00

266 lines
9.2 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
export interface CdtResult {
title: string;
publishDate: Date;
url: string;
}
export const CdtCrawler = {
name: '中国大唐集团电子商务平台',
url: 'https://tang.cdt-ec.com/home/index.html',
baseUrl: 'https://tang.cdt-ec.com',
async crawl(browser: puppeteer.Browser): Promise<CdtResult[]> {
const logger = new Logger('CdtCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
const allResults: CdtResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
logger.log(`Navigating to ${this.url}...`);
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// 点击"招标公告"标签
logger.log('Looking for "招标公告" tab...');
await page.waitForFunction(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
}, { timeout: 60000 });
await page.evaluate(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
const target = tabs.find(tab => tab.textContent && tab.textContent.includes('招标公告')) as HTMLElement;
if (target) target.click();
});
logger.log('Clicked "招标公告" tab.');
await new Promise(r => setTimeout(r, 2000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// 点击"招标公告"下的"更多+"链接
logger.log('Looking for "更多+" link under "招标公告"...');
await page.waitForFunction(() => {
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
return titles.some(title => title.textContent && title.textContent.includes('招标公告'));
}, { timeout: 30000 });
await page.evaluate(() => {
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
const targetTitle = titles.find(title => title.textContent && title.textContent.includes('招标公告'));
if (targetTitle) {
const parent = targetTitle.parentElement;
if (parent) {
const moreLink = parent.querySelector('a.h-notice-more') as HTMLElement;
if (moreLink) moreLink.click();
}
}
});
logger.log('Clicked "更多+" link under "招标公告".');
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
await new Promise(r => setTimeout(r, 3000));
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// 等待表格加载完成
logger.log('Waiting for table to load...');
await page.waitForSelector('table.layui-table', { timeout: 30000 });
while (currentPage <= maxPages) {
// 等待表格数据加载
await page.waitForSelector('tbody tr', { timeout: 10000 });
// 获取当前页面的 HTML 内容
const content = await page.content();
const pageResults = this.extract(content);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`);
break;
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
// 查找下一页按钮
const nextButtonSelector = 'a.layui-laypage-next:not(.layui-disabled)';
const nextButtonExists = await page.evaluate((selector) => {
const btn = document.querySelector(selector);
return btn !== null && !btn.classList.contains('layui-disabled');
}, nextButtonSelector);
if (!nextButtonExists) {
logger.log('Next page button not found or disabled. Reached end of list.');
break;
}
logger.log(`Navigating to page ${currentPage + 1}...`);
try {
// 点击下一页按钮
await page.evaluate((selector) => {
const btn = document.querySelector(selector) as HTMLElement;
if (btn) btn.click();
}, nextButtonSelector);
// 等待 AJAX 请求完成(通过监听网络请求)
await page.waitForFunction(() => {
// 检查表格是否正在加载
const loading = document.querySelector('.layui-table-loading');
return !loading;
}, { timeout: 30000 }).catch(() => {});
// 额外等待确保数据加载完成
await new Promise(r => setTimeout(r, 2000));
// 检查是否真的翻页了(通过检查当前页码)
const currentActivePage = await page.evaluate(() => {
const activeSpan = document.querySelector('.layui-laypage-curr em:last-child');
return activeSpan ? parseInt(activeSpan.textContent || '1') : 1;
});
if (currentActivePage <= currentPage) {
logger.log('Page did not change, stopping.');
break;
}
currentPage++;
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
break;
}
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): CdtResult[] {
const results: CdtResult[] = [];
/**
* Regex groups for tang.cdt-ec.com:
* 1: URL
* 2: Title (项目名称)
* 3: Date (发布时间)
*/
const regex = /<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
let match;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
if (title && url) {
const fullUrl = url.startsWith('http') ? url : this.baseUrl + url;
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: fullUrl.replace(/\/\//g, '/')
});
}
}
return results;
}
};