From 3454d9d07f341fe1620f00782f709796d9e165db Mon Sep 17 00:00:00 2001 From: dmy Date: Mon, 12 Jan 2026 00:01:38 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E7=94=B5=E8=83=BDe?= =?UTF-8?q?=E6=8B=9B=E9=87=87=E5=B9=B3=E5=8F=B0=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 EspicCrawler 爬虫类,支持电能e招采平台招标信息抓取 - 目标平台:https://ebid.espic.com.cn,专注于电力行业电子招投标 - 智能 URL 生成:使用 getUrl() 方法动态生成带时间戳的分页链接 - 增强反 WAF 功能:等待人机识别检测通过,避免被防火墙拦截 - 多重选择器策略:使用多种选择器组合查找下一页按钮,提高翻页成功率 - 使用正则表达式提取招标信息,支持标题、URL和发布日期解析 - 添加完整的人类行为模拟功能(鼠标移动、滚动)降低检测风险 - 支持分页抓取,最多抓取5页数据 - 添加完整的单元测试文件,确保爬虫功能正常 - 统一的错误处理机制,单个爬虫失败不影响整体任务执行 --- src/crawler/services/espic_target.spec.ts | 51 +++++ src/crawler/services/espic_target.ts | 247 ++++++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 src/crawler/services/espic_target.spec.ts create mode 100644 src/crawler/services/espic_target.ts diff --git a/src/crawler/services/espic_target.spec.ts b/src/crawler/services/espic_target.spec.ts new file mode 100644 index 0000000..50bbf6a --- /dev/null +++ b/src/crawler/services/espic_target.spec.ts @@ -0,0 +1,51 @@ +import { EspicCrawler } from './espic_target'; +import * as puppeteer from 'puppeteer'; + +// Increase timeout to 60 seconds for network operations +jest.setTimeout(60000*5); + +describe('EspicCrawler Real Site Test', () => { + let browser: puppeteer.Browser; + + beforeAll(async () => { + browser = await puppeteer.launch({ + headless: false, // Change to false to see browser UI + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + }); + + afterAll(async () => { + if (browser) { + await browser.close(); + } + }); + + it('should visit website and list all found bid information', async () => { + console.log(`\nStarting crawl for: ${EspicCrawler.name}`); + console.log(`Target URL: ${EspicCrawler.getUrl()}`); + + const results = await EspicCrawler.crawl(browser); + + console.log(`\nSuccessfully found ${results.length} items:\n`); + console.log('----------------------------------------'); + results.forEach((item, index) => { + console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log(` Link: ${item.url}`); + console.log('----------------------------------------'); + }); + + // Basic assertions to ensure crawler is working + expect(results).toBeDefined(); + expect(Array.isArray(results)).toBeTruthy(); + // Warn but don't fail if site returns 0 items (could be empty or changed structure) + if (results.length === 0) { + console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + } else { + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); + } + }); +}); diff --git a/src/crawler/services/espic_target.ts b/src/crawler/services/espic_target.ts new file mode 100644 index 0000000..0b7959a --- /dev/null +++ b/src/crawler/services/espic_target.ts @@ -0,0 +1,247 @@ +import * as puppeteer from 'puppeteer'; +import { Logger } from '@nestjs/common'; + +// 模拟人类鼠标移动 +async function simulateHumanMouseMovement(page: puppeteer.Page) { + const viewport = page.viewport(); + if (!viewport) return; + + const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动 + + for (let i = 0; i < movements; i++) { + const x = Math.floor(Math.random() * viewport.width); + const y = Math.floor(Math.random() * viewport.height); + + await page.mouse.move(x, y, { + steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + }); + + // 随机停顿 100-500ms + await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + } +} + +// 模拟人类滚动 +async function simulateHumanScrolling(page: puppeteer.Page) { + const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动 + + for (let i = 0; i < scrollCount; i++) { + const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px + + await page.evaluate((distance) => { + window.scrollBy({ + top: distance, + behavior: 'smooth' + }); + }, scrollDistance); + + // 随机停顿 500-1500ms + await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + } + + // 滚动回顶部 + await page.evaluate(() => { + window.scrollTo({ top: 0, behavior: 'smooth' }); + }); + await new Promise(r => setTimeout(r, 1000)); +} + +export interface EspicResult { + title: string; + publishDate: Date; + url: string; +} + +export const EspicCrawler = { + name: '电能e招采平台', + baseUrl: 'https://ebid.espic.com.cn', + + // 生成动态 URL,使用当前日期 + getUrl(page: number = 1): string { + const now = new Date(); + const year = now.getFullYear(); + const month = now.getMonth() + 1; // 月份从0开始 + const day = now.getDate(); + const timeStr = `${year}-${month}-${day}`; + return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`; + }, + + async crawl(browser: puppeteer.Browser): Promise { + const logger = new Logger('EspicCrawler'); + const page = await browser.newPage(); + + const username = process.env.PROXY_USERNAME; + const password = process.env.PROXY_PASSWORD; + if (username && password) { + await page.authenticate({ username, password }); + } + + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); + Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + }); + + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setViewport({ width: 1920, height: 1080 }); + + const allResults: EspicResult[] = []; + let currentPage = 1; + const maxPages = 5; + + try { + const url = this.getUrl(currentPage); + logger.log(`Navigating to ${url}...`); + await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); + + // 等待 WAF 验证通过 + logger.log('Waiting for WAF verification...'); + await page.waitForFunction( + () => { + // 检查是否已经通过验证(页面不再是 WAF 页面) + const bodyText = document.body?.textContent || ''; + return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙'); + }, + { timeout: 30000 } + ); + + // 模拟人类行为 + logger.log('Simulating human mouse movements...'); + await simulateHumanMouseMovement(page); + + logger.log('Simulating human scrolling...'); + await simulateHumanScrolling(page); + + while (currentPage <= maxPages) { + logger.log(`Processing page ${currentPage}...`); + + const content = await page.content(); + const pageResults = this.extract(content); + + if (pageResults.length === 0) { + logger.warn(`No results found on page ${currentPage}, stopping.`); + break; + } + + allResults.push(...pageResults); + logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + + // 模拟人类行为 - 翻页前 + logger.log('Simulating human mouse movements before pagination...'); + await simulateHumanMouseMovement(page); + + logger.log('Simulating human scrolling before pagination...'); + await simulateHumanScrolling(page); + + // 查找下一页按钮 - 根据网站实际结构调整选择器 + // 实际结构: 下一页 + const nextButtonSelectors = [ + 'a[onclick*="turnPage"]', + 'a:contains("下一页")', + 'a[aria-label="Next"]', + 'a.next', + 'li.next a', + 'a.layui-laypage-next:not(.layui-disabled)' + ]; + + let nextButton: puppeteer.ElementHandle | null = null; + for (const selector of nextButtonSelectors) { + try { + nextButton = await page.$(selector); + if (nextButton) break; + } catch (e) { + // 继续尝试下一个选择器 + } + } + + if (!nextButton) { + logger.log('Next page button not found. Reached end of list.'); + break; + } + + logger.log(`Navigating to page ${currentPage + 1}...`); + + try { + // 点击下一页按钮 + await nextButton.click(); + await new Promise(r => setTimeout(r, 3000)); // 等待页面加载 + } catch (navError) { + logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + break; + } + + currentPage++; + + // 模拟人类行为 - 翻页后 + logger.log('Simulating human mouse movements after pagination...'); + await simulateHumanMouseMovement(page); + + logger.log('Simulating human scrolling after pagination...'); + await simulateHumanScrolling(page); + + // Random delay between pages + const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; + await new Promise(resolve => setTimeout(resolve, delay)); + } + + return allResults; + + } catch (error) { + logger.error(`Failed to crawl ${this.name}: ${error.message}`); + return allResults; + } finally { + await page.close(); + } + }, + + extract(html: string): EspicResult[] { + const results: EspicResult[] = []; + /** + * Regex groups for ebid.espic.com.cn: + * 1: URL (href属性) + * 2: Title (title属性,完整的招标工程名称) + * 3: Date (发布时间,格式:2026-01-11) + * + * HTML结构示例: + *
  • + * + *
    + *
    + *
    GJDTHN2025225-国家电投集团河南公司平顶山发电2026年-2028年(两年期)旺...
    + *
    + *
    招标编号:DNYZC-2026-01-11-001
    + *
    招标方式:公开招标
    + *
    报名截止时间:2026-01-19
    + *
    + *
    + *
    + *
    + *
    2026-01-11
    + *
    + *
    + *
    + *
    + *
  • + */ + const regex = /
  • [\s\S]*?]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?
    [\s\S]*?
    \s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs; + + let match; + while ((match = regex.exec(html)) !== null) { + const url = match[1]?.trim(); + const title = match[2]?.trim(); + const dateStr = match[3]?.trim(); + + if (title && url) { + results.push({ + title, + publishDate: dateStr ? new Date(dateStr) : new Date(), + url: url.startsWith('http') ? url : this.baseUrl + url + }); + } + } + + return results; + } +};