From 090e4121ceb3e254e302e278b0e9d2fa27df7fce Mon Sep 17 00:00:00 2001 From: dmy Date: Mon, 12 Jan 2026 14:53:38 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=9B=BD=E6=8A=95?= =?UTF-8?q?=E9=9B=86=E5=9B=A2=E7=94=B5=E5=AD=90=E9=87=87=E8=B4=AD=E5=B9=B3?= =?UTF-8?q?=E5=8F=B0=E7=88=AC=E8=99=AB=E5=B9=B6=E6=9B=B4=E6=96=B0=E5=85=B6?= =?UTF-8?q?=E4=BB=96=E5=B9=B3=E5=8F=B0=E5=90=8D=E7=A7=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/crawler/services/bid-crawler.service.ts | 3 +- src/crawler/services/ceic_target.ts | 2 +- src/crawler/services/chdtp_target.ts | 2 +- src/crawler/services/powerbeijing_target.ts | 2 +- src/crawler/services/sdicc_target.spec.ts | 51 +++++ src/crawler/services/sdicc_target.ts | 203 ++++++++++++++++++++ 6 files changed, 259 insertions(+), 4 deletions(-) create mode 100644 src/crawler/services/sdicc_target.spec.ts create mode 100644 src/crawler/services/sdicc_target.ts diff --git a/src/crawler/services/bid-crawler.service.ts b/src/crawler/services/bid-crawler.service.ts index a9d7911..0c846fe 100644 --- a/src/crawler/services/bid-crawler.service.ts +++ b/src/crawler/services/bid-crawler.service.ts @@ -12,6 +12,7 @@ import { CgnpcCrawler } from './cgnpc_target'; import { CeicCrawler } from './ceic_target'; import { EspicCrawler } from './espic_target'; import { PowerbeijingCrawler } from './powerbeijing_target'; +import { SdiccCrawler } from './sdicc_target'; @Injectable() export class BidCrawlerService { @@ -65,7 +66,7 @@ export class BidCrawlerService { args, }); - const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler]; + const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler]; try { for (const crawler of crawlers) { diff --git a/src/crawler/services/ceic_target.ts b/src/crawler/services/ceic_target.ts index b7266b7..fcf6417 100644 --- a/src/crawler/services/ceic_target.ts +++ b/src/crawler/services/ceic_target.ts @@ -48,7 +48,7 @@ async function simulateHumanScrolling(page: puppeteer.Page) { } export const CeicCrawler = { - name: '大连能源采购平台', + name: '国家能源集团生态协作平台', url: 'https://ceic.dlnyzb.com/3001', baseUrl: 'https://ceic.dlnyzb.com', diff --git a/src/crawler/services/chdtp_target.ts b/src/crawler/services/chdtp_target.ts index 7cb5e65..b6fe939 100644 --- a/src/crawler/services/chdtp_target.ts +++ b/src/crawler/services/chdtp_target.ts @@ -8,7 +8,7 @@ export interface ChdtpResult { } export const ChdtpCrawler = { - name: '中国华能集团', + name: '华电集团电子商务平台 ', url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1', baseUrl: 'https://www.chdtp.com/webs/', diff --git a/src/crawler/services/powerbeijing_target.ts b/src/crawler/services/powerbeijing_target.ts index 8d11c16..631b499 100644 --- a/src/crawler/services/powerbeijing_target.ts +++ b/src/crawler/services/powerbeijing_target.ts @@ -53,7 +53,7 @@ export interface PowerbeijingResult { } export const PowerbeijingCrawler = { - name: '北京电力交易平台', + name: '北京京能电子商务平台', url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1', baseUrl: 'https://www.powerbeijing-ec.com', diff --git a/src/crawler/services/sdicc_target.spec.ts b/src/crawler/services/sdicc_target.spec.ts new file mode 100644 index 0000000..b9b0522 --- /dev/null +++ b/src/crawler/services/sdicc_target.spec.ts @@ -0,0 +1,51 @@ +import { SdiccCrawler } from './sdicc_target'; +import * as puppeteer from 'puppeteer'; + +// Increase timeout to 60 seconds for network operations +jest.setTimeout(60000*5); + +describe('SdiccCrawler Real Site Test', () => { + let browser: puppeteer.Browser; + + beforeAll(async () => { + browser = await puppeteer.launch({ + headless: false, // Change to false to see browser UI + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + }); + + afterAll(async () => { + if (browser) { + await browser.close(); + } + }); + + it('should visit website and list all found bid information', async () => { + console.log(`\nStarting crawl for: ${SdiccCrawler.name}`); + console.log(`Target URL: ${SdiccCrawler.url}`); + + const results = await SdiccCrawler.crawl(browser); + + console.log(`\nSuccessfully found ${results.length} items:\n`); + console.log('----------------------------------------'); + results.forEach((item, index) => { + console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log(` Link: ${item.url}`); + console.log('----------------------------------------'); + }); + + // Basic assertions to ensure crawler is working + expect(results).toBeDefined(); + expect(Array.isArray(results)).toBeTruthy(); + // Warn but don't fail if site returns 0 items (could be empty or changed structure) + if (results.length === 0) { + console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.'); + } else { + // Check data integrity of first item + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); + } + }); +}); diff --git a/src/crawler/services/sdicc_target.ts b/src/crawler/services/sdicc_target.ts new file mode 100644 index 0000000..7345776 --- /dev/null +++ b/src/crawler/services/sdicc_target.ts @@ -0,0 +1,203 @@ +import * as puppeteer from 'puppeteer'; +import { Logger } from '@nestjs/common'; + +// 模拟人类鼠标移动 +async function simulateHumanMouseMovement(page: puppeteer.Page) { + const viewport = page.viewport(); + if (!viewport) return; + + const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动 + + for (let i = 0; i < movements; i++) { + const x = Math.floor(Math.random() * viewport.width); + const y = Math.floor(Math.random() * viewport.height); + + await page.mouse.move(x, y, { + steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑 + }); + + // 随机停顿 100-500ms + await new Promise(r => setTimeout(r, 100 + Math.random() * 400)); + } +} + +// 模拟人类滚动 +async function simulateHumanScrolling(page: puppeteer.Page) { + const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动 + + for (let i = 0; i < scrollCount; i++) { + const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px + + await page.evaluate((distance) => { + window.scrollBy({ + top: distance, + behavior: 'smooth' + }); + }, scrollDistance); + + // 随机停顿 500-1500ms + await new Promise(r => setTimeout(r, 500 + Math.random() * 1000)); + } + + // 滚动回顶部 + await page.evaluate(() => { + window.scrollTo({ top: 0, behavior: 'smooth' }); + }); + await new Promise(r => setTimeout(r, 1000)); +} + +export interface SdiccResult { + title: string; + publishDate: Date; + url: string; +} + +export const SdiccCrawler = { + name: '国投集团电子采购平台', + url: 'https://www.sdicc.com.cn/cgxx/ggList', + baseUrl: 'https://www.sdicc.com.cn', + + async crawl(browser: puppeteer.Browser): Promise { + const logger = new Logger('SdiccCrawler'); + const page = await browser.newPage(); + + const username = process.env.PROXY_USERNAME; + const password = process.env.PROXY_PASSWORD; + if (username && password) { + await page.authenticate({ username, password }); + } + + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + Object.defineProperty(navigator, 'language', { get: () => "zh-CN"}); + Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]}); + }); + + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setViewport({ width: 1920, height: 1080 }); + + const allResults: SdiccResult[] = []; + let currentPage = 1; + const maxPages = 5; + + try { + logger.log(`Navigating to ${this.url}...`); + await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 }); + + // 模拟人类行为 + logger.log('Simulating human mouse movements...'); + await simulateHumanMouseMovement(page); + + logger.log('Simulating human scrolling...'); + await simulateHumanScrolling(page); + + // 等待表格加载 + logger.log('Waiting for table to load...'); + await page.waitForSelector('.tbody table tbody tr', { timeout: 30000 }).catch(() => { + logger.warn('Table rows not found, trying alternative selectors...'); + }); + + while (currentPage <= maxPages) { + logger.log(`Processing page ${currentPage}...`); + + const content = await page.content(); + const pageResults = this.extract(content); + + if (pageResults.length === 0) { + logger.warn(`No results found on page ${currentPage}, stopping.`); + break; + } + + allResults.push(...pageResults); + logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`); + + // 模拟人类行为 - 翻页前 + logger.log('Simulating human mouse movements before pagination...'); + await simulateHumanMouseMovement(page); + + logger.log('Simulating human scrolling before pagination...'); + await simulateHumanScrolling(page); + + // 查找下一页按钮 + const nextButtonSelector = '#page_btnLas'; + const nextButton = await page.$(nextButtonSelector); + + if (!nextButton) { + logger.log('Next page button not found. Reached end of list.'); + break; + } + + logger.log(`Navigating to page ${currentPage + 1}...`); + + try { + // 点击下一页按钮 + await nextButton.click(); + await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {}); + await new Promise(r => setTimeout(r, 2000)); // 额外等待确保数据加载完成 + } catch (navError) { + logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`); + break; + } + + currentPage++; + + // 模拟人类行为 - 翻页后 + logger.log('Simulating human mouse movements after pagination...'); + await simulateHumanMouseMovement(page); + + logger.log('Simulating human scrolling after pagination...'); + await simulateHumanScrolling(page); + + // Random delay between pages + const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; + await new Promise(resolve => setTimeout(resolve, delay)); + } + + return allResults; + + } catch (error) { + logger.error(`Failed to crawl ${this.name}: ${error.message}`); + return allResults; + } finally { + await page.close(); + } + }, + + extract(html: string): SdiccResult[] { + const results: SdiccResult[] = []; + /** + * Regex groups for sdicc.com.cn: + * 1: Title (公告名称/项目名称) + * 2: Date (发布时间,格式:2026-01-09) + * 3: gcGuid (项目GUID) + * 4: ggGuid (公告GUID) + * + * HTML结构示例: + * + * 1 + * 国投罗钾公司硫酸钾厂球磨机控制系统升级项目公开招标公告 + * 服务 + * 2026-01-09 + * + */ + const regex = /]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?]*>]*>([^<]+)<\/span><\/td>[\s\S]*?]*>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs; + + let match; + while ((match = regex.exec(html)) !== null) { + const ggGuid = match[1]?.trim(); + const gcGuid = match[2]?.trim(); + const title = match[3]?.trim(); + const dateStr = match[4]?.trim(); + + if (title && ggGuid && gcGuid) { + results.push({ + title, + publishDate: dateStr ? new Date(dateStr) : new Date(), + url: `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}` + }); + } + } + + return results; + } +};