diff --git a/src/crawler/services/bid-crawler.service.ts b/src/crawler/services/bid-crawler.service.ts index 00d7373..8f4d973 100644 --- a/src/crawler/services/bid-crawler.service.ts +++ b/src/crawler/services/bid-crawler.service.ts @@ -1,7 +1,9 @@ import { Injectable, Logger } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; import * as puppeteer from 'puppeteer'; import { BidsService } from '../../bids/services/bid.service'; import { ChdtpCrawler } from './chdtp_target'; +import { ChngCrawler } from './chng_target'; @Injectable() export class BidCrawlerService { @@ -9,33 +11,64 @@ export class BidCrawlerService { constructor( private bidsService: BidsService, + private configService: ConfigService, ) {} async crawlAll() { this.logger.log('Starting crawl task with Puppeteer...'); + // 从环境变量读取代理配置 + const proxyHost = this.configService.get('PROXY_HOST'); + const proxyPort = this.configService.get('PROXY_PORT'); + const proxyUsername = this.configService.get('PROXY_USERNAME'); + const proxyPassword = this.configService.get('PROXY_PASSWORD'); + + // 构建代理参数 + const args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-blink-features=AutomationControlled', + '--disable-infobars', + '--window-position=0,0', + '--ignore-certifcate-errors', + '--ignore-certifcate-errors-spki-list', + ]; + + if (proxyHost && proxyPort) { + const proxyUrl = proxyUsername && proxyPassword + ? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}` + : `http://${proxyHost}:${proxyPort}`; + args.push(`--proxy-server=${proxyUrl}`); + this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`); + } + const browser = await puppeteer.launch({ headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'], + args, }); + const crawlers = [ChdtpCrawler, ChngCrawler]; + try { - // Currently only supports ChdtpCrawler, but can be extended to a list of crawlers - const crawler = ChdtpCrawler; - this.logger.log(`Crawling: ${crawler.name}`); - - const results = await crawler.crawl(browser); - this.logger.log(`Extracted ${results.length} items from ${crawler.name}`); + for (const crawler of crawlers) { + this.logger.log(`Crawling: ${crawler.name}`); + try { + const results = await crawler.crawl(browser); + this.logger.log(`Extracted ${results.length} items from ${crawler.name}`); - for (const item of results) { - await this.bidsService.createOrUpdate({ - title, - url: itemUrl, - publishDate, - source: type || 'Unknown', - }); + for (const item of results) { + await this.bidsService.createOrUpdate({ + title: item.title, + url: item.url, + publishDate: item.publishDate, + source: crawler.name, + unit: '', + }); + } + } catch (err) { + this.logger.error(`Error crawling ${crawler.name}: ${err.message}`); + } } - } catch (error) { this.logger.error(`Crawl task failed: ${error.message}`); } finally { diff --git a/src/crawler/services/chdtp_target.ts b/src/crawler/services/chdtp_target.ts index 23b9e7f..7cb5e65 100644 --- a/src/crawler/services/chdtp_target.ts +++ b/src/crawler/services/chdtp_target.ts @@ -15,6 +15,13 @@ export const ChdtpCrawler = { async crawl(browser: puppeteer.Browser): Promise { const logger = new Logger('ChdtpCrawler'); const page = await browser.newPage(); + + const username = process.env.PROXY_USERNAME; + const password = process.env.PROXY_PASSWORD; + if (username && password) { + await page.authenticate({ username, password }); + } + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'); const allResults: ChdtpResult[] = []; diff --git a/src/crawler/services/chng_target.spec.ts b/src/crawler/services/chng_target.spec.ts new file mode 100644 index 0000000..105aa8b --- /dev/null +++ b/src/crawler/services/chng_target.spec.ts @@ -0,0 +1,60 @@ +import { ChngCrawler } from './chng_target'; +import * as puppeteer from 'puppeteer'; + +// Increase timeout to 120 seconds for manual inspection and slow sites +jest.setTimeout(120000); + +describe('ChngCrawler Real Site Test', () => { + let browser: puppeteer.Browser; + + beforeAll(async () => { + browser = await puppeteer.launch({ + headless: false, // Run in non-headless mode + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-blink-features=AutomationControlled', + '--window-size=1920,1080' + ], + defaultViewport: null + }); + }); + + afterAll(async () => { + if (browser) { + // Keep open for a few seconds after test to see result + await new Promise(r => setTimeout(r, 50000)); + // await browser.close(); + } + }); + + it('should visit the website and list all found bid information', async () => { + console.log(` +Starting crawl for: ${ChngCrawler.name}`); + console.log(`Target URL: ${ChngCrawler.url}`); + + const results = await ChngCrawler.crawl(browser); + + console.log(` +Successfully found ${results.length} items: +`); + console.log('----------------------------------------'); + results.forEach((item, index) => { + console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log(` Link: ${item.url}`); + console.log('----------------------------------------'); + }); + + expect(results).toBeDefined(); + expect(Array.isArray(results)).toBeTruthy(); + + if (results.length === 0) { + console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.'); + } else { + const firstItem = results[0]; + expect(firstItem.title).toBeTruthy(); + expect(firstItem.url).toMatch(/^https?:\/\//); + expect(firstItem.publishDate).toBeInstanceOf(Date); + } + }); +}); \ No newline at end of file diff --git a/src/crawler/services/chng_target.ts b/src/crawler/services/chng_target.ts new file mode 100644 index 0000000..d51de6e --- /dev/null +++ b/src/crawler/services/chng_target.ts @@ -0,0 +1,163 @@ +import * as puppeteer from 'puppeteer'; +import { Logger } from '@nestjs/common'; +import { ChdtpResult } from './chdtp_target'; + +export const ChngCrawler = { + name: '华能集团电子商务平台', + url: 'https://ec.chng.com.cn/ecmall/index.html#/purchase/home?top=0', + baseUrl: 'https://ec.chng.com.cn/ecmall/index.html', + + async crawl(browser: puppeteer.Browser): Promise { + const logger = new Logger('ChngCrawler'); + let page = await browser.newPage(); + + const username = process.env.PROXY_USERNAME; + const password = process.env.PROXY_PASSWORD; + if (username && password) { + await page.authenticate({ username, password }); + } + + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + }); + + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'); + await page.setViewport({ width: 1920, height: 1080 }); + + const allResults: ChdtpResult[] = []; + let currentPage = 1; + const maxPages = 5; + + try { + logger.log('Navigating to Bing...'); + await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' }); + + logger.log('Searching for target site...'); + const searchBoxSelector = 'input[name="q"]'; + await page.waitForSelector(searchBoxSelector); + await page.type(searchBoxSelector, 'https://ec.chng.com.cn/'); + await page.keyboard.press('Enter'); + await page.waitForNavigation({ waitUntil: 'networkidle2' }); + + logger.log('Clicking search result...'); + const firstResultSelector = '#b_results .b_algo h2 a'; + await page.waitForSelector(firstResultSelector); + + const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target()); + await page.click(firstResultSelector); + + const newTarget = await newTargetPromise; + const newPage = await newTarget.page(); + + if (newPage) { + await page.close(); + page = newPage; + if (username && password) { + await page.authenticate({ username, password }); + } + } + + await page.waitForNavigation({ waitUntil: 'domcontentloaded' }).catch(() => {}); + + // PAUSE 15 SECONDS as requested + logger.log('Pausing 15 seconds before looking for "采购专栏"...'); + await new Promise(r => setTimeout(r, 15000)); + + logger.log('Looking for "采购专栏" link...'); + await page.waitForFunction(() => { + const divs = Array.from(document.querySelectorAll('div.text')); + return divs.some(div => div.textContent && div.textContent.includes('采购专栏')); + }, { timeout: 60000 }); + + const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null); + + await page.evaluate(() => { + const divs = Array.from(document.querySelectorAll('div.text')); + const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement; + if (target) target.click(); + }); + + const purchaseTarget = await purchaseTargetPromise; + if (purchaseTarget) { + const pPage = await purchaseTarget.page(); + if (pPage) { + logger.log('Switched to Purchase Page tab.'); + page = pPage; + if (username && password) { + await page.authenticate({ username, password }); + } + await new Promise(r => setTimeout(r, 5000)); + } + } + + logger.log(`Active URL: ${page.url()}`); + + while (currentPage <= maxPages) { + logger.log(`Processing page ${currentPage}...`); + + // Wait for table rows to load + await page.waitForFunction(() => { + return document.querySelectorAll('tr.ant-table-row').length > 0; + }, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.')); + + const pageResults = await page.evaluate((baseUrl) => { + // Extract from table rows + const items = Array.from(document.querySelectorAll('tr.ant-table-row')); + return items.map(item => { + const titleSpan = item.querySelector('span.list-text'); + const dateCell = item.querySelector('td.ant-table-row-cell-break-word p'); + + if (titleSpan && dateCell) { + const title = titleSpan.textContent?.trim() || ''; + const dateStr = dateCell.textContent?.trim() || ''; + + if (title.length < 5) return null; // Filter noise + + // URL is not directly available in the table, need to construct from data-row-key + const rowKey = item.getAttribute('data-row-key'); + const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : ''; + + return { + title, + dateStr, + url + }; + } + return null; + }).filter(i => i !== null); + }, this.baseUrl); + + if (pageResults.length === 0) { + logger.warn(`No results found on page ${currentPage}. Extraction failed.`); + break; + } + + allResults.push(...pageResults.map(r => ({ + title: r!.title, + publishDate: new Date(r!.dateStr), + url: r!.url + }))); + + logger.log(`Extracted ${pageResults.length} items.`); + + // Pagination: look for the "right" icon SVG + const nextButton = await page.$('svg[data-icon="right"]'); + if (!nextButton) break; + + await nextButton.click(); + await new Promise(r => setTimeout(r, 5000)); + currentPage++; + } + + return allResults; + + } catch (error) { + logger.error(`Crawl failed: ${error.message}`); + return allResults; + } finally { + if (page) await page.close(); + } + }, + + extract() { return []; } +}; \ No newline at end of file diff --git a/src/crawler/services/chng_target_playwright.spec.ts b/src/crawler/services/chng_target_playwright.spec.ts new file mode 100644 index 0000000..949bf2c --- /dev/null +++ b/src/crawler/services/chng_target_playwright.spec.ts @@ -0,0 +1,72 @@ +import { chromium } from 'playwright'; +import { ChngCrawler } from './chng_target'; + +jest.setTimeout(120000); + +describe('ChngCrawler Playwright Test', () => { + let browser; + + beforeAll(async () => { + browser = await chromium.launch({ + headless: false, + args: ['--no-sandbox', '--disable-setuid-sandbox'] + }); + }); + + afterAll(async () => { + if (browser) { + await browser.close(); + } + }); + + it('should visit the website and list all found bid information', async () => { + console.log(` +Starting crawl for: ${ChngCrawler.name}`); + console.log(`Target URL: ${ChngCrawler.url}`); + + const context = await browser.newContext({ + viewport: { width: 1920, height: 1080 }, + userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + }); + + const page = await context.newPage(); + + // Add stealth scripts if needed, but Playwright is often better at evasion + await page.addInitScript(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + }); + + await page.goto(ChngCrawler.url, { waitUntil: 'networkidle', timeout: 60000 }); + + // Wait for content + try { + await page.waitForSelector('.ant-table-row', { timeout: 30000 }); + } catch (e) { + console.warn('Timed out waiting for .ant-table-row'); + } + + const content = await page.content(); + + // Reuse the extraction logic from the Crawler definition + const results = ChngCrawler.extract(content); + + console.log(` +Successfully found ${results.length} items: +`); + console.log('----------------------------------------'); + results.forEach((item, index) => { + console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`); + console.log(` Link: ${item.url}`); + console.log('----------------------------------------'); + }); + + if (results.length === 0) { + console.warn('No items found. Debugging content length: ' + content.length); + if (content.length < 500) { + console.log('Content dump:', content); + } + } + + expect(Array.isArray(results)).toBeTruthy(); + }); +});