import { ChdtpCrawler } from './chdtp_target'; import * as puppeteer from 'puppeteer'; // Increase timeout to 60 seconds for network operations jest.setTimeout(60000); // 获取代理配置 const getProxyArgs = (): string[] => { const proxyHost = process.env.PROXY_HOST; const proxyPort = process.env.PROXY_PORT; const proxyUsername = process.env.PROXY_USERNAME; const proxyPassword = process.env.PROXY_PASSWORD; if (proxyHost && proxyPort) { const args = [`--proxy-server=${proxyHost}:${proxyPort}`]; if (proxyUsername && proxyPassword) { args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`); } return args; } return []; }; describe('ChdtpCrawler Real Site Test', () => { let browser: puppeteer.Browser; beforeAll(async () => { const proxyArgs = getProxyArgs(); if (proxyArgs.length > 0) { console.log('Using proxy:', proxyArgs.join(' ')); } browser = await puppeteer.launch({ headless: true, // Change to false to see the browser UI args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs], }); }); afterAll(async () => { if (browser) { await browser.close(); } }); it('should visit the website and list all found bid information', async () => { console.log(`\nStarting crawl for: ${ChdtpCrawler.name}`); console.log(`Target URL: ${ChdtpCrawler.url}`); const results = await ChdtpCrawler.crawl(browser); console.log(`\nSuccessfully found ${results.length} items:\n`); console.log('----------------------------------------'); results.forEach((item, index) => { console.log( `${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`, ); console.log(` Link: ${item.url}`); console.log('----------------------------------------'); }); // Basic assertions to ensure the crawler is working expect(results).toBeDefined(); expect(Array.isArray(results)).toBeTruthy(); // Warn but don't fail if site returns 0 items (could be empty or changed structure) if (results.length === 0) { console.warn( 'Warning: No items found. Check if the website structure has changed or if the list is currently empty.', ); } else { // Check data integrity of the first item const firstItem = results[0]; expect(firstItem.title).toBeTruthy(); expect(firstItem.url).toMatch(/^https?:\/\//); expect(firstItem.publishDate).toBeInstanceOf(Date); } }); });