feat: 全面升级系统日志和反爬虫功能
- 新增专业日志系统:集成 Winston 日志框架,支持按天轮转和分级存储 - 增强反爬虫能力:集成 puppeteer-extra-plugin-stealth 插件,提升隐蔽性 - 新增独立爬虫脚本:可通过 npm run crawl 命令单独执行爬虫任务 - 优化前端日期筛选:添加日期范围选择器,支持3天/7天快速筛选 - 改进爬虫统计功能:详细记录每个平台的成功/失败情况和执行时间 - 移除默认关键词初始化:避免重复创建预设关键词 - 扩展环境配置:新增 LOG_LEVEL 日志级别配置选项 - 增强.gitignore:添加日志目录、构建产物等忽略规则 - 升级执行时间限制:将最大执行时间从1小时延长至3小时 - 完善错误处理:更好的异常捕获和日志记录机制
This commit is contained in:
@@ -25,10 +25,13 @@ export class BidCrawlerService {
|
||||
async crawlAll() {
|
||||
this.logger.log('Starting crawl task with Puppeteer...');
|
||||
|
||||
// 设置最大执行时间为1小时
|
||||
const maxExecutionTime = 60 * 60 * 1000; // 1小时(毫秒)
|
||||
// 设置最大执行时间为3小时
|
||||
const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时(毫秒)
|
||||
const startTime = Date.now();
|
||||
|
||||
// 统计结果
|
||||
const crawlResults: Record<string, { success: number; error?: string }> = {};
|
||||
|
||||
// 从环境变量读取代理配置
|
||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||
@@ -68,7 +71,7 @@ export class BidCrawlerService {
|
||||
// 检查是否超时
|
||||
const elapsedTime = Date.now() - startTime;
|
||||
if (elapsedTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour. Stopping...`);
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`);
|
||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||
break;
|
||||
}
|
||||
@@ -76,18 +79,23 @@ export class BidCrawlerService {
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||
|
||||
// 记录成功数量
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title: item.title,
|
||||
url: item.url,
|
||||
publishDate: item.publishDate,
|
||||
source: crawler.name,
|
||||
unit: '',
|
||||
source: crawler.name,
|
||||
unit: '',
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
@@ -100,8 +108,31 @@ export class BidCrawlerService {
|
||||
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
|
||||
|
||||
if (totalTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour.`);
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`);
|
||||
}
|
||||
|
||||
// 输出统计总结
|
||||
this.logger.log('='.repeat(50));
|
||||
this.logger.log('爬虫执行总结 / Crawl Summary');
|
||||
this.logger.log('='.repeat(50));
|
||||
|
||||
let totalSuccess = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (const [source, result] of Object.entries(crawlResults)) {
|
||||
if (result.error) {
|
||||
this.logger.error(`❌ ${source}: 出错 - ${result.error}`);
|
||||
errorCount++;
|
||||
} else {
|
||||
this.logger.log(`✅ ${source}: 成功获取 ${result.success} 条工程信息`);
|
||||
totalSuccess += result.success;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log('='.repeat(50));
|
||||
this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`);
|
||||
this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`);
|
||||
this.logger.log('='.repeat(50));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
import { chromium } from 'playwright';
|
||||
import { ChngCrawler } from './chng_target';
|
||||
|
||||
jest.setTimeout(120000);
|
||||
|
||||
describe('ChngCrawler Playwright Test', () => {
|
||||
let browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await chromium.launch({
|
||||
headless: false,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit the website and list all found bid information', async () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${ChngCrawler.name}`);
|
||||
console.log(`Target URL: ${ChngCrawler.url}`);
|
||||
|
||||
const context = await browser.newContext({
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
// Add stealth scripts if needed, but Playwright is often better at evasion
|
||||
await page.addInitScript(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
});
|
||||
|
||||
await page.goto(ChngCrawler.url, { waitUntil: 'networkidle', timeout: 60000 });
|
||||
|
||||
// Wait for content
|
||||
try {
|
||||
await page.waitForSelector('.ant-table-row', { timeout: 30000 });
|
||||
} catch (e) {
|
||||
console.warn('Timed out waiting for .ant-table-row');
|
||||
}
|
||||
|
||||
const content = await page.content();
|
||||
|
||||
// Reuse the extraction logic from the Crawler definition
|
||||
const results = ChngCrawler.extract(content);
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('No items found. Debugging content length: ' + content.length);
|
||||
if (content.length < 500) {
|
||||
console.log('Content dump:', content);
|
||||
}
|
||||
}
|
||||
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
});
|
||||
});
|
||||
134
src/crawler/services/chng_target_stealth.spec.ts
Normal file
134
src/crawler/services/chng_target_stealth.spec.ts
Normal file
@@ -0,0 +1,134 @@
|
||||
import { ChngCrawler } from './chng_target';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import type { Browser, Page } from 'puppeteer';
|
||||
|
||||
// 使用 stealth 插件增强反爬虫能力
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// Increase timeout to 180 seconds for slow sites and stealth mode
|
||||
jest.setTimeout(180000);
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
describe('ChngCrawler Stealth Test (Headless Mode with Stealth Plugin)', () => {
|
||||
let browser: Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: true, // 使用 headless 模式
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
'--disable-infobars',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--no-first-run',
|
||||
'--no-zygote',
|
||||
'--disable-gpu',
|
||||
'--disable-features=VizDisplayCompositor',
|
||||
'--disable-webgl',
|
||||
],
|
||||
defaultViewport: null
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit the website and list all found bid information with stealth plugin', async () => {
|
||||
// 为此测试单独设置更长的超时时间
|
||||
jest.setTimeout(180000);
|
||||
console.log(`
|
||||
Starting crawl for: ${ChngCrawler.name}`);
|
||||
console.log(`Target URL: ${ChngCrawler.url}`);
|
||||
console.log('Using puppeteer-extra-plugin-stealth for anti-detection');
|
||||
console.log('Running in headless mode');
|
||||
|
||||
// 创建一个临时页面用于模拟人类行为
|
||||
const tempPage = await browser.newPage();
|
||||
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
console.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(tempPage);
|
||||
|
||||
// 模拟人类滚动
|
||||
console.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(tempPage);
|
||||
|
||||
await tempPage.close();
|
||||
|
||||
const results = await ChngCrawler.crawl(browser);
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. The site might have detected the crawler or content is not loading properly.');
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user