Compare commits

..

3 Commits

Author SHA1 Message Date
dmy
8b2f328981 feat: 调整界面文字和列宽
将标题改为中文"投标信息一览"
调整Source列的宽度
2026-01-12 13:21:49 +08:00
dmy
1b28a3462a feat: 全面优化爬虫系统和数据处理能力
- 增强数据重试机制:对数据为0的爬虫自动重试,提高数据完整性
- 优化前端筛选逻辑:改进日期筛选,只限制开始时间,更灵活的数据查看
- 新增最近数据接口:添加 /api/bids/recent 获取30天内最新招标数据
- 改进统计展示:实时显示筛选结果数量,优化用户体验
- 完善日志系统:确保日志目录自动创建,避免启动错误
- 增强独立脚本:使用自定义logger,完善错误处理和程序关闭
- 优化主程序:集成自定义日志服务,统一日志格式
- 扩展npm脚本:新增 web 命令用于构建前端
- 改进大唐爬虫:延长等待时间到60秒,提高页面加载成功率
- 优化数据筛选:今日招标改为使用独立接口,提升性能
2026-01-12 12:28:37 +08:00
dmy
3e6456e120 feat: 全面升级系统日志和反爬虫功能
- 新增专业日志系统:集成 Winston 日志框架,支持按天轮转和分级存储
- 增强反爬虫能力:集成 puppeteer-extra-plugin-stealth 插件,提升隐蔽性
- 新增独立爬虫脚本:可通过 npm run crawl 命令单独执行爬虫任务
- 优化前端日期筛选:添加日期范围选择器,支持3天/7天快速筛选
- 改进爬虫统计功能:详细记录每个平台的成功/失败情况和执行时间
- 移除默认关键词初始化:避免重复创建预设关键词
- 扩展环境配置:新增 LOG_LEVEL 日志级别配置选项
- 增强.gitignore:添加日志目录、构建产物等忽略规则
- 升级执行时间限制:将最大执行时间从1小时延长至3小时
- 完善错误处理:更好的异常捕获和日志记录机制
2026-01-12 10:46:10 +08:00
19 changed files with 567 additions and 126 deletions

5
.env
View File

@@ -8,4 +8,7 @@ DATABASE_SYNCHRONIZE=true
# 代理配置(可选)
PROXY_HOST=127.0.0.1
PROXY_PORT=3211
PROXY_PORT=3211
# 日志级别可选error, warn, info, debug, verbose
LOG_LEVEL=info

View File

@@ -10,4 +10,7 @@ DATABASE_SYNCHRONIZE=true
PROXY_HOST=127.0.0.1
PROXY_PORT=6000
# PROXY_USERNAME=
# PROXY_PASSWORD=
# PROXY_PASSWORD=
# 日志级别可选error, warn, info, debug, verbose
LOG_LEVEL=info

6
.gitignore vendored
View File

@@ -1 +1,7 @@
node_modules
dist
.vscode
public
*.xls*
pw-browsers
logs

View File

@@ -4,7 +4,7 @@
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>frontend</title>
<title>投标</title>
</head>
<body>
<div id="app"></div>

View File

@@ -1,7 +1,7 @@
<template>
<el-container class="layout-container" style="height: 100vh">
<el-aside width="200px" style="background-color: #545c64">
<div class="logo">BID MONITOR</div>
<div class="logo">投标信息一览</div>
<el-menu
active-text-color="#ffd04b"
background-color="#545c64"
@@ -54,7 +54,7 @@
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="120" />
<el-table-column prop="source" label="Source" width="240" />
<el-table-column prop="publishDate" label="Date" width="120">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
@@ -65,22 +65,37 @@
<el-divider />
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
<h3 style="margin: 0;">Today's Bids</h3>
<el-select
v-model="selectedKeywords"
multiple
collapse-tags
collapse-tags-tooltip
placeholder="Filter by Keywords"
clearable
style="width: 300px;"
>
<el-option
v-for="keyword in keywords"
:key="keyword.id"
:label="keyword.word"
:value="keyword.word"
<div style="display: flex; gap: 10px;">
<el-date-picker
v-model="dateRange"
type="daterange"
range-separator="To"
start-placeholder="Start Date"
end-placeholder="End Date"
format="YYYY-MM-DD"
value-format="YYYY-MM-DD"
clearable
style="width: 240px;"
/>
</el-select>
<el-button type="primary" @click="setLast3Days">3天</el-button>
<el-button type="primary" @click="setLast7Days">7天</el-button>
<el-select
v-model="selectedKeywords"
multiple
collapse-tags
collapse-tags-tooltip
placeholder="Filter by Keywords"
clearable
style="width: 300px;"
>
<el-option
v-for="keyword in keywords"
:key="keyword.id"
:label="keyword.word"
:value="keyword.word"
/>
</el-select>
</div>
</div>
<el-table :data="filteredTodayBids" v-loading="loading" style="width: 100%">
<el-table-column prop="title" label="Title">
@@ -88,7 +103,7 @@
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="150" />
<el-table-column prop="source" label="Source" width="220" />
<el-table-column prop="publishDate" label="Date" width="150">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
@@ -113,7 +128,7 @@
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="150" />
<el-table-column prop="source" label="Source" width="200" />
<el-table-column prop="publishDate" label="Date" width="150">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
@@ -180,6 +195,7 @@ import { DataBoard, Document, Setting, Refresh } from '@element-plus/icons-vue'
const activeIndex = ref('1')
const bids = ref<any[]>([])
const todayBids = ref<any[]>([])
const highPriorityBids = ref<any[]>([])
const keywords = ref<any[]>([])
const loading = ref(false)
@@ -192,6 +208,7 @@ const total = ref(0)
const sourceOptions = ref<string[]>([])
const isCrawling = ref(false)
const selectedKeywords = ref<string[]>([])
const dateRange = ref<[string, string] | null>(null)
// 从 localStorage 加载保存的关键字
const loadSavedKeywords = () => {
@@ -210,6 +227,16 @@ watch(selectedKeywords, (newKeywords) => {
localStorage.setItem('selectedKeywords', JSON.stringify(newKeywords))
}, { deep: true })
// 监听日期范围变化并显示提示
watch(dateRange, () => {
const totalBids = bids.value.length
const filteredCount = filteredTodayBids.value.length
if (totalBids > 0 && filteredCount < totalBids) {
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
}
})
const form = reactive({
word: '',
weight: 1
@@ -241,28 +268,128 @@ const handleSizeChange = (size: number) => {
fetchData()
}
// 设置日期范围为最近3天
const setLast3Days = () => {
const endDate = new Date()
const startDate = new Date()
startDate.setDate(startDate.getDate() - 2) // 最近3天包括今天
const formatDateForPicker = (date: Date) => {
const year = date.getFullYear()
const month = String(date.getMonth() + 1).padStart(2, '0')
const day = String(date.getDate()).padStart(2, '0')
return `${year}-${month}-${day}`
}
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
console.log('setLast3Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
const start = new Date(startDate)
start.setHours(0, 0, 0, 0)
let result = todayBids.value
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
return bidDate >= start
})
const totalBids = todayBids.value.length
const filteredCount = result.length
console.log('setLast3Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
if (totalBids === 0) {
ElMessage.warning('暂无数据请先抓取数据')
}
}
// 设置日期范围为最近7天
const setLast7Days = () => {
const endDate = new Date()
const startDate = new Date()
startDate.setDate(startDate.getDate() - 6) // 最近7天包括今天
const formatDateForPicker = (date: Date) => {
const year = date.getFullYear()
const month = String(date.getMonth() + 1).padStart(2, '0')
const day = String(date.getDate()).padStart(2, '0')
return `${year}-${month}-${day}`
}
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
console.log('setLast7Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
const start = new Date(startDate)
start.setHours(0, 0, 0, 0)
let result = todayBids.value
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
return bidDate >= start
})
const totalBids = todayBids.value.length
const filteredCount = result.length
console.log('setLast7Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
if (totalBids === 0) {
ElMessage.warning('暂无数据请先抓取数据')
}
}
const formatDate = (dateString: string) => {
if (!dateString) return '-'
return new Date(dateString).toLocaleDateString()
}
// 过滤 Today's Bids只显示包含所选关键字的项目
// 过滤 Today's Bids只显示包含所选关键字的项目并且在日期范围内
const filteredTodayBids = computed(() => {
if (selectedKeywords.value.length === 0) {
return bids.value
let result = todayBids.value
// 按关键字筛选
if (selectedKeywords.value.length > 0) {
result = result.filter(bid => {
return selectedKeywords.value.some(keyword =>
bid.title.toLowerCase().includes(keyword.toLowerCase())
)
})
}
return bids.value.filter(bid => {
return selectedKeywords.value.some(keyword =>
bid.title.toLowerCase().includes(keyword.toLowerCase())
)
})
// 按日期范围筛选(只限制开始时间,不限制结束时间)
if (dateRange.value && dateRange.value.length === 2) {
const [startDate] = dateRange.value
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
const start = new Date(startDate)
// 设置时间为当天的开始
start.setHours(0, 0, 0, 0)
return bidDate >= start
})
}
return result
})
// 监听筛选结果变化并显示提示
watch(filteredTodayBids, (newFilteredBids) => {
const totalBids = todayBids.value.length
const filteredCount = newFilteredBids.length
if (totalBids > 0 && filteredCount < totalBids) {
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
}
}, { deep: true })
const fetchData = async () => {
loading.value = true
try {
const [bidsRes, highRes, kwRes, sourcesRes, statusRes] = await Promise.all([
const [bidsRes, recentRes, highRes, kwRes, sourcesRes, statusRes] = await Promise.all([
axios.get('/api/bids', {
params: {
page: currentPage.value,
@@ -270,6 +397,7 @@ const fetchData = async () => {
source: selectedSource.value || undefined
}
}),
axios.get('/api/bids/recent'),
axios.get('/api/bids/high-priority'),
axios.get('/api/keywords'),
axios.get('/api/bids/sources'),
@@ -277,6 +405,7 @@ const fetchData = async () => {
])
bids.value = bidsRes.data.items
total.value = bidsRes.data.total
todayBids.value = recentRes.data
highPriorityBids.value = highRes.data
keywords.value = kwRes.data
sourceOptions.value = sourcesRes.data

View File

@@ -17,7 +17,9 @@
"test:watch": "jest --watch",
"test:cov": "jest --coverage",
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
"test:e2e": "jest --config ./test/jest-e2e.json"
"test:e2e": "jest --config ./test/jest-e2e.json",
"crawl": "ts-node -r tsconfig-paths/register src/scripts/crawl.ts",
"web":"npm --prefix frontend run build"
},
"dependencies": {
"@nestjs/common": "^11.0.1",
@@ -32,9 +34,13 @@
"class-validator": "^0.14.3",
"mysql2": "^3.16.0",
"puppeteer": "^24.34.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"reflect-metadata": "^0.2.2",
"rxjs": "^7.8.1",
"typeorm": "^0.3.28"
"typeorm": "^0.3.28",
"winston": "^3.19.0",
"winston-daily-rotate-file": "^5.0.0"
},
"devDependencies": {
"@eslint/eslintrc": "^3.2.0",

View File

@@ -8,6 +8,7 @@ import { BidsModule } from './bids/bids.module';
import { KeywordsModule } from './keywords/keywords.module';
import { CrawlerModule } from './crawler/crawler.module';
import { TasksModule } from './schedule/schedule.module';
import { LoggerModule } from './common/logger/logger.module';
@Module({
imports: [
@@ -17,6 +18,7 @@ import { TasksModule } from './schedule/schedule.module';
rootPath: join(__dirname, '..', 'frontend', 'dist'),
exclude: ['/api*'],
}),
LoggerModule,
DatabaseModule,
BidsModule,
KeywordsModule,

View File

@@ -10,6 +10,11 @@ export class BidsController {
return this.bidsService.findAll(query);
}
@Get('recent')
getRecent() {
return this.bidsService.getRecentBids();
}
@Get('high-priority')
getHighPriority() {
return this.bidsService.getHighPriorityCorrected();

View File

@@ -1,6 +1,6 @@
import { Injectable } from '@nestjs/common';
import { InjectRepository } from '@nestjs/typeorm';
import { Repository, LessThan } from 'typeorm';
import { Repository, LessThan, MoreThanOrEqual } from 'typeorm';
import { BidItem } from '../entities/bid-item.entity';
@Injectable()
@@ -75,4 +75,16 @@ export class BidsService {
.getRawMany();
return result.map((item: any) => item.source);
}
async getRecentBids() {
const thirtyDaysAgo = new Date();
thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30);
thirtyDaysAgo.setHours(0, 0, 0, 0);
return this.bidRepository
.createQueryBuilder('bid')
.where('bid.publishDate >= :thirtyDaysAgo', { thirtyDaysAgo })
.orderBy('bid.publishDate', 'DESC')
.getMany();
}
}

View File

@@ -0,0 +1,9 @@
import { Module, Global } from '@nestjs/common';
import { CustomLogger } from './logger.service';
@Global()
@Module({
providers: [CustomLogger],
exports: [CustomLogger],
})
export class LoggerModule {}

View File

@@ -0,0 +1,31 @@
import { Injectable, LoggerService, Scope } from '@nestjs/common';
import { winstonLogger } from './winston.config';
@Injectable({ scope: Scope.TRANSIENT })
export class CustomLogger implements LoggerService {
private context?: string;
setContext(context: string) {
this.context = context;
}
log(message: any, context?: string) {
winstonLogger.info(message, { context: context || this.context });
}
error(message: any, trace?: string, context?: string) {
winstonLogger.error(message, { context: context || this.context, trace });
}
warn(message: any, context?: string) {
winstonLogger.warn(message, { context: context || this.context });
}
debug(message: any, context?: string) {
winstonLogger.debug(message, { context: context || this.context });
}
verbose(message: any, context?: string) {
winstonLogger.verbose(message, { context: context || this.context });
}
}

View File

@@ -0,0 +1,70 @@
import * as winston from 'winston';
import DailyRotateFile from 'winston-daily-rotate-file';
import * as path from 'path';
import * as fs from 'fs';
const logDir = path.join(process.cwd(), 'logs');
// 确保日志目录存在
if (!fs.existsSync(logDir)) {
fs.mkdirSync(logDir, { recursive: true });
}
// 日志格式
const logFormat = winston.format.combine(
winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),
winston.format.errors({ stack: true }),
winston.format.splat(),
winston.format.printf(({ timestamp, level, message, context, stack }) => {
let log = `${timestamp} [${level}]`;
if (context) {
log += ` [${context}]`;
}
log += ` ${message}`;
if (stack) {
log += `\n${stack}`;
}
return log;
}),
);
// 控制台传输
const consoleTransport = new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
logFormat,
),
});
// 应用日志传输(按天轮转)
const appLogTransport = new DailyRotateFile({
dirname: logDir,
filename: 'application-%DATE%.log',
datePattern: 'YYYY-MM-DD',
maxSize: '20m',
maxFiles: '30d',
format: logFormat,
});
// 错误日志传输(按天轮转)
const errorLogTransport = new DailyRotateFile({
dirname: logDir,
filename: 'error-%DATE%.log',
datePattern: 'YYYY-MM-DD',
level: 'error',
maxSize: '20m',
maxFiles: '30d',
format: logFormat,
});
// 创建 winston logger 实例
export const winstonLogger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: logFormat,
transports: [
consoleTransport,
appLogTransport,
errorLogTransport,
],
exitOnError: false,
});

View File

@@ -25,10 +25,16 @@ export class BidCrawlerService {
async crawlAll() {
this.logger.log('Starting crawl task with Puppeteer...');
// 设置最大执行时间为1小时
const maxExecutionTime = 60 * 60 * 1000; // 1小时(毫秒)
// 设置最大执行时间为3小时
const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时(毫秒)
const startTime = Date.now();
// 统计结果
const crawlResults: Record<string, { success: number; error?: string }> = {};
// 记录数据为0的爬虫用于重试
const zeroDataCrawlers: any[] = [];
// 从环境变量读取代理配置
const proxyHost = this.configService.get<string>('PROXY_HOST');
const proxyPort = this.configService.get<string>('PROXY_PORT');
@@ -68,7 +74,7 @@ export class BidCrawlerService {
// 检查是否超时
const elapsedTime = Date.now() - startTime;
if (elapsedTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour. Stopping...`);
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`);
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
break;
}
@@ -76,18 +82,67 @@ export class BidCrawlerService {
try {
const results = await crawler.crawl(browser);
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
// 记录成功数量
crawlResults[crawler.name] = { success: results.length };
// 如果数据为0记录下来用于重试
if (results.length === 0) {
zeroDataCrawlers.push(crawler);
}
for (const item of results) {
await this.bidsService.createOrUpdate({
title: item.title,
url: item.url,
publishDate: item.publishDate,
source: crawler.name,
unit: '',
source: crawler.name,
unit: '',
});
}
} catch (err) {
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
// 记录错误信息
crawlResults[crawler.name] = { success: 0, error: err.message };
}
}
// 对数据为0的爬虫进行重试
if (zeroDataCrawlers.length > 0) {
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`);
for (const crawler of zeroDataCrawlers) {
this.logger.log(`Retrying: ${crawler.name}`);
// 检查是否超时
const elapsedTime = Date.now() - startTime;
if (elapsedTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`);
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
break;
}
try {
const results = await crawler.crawl(browser);
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`);
// 更新统计结果
crawlResults[crawler.name] = { success: results.length };
for (const item of results) {
await this.bidsService.createOrUpdate({
title: item.title,
url: item.url,
publishDate: item.publishDate,
source: crawler.name,
unit: '',
});
}
} catch (err) {
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
// 记录错误信息
crawlResults[crawler.name] = { success: 0, error: err.message };
}
}
}
} catch (error) {
@@ -100,8 +155,31 @@ export class BidCrawlerService {
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
if (totalTime > maxExecutionTime) {
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour.`);
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`);
}
// 输出统计总结
this.logger.log('='.repeat(50));
this.logger.log('爬虫执行总结 / Crawl Summary');
this.logger.log('='.repeat(50));
let totalSuccess = 0;
let errorCount = 0;
for (const [source, result] of Object.entries(crawlResults)) {
if (result.error) {
this.logger.error(`${source}: 出错 - ${result.error}`);
errorCount++;
} else {
this.logger.log(`${source}: 成功获取 ${result.success} 条工程信息`);
totalSuccess += result.success;
}
}
this.logger.log('='.repeat(50));
this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`);
this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`);
this.logger.log('='.repeat(50));
}
}
}

View File

@@ -89,7 +89,7 @@ export const CdtCrawler = {
await page.waitForFunction(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
}, { timeout: 30000 });
}, { timeout: 60000 });
await page.evaluate(() => {
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));

View File

@@ -1,72 +0,0 @@
import { chromium } from 'playwright';
import { ChngCrawler } from './chng_target';
jest.setTimeout(120000);
describe('ChngCrawler Playwright Test', () => {
let browser;
beforeAll(async () => {
browser = await chromium.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
});
afterAll(async () => {
if (browser) {
await browser.close();
}
});
it('should visit the website and list all found bid information', async () => {
console.log(`
Starting crawl for: ${ChngCrawler.name}`);
console.log(`Target URL: ${ChngCrawler.url}`);
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
// Add stealth scripts if needed, but Playwright is often better at evasion
await page.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
});
await page.goto(ChngCrawler.url, { waitUntil: 'networkidle', timeout: 60000 });
// Wait for content
try {
await page.waitForSelector('.ant-table-row', { timeout: 30000 });
} catch (e) {
console.warn('Timed out waiting for .ant-table-row');
}
const content = await page.content();
// Reuse the extraction logic from the Crawler definition
const results = ChngCrawler.extract(content);
console.log(`
Successfully found ${results.length} items:
`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
if (results.length === 0) {
console.warn('No items found. Debugging content length: ' + content.length);
if (content.length < 500) {
console.log('Content dump:', content);
}
}
expect(Array.isArray(results)).toBeTruthy();
});
});

View File

@@ -0,0 +1,134 @@
import { ChngCrawler } from './chng_target';
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import type { Browser, Page } from 'puppeteer';
// 使用 stealth 插件增强反爬虫能力
puppeteer.use(StealthPlugin());
// Increase timeout to 180 seconds for slow sites and stealth mode
jest.setTimeout(180000);
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
describe('ChngCrawler Stealth Test (Headless Mode with Stealth Plugin)', () => {
let browser: Browser;
beforeAll(async () => {
browser = await puppeteer.launch({
headless: true, // 使用 headless 模式
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080',
'--disable-infobars',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu',
'--disable-features=VizDisplayCompositor',
'--disable-webgl',
],
defaultViewport: null
});
});
afterAll(async () => {
if (browser) {
await browser.close();
}
});
it('should visit the website and list all found bid information with stealth plugin', async () => {
// 为此测试单独设置更长的超时时间
jest.setTimeout(180000);
console.log(`
Starting crawl for: ${ChngCrawler.name}`);
console.log(`Target URL: ${ChngCrawler.url}`);
console.log('Using puppeteer-extra-plugin-stealth for anti-detection');
console.log('Running in headless mode');
// 创建一个临时页面用于模拟人类行为
const tempPage = await browser.newPage();
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
// 模拟人类鼠标移动
console.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(tempPage);
// 模拟人类滚动
console.log('Simulating human scrolling...');
await simulateHumanScrolling(tempPage);
await tempPage.close();
const results = await ChngCrawler.crawl(browser);
console.log(`
Successfully found ${results.length} items:
`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
if (results.length === 0) {
console.warn('Warning: No items found. The site might have detected the crawler or content is not loading properly.');
} else {
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -1,26 +1,15 @@
import { Injectable, OnModuleInit } from '@nestjs/common';
import { Injectable } from '@nestjs/common';
import { InjectRepository } from '@nestjs/typeorm';
import { Repository } from 'typeorm';
import { Keyword } from './keyword.entity';
@Injectable()
export class KeywordsService implements OnModuleInit {
export class KeywordsService {
constructor(
@InjectRepository(Keyword)
private keywordRepository: Repository<Keyword>,
) {}
async onModuleInit() {
// 初始预设关键词
const defaultKeywords = ["山东", "海", "建设", "工程", "采购"];
for (const word of defaultKeywords) {
const exists = await this.keywordRepository.findOne({ where: { word } });
if (!exists) {
await this.keywordRepository.save({ word, weight: 1 });
}
}
}
findAll() {
return this.keywordRepository.find();
}

View File

@@ -1,8 +1,14 @@
import { NestFactory } from '@nestjs/core';
import { AppModule } from './app.module';
import { CustomLogger } from './common/logger/logger.service';
async function bootstrap() {
const app = await NestFactory.create(AppModule);
// 使用自定义日志服务
const logger = await app.resolve(CustomLogger);
app.useLogger(logger);
await app.listen(process.env.PORT ?? 3000);
}
bootstrap();

30
src/scripts/crawl.ts Normal file
View File

@@ -0,0 +1,30 @@
import { NestFactory } from '@nestjs/core';
import { AppModule } from '../app.module';
import { BidCrawlerService } from '../crawler/services/bid-crawler.service';
import { CustomLogger } from '../common/logger/logger.service';
async function runCrawler() {
const app = await NestFactory.createApplicationContext(AppModule);
// 设置自定义 logger使 NestJS 框架日志也输出到文件
const logger = await app.resolve(CustomLogger);
app.useLogger(logger);
logger.setContext('CrawlScript');
try {
const crawlerService = await app.resolve(BidCrawlerService);
logger.log('Starting crawler...');
await crawlerService.crawlAll();
logger.log('Crawler completed successfully');
await app.close();
process.exit(0);
} catch (error) {
logger.error('Crawler failed:', error);
await app.close();
process.exit(1);
}
}
runCrawler();