feat: 全面升级系统日志和反爬虫功能
- 新增专业日志系统:集成 Winston 日志框架,支持按天轮转和分级存储 - 增强反爬虫能力:集成 puppeteer-extra-plugin-stealth 插件,提升隐蔽性 - 新增独立爬虫脚本:可通过 npm run crawl 命令单独执行爬虫任务 - 优化前端日期筛选:添加日期范围选择器,支持3天/7天快速筛选 - 改进爬虫统计功能:详细记录每个平台的成功/失败情况和执行时间 - 移除默认关键词初始化:避免重复创建预设关键词 - 扩展环境配置:新增 LOG_LEVEL 日志级别配置选项 - 增强.gitignore:添加日志目录、构建产物等忽略规则 - 升级执行时间限制:将最大执行时间从1小时延长至3小时 - 完善错误处理:更好的异常捕获和日志记录机制
This commit is contained in:
3
.env
3
.env
@@ -9,3 +9,6 @@ DATABASE_SYNCHRONIZE=true
|
|||||||
# 代理配置(可选)
|
# 代理配置(可选)
|
||||||
PROXY_HOST=127.0.0.1
|
PROXY_HOST=127.0.0.1
|
||||||
PROXY_PORT=3211
|
PROXY_PORT=3211
|
||||||
|
|
||||||
|
# 日志级别(可选):error, warn, info, debug, verbose
|
||||||
|
LOG_LEVEL=info
|
||||||
@@ -11,3 +11,6 @@ PROXY_HOST=127.0.0.1
|
|||||||
PROXY_PORT=6000
|
PROXY_PORT=6000
|
||||||
# PROXY_USERNAME=
|
# PROXY_USERNAME=
|
||||||
# PROXY_PASSWORD=
|
# PROXY_PASSWORD=
|
||||||
|
|
||||||
|
# 日志级别(可选):error, warn, info, debug, verbose
|
||||||
|
LOG_LEVEL=info
|
||||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1 +1,7 @@
|
|||||||
node_modules
|
node_modules
|
||||||
|
dist
|
||||||
|
.vscode
|
||||||
|
public
|
||||||
|
*.xls*
|
||||||
|
pw-browsers
|
||||||
|
logs
|
||||||
@@ -65,22 +65,37 @@
|
|||||||
<el-divider />
|
<el-divider />
|
||||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
|
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
|
||||||
<h3 style="margin: 0;">Today's Bids</h3>
|
<h3 style="margin: 0;">Today's Bids</h3>
|
||||||
<el-select
|
<div style="display: flex; gap: 10px;">
|
||||||
v-model="selectedKeywords"
|
<el-date-picker
|
||||||
multiple
|
v-model="dateRange"
|
||||||
collapse-tags
|
type="daterange"
|
||||||
collapse-tags-tooltip
|
range-separator="To"
|
||||||
placeholder="Filter by Keywords"
|
start-placeholder="Start Date"
|
||||||
clearable
|
end-placeholder="End Date"
|
||||||
style="width: 300px;"
|
format="YYYY-MM-DD"
|
||||||
>
|
value-format="YYYY-MM-DD"
|
||||||
<el-option
|
clearable
|
||||||
v-for="keyword in keywords"
|
style="width: 240px;"
|
||||||
:key="keyword.id"
|
|
||||||
:label="keyword.word"
|
|
||||||
:value="keyword.word"
|
|
||||||
/>
|
/>
|
||||||
</el-select>
|
<el-button type="primary" @click="setLast3Days">3天</el-button>
|
||||||
|
<el-button type="primary" @click="setLast7Days">7天</el-button>
|
||||||
|
<el-select
|
||||||
|
v-model="selectedKeywords"
|
||||||
|
multiple
|
||||||
|
collapse-tags
|
||||||
|
collapse-tags-tooltip
|
||||||
|
placeholder="Filter by Keywords"
|
||||||
|
clearable
|
||||||
|
style="width: 300px;"
|
||||||
|
>
|
||||||
|
<el-option
|
||||||
|
v-for="keyword in keywords"
|
||||||
|
:key="keyword.id"
|
||||||
|
:label="keyword.word"
|
||||||
|
:value="keyword.word"
|
||||||
|
/>
|
||||||
|
</el-select>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<el-table :data="filteredTodayBids" v-loading="loading" style="width: 100%">
|
<el-table :data="filteredTodayBids" v-loading="loading" style="width: 100%">
|
||||||
<el-table-column prop="title" label="Title">
|
<el-table-column prop="title" label="Title">
|
||||||
@@ -173,13 +188,14 @@
|
|||||||
</template>
|
</template>
|
||||||
|
|
||||||
<script setup lang="ts">
|
<script setup lang="ts">
|
||||||
import { ref, onMounted, reactive, computed, watch } from 'vue'
|
import { ref, onMounted, reactive, computed, watch, nextTick } from 'vue'
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import { ElMessage } from 'element-plus'
|
import { ElMessage } from 'element-plus'
|
||||||
import { DataBoard, Document, Setting, Refresh } from '@element-plus/icons-vue'
|
import { DataBoard, Document, Setting, Refresh } from '@element-plus/icons-vue'
|
||||||
|
|
||||||
const activeIndex = ref('1')
|
const activeIndex = ref('1')
|
||||||
const bids = ref<any[]>([])
|
const bids = ref<any[]>([])
|
||||||
|
const todayBids = ref<any[]>([])
|
||||||
const highPriorityBids = ref<any[]>([])
|
const highPriorityBids = ref<any[]>([])
|
||||||
const keywords = ref<any[]>([])
|
const keywords = ref<any[]>([])
|
||||||
const loading = ref(false)
|
const loading = ref(false)
|
||||||
@@ -192,6 +208,7 @@ const total = ref(0)
|
|||||||
const sourceOptions = ref<string[]>([])
|
const sourceOptions = ref<string[]>([])
|
||||||
const isCrawling = ref(false)
|
const isCrawling = ref(false)
|
||||||
const selectedKeywords = ref<string[]>([])
|
const selectedKeywords = ref<string[]>([])
|
||||||
|
const dateRange = ref<[string, string] | null>(null)
|
||||||
|
|
||||||
// 从 localStorage 加载保存的关键字
|
// 从 localStorage 加载保存的关键字
|
||||||
const loadSavedKeywords = () => {
|
const loadSavedKeywords = () => {
|
||||||
@@ -210,6 +227,16 @@ watch(selectedKeywords, (newKeywords) => {
|
|||||||
localStorage.setItem('selectedKeywords', JSON.stringify(newKeywords))
|
localStorage.setItem('selectedKeywords', JSON.stringify(newKeywords))
|
||||||
}, { deep: true })
|
}, { deep: true })
|
||||||
|
|
||||||
|
// 监听日期范围变化并显示提示
|
||||||
|
watch(dateRange, () => {
|
||||||
|
const totalBids = bids.value.length
|
||||||
|
const filteredCount = filteredTodayBids.value.length
|
||||||
|
|
||||||
|
if (totalBids > 0 && filteredCount < totalBids) {
|
||||||
|
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
const form = reactive({
|
const form = reactive({
|
||||||
word: '',
|
word: '',
|
||||||
weight: 1
|
weight: 1
|
||||||
@@ -241,24 +268,134 @@ const handleSizeChange = (size: number) => {
|
|||||||
fetchData()
|
fetchData()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 设置日期范围为最近3天
|
||||||
|
const setLast3Days = () => {
|
||||||
|
const endDate = new Date()
|
||||||
|
const startDate = new Date()
|
||||||
|
startDate.setDate(startDate.getDate() - 2) // 最近3天(包括今天)
|
||||||
|
|
||||||
|
const formatDateForPicker = (date: Date) => {
|
||||||
|
const year = date.getFullYear()
|
||||||
|
const month = String(date.getMonth() + 1).padStart(2, '0')
|
||||||
|
const day = String(date.getDate()).padStart(2, '0')
|
||||||
|
return `${year}-${month}-${day}`
|
||||||
|
}
|
||||||
|
|
||||||
|
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
|
||||||
|
|
||||||
|
console.log('setLast3Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
|
||||||
|
|
||||||
|
// 直接计算筛选结果并显示提示
|
||||||
|
const start = new Date(startDate)
|
||||||
|
start.setHours(0, 0, 0, 0)
|
||||||
|
const end = new Date(endDate)
|
||||||
|
end.setHours(23, 59, 59, 999)
|
||||||
|
|
||||||
|
let result = todayBids.value
|
||||||
|
result = result.filter(bid => {
|
||||||
|
if (!bid.publishDate) return false
|
||||||
|
const bidDate = new Date(bid.publishDate)
|
||||||
|
return bidDate >= start && bidDate <= end
|
||||||
|
})
|
||||||
|
|
||||||
|
const totalBids = todayBids.value.length
|
||||||
|
const filteredCount = result.length
|
||||||
|
|
||||||
|
console.log('setLast3Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
|
||||||
|
if (totalBids > 0) {
|
||||||
|
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
|
||||||
|
} else {
|
||||||
|
ElMessage.warning('暂无数据,请先抓取数据')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 设置日期范围为最近7天
|
||||||
|
const setLast7Days = () => {
|
||||||
|
const endDate = new Date()
|
||||||
|
const startDate = new Date()
|
||||||
|
startDate.setDate(startDate.getDate() - 6) // 最近7天(包括今天)
|
||||||
|
|
||||||
|
const formatDateForPicker = (date: Date) => {
|
||||||
|
const year = date.getFullYear()
|
||||||
|
const month = String(date.getMonth() + 1).padStart(2, '0')
|
||||||
|
const day = String(date.getDate()).padStart(2, '0')
|
||||||
|
return `${year}-${month}-${day}`
|
||||||
|
}
|
||||||
|
|
||||||
|
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
|
||||||
|
|
||||||
|
console.log('setLast7Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
|
||||||
|
|
||||||
|
// 直接计算筛选结果并显示提示
|
||||||
|
const start = new Date(startDate)
|
||||||
|
start.setHours(0, 0, 0, 0)
|
||||||
|
const end = new Date(endDate)
|
||||||
|
end.setHours(23, 59, 59, 999)
|
||||||
|
|
||||||
|
let result = todayBids.value
|
||||||
|
result = result.filter(bid => {
|
||||||
|
if (!bid.publishDate) return false
|
||||||
|
const bidDate = new Date(bid.publishDate)
|
||||||
|
return bidDate >= start && bidDate <= end
|
||||||
|
})
|
||||||
|
|
||||||
|
const totalBids = todayBids.value.length
|
||||||
|
const filteredCount = result.length
|
||||||
|
|
||||||
|
console.log('setLast7Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
|
||||||
|
if (totalBids > 0) {
|
||||||
|
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
|
||||||
|
} else {
|
||||||
|
ElMessage.warning('暂无数据,请先抓取数据')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const formatDate = (dateString: string) => {
|
const formatDate = (dateString: string) => {
|
||||||
if (!dateString) return '-'
|
if (!dateString) return '-'
|
||||||
return new Date(dateString).toLocaleDateString()
|
return new Date(dateString).toLocaleDateString()
|
||||||
}
|
}
|
||||||
|
|
||||||
// 过滤 Today's Bids,只显示包含所选关键字的项目
|
// 过滤 Today's Bids,只显示包含所选关键字的项目,并且在日期范围内
|
||||||
const filteredTodayBids = computed(() => {
|
const filteredTodayBids = computed(() => {
|
||||||
if (selectedKeywords.value.length === 0) {
|
let result = todayBids.value
|
||||||
return bids.value
|
|
||||||
|
// 按关键字筛选
|
||||||
|
if (selectedKeywords.value.length > 0) {
|
||||||
|
result = result.filter(bid => {
|
||||||
|
return selectedKeywords.value.some(keyword =>
|
||||||
|
bid.title.toLowerCase().includes(keyword.toLowerCase())
|
||||||
|
)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return bids.value.filter(bid => {
|
// 按日期范围筛选
|
||||||
return selectedKeywords.value.some(keyword =>
|
if (dateRange.value && dateRange.value.length === 2) {
|
||||||
bid.title.toLowerCase().includes(keyword.toLowerCase())
|
const [startDate, endDate] = dateRange.value
|
||||||
)
|
result = result.filter(bid => {
|
||||||
})
|
if (!bid.publishDate) return false
|
||||||
|
const bidDate = new Date(bid.publishDate)
|
||||||
|
const start = new Date(startDate)
|
||||||
|
const end = new Date(endDate)
|
||||||
|
// 设置时间为当天的开始和结束
|
||||||
|
start.setHours(0, 0, 0, 0)
|
||||||
|
end.setHours(23, 59, 59, 999)
|
||||||
|
return bidDate >= start && bidDate <= end
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// 监听筛选结果变化并显示提示
|
||||||
|
watch(filteredTodayBids, (newFilteredBids) => {
|
||||||
|
const totalBids = todayBids.value.length
|
||||||
|
const filteredCount = newFilteredBids.length
|
||||||
|
|
||||||
|
if (totalBids > 0 && filteredCount < totalBids) {
|
||||||
|
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
|
||||||
|
}
|
||||||
|
}, { deep: true })
|
||||||
|
|
||||||
const fetchData = async () => {
|
const fetchData = async () => {
|
||||||
loading.value = true
|
loading.value = true
|
||||||
try {
|
try {
|
||||||
@@ -281,6 +418,15 @@ const fetchData = async () => {
|
|||||||
keywords.value = kwRes.data
|
keywords.value = kwRes.data
|
||||||
sourceOptions.value = sourcesRes.data
|
sourceOptions.value = sourcesRes.data
|
||||||
isCrawling.value = statusRes.data.isCrawling
|
isCrawling.value = statusRes.data.isCrawling
|
||||||
|
|
||||||
|
// 过滤今天的数据用于 Today's Bids
|
||||||
|
const today = new Date()
|
||||||
|
today.setHours(0, 0, 0, 0)
|
||||||
|
todayBids.value = bidsRes.data.items.filter((bid: any) => {
|
||||||
|
if (!bid.publishDate) return false
|
||||||
|
const bidDate = new Date(bid.publishDate)
|
||||||
|
return bidDate >= today
|
||||||
|
})
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
ElMessage.error('Failed to fetch data')
|
ElMessage.error('Failed to fetch data')
|
||||||
} finally {
|
} finally {
|
||||||
|
|||||||
@@ -17,7 +17,8 @@
|
|||||||
"test:watch": "jest --watch",
|
"test:watch": "jest --watch",
|
||||||
"test:cov": "jest --coverage",
|
"test:cov": "jest --coverage",
|
||||||
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
|
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
|
||||||
"test:e2e": "jest --config ./test/jest-e2e.json"
|
"test:e2e": "jest --config ./test/jest-e2e.json",
|
||||||
|
"crawl": "ts-node -r tsconfig-paths/register src/scripts/crawl.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@nestjs/common": "^11.0.1",
|
"@nestjs/common": "^11.0.1",
|
||||||
@@ -32,9 +33,13 @@
|
|||||||
"class-validator": "^0.14.3",
|
"class-validator": "^0.14.3",
|
||||||
"mysql2": "^3.16.0",
|
"mysql2": "^3.16.0",
|
||||||
"puppeteer": "^24.34.0",
|
"puppeteer": "^24.34.0",
|
||||||
|
"puppeteer-extra": "^3.3.6",
|
||||||
|
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||||
"reflect-metadata": "^0.2.2",
|
"reflect-metadata": "^0.2.2",
|
||||||
"rxjs": "^7.8.1",
|
"rxjs": "^7.8.1",
|
||||||
"typeorm": "^0.3.28"
|
"typeorm": "^0.3.28",
|
||||||
|
"winston": "^3.19.0",
|
||||||
|
"winston-daily-rotate-file": "^5.0.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@eslint/eslintrc": "^3.2.0",
|
"@eslint/eslintrc": "^3.2.0",
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import { BidsModule } from './bids/bids.module';
|
|||||||
import { KeywordsModule } from './keywords/keywords.module';
|
import { KeywordsModule } from './keywords/keywords.module';
|
||||||
import { CrawlerModule } from './crawler/crawler.module';
|
import { CrawlerModule } from './crawler/crawler.module';
|
||||||
import { TasksModule } from './schedule/schedule.module';
|
import { TasksModule } from './schedule/schedule.module';
|
||||||
|
import { LoggerModule } from './common/logger/logger.module';
|
||||||
|
|
||||||
@Module({
|
@Module({
|
||||||
imports: [
|
imports: [
|
||||||
@@ -17,6 +18,7 @@ import { TasksModule } from './schedule/schedule.module';
|
|||||||
rootPath: join(__dirname, '..', 'frontend', 'dist'),
|
rootPath: join(__dirname, '..', 'frontend', 'dist'),
|
||||||
exclude: ['/api*'],
|
exclude: ['/api*'],
|
||||||
}),
|
}),
|
||||||
|
LoggerModule,
|
||||||
DatabaseModule,
|
DatabaseModule,
|
||||||
BidsModule,
|
BidsModule,
|
||||||
KeywordsModule,
|
KeywordsModule,
|
||||||
|
|||||||
9
src/common/logger/logger.module.ts
Normal file
9
src/common/logger/logger.module.ts
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import { Module, Global } from '@nestjs/common';
|
||||||
|
import { CustomLogger } from './logger.service';
|
||||||
|
|
||||||
|
@Global()
|
||||||
|
@Module({
|
||||||
|
providers: [CustomLogger],
|
||||||
|
exports: [CustomLogger],
|
||||||
|
})
|
||||||
|
export class LoggerModule {}
|
||||||
31
src/common/logger/logger.service.ts
Normal file
31
src/common/logger/logger.service.ts
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
import { Injectable, LoggerService, Scope } from '@nestjs/common';
|
||||||
|
import { winstonLogger } from './winston.config';
|
||||||
|
|
||||||
|
@Injectable({ scope: Scope.TRANSIENT })
|
||||||
|
export class CustomLogger implements LoggerService {
|
||||||
|
private context?: string;
|
||||||
|
|
||||||
|
setContext(context: string) {
|
||||||
|
this.context = context;
|
||||||
|
}
|
||||||
|
|
||||||
|
log(message: any, context?: string) {
|
||||||
|
winstonLogger.info(message, { context: context || this.context });
|
||||||
|
}
|
||||||
|
|
||||||
|
error(message: any, trace?: string, context?: string) {
|
||||||
|
winstonLogger.error(message, { context: context || this.context, trace });
|
||||||
|
}
|
||||||
|
|
||||||
|
warn(message: any, context?: string) {
|
||||||
|
winstonLogger.warn(message, { context: context || this.context });
|
||||||
|
}
|
||||||
|
|
||||||
|
debug(message: any, context?: string) {
|
||||||
|
winstonLogger.debug(message, { context: context || this.context });
|
||||||
|
}
|
||||||
|
|
||||||
|
verbose(message: any, context?: string) {
|
||||||
|
winstonLogger.verbose(message, { context: context || this.context });
|
||||||
|
}
|
||||||
|
}
|
||||||
64
src/common/logger/winston.config.ts
Normal file
64
src/common/logger/winston.config.ts
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import * as winston from 'winston';
|
||||||
|
import DailyRotateFile from 'winston-daily-rotate-file';
|
||||||
|
import * as path from 'path';
|
||||||
|
|
||||||
|
const logDir = path.join(process.cwd(), 'logs');
|
||||||
|
|
||||||
|
// 日志格式
|
||||||
|
const logFormat = winston.format.combine(
|
||||||
|
winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),
|
||||||
|
winston.format.errors({ stack: true }),
|
||||||
|
winston.format.splat(),
|
||||||
|
winston.format.printf(({ timestamp, level, message, context, stack }) => {
|
||||||
|
let log = `${timestamp} [${level}]`;
|
||||||
|
if (context) {
|
||||||
|
log += ` [${context}]`;
|
||||||
|
}
|
||||||
|
log += ` ${message}`;
|
||||||
|
if (stack) {
|
||||||
|
log += `\n${stack}`;
|
||||||
|
}
|
||||||
|
return log;
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
// 控制台传输
|
||||||
|
const consoleTransport = new winston.transports.Console({
|
||||||
|
format: winston.format.combine(
|
||||||
|
winston.format.colorize(),
|
||||||
|
logFormat,
|
||||||
|
),
|
||||||
|
});
|
||||||
|
|
||||||
|
// 应用日志传输(按天轮转)
|
||||||
|
const appLogTransport = new DailyRotateFile({
|
||||||
|
dirname: logDir,
|
||||||
|
filename: 'application-%DATE%.log',
|
||||||
|
datePattern: 'YYYY-MM-DD',
|
||||||
|
maxSize: '20m',
|
||||||
|
maxFiles: '30d',
|
||||||
|
format: logFormat,
|
||||||
|
});
|
||||||
|
|
||||||
|
// 错误日志传输(按天轮转)
|
||||||
|
const errorLogTransport = new DailyRotateFile({
|
||||||
|
dirname: logDir,
|
||||||
|
filename: 'error-%DATE%.log',
|
||||||
|
datePattern: 'YYYY-MM-DD',
|
||||||
|
level: 'error',
|
||||||
|
maxSize: '20m',
|
||||||
|
maxFiles: '30d',
|
||||||
|
format: logFormat,
|
||||||
|
});
|
||||||
|
|
||||||
|
// 创建 winston logger 实例
|
||||||
|
export const winstonLogger = winston.createLogger({
|
||||||
|
level: process.env.LOG_LEVEL || 'info',
|
||||||
|
format: logFormat,
|
||||||
|
transports: [
|
||||||
|
consoleTransport,
|
||||||
|
appLogTransport,
|
||||||
|
errorLogTransport,
|
||||||
|
],
|
||||||
|
exitOnError: false,
|
||||||
|
});
|
||||||
@@ -25,10 +25,13 @@ export class BidCrawlerService {
|
|||||||
async crawlAll() {
|
async crawlAll() {
|
||||||
this.logger.log('Starting crawl task with Puppeteer...');
|
this.logger.log('Starting crawl task with Puppeteer...');
|
||||||
|
|
||||||
// 设置最大执行时间为1小时
|
// 设置最大执行时间为3小时
|
||||||
const maxExecutionTime = 60 * 60 * 1000; // 1小时(毫秒)
|
const maxExecutionTime = 3 * 60 * 60 * 1000; // 3小时(毫秒)
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
// 统计结果
|
||||||
|
const crawlResults: Record<string, { success: number; error?: string }> = {};
|
||||||
|
|
||||||
// 从环境变量读取代理配置
|
// 从环境变量读取代理配置
|
||||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||||
@@ -68,7 +71,7 @@ export class BidCrawlerService {
|
|||||||
// 检查是否超时
|
// 检查是否超时
|
||||||
const elapsedTime = Date.now() - startTime;
|
const elapsedTime = Date.now() - startTime;
|
||||||
if (elapsedTime > maxExecutionTime) {
|
if (elapsedTime > maxExecutionTime) {
|
||||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour. Stopping...`);
|
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping...`);
|
||||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -77,6 +80,9 @@ export class BidCrawlerService {
|
|||||||
const results = await crawler.crawl(browser);
|
const results = await crawler.crawl(browser);
|
||||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||||
|
|
||||||
|
// 记录成功数量
|
||||||
|
crawlResults[crawler.name] = { success: results.length };
|
||||||
|
|
||||||
for (const item of results) {
|
for (const item of results) {
|
||||||
await this.bidsService.createOrUpdate({
|
await this.bidsService.createOrUpdate({
|
||||||
title: item.title,
|
title: item.title,
|
||||||
@@ -88,6 +94,8 @@ export class BidCrawlerService {
|
|||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
||||||
|
// 记录错误信息
|
||||||
|
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -100,8 +108,31 @@ export class BidCrawlerService {
|
|||||||
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
|
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
|
||||||
|
|
||||||
if (totalTime > maxExecutionTime) {
|
if (totalTime > maxExecutionTime) {
|
||||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour.`);
|
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 输出统计总结
|
||||||
|
this.logger.log('='.repeat(50));
|
||||||
|
this.logger.log('爬虫执行总结 / Crawl Summary');
|
||||||
|
this.logger.log('='.repeat(50));
|
||||||
|
|
||||||
|
let totalSuccess = 0;
|
||||||
|
let errorCount = 0;
|
||||||
|
|
||||||
|
for (const [source, result] of Object.entries(crawlResults)) {
|
||||||
|
if (result.error) {
|
||||||
|
this.logger.error(`❌ ${source}: 出错 - ${result.error}`);
|
||||||
|
errorCount++;
|
||||||
|
} else {
|
||||||
|
this.logger.log(`✅ ${source}: 成功获取 ${result.success} 条工程信息`);
|
||||||
|
totalSuccess += result.success;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.log('='.repeat(50));
|
||||||
|
this.logger.log(`总计: ${totalSuccess} 条工程信息, ${errorCount} 个来源出错`);
|
||||||
|
this.logger.log(`Total: ${totalSuccess} items, ${errorCount} sources failed`);
|
||||||
|
this.logger.log('='.repeat(50));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,72 +0,0 @@
|
|||||||
import { chromium } from 'playwright';
|
|
||||||
import { ChngCrawler } from './chng_target';
|
|
||||||
|
|
||||||
jest.setTimeout(120000);
|
|
||||||
|
|
||||||
describe('ChngCrawler Playwright Test', () => {
|
|
||||||
let browser;
|
|
||||||
|
|
||||||
beforeAll(async () => {
|
|
||||||
browser = await chromium.launch({
|
|
||||||
headless: false,
|
|
||||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
afterAll(async () => {
|
|
||||||
if (browser) {
|
|
||||||
await browser.close();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should visit the website and list all found bid information', async () => {
|
|
||||||
console.log(`
|
|
||||||
Starting crawl for: ${ChngCrawler.name}`);
|
|
||||||
console.log(`Target URL: ${ChngCrawler.url}`);
|
|
||||||
|
|
||||||
const context = await browser.newContext({
|
|
||||||
viewport: { width: 1920, height: 1080 },
|
|
||||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
||||||
});
|
|
||||||
|
|
||||||
const page = await context.newPage();
|
|
||||||
|
|
||||||
// Add stealth scripts if needed, but Playwright is often better at evasion
|
|
||||||
await page.addInitScript(() => {
|
|
||||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
||||||
});
|
|
||||||
|
|
||||||
await page.goto(ChngCrawler.url, { waitUntil: 'networkidle', timeout: 60000 });
|
|
||||||
|
|
||||||
// Wait for content
|
|
||||||
try {
|
|
||||||
await page.waitForSelector('.ant-table-row', { timeout: 30000 });
|
|
||||||
} catch (e) {
|
|
||||||
console.warn('Timed out waiting for .ant-table-row');
|
|
||||||
}
|
|
||||||
|
|
||||||
const content = await page.content();
|
|
||||||
|
|
||||||
// Reuse the extraction logic from the Crawler definition
|
|
||||||
const results = ChngCrawler.extract(content);
|
|
||||||
|
|
||||||
console.log(`
|
|
||||||
Successfully found ${results.length} items:
|
|
||||||
`);
|
|
||||||
console.log('----------------------------------------');
|
|
||||||
results.forEach((item, index) => {
|
|
||||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
|
||||||
console.log(` Link: ${item.url}`);
|
|
||||||
console.log('----------------------------------------');
|
|
||||||
});
|
|
||||||
|
|
||||||
if (results.length === 0) {
|
|
||||||
console.warn('No items found. Debugging content length: ' + content.length);
|
|
||||||
if (content.length < 500) {
|
|
||||||
console.log('Content dump:', content);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(Array.isArray(results)).toBeTruthy();
|
|
||||||
});
|
|
||||||
});
|
|
||||||
134
src/crawler/services/chng_target_stealth.spec.ts
Normal file
134
src/crawler/services/chng_target_stealth.spec.ts
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
import { ChngCrawler } from './chng_target';
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
import type { Browser, Page } from 'puppeteer';
|
||||||
|
|
||||||
|
// 使用 stealth 插件增强反爬虫能力
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
// Increase timeout to 180 seconds for slow sites and stealth mode
|
||||||
|
jest.setTimeout(180000);
|
||||||
|
|
||||||
|
// 模拟人类鼠标移动
|
||||||
|
async function simulateHumanMouseMovement(page: Page) {
|
||||||
|
const viewport = page.viewport();
|
||||||
|
if (!viewport) return;
|
||||||
|
|
||||||
|
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||||
|
|
||||||
|
for (let i = 0; i < movements; i++) {
|
||||||
|
const x = Math.floor(Math.random() * viewport.width);
|
||||||
|
const y = Math.floor(Math.random() * viewport.height);
|
||||||
|
|
||||||
|
await page.mouse.move(x, y, {
|
||||||
|
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||||
|
});
|
||||||
|
|
||||||
|
// 随机停顿 100-500ms
|
||||||
|
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 模拟人类滚动
|
||||||
|
async function simulateHumanScrolling(page: Page) {
|
||||||
|
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||||
|
|
||||||
|
for (let i = 0; i < scrollCount; i++) {
|
||||||
|
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||||
|
|
||||||
|
await page.evaluate((distance) => {
|
||||||
|
window.scrollBy({
|
||||||
|
top: distance,
|
||||||
|
behavior: 'smooth'
|
||||||
|
});
|
||||||
|
}, scrollDistance);
|
||||||
|
|
||||||
|
// 随机停顿 500-1500ms
|
||||||
|
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||||
|
}
|
||||||
|
|
||||||
|
// 滚动回顶部
|
||||||
|
await page.evaluate(() => {
|
||||||
|
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||||
|
});
|
||||||
|
await new Promise(r => setTimeout(r, 1000));
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('ChngCrawler Stealth Test (Headless Mode with Stealth Plugin)', () => {
|
||||||
|
let browser: Browser;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
browser = await puppeteer.launch({
|
||||||
|
headless: true, // 使用 headless 模式
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-blink-features=AutomationControlled',
|
||||||
|
'--window-size=1920,1080',
|
||||||
|
'--disable-infobars',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-accelerated-2d-canvas',
|
||||||
|
'--no-first-run',
|
||||||
|
'--no-zygote',
|
||||||
|
'--disable-gpu',
|
||||||
|
'--disable-features=VizDisplayCompositor',
|
||||||
|
'--disable-webgl',
|
||||||
|
],
|
||||||
|
defaultViewport: null
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(async () => {
|
||||||
|
if (browser) {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should visit the website and list all found bid information with stealth plugin', async () => {
|
||||||
|
// 为此测试单独设置更长的超时时间
|
||||||
|
jest.setTimeout(180000);
|
||||||
|
console.log(`
|
||||||
|
Starting crawl for: ${ChngCrawler.name}`);
|
||||||
|
console.log(`Target URL: ${ChngCrawler.url}`);
|
||||||
|
console.log('Using puppeteer-extra-plugin-stealth for anti-detection');
|
||||||
|
console.log('Running in headless mode');
|
||||||
|
|
||||||
|
// 创建一个临时页面用于模拟人类行为
|
||||||
|
const tempPage = await browser.newPage();
|
||||||
|
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||||
|
|
||||||
|
// 模拟人类鼠标移动
|
||||||
|
console.log('Simulating human mouse movements...');
|
||||||
|
await simulateHumanMouseMovement(tempPage);
|
||||||
|
|
||||||
|
// 模拟人类滚动
|
||||||
|
console.log('Simulating human scrolling...');
|
||||||
|
await simulateHumanScrolling(tempPage);
|
||||||
|
|
||||||
|
await tempPage.close();
|
||||||
|
|
||||||
|
const results = await ChngCrawler.crawl(browser);
|
||||||
|
|
||||||
|
console.log(`
|
||||||
|
Successfully found ${results.length} items:
|
||||||
|
`);
|
||||||
|
console.log('----------------------------------------');
|
||||||
|
results.forEach((item, index) => {
|
||||||
|
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||||
|
console.log(` Link: ${item.url}`);
|
||||||
|
console.log('----------------------------------------');
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(results).toBeDefined();
|
||||||
|
expect(Array.isArray(results)).toBeTruthy();
|
||||||
|
|
||||||
|
if (results.length === 0) {
|
||||||
|
console.warn('Warning: No items found. The site might have detected the crawler or content is not loading properly.');
|
||||||
|
} else {
|
||||||
|
const firstItem = results[0];
|
||||||
|
expect(firstItem.title).toBeTruthy();
|
||||||
|
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||||
|
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,26 +1,15 @@
|
|||||||
import { Injectable, OnModuleInit } from '@nestjs/common';
|
import { Injectable } from '@nestjs/common';
|
||||||
import { InjectRepository } from '@nestjs/typeorm';
|
import { InjectRepository } from '@nestjs/typeorm';
|
||||||
import { Repository } from 'typeorm';
|
import { Repository } from 'typeorm';
|
||||||
import { Keyword } from './keyword.entity';
|
import { Keyword } from './keyword.entity';
|
||||||
|
|
||||||
@Injectable()
|
@Injectable()
|
||||||
export class KeywordsService implements OnModuleInit {
|
export class KeywordsService {
|
||||||
constructor(
|
constructor(
|
||||||
@InjectRepository(Keyword)
|
@InjectRepository(Keyword)
|
||||||
private keywordRepository: Repository<Keyword>,
|
private keywordRepository: Repository<Keyword>,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
async onModuleInit() {
|
|
||||||
// 初始预设关键词
|
|
||||||
const defaultKeywords = ["山东", "海", "建设", "工程", "采购"];
|
|
||||||
for (const word of defaultKeywords) {
|
|
||||||
const exists = await this.keywordRepository.findOne({ where: { word } });
|
|
||||||
if (!exists) {
|
|
||||||
await this.keywordRepository.save({ word, weight: 1 });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
findAll() {
|
findAll() {
|
||||||
return this.keywordRepository.find();
|
return this.keywordRepository.find();
|
||||||
}
|
}
|
||||||
|
|||||||
25
src/scripts/crawl.ts
Normal file
25
src/scripts/crawl.ts
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
import { NestFactory } from '@nestjs/core';
|
||||||
|
import { AppModule } from '../app.module';
|
||||||
|
import { BidCrawlerService } from '../crawler/services/bid-crawler.service';
|
||||||
|
import { Logger } from '@nestjs/common';
|
||||||
|
|
||||||
|
async function runCrawler() {
|
||||||
|
const logger = new Logger('CrawlScript');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const app = await NestFactory.createApplicationContext(AppModule);
|
||||||
|
const crawlerService = app.get(BidCrawlerService);
|
||||||
|
|
||||||
|
logger.log('Starting crawler...');
|
||||||
|
await crawlerService.crawlAll();
|
||||||
|
logger.log('Crawler completed successfully');
|
||||||
|
|
||||||
|
await app.close();
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error('Crawler failed:', error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
runCrawler();
|
||||||
Reference in New Issue
Block a user