feat: 全面优化爬虫系统和数据处理能力
- 增强数据重试机制:对数据为0的爬虫自动重试,提高数据完整性 - 优化前端筛选逻辑:改进日期筛选,只限制开始时间,更灵活的数据查看 - 新增最近数据接口:添加 /api/bids/recent 获取30天内最新招标数据 - 改进统计展示:实时显示筛选结果数量,优化用户体验 - 完善日志系统:确保日志目录自动创建,避免启动错误 - 增强独立脚本:使用自定义logger,完善错误处理和程序关闭 - 优化主程序:集成自定义日志服务,统一日志格式 - 扩展npm脚本:新增 web 命令用于构建前端 - 改进大唐爬虫:延长等待时间到60秒,提高页面加载成功率 - 优化数据筛选:今日招标改为使用独立接口,提升性能
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>frontend</title>
|
<title>投标</title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="app"></div>
|
<div id="app"></div>
|
||||||
|
|||||||
@@ -188,7 +188,7 @@
|
|||||||
</template>
|
</template>
|
||||||
|
|
||||||
<script setup lang="ts">
|
<script setup lang="ts">
|
||||||
import { ref, onMounted, reactive, computed, watch, nextTick } from 'vue'
|
import { ref, onMounted, reactive, computed, watch } from 'vue'
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import { ElMessage } from 'element-plus'
|
import { ElMessage } from 'element-plus'
|
||||||
import { DataBoard, Document, Setting, Refresh } from '@element-plus/icons-vue'
|
import { DataBoard, Document, Setting, Refresh } from '@element-plus/icons-vue'
|
||||||
@@ -285,26 +285,22 @@ const setLast3Days = () => {
|
|||||||
|
|
||||||
console.log('setLast3Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
|
console.log('setLast3Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
|
||||||
|
|
||||||
// 直接计算筛选结果并显示提示
|
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
|
||||||
const start = new Date(startDate)
|
const start = new Date(startDate)
|
||||||
start.setHours(0, 0, 0, 0)
|
start.setHours(0, 0, 0, 0)
|
||||||
const end = new Date(endDate)
|
|
||||||
end.setHours(23, 59, 59, 999)
|
|
||||||
|
|
||||||
let result = todayBids.value
|
let result = todayBids.value
|
||||||
result = result.filter(bid => {
|
result = result.filter(bid => {
|
||||||
if (!bid.publishDate) return false
|
if (!bid.publishDate) return false
|
||||||
const bidDate = new Date(bid.publishDate)
|
const bidDate = new Date(bid.publishDate)
|
||||||
return bidDate >= start && bidDate <= end
|
return bidDate >= start
|
||||||
})
|
})
|
||||||
|
|
||||||
const totalBids = todayBids.value.length
|
const totalBids = todayBids.value.length
|
||||||
const filteredCount = result.length
|
const filteredCount = result.length
|
||||||
|
|
||||||
console.log('setLast3Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
|
console.log('setLast3Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
|
||||||
if (totalBids > 0) {
|
if (totalBids === 0) {
|
||||||
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
|
|
||||||
} else {
|
|
||||||
ElMessage.warning('暂无数据,请先抓取数据')
|
ElMessage.warning('暂无数据,请先抓取数据')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -326,26 +322,22 @@ const setLast7Days = () => {
|
|||||||
|
|
||||||
console.log('setLast7Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
|
console.log('setLast7Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
|
||||||
|
|
||||||
// 直接计算筛选结果并显示提示
|
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
|
||||||
const start = new Date(startDate)
|
const start = new Date(startDate)
|
||||||
start.setHours(0, 0, 0, 0)
|
start.setHours(0, 0, 0, 0)
|
||||||
const end = new Date(endDate)
|
|
||||||
end.setHours(23, 59, 59, 999)
|
|
||||||
|
|
||||||
let result = todayBids.value
|
let result = todayBids.value
|
||||||
result = result.filter(bid => {
|
result = result.filter(bid => {
|
||||||
if (!bid.publishDate) return false
|
if (!bid.publishDate) return false
|
||||||
const bidDate = new Date(bid.publishDate)
|
const bidDate = new Date(bid.publishDate)
|
||||||
return bidDate >= start && bidDate <= end
|
return bidDate >= start
|
||||||
})
|
})
|
||||||
|
|
||||||
const totalBids = todayBids.value.length
|
const totalBids = todayBids.value.length
|
||||||
const filteredCount = result.length
|
const filteredCount = result.length
|
||||||
|
|
||||||
console.log('setLast7Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
|
console.log('setLast7Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
|
||||||
if (totalBids > 0) {
|
if (totalBids === 0) {
|
||||||
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
|
|
||||||
} else {
|
|
||||||
ElMessage.warning('暂无数据,请先抓取数据')
|
ElMessage.warning('暂无数据,请先抓取数据')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -368,18 +360,16 @@ const filteredTodayBids = computed(() => {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// 按日期范围筛选
|
// 按日期范围筛选(只限制开始时间,不限制结束时间)
|
||||||
if (dateRange.value && dateRange.value.length === 2) {
|
if (dateRange.value && dateRange.value.length === 2) {
|
||||||
const [startDate, endDate] = dateRange.value
|
const [startDate] = dateRange.value
|
||||||
result = result.filter(bid => {
|
result = result.filter(bid => {
|
||||||
if (!bid.publishDate) return false
|
if (!bid.publishDate) return false
|
||||||
const bidDate = new Date(bid.publishDate)
|
const bidDate = new Date(bid.publishDate)
|
||||||
const start = new Date(startDate)
|
const start = new Date(startDate)
|
||||||
const end = new Date(endDate)
|
// 设置时间为当天的开始
|
||||||
// 设置时间为当天的开始和结束
|
|
||||||
start.setHours(0, 0, 0, 0)
|
start.setHours(0, 0, 0, 0)
|
||||||
end.setHours(23, 59, 59, 999)
|
return bidDate >= start
|
||||||
return bidDate >= start && bidDate <= end
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -399,7 +389,7 @@ watch(filteredTodayBids, (newFilteredBids) => {
|
|||||||
const fetchData = async () => {
|
const fetchData = async () => {
|
||||||
loading.value = true
|
loading.value = true
|
||||||
try {
|
try {
|
||||||
const [bidsRes, highRes, kwRes, sourcesRes, statusRes] = await Promise.all([
|
const [bidsRes, recentRes, highRes, kwRes, sourcesRes, statusRes] = await Promise.all([
|
||||||
axios.get('/api/bids', {
|
axios.get('/api/bids', {
|
||||||
params: {
|
params: {
|
||||||
page: currentPage.value,
|
page: currentPage.value,
|
||||||
@@ -407,6 +397,7 @@ const fetchData = async () => {
|
|||||||
source: selectedSource.value || undefined
|
source: selectedSource.value || undefined
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
|
axios.get('/api/bids/recent'),
|
||||||
axios.get('/api/bids/high-priority'),
|
axios.get('/api/bids/high-priority'),
|
||||||
axios.get('/api/keywords'),
|
axios.get('/api/keywords'),
|
||||||
axios.get('/api/bids/sources'),
|
axios.get('/api/bids/sources'),
|
||||||
@@ -414,19 +405,11 @@ const fetchData = async () => {
|
|||||||
])
|
])
|
||||||
bids.value = bidsRes.data.items
|
bids.value = bidsRes.data.items
|
||||||
total.value = bidsRes.data.total
|
total.value = bidsRes.data.total
|
||||||
|
todayBids.value = recentRes.data
|
||||||
highPriorityBids.value = highRes.data
|
highPriorityBids.value = highRes.data
|
||||||
keywords.value = kwRes.data
|
keywords.value = kwRes.data
|
||||||
sourceOptions.value = sourcesRes.data
|
sourceOptions.value = sourcesRes.data
|
||||||
isCrawling.value = statusRes.data.isCrawling
|
isCrawling.value = statusRes.data.isCrawling
|
||||||
|
|
||||||
// 过滤今天的数据用于 Today's Bids
|
|
||||||
const today = new Date()
|
|
||||||
today.setHours(0, 0, 0, 0)
|
|
||||||
todayBids.value = bidsRes.data.items.filter((bid: any) => {
|
|
||||||
if (!bid.publishDate) return false
|
|
||||||
const bidDate = new Date(bid.publishDate)
|
|
||||||
return bidDate >= today
|
|
||||||
})
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
ElMessage.error('Failed to fetch data')
|
ElMessage.error('Failed to fetch data')
|
||||||
} finally {
|
} finally {
|
||||||
|
|||||||
@@ -18,7 +18,8 @@
|
|||||||
"test:cov": "jest --coverage",
|
"test:cov": "jest --coverage",
|
||||||
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
|
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
|
||||||
"test:e2e": "jest --config ./test/jest-e2e.json",
|
"test:e2e": "jest --config ./test/jest-e2e.json",
|
||||||
"crawl": "ts-node -r tsconfig-paths/register src/scripts/crawl.ts"
|
"crawl": "ts-node -r tsconfig-paths/register src/scripts/crawl.ts",
|
||||||
|
"web":"npm --prefix frontend run build"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@nestjs/common": "^11.0.1",
|
"@nestjs/common": "^11.0.1",
|
||||||
|
|||||||
@@ -10,6 +10,11 @@ export class BidsController {
|
|||||||
return this.bidsService.findAll(query);
|
return this.bidsService.findAll(query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Get('recent')
|
||||||
|
getRecent() {
|
||||||
|
return this.bidsService.getRecentBids();
|
||||||
|
}
|
||||||
|
|
||||||
@Get('high-priority')
|
@Get('high-priority')
|
||||||
getHighPriority() {
|
getHighPriority() {
|
||||||
return this.bidsService.getHighPriorityCorrected();
|
return this.bidsService.getHighPriorityCorrected();
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { Injectable } from '@nestjs/common';
|
import { Injectable } from '@nestjs/common';
|
||||||
import { InjectRepository } from '@nestjs/typeorm';
|
import { InjectRepository } from '@nestjs/typeorm';
|
||||||
import { Repository, LessThan } from 'typeorm';
|
import { Repository, LessThan, MoreThanOrEqual } from 'typeorm';
|
||||||
import { BidItem } from '../entities/bid-item.entity';
|
import { BidItem } from '../entities/bid-item.entity';
|
||||||
|
|
||||||
@Injectable()
|
@Injectable()
|
||||||
@@ -75,4 +75,16 @@ export class BidsService {
|
|||||||
.getRawMany();
|
.getRawMany();
|
||||||
return result.map((item: any) => item.source);
|
return result.map((item: any) => item.source);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async getRecentBids() {
|
||||||
|
const thirtyDaysAgo = new Date();
|
||||||
|
thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30);
|
||||||
|
thirtyDaysAgo.setHours(0, 0, 0, 0);
|
||||||
|
|
||||||
|
return this.bidRepository
|
||||||
|
.createQueryBuilder('bid')
|
||||||
|
.where('bid.publishDate >= :thirtyDaysAgo', { thirtyDaysAgo })
|
||||||
|
.orderBy('bid.publishDate', 'DESC')
|
||||||
|
.getMany();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,15 @@
|
|||||||
import * as winston from 'winston';
|
import * as winston from 'winston';
|
||||||
import DailyRotateFile from 'winston-daily-rotate-file';
|
import DailyRotateFile from 'winston-daily-rotate-file';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
|
||||||
const logDir = path.join(process.cwd(), 'logs');
|
const logDir = path.join(process.cwd(), 'logs');
|
||||||
|
|
||||||
|
// 确保日志目录存在
|
||||||
|
if (!fs.existsSync(logDir)) {
|
||||||
|
fs.mkdirSync(logDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
// 日志格式
|
// 日志格式
|
||||||
const logFormat = winston.format.combine(
|
const logFormat = winston.format.combine(
|
||||||
winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),
|
winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),
|
||||||
|
|||||||
@@ -32,6 +32,9 @@ export class BidCrawlerService {
|
|||||||
// 统计结果
|
// 统计结果
|
||||||
const crawlResults: Record<string, { success: number; error?: string }> = {};
|
const crawlResults: Record<string, { success: number; error?: string }> = {};
|
||||||
|
|
||||||
|
// 记录数据为0的爬虫,用于重试
|
||||||
|
const zeroDataCrawlers: any[] = [];
|
||||||
|
|
||||||
// 从环境变量读取代理配置
|
// 从环境变量读取代理配置
|
||||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||||
@@ -83,6 +86,11 @@ export class BidCrawlerService {
|
|||||||
// 记录成功数量
|
// 记录成功数量
|
||||||
crawlResults[crawler.name] = { success: results.length };
|
crawlResults[crawler.name] = { success: results.length };
|
||||||
|
|
||||||
|
// 如果数据为0,记录下来用于重试
|
||||||
|
if (results.length === 0) {
|
||||||
|
zeroDataCrawlers.push(crawler);
|
||||||
|
}
|
||||||
|
|
||||||
for (const item of results) {
|
for (const item of results) {
|
||||||
await this.bidsService.createOrUpdate({
|
await this.bidsService.createOrUpdate({
|
||||||
title: item.title,
|
title: item.title,
|
||||||
@@ -98,6 +106,45 @@ export class BidCrawlerService {
|
|||||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 对数据为0的爬虫进行重试
|
||||||
|
if (zeroDataCrawlers.length > 0) {
|
||||||
|
this.logger.log(`Retrying ${zeroDataCrawlers.length} crawlers with zero data...`);
|
||||||
|
|
||||||
|
for (const crawler of zeroDataCrawlers) {
|
||||||
|
this.logger.log(`Retrying: ${crawler.name}`);
|
||||||
|
|
||||||
|
// 检查是否超时
|
||||||
|
const elapsedTime = Date.now() - startTime;
|
||||||
|
if (elapsedTime > maxExecutionTime) {
|
||||||
|
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 3 hours. Stopping retry...`);
|
||||||
|
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const results = await crawler.crawl(browser);
|
||||||
|
this.logger.log(`Retry extracted ${results.length} items from ${crawler.name}`);
|
||||||
|
|
||||||
|
// 更新统计结果
|
||||||
|
crawlResults[crawler.name] = { success: results.length };
|
||||||
|
|
||||||
|
for (const item of results) {
|
||||||
|
await this.bidsService.createOrUpdate({
|
||||||
|
title: item.title,
|
||||||
|
url: item.url,
|
||||||
|
publishDate: item.publishDate,
|
||||||
|
source: crawler.name,
|
||||||
|
unit: '',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
|
||||||
|
// 记录错误信息
|
||||||
|
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error(`Crawl task failed: ${error.message}`);
|
this.logger.error(`Crawl task failed: ${error.message}`);
|
||||||
} finally {
|
} finally {
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ export const CdtCrawler = {
|
|||||||
await page.waitForFunction(() => {
|
await page.waitForFunction(() => {
|
||||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||||
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
|
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
|
||||||
}, { timeout: 30000 });
|
}, { timeout: 60000 });
|
||||||
|
|
||||||
await page.evaluate(() => {
|
await page.evaluate(() => {
|
||||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||||
|
|||||||
@@ -1,8 +1,14 @@
|
|||||||
import { NestFactory } from '@nestjs/core';
|
import { NestFactory } from '@nestjs/core';
|
||||||
import { AppModule } from './app.module';
|
import { AppModule } from './app.module';
|
||||||
|
import { CustomLogger } from './common/logger/logger.service';
|
||||||
|
|
||||||
async function bootstrap() {
|
async function bootstrap() {
|
||||||
const app = await NestFactory.create(AppModule);
|
const app = await NestFactory.create(AppModule);
|
||||||
|
|
||||||
|
// 使用自定义日志服务
|
||||||
|
const logger = await app.resolve(CustomLogger);
|
||||||
|
app.useLogger(logger);
|
||||||
|
|
||||||
await app.listen(process.env.PORT ?? 3000);
|
await app.listen(process.env.PORT ?? 3000);
|
||||||
}
|
}
|
||||||
bootstrap();
|
bootstrap();
|
||||||
|
|||||||
@@ -1,14 +1,18 @@
|
|||||||
import { NestFactory } from '@nestjs/core';
|
import { NestFactory } from '@nestjs/core';
|
||||||
import { AppModule } from '../app.module';
|
import { AppModule } from '../app.module';
|
||||||
import { BidCrawlerService } from '../crawler/services/bid-crawler.service';
|
import { BidCrawlerService } from '../crawler/services/bid-crawler.service';
|
||||||
import { Logger } from '@nestjs/common';
|
import { CustomLogger } from '../common/logger/logger.service';
|
||||||
|
|
||||||
async function runCrawler() {
|
async function runCrawler() {
|
||||||
const logger = new Logger('CrawlScript');
|
const app = await NestFactory.createApplicationContext(AppModule);
|
||||||
|
|
||||||
|
// 设置自定义 logger,使 NestJS 框架日志也输出到文件
|
||||||
|
const logger = await app.resolve(CustomLogger);
|
||||||
|
app.useLogger(logger);
|
||||||
|
logger.setContext('CrawlScript');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const app = await NestFactory.createApplicationContext(AppModule);
|
const crawlerService = await app.resolve(BidCrawlerService);
|
||||||
const crawlerService = app.get(BidCrawlerService);
|
|
||||||
|
|
||||||
logger.log('Starting crawler...');
|
logger.log('Starting crawler...');
|
||||||
await crawlerService.crawlAll();
|
await crawlerService.crawlAll();
|
||||||
@@ -18,6 +22,7 @@ async function runCrawler() {
|
|||||||
process.exit(0);
|
process.exit(0);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error('Crawler failed:', error);
|
logger.error('Crawler failed:', error);
|
||||||
|
await app.close();
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user