feat: 添加爬虫统计信息页面和功能
新增爬虫统计信息页面,展示各来源的爬取数量、最新更新时间、错误信息等统计指标 后端添加爬虫统计信息存储和查询接口,记录每次爬取的结果 支持按关键词过滤招标信息查询
This commit is contained in:
@@ -17,7 +17,7 @@ import { AiModule } from './ai/ai.module';
|
||||
ScheduleModule.forRoot(),
|
||||
ServeStaticModule.forRoot({
|
||||
rootPath: join(__dirname, '..', 'frontend', 'dist'),
|
||||
exclude: ['/api/(.*)'],
|
||||
exclude: ['/api/:path(*)'],
|
||||
}),
|
||||
LoggerModule,
|
||||
DatabaseModule,
|
||||
|
||||
@@ -26,7 +26,13 @@ export class BidsController {
|
||||
}
|
||||
|
||||
@Get('by-date-range')
|
||||
getByDateRange(@Query('startDate') startDate: string, @Query('endDate') endDate: string) {
|
||||
return this.bidsService.getBidsByDateRange(startDate, endDate);
|
||||
getByDateRange(@Query('startDate') startDate: string, @Query('endDate') endDate?: string, @Query('keywords') keywords?: string) {
|
||||
const keywordsArray = keywords ? keywords.split(',') : undefined;
|
||||
return this.bidsService.getBidsByDateRange(startDate, endDate, keywordsArray);
|
||||
}
|
||||
|
||||
@Get('crawl-info-stats')
|
||||
getCrawlInfoStats() {
|
||||
return this.bidsService.getCrawlInfoAddStats();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,7 +88,7 @@ export class BidsService {
|
||||
.getMany();
|
||||
}
|
||||
|
||||
async getBidsByDateRange(startDate?: string, endDate?: string) {
|
||||
async getBidsByDateRange(startDate?: string, endDate?: string, keywords?: string[]) {
|
||||
const qb = this.bidRepository.createQueryBuilder('bid');
|
||||
|
||||
if (startDate) {
|
||||
@@ -103,6 +103,49 @@ export class BidsService {
|
||||
qb.andWhere('bid.publishDate <= :endDate', { endDate: end });
|
||||
}
|
||||
|
||||
if (keywords && keywords.length > 0) {
|
||||
const keywordConditions = keywords.map((keyword, index) => {
|
||||
return `bid.title LIKE :keyword${index}`;
|
||||
}).join(' OR ');
|
||||
qb.andWhere(`(${keywordConditions})`, keywords.reduce((params, keyword, index) => {
|
||||
params[`keyword${index}`] = `%${keyword}%`;
|
||||
return params;
|
||||
}, {}));
|
||||
}
|
||||
|
||||
return qb.orderBy('bid.publishDate', 'DESC').getMany();
|
||||
}
|
||||
|
||||
async getCrawlInfoAddStats() {
|
||||
const { InjectRepository } = require('@nestjs/typeorm');
|
||||
const { Repository } = require('typeorm');
|
||||
const { CrawlInfoAdd } = require('../../crawler/entities/crawl-info-add.entity');
|
||||
|
||||
// 获取每个来源的最新一次爬虫记录
|
||||
const query = `
|
||||
SELECT
|
||||
source,
|
||||
count,
|
||||
latestPublishDate,
|
||||
error,
|
||||
createdAt as latestUpdate
|
||||
FROM crawl_info_add
|
||||
WHERE id IN (
|
||||
SELECT MAX(id)
|
||||
FROM crawl_info_add
|
||||
GROUP BY source
|
||||
)
|
||||
ORDER BY source ASC
|
||||
`;
|
||||
|
||||
const results = await this.bidRepository.query(query);
|
||||
|
||||
return results.map((item: any) => ({
|
||||
source: item.source,
|
||||
count: item.count,
|
||||
latestUpdate: item.latestUpdate,
|
||||
latestPublishDate: item.latestPublishDate,
|
||||
error: item.error,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import { Module } from '@nestjs/common';
|
||||
import { TypeOrmModule } from '@nestjs/typeorm';
|
||||
import { BidCrawlerService } from './services/bid-crawler.service';
|
||||
import { CrawlerController } from './crawler.controller';
|
||||
import { BidsModule } from '../bids/bids.module';
|
||||
import { CrawlInfoAdd } from './entities/crawl-info-add.entity';
|
||||
|
||||
@Module({
|
||||
imports: [BidsModule],
|
||||
imports: [BidsModule, TypeOrmModule.forFeature([CrawlInfoAdd])],
|
||||
controllers: [CrawlerController],
|
||||
providers: [BidCrawlerService],
|
||||
exports: [BidCrawlerService],
|
||||
|
||||
22
src/crawler/entities/crawl-info-add.entity.ts
Normal file
22
src/crawler/entities/crawl-info-add.entity.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn } from 'typeorm';
|
||||
|
||||
@Entity('crawl_info_add')
|
||||
export class CrawlInfoAdd {
|
||||
@PrimaryGeneratedColumn('uuid')
|
||||
id: string;
|
||||
|
||||
@Column()
|
||||
source: string;
|
||||
|
||||
@Column()
|
||||
count: number;
|
||||
|
||||
@Column({ type: 'datetime', nullable: true })
|
||||
latestPublishDate: Date | null;
|
||||
|
||||
@Column({ type: 'text', nullable: true })
|
||||
error: string;
|
||||
|
||||
@CreateDateColumn()
|
||||
createdAt: Date;
|
||||
}
|
||||
@@ -1,7 +1,10 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import { Repository } from 'typeorm';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { BidsService } from '../../bids/services/bid.service';
|
||||
import { CrawlInfoAdd } from '../entities/crawl-info-add.entity';
|
||||
import { ChdtpCrawler } from './chdtp_target';
|
||||
import { ChngCrawler } from './chng_target';
|
||||
import { SzecpCrawler } from './szecp_target';
|
||||
@@ -22,6 +25,8 @@ export class BidCrawlerService {
|
||||
constructor(
|
||||
private bidsService: BidsService,
|
||||
private configService: ConfigService,
|
||||
@InjectRepository(CrawlInfoAdd)
|
||||
private crawlInfoRepository: Repository<CrawlInfoAdd>,
|
||||
) {}
|
||||
|
||||
async crawlAll() {
|
||||
@@ -93,6 +98,14 @@ export class BidCrawlerService {
|
||||
zeroDataCrawlers.push(crawler);
|
||||
}
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title: item.title,
|
||||
@@ -102,10 +115,16 @@ export class BidCrawlerService {
|
||||
unit: '',
|
||||
});
|
||||
}
|
||||
|
||||
// 保存爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
|
||||
// 保存错误信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,6 +150,14 @@ export class BidCrawlerService {
|
||||
// 更新统计结果
|
||||
crawlResults[crawler.name] = { success: results.length };
|
||||
|
||||
// 获取最新的发布日期
|
||||
const latestPublishDate = results.length > 0
|
||||
? results.reduce((latest, item) => {
|
||||
const itemDate = new Date(item.publishDate);
|
||||
return itemDate > latest ? itemDate : latest;
|
||||
}, new Date(0))
|
||||
: null;
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title: item.title,
|
||||
@@ -140,10 +167,16 @@ export class BidCrawlerService {
|
||||
unit: '',
|
||||
});
|
||||
}
|
||||
|
||||
// 更新爬虫统计信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, results.length, latestPublishDate);
|
||||
} catch (err) {
|
||||
this.logger.error(`Error retrying ${crawler.name}: ${err.message}`);
|
||||
// 记录错误信息
|
||||
crawlResults[crawler.name] = { success: 0, error: err.message };
|
||||
|
||||
// 更新错误信息到数据库
|
||||
await this.saveCrawlInfo(crawler.name, 0, null, err.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -184,4 +217,24 @@ export class BidCrawlerService {
|
||||
this.logger.log('='.repeat(50));
|
||||
}
|
||||
}
|
||||
|
||||
private async saveCrawlInfo(
|
||||
source: string,
|
||||
count: number,
|
||||
latestPublishDate: Date | null,
|
||||
error?: string,
|
||||
) {
|
||||
try {
|
||||
const crawlInfo = this.crawlInfoRepository.create({
|
||||
source,
|
||||
count,
|
||||
latestPublishDate,
|
||||
error,
|
||||
});
|
||||
await this.crawlInfoRepository.save(crawlInfo);
|
||||
this.logger.log(`Saved crawl info for ${source}: ${count} items`);
|
||||
} catch (err) {
|
||||
this.logger.error(`Failed to save crawl info for ${source}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user