Compare commits

...

5 Commits

Author SHA1 Message Date
dmy
f2630ed01c feat: 添加更新数据源脚本 2026-01-12 15:29:45 +08:00
dmy
b1435523e8 feat: 为爬虫测试添加代理支持并通过环境变量配置
添加dotenv依赖,创建jest配置文件和setup文件
修改所有爬虫测试文件以支持通过环境变量配置代理
将jest配置从package.json移动到独立文件
2026-01-12 15:19:54 +08:00
dmy
f1ec37143c feat: 添加中海油招标平台爬虫支持 2026-01-12 15:06:31 +08:00
dmy
090e4121ce feat: 添加国投集团电子采购平台爬虫并更新其他平台名称 2026-01-12 14:53:38 +08:00
dmy
4f37b0fb61 refactor: 重构前端代码,拆分组件并优化README文档 2026-01-12 14:37:18 +08:00
33 changed files with 1547 additions and 506 deletions

100
README.md
View File

@@ -97,26 +97,86 @@ Nest is an MIT-licensed open source project. It can grow thanks to the sponsors
Nest is [MIT licensed](https://github.com/nestjs/nest/blob/master/LICENSE). Nest is [MIT licensed](https://github.com/nestjs/nest/blob/master/LICENSE).
How to Run: ## How to Run
1. Database Setup: Update the .env file with your PostgreSQL credentials.
1 DATABASE_TYPE=postgres ### 1. Database Setup
2 DATABASE_HOST=localhost Update the `.env` file with your PostgreSQL credentials:
3 DATABASE_PORT=5432
4 DATABASE_USERNAME=your_username
5 DATABASE_PASSWORD=your_password
6 DATABASE_NAME=bidding
7 DATABASE_SYNCHRONIZE=true
2. Install Dependencies:
1 npm install
2 cd frontend && npm install
3. Build and Start:
1 # From the root directory ```env
2 cd frontend && npm run build DATABASE_TYPE=postgres
3 cd .. DATABASE_HOST=localhost
4 npm run build DATABASE_PORT=5432
5 npm run start DATABASE_USERNAME=your_username
DATABASE_PASSWORD=your_password
DATABASE_NAME=bidding
DATABASE_SYNCHRONIZE=true
```
The system will automatically initialize with the preset keywords: "山东", "海", "建设", "工程", "采购". You can ### 2. Install Dependencies
manage these and view crawled bidding information at http://localhost:3000.
```bash
npm install
cd frontend && npm install
```
### 3. Build and Start
```bash
# From the root directory
cd frontend && npm run build
cd ..
npm run build
npm run start
```
## Features
### Frontend Features
- **Dashboard**: View high priority bids and today's bids
- **Date Filtering**:
- Click "3天" or "7天" buttons to filter bids from the last 3 or 7 days
- The filter only limits the start date, showing all data from the selected start date onwards (including data newer than the end date)
- **Keyword Filtering**: Filter bids by keywords (saved in localStorage)
- **All Bids**: View all bids with pagination and source filtering
- **Keyword Management**: Add and delete keywords with weight-based priority
### Backend Features
- **Multi-Source Crawling**: Crawls bidding information from multiple sources:
- ChdtpCrawler
- ChngCrawler
- SzecpCrawler
- CdtCrawler
- EpsCrawler
- CnncecpCrawler
- CgnpcCrawler
- CeicCrawler
- EspicCrawler
- PowerbeijingCrawler
- **Automatic Retry**: If a crawler returns 0 items, it will be retried after all crawlers complete
- **Proxy Support**: Configurable proxy settings via environment variables
- **Scheduled Tasks**: Automatic crawling at scheduled intervals
### Environment Variables
```env
# Database
DATABASE_TYPE=postgres
DATABASE_HOST=localhost
DATABASE_PORT=5432
DATABASE_USERNAME=your_username
DATABASE_PASSWORD=your_password
DATABASE_NAME=bidding
DATABASE_SYNCHRONIZE=true
# Proxy (optional)
PROXY_HOST=your_proxy_host
PROXY_PORT=your_proxy_port
PROXY_USERNAME=your_proxy_username
PROXY_PASSWORD=your_proxy_password
```
## Initial Setup
The system will automatically initialize with the preset keywords: "山东", "海", "建设", "工程", "采购". You can manage these and view crawled bidding information at http://localhost:3000.

View File

@@ -31,167 +31,43 @@
</el-header> </el-header>
<el-main> <el-main>
<div v-if="activeIndex === '1'"> <Dashboard
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;"> v-if="activeIndex === '1'"
<h2 style="margin: 0;">Dashboard</h2> :today-bids="todayBids"
<el-button type="primary" :loading="crawling" :disabled="isCrawling" @click="handleCrawl"> :high-priority-bids="highPriorityBids"
<el-icon style="margin-right: 5px"><Refresh /></el-icon> :keywords="keywords"
立刻抓取 :loading="loading"
</el-button> :is-crawling="isCrawling"
</div> @refresh="fetchData"
<el-row :gutter="20">
<el-col :span="24">
<el-card class="box-card" shadow="hover">
<template #header>
<div class="card-header">
<span>High Priority Bids</span>
<el-tag type="danger">Top 10</el-tag>
</div>
</template>
<el-table :data="highPriorityBids" style="width: 100%" size="small">
<el-table-column prop="title" label="Title">
<template #default="scope">
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="240" />
<el-table-column prop="publishDate" label="Date" width="120">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
</el-table>
</el-card>
</el-col>
</el-row>
<el-divider />
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
<h3 style="margin: 0;">Today's Bids</h3>
<div style="display: flex; gap: 10px;">
<el-date-picker
v-model="dateRange"
type="daterange"
range-separator="To"
start-placeholder="Start Date"
end-placeholder="End Date"
format="YYYY-MM-DD"
value-format="YYYY-MM-DD"
clearable
style="width: 240px;"
/> />
<el-button type="primary" @click="setLast3Days">3天</el-button>
<el-button type="primary" @click="setLast7Days">7天</el-button>
<el-select
v-model="selectedKeywords"
multiple
collapse-tags
collapse-tags-tooltip
placeholder="Filter by Keywords"
clearable
style="width: 300px;"
>
<el-option
v-for="keyword in keywords"
:key="keyword.id"
:label="keyword.word"
:value="keyword.word"
/>
</el-select>
</div>
</div>
<el-table :data="filteredTodayBids" v-loading="loading" style="width: 100%">
<el-table-column prop="title" label="Title">
<template #default="scope">
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="220" />
<el-table-column prop="publishDate" label="Date" width="150">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
</el-table>
</div>
<div v-if="activeIndex === '2'"> <Bids
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;"> v-if="activeIndex === '2'"
<h2 style="margin: 0;">All Bids</h2> :bids="bids"
<el-select v-model="selectedSource" placeholder="Filter by Source" clearable style="width: 200px" @change="currentPage = 1; fetchData()"> :source-options="sourceOptions"
<el-option :loading="loading"
v-for="source in sourceOptions"
:key="source"
:label="source"
:value="source"
/>
</el-select>
</div>
<el-table :data="bids" v-loading="loading" style="width: 100%">
<el-table-column prop="title" label="Title">
<template #default="scope">
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="200" />
<el-table-column prop="publishDate" label="Date" width="150">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
</el-table>
<el-pagination
v-model:current-page="currentPage"
v-model:page-size="pageSize"
:page-sizes="[10, 20, 50, 100]"
:total="total" :total="total"
layout="total, sizes, prev, pager, next, jumper" @fetch="handleFetchBids"
@current-change="handlePageChange"
@size-change="handleSizeChange"
style="margin-top: 20px; justify-content: flex-end;"
/> />
</div>
<div v-if="activeIndex === '3'"> <Keywords
<div class="card-header" style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;"> v-if="activeIndex === '3'"
<h2>Keyword Management</h2> :keywords="keywords"
<el-button type="primary" @click="dialogVisible = true">Add Keyword</el-button> :loading="loading"
</div> @refresh="fetchData"
/>
<div v-loading="loading" style="min-height: 200px;">
<el-tag
v-for="keyword in keywords"
:key="keyword.id"
closable
:type="getTagType(keyword.weight)"
@close="handleDeleteKeyword(keyword.id)"
style="margin: 5px;"
>
{{ keyword.word }}
</el-tag>
<el-empty v-if="keywords.length === 0" description="No keywords" />
</div>
</div>
</el-main> </el-main>
</el-container> </el-container>
<el-dialog v-model="dialogVisible" title="Add Keyword" width="30%">
<el-form :model="form" label-width="120px">
<el-form-item label="Keyword">
<el-input v-model="form.word" />
</el-form-item>
<el-form-item label="Weight">
<el-input-number v-model="form.weight" :min="1" :max="5" />
</el-form-item>
</el-form>
<template #footer>
<span class="dialog-footer">
<el-button @click="dialogVisible = false">Cancel</el-button>
<el-button type="primary" @click="handleAddKeyword">Confirm</el-button>
</span>
</template>
</el-dialog>
</el-container> </el-container>
</template> </template>
<script setup lang="ts"> <script setup lang="ts">
import { ref, onMounted, reactive, computed, watch } from 'vue' import { ref, onMounted } from 'vue'
import axios from 'axios' import axios from 'axios'
import { ElMessage } from 'element-plus' import { DataBoard, Document, Setting } from '@element-plus/icons-vue'
import { DataBoard, Document, Setting, Refresh } from '@element-plus/icons-vue' import Dashboard from './components/Dashboard.vue'
import Bids from './components/Bids.vue'
import Keywords from './components/Keywords.vue'
const activeIndex = ref('1') const activeIndex = ref('1')
const bids = ref<any[]>([]) const bids = ref<any[]>([])
@@ -199,202 +75,41 @@ const todayBids = ref<any[]>([])
const highPriorityBids = ref<any[]>([]) const highPriorityBids = ref<any[]>([])
const keywords = ref<any[]>([]) const keywords = ref<any[]>([])
const loading = ref(false) const loading = ref(false)
const crawling = ref(false) const isCrawling = ref(false)
const dialogVisible = ref(false)
const selectedSource = ref('')
const currentPage = ref(1)
const pageSize = ref(10)
const total = ref(0) const total = ref(0)
const sourceOptions = ref<string[]>([]) const sourceOptions = ref<string[]>([])
const isCrawling = ref(false)
const selectedKeywords = ref<string[]>([])
const dateRange = ref<[string, string] | null>(null)
// 从 localStorage 加载保存的关键字
const loadSavedKeywords = () => {
const saved = localStorage.getItem('selectedKeywords')
if (saved) {
try {
selectedKeywords.value = JSON.parse(saved)
} catch (e) {
console.error('Failed to parse saved keywords:', e)
}
}
}
// 监听关键字变化并保存到 localStorage
watch(selectedKeywords, (newKeywords) => {
localStorage.setItem('selectedKeywords', JSON.stringify(newKeywords))
}, { deep: true })
// 监听日期范围变化并显示提示
watch(dateRange, () => {
const totalBids = bids.value.length
const filteredCount = filteredTodayBids.value.length
if (totalBids > 0 && filteredCount < totalBids) {
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
}
})
const form = reactive({
word: '',
weight: 1
})
// 根据 weight 获取 tag 类型
const getTagType = (weight: number) => {
if (weight >= 5) return 'danger'
if (weight >= 4) return 'warning'
if (weight >= 3) return 'primary'
if (weight >= 2) return 'success'
return 'info'
}
const handleSelect = (key: string) => { const handleSelect = (key: string) => {
activeIndex.value = key activeIndex.value = key
} }
// 处理分页变化 const handleFetchBids = async (page: number, limit: number, source?: string) => {
const handlePageChange = (page: number) => { loading.value = true
currentPage.value = page try {
fetchData() const res = await axios.get('/api/bids', {
} params: {
page,
// 处理每页数量变化 limit,
const handleSizeChange = (size: number) => { source: source || undefined
pageSize.value = size
currentPage.value = 1
fetchData()
}
// 设置日期范围为最近3天
const setLast3Days = () => {
const endDate = new Date()
const startDate = new Date()
startDate.setDate(startDate.getDate() - 2) // 最近3天包括今天
const formatDateForPicker = (date: Date) => {
const year = date.getFullYear()
const month = String(date.getMonth() + 1).padStart(2, '0')
const day = String(date.getDate()).padStart(2, '0')
return `${year}-${month}-${day}`
} }
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
console.log('setLast3Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
const start = new Date(startDate)
start.setHours(0, 0, 0, 0)
let result = todayBids.value
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
return bidDate >= start
}) })
bids.value = res.data.items
const totalBids = todayBids.value.length total.value = res.data.total
const filteredCount = result.length } catch (error) {
console.error('Failed to fetch bids:', error)
console.log('setLast3Days result, totalBids:', totalBids, 'filteredCount:', filteredCount) } finally {
if (totalBids === 0) { loading.value = false
ElMessage.warning('暂无数据请先抓取数据')
} }
} }
// 设置日期范围为最近7天
const setLast7Days = () => {
const endDate = new Date()
const startDate = new Date()
startDate.setDate(startDate.getDate() - 6) // 最近7天包括今天
const formatDateForPicker = (date: Date) => {
const year = date.getFullYear()
const month = String(date.getMonth() + 1).padStart(2, '0')
const day = String(date.getDate()).padStart(2, '0')
return `${year}-${month}-${day}`
}
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
console.log('setLast7Days called, todayBids:', todayBids.value.length, 'dateRange:', dateRange.value)
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
const start = new Date(startDate)
start.setHours(0, 0, 0, 0)
let result = todayBids.value
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
return bidDate >= start
})
const totalBids = todayBids.value.length
const filteredCount = result.length
console.log('setLast7Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
if (totalBids === 0) {
ElMessage.warning('暂无数据请先抓取数据')
}
}
const formatDate = (dateString: string) => {
if (!dateString) return '-'
return new Date(dateString).toLocaleDateString()
}
// 过滤 Today's Bids只显示包含所选关键字的项目并且在日期范围内
const filteredTodayBids = computed(() => {
let result = todayBids.value
// 按关键字筛选
if (selectedKeywords.value.length > 0) {
result = result.filter(bid => {
return selectedKeywords.value.some(keyword =>
bid.title.toLowerCase().includes(keyword.toLowerCase())
)
})
}
// 按日期范围筛选(只限制开始时间,不限制结束时间)
if (dateRange.value && dateRange.value.length === 2) {
const [startDate] = dateRange.value
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
const start = new Date(startDate)
// 设置时间为当天的开始
start.setHours(0, 0, 0, 0)
return bidDate >= start
})
}
return result
})
// 监听筛选结果变化并显示提示
watch(filteredTodayBids, (newFilteredBids) => {
const totalBids = todayBids.value.length
const filteredCount = newFilteredBids.length
if (totalBids > 0 && filteredCount < totalBids) {
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
}
}, { deep: true })
const fetchData = async () => { const fetchData = async () => {
loading.value = true loading.value = true
try { try {
const [bidsRes, recentRes, highRes, kwRes, sourcesRes, statusRes] = await Promise.all([ const [bidsRes, recentRes, highRes, kwRes, sourcesRes, statusRes] = await Promise.all([
axios.get('/api/bids', { axios.get('/api/bids', {
params: { params: {
page: currentPage.value, page: 1,
limit: pageSize.value, limit: 10
source: selectedSource.value || undefined
} }
}), }),
axios.get('/api/bids/recent'), axios.get('/api/bids/recent'),
@@ -411,58 +126,13 @@ const fetchData = async () => {
sourceOptions.value = sourcesRes.data sourceOptions.value = sourcesRes.data
isCrawling.value = statusRes.data.isCrawling isCrawling.value = statusRes.data.isCrawling
} catch (error) { } catch (error) {
ElMessage.error('Failed to fetch data') console.error('Failed to fetch data:', error)
} finally { } finally {
loading.value = false loading.value = false
} }
} }
const handleCrawl = async () => {
if (isCrawling.value) {
ElMessage.warning('Crawl is already running')
return
}
crawling.value = true
try {
await axios.post('/api/crawler/run')
ElMessage.success('Crawl completed successfully')
fetchData() // Refresh data after crawl
} catch (error) {
ElMessage.error('Failed to run crawl task')
} finally {
crawling.value = false
}
}
const handleAddKeyword = async () => {
if (!form.word) {
ElMessage.warning('Please enter a keyword')
return
}
try {
await axios.post('/api/keywords', form)
ElMessage.success('Keyword added')
dialogVisible.value = false
form.word = ''
form.weight = 1
fetchData()
} catch (error) {
ElMessage.error('Failed to add keyword')
}
}
const handleDeleteKeyword = async (id: string) => {
try {
await axios.delete(`/api/keywords/${id}`)
ElMessage.success('Keyword deleted')
fetchData()
} catch (error) {
ElMessage.error('Failed to delete keyword')
}
}
onMounted(() => { onMounted(() => {
loadSavedKeywords()
fetchData() fetchData()
}) })
</script> </script>
@@ -486,9 +156,4 @@ onMounted(() => {
font-size: 18px; font-size: 18px;
background-color: #434a50; background-color: #434a50;
} }
.card-header {
display: flex;
justify-content: space-between;
align-items: center;
}
</style> </style>

View File

@@ -0,0 +1,78 @@
<template>
<div>
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;">
<h2 style="margin: 0;">All Bids</h2>
<el-select v-model="selectedSource" placeholder="Filter by Source" clearable style="width: 200px" @change="handleSourceChange">
<el-option
v-for="source in sourceOptions"
:key="source"
:label="source"
:value="source"
/>
</el-select>
</div>
<el-table :data="bids" v-loading="loading" style="width: 100%">
<el-table-column prop="title" label="Title">
<template #default="scope">
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="200" />
<el-table-column prop="publishDate" label="Date" width="150">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
</el-table>
<el-pagination
v-model:current-page="currentPage"
v-model:page-size="pageSize"
:page-sizes="[10, 20, 50, 100]"
:total="total"
layout="total, sizes, prev, pager, next, jumper"
@current-change="handlePageChange"
@size-change="handleSizeChange"
style="margin-top: 20px; justify-content: flex-end;"
/>
</div>
</template>
<script setup lang="ts">
import { ref } from 'vue'
interface Props {
bids: any[]
sourceOptions: string[]
loading: boolean
total: number
}
const props = defineProps<Props>()
const emit = defineEmits<{
fetch: [page: number, limit: number, source?: string]
}>()
const selectedSource = ref('')
const currentPage = ref(1)
const pageSize = ref(10)
const formatDate = (dateString: string) => {
if (!dateString) return '-'
return new Date(dateString).toLocaleDateString()
}
const handleSourceChange = () => {
currentPage.value = 1
emit('fetch', currentPage.value, pageSize.value, selectedSource.value || undefined)
}
const handlePageChange = (page: number) => {
currentPage.value = page
emit('fetch', currentPage.value, pageSize.value, selectedSource.value || undefined)
}
const handleSizeChange = (size: number) => {
pageSize.value = size
currentPage.value = 1
emit('fetch', currentPage.value, pageSize.value, selectedSource.value || undefined)
}
</script>

View File

@@ -0,0 +1,279 @@
<template>
<div>
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;">
<h2 style="margin: 0;">Dashboard</h2>
<el-button type="primary" :loading="crawling" :disabled="isCrawling" @click="handleCrawl">
<el-icon style="margin-right: 5px"><Refresh /></el-icon>
立刻抓取
</el-button>
</div>
<el-row :gutter="20">
<el-col :span="24">
<el-card class="box-card" shadow="hover">
<template #header>
<div class="card-header">
<span>High Priority Bids</span>
<el-tag type="danger">Top 10</el-tag>
</div>
</template>
<el-table :data="highPriorityBids" style="width: 100%" size="small">
<el-table-column prop="title" label="Title">
<template #default="scope">
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="240" />
<el-table-column prop="publishDate" label="Date" width="120">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
</el-table>
</el-card>
</el-col>
</el-row>
<el-divider />
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
<h3 style="margin: 0;">Today's Bids</h3>
<div style="display: flex; gap: 10px;">
<el-date-picker
v-model="dateRange"
type="daterange"
range-separator="To"
start-placeholder="Start Date"
end-placeholder="End Date"
format="YYYY-MM-DD"
value-format="YYYY-MM-DD"
clearable
style="width: 240px;"
/>
<el-button type="primary" @click="setLast3Days">3天</el-button>
<el-button type="primary" @click="setLast7Days">7天</el-button>
<el-select
v-model="selectedKeywords"
multiple
collapse-tags
collapse-tags-tooltip
placeholder="Filter by Keywords"
clearable
style="width: 300px;"
>
<el-option
v-for="keyword in keywords"
:key="keyword.id"
:label="keyword.word"
:value="keyword.word"
/>
</el-select>
</div>
</div>
<el-table :data="filteredTodayBids" v-loading="loading" style="width: 100%">
<el-table-column prop="title" label="Title">
<template #default="scope">
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
</template>
</el-table-column>
<el-table-column prop="source" label="Source" width="220" />
<el-table-column prop="publishDate" label="Date" width="150">
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
</el-table-column>
</el-table>
</div>
</template>
<script setup lang="ts">
import { ref, computed, watch } from 'vue'
import axios from 'axios'
import { ElMessage } from 'element-plus'
import { Refresh } from '@element-plus/icons-vue'
interface Props {
todayBids: any[]
highPriorityBids: any[]
keywords: any[]
loading: boolean
isCrawling: boolean
}
const props = defineProps<Props>()
const emit = defineEmits<{
crawl: []
refresh: []
}>()
const selectedKeywords = ref<string[]>([])
const dateRange = ref<[string, string] | null>(null)
const crawling = ref(false)
// 从 localStorage 加载保存的关键字
const loadSavedKeywords = () => {
const saved = localStorage.getItem('selectedKeywords')
if (saved) {
try {
selectedKeywords.value = JSON.parse(saved)
} catch (e) {
console.error('Failed to parse saved keywords:', e)
}
}
}
// 监听关键字变化并保存到 localStorage
watch(selectedKeywords, (newKeywords) => {
localStorage.setItem('selectedKeywords', JSON.stringify(newKeywords))
}, { deep: true })
// 监听日期范围变化并显示提示
watch(dateRange, () => {
const totalBids = props.todayBids.length
const filteredCount = filteredTodayBids.value.length
if (totalBids > 0 && filteredCount < totalBids) {
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
}
})
const formatDate = (dateString: string) => {
if (!dateString) return '-'
return new Date(dateString).toLocaleDateString()
}
// 过滤 Today's Bids只显示包含所选关键字的项目并且在日期范围内
const filteredTodayBids = computed(() => {
let result = props.todayBids
// 按关键字筛选
if (selectedKeywords.value.length > 0) {
result = result.filter(bid => {
return selectedKeywords.value.some(keyword =>
bid.title.toLowerCase().includes(keyword.toLowerCase())
)
})
}
// 按日期范围筛选(只限制开始时间,不限制结束时间)
if (dateRange.value && dateRange.value.length === 2) {
const [startDate] = dateRange.value
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
const start = new Date(startDate)
// 设置时间为当天的开始
start.setHours(0, 0, 0, 0)
return bidDate >= start
})
}
return result
})
// 监听筛选结果变化并显示提示
watch(filteredTodayBids, (newFilteredBids) => {
const totalBids = props.todayBids.length
const filteredCount = newFilteredBids.length
if (totalBids > 0 && filteredCount < totalBids) {
ElMessage.info(`筛选结果:共 ${filteredCount} 条数据(总共 ${totalBids} 条)`)
}
}, { deep: true })
// 设置日期范围为最近3天
const setLast3Days = () => {
const endDate = new Date()
const startDate = new Date()
startDate.setDate(startDate.getDate() - 2) // 最近3天包括今天
const formatDateForPicker = (date: Date) => {
const year = date.getFullYear()
const month = String(date.getMonth() + 1).padStart(2, '0')
const day = String(date.getDate()).padStart(2, '0')
return `${year}-${month}-${day}`
}
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
console.log('setLast3Days called, todayBids:', props.todayBids.length, 'dateRange:', dateRange.value)
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
const start = new Date(startDate)
start.setHours(0, 0, 0, 0)
let result = props.todayBids
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
return bidDate >= start
})
const totalBids = props.todayBids.length
const filteredCount = result.length
console.log('setLast3Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
if (totalBids === 0) {
ElMessage.warning('暂无数据,请先抓取数据')
}
}
// 设置日期范围为最近7天
const setLast7Days = () => {
const endDate = new Date()
const startDate = new Date()
startDate.setDate(startDate.getDate() - 6) // 最近7天包括今天
const formatDateForPicker = (date: Date) => {
const year = date.getFullYear()
const month = String(date.getMonth() + 1).padStart(2, '0')
const day = String(date.getDate()).padStart(2, '0')
return `${year}-${month}-${day}`
}
dateRange.value = [formatDateForPicker(startDate), formatDateForPicker(endDate)]
console.log('setLast7Days called, todayBids:', props.todayBids.length, 'dateRange:', dateRange.value)
// 直接计算筛选结果并显示提示(只限制开始时间,不限制结束时间)
const start = new Date(startDate)
start.setHours(0, 0, 0, 0)
let result = props.todayBids
result = result.filter(bid => {
if (!bid.publishDate) return false
const bidDate = new Date(bid.publishDate)
return bidDate >= start
})
const totalBids = props.todayBids.length
const filteredCount = result.length
console.log('setLast7Days result, totalBids:', totalBids, 'filteredCount:', filteredCount)
if (totalBids === 0) {
ElMessage.warning('暂无数据,请先抓取数据')
}
}
const handleCrawl = async () => {
if (props.isCrawling) {
ElMessage.warning('Crawl is already running')
return
}
crawling.value = true
try {
await axios.post('/api/crawler/run')
ElMessage.success('Crawl completed successfully')
emit('refresh') // Refresh data after crawl
} catch (error) {
ElMessage.error('Failed to run crawl task')
} finally {
crawling.value = false
}
}
// 初始化时加载保存的关键字
loadSavedKeywords()
</script>
<style scoped>
.card-header {
display: flex;
justify-content: space-between;
align-items: center;
}
</style>

View File

@@ -1,41 +0,0 @@
<script setup lang="ts">
import { ref } from 'vue'
defineProps<{ msg: string }>()
const count = ref(0)
</script>
<template>
<h1>{{ msg }}</h1>
<div class="card">
<button type="button" @click="count++">count is {{ count }}</button>
<p>
Edit
<code>components/HelloWorld.vue</code> to test HMR
</p>
</div>
<p>
Check out
<a href="https://vuejs.org/guide/quick-start.html#local" target="_blank"
>create-vue</a
>, the official Vue + Vite starter
</p>
<p>
Learn more about IDE Support for Vue in the
<a
href="https://vuejs.org/guide/scaling-up/tooling.html#ide-support"
target="_blank"
>Vue Docs Scaling up Guide</a
>.
</p>
<p class="read-the-docs">Click on the Vite and Vue logos to learn more</p>
</template>
<style scoped>
.read-the-docs {
color: #888;
}
</style>

View File

@@ -0,0 +1,107 @@
<template>
<div>
<div class="card-header" style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;">
<h2>Keyword Management</h2>
<el-button type="primary" @click="dialogVisible = true">Add Keyword</el-button>
</div>
<div v-loading="loading" style="min-height: 200px;">
<el-tag
v-for="keyword in keywords"
:key="keyword.id"
closable
:type="getTagType(keyword.weight)"
@close="handleDeleteKeyword(keyword.id)"
style="margin: 5px;"
>
{{ keyword.word }}
</el-tag>
<el-empty v-if="keywords.length === 0" description="No keywords" />
</div>
<el-dialog v-model="dialogVisible" title="Add Keyword" width="30%">
<el-form :model="form" label-width="120px">
<el-form-item label="Keyword">
<el-input v-model="form.word" />
</el-form-item>
<el-form-item label="Weight">
<el-input-number v-model="form.weight" :min="1" :max="5" />
</el-form-item>
</el-form>
<template #footer>
<span class="dialog-footer">
<el-button @click="dialogVisible = false">Cancel</el-button>
<el-button type="primary" @click="handleAddKeyword">Confirm</el-button>
</span>
</template>
</el-dialog>
</div>
</template>
<script setup lang="ts">
import { ref, reactive } from 'vue'
import axios from 'axios'
import { ElMessage } from 'element-plus'
interface Props {
keywords: any[]
loading: boolean
}
const props = defineProps<Props>()
const emit = defineEmits<{
refresh: []
}>()
const dialogVisible = ref(false)
const form = reactive({
word: '',
weight: 1
})
// 根据 weight 获取 tag 类型
const getTagType = (weight: number) => {
if (weight >= 5) return 'danger'
if (weight >= 4) return 'warning'
if (weight >= 3) return 'primary'
if (weight >= 2) return 'success'
return 'info'
}
const handleAddKeyword = async () => {
if (!form.word) {
ElMessage.warning('Please enter a keyword')
return
}
try {
await axios.post('/api/keywords', form)
ElMessage.success('Keyword added')
dialogVisible.value = false
form.word = ''
form.weight = 1
emit('refresh')
} catch (error) {
ElMessage.error('Failed to add keyword')
}
}
const handleDeleteKeyword = async (id: string) => {
try {
await axios.delete(`/api/keywords/${id}`)
ElMessage.success('Keyword deleted')
emit('refresh')
} catch (error) {
ElMessage.error('Failed to delete keyword')
}
}
</script>
<style scoped>
.card-header {
display: flex;
justify-content: space-between;
align-items: center;
}
</style>

13
jest.config.js Normal file
View File

@@ -0,0 +1,13 @@
module.exports = {
moduleFileExtensions: ['js', 'json', 'ts'],
rootDir: 'src',
testRegex: '.*\\.spec\\.ts$',
transform: {
'^.+\\.(t|j)s$': 'ts-jest',
},
collectCoverageFrom: ['**/*.(t|j)s'],
coverageDirectory: '../coverage',
testEnvironment: 'node',
// 加载环境变量
setupFiles: ['<rootDir>/../jest.setup.js'],
};

2
jest.setup.js Normal file
View File

@@ -0,0 +1,2 @@
// 加载环境变量
require('dotenv').config({ path: '.env' });

View File

@@ -19,6 +19,7 @@
"test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand", "test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
"test:e2e": "jest --config ./test/jest-e2e.json", "test:e2e": "jest --config ./test/jest-e2e.json",
"crawl": "ts-node -r tsconfig-paths/register src/scripts/crawl.ts", "crawl": "ts-node -r tsconfig-paths/register src/scripts/crawl.ts",
"update-source": "ts-node -r tsconfig-paths/register src/scripts/update-source.ts",
"web":"npm --prefix frontend run build" "web":"npm --prefix frontend run build"
}, },
"dependencies": { "dependencies": {
@@ -32,6 +33,7 @@
"axios": "^1.13.2", "axios": "^1.13.2",
"class-transformer": "^0.5.1", "class-transformer": "^0.5.1",
"class-validator": "^0.14.3", "class-validator": "^0.14.3",
"dotenv": "^16.4.7",
"mysql2": "^3.16.0", "mysql2": "^3.16.0",
"puppeteer": "^24.34.0", "puppeteer": "^24.34.0",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
@@ -66,22 +68,5 @@
"tsconfig-paths": "^4.2.0", "tsconfig-paths": "^4.2.0",
"typescript": "^5.7.3", "typescript": "^5.7.3",
"typescript-eslint": "^8.20.0" "typescript-eslint": "^8.20.0"
},
"jest": {
"moduleFileExtensions": [
"js",
"json",
"ts"
],
"rootDir": "src",
"testRegex": ".*\\.spec\\.ts$",
"transform": {
"^.+\\.(t|j)s$": "ts-jest"
},
"collectCoverageFrom": [
"**/*.(t|j)s"
],
"coverageDirectory": "../coverage",
"testEnvironment": "node"
} }
} }

View File

@@ -12,6 +12,8 @@ import { CgnpcCrawler } from './cgnpc_target';
import { CeicCrawler } from './ceic_target'; import { CeicCrawler } from './ceic_target';
import { EspicCrawler } from './espic_target'; import { EspicCrawler } from './espic_target';
import { PowerbeijingCrawler } from './powerbeijing_target'; import { PowerbeijingCrawler } from './powerbeijing_target';
import { SdiccCrawler } from './sdicc_target';
import { CnoocCrawler } from './cnooc_target';
@Injectable() @Injectable()
export class BidCrawlerService { export class BidCrawlerService {
@@ -65,7 +67,7 @@ export class BidCrawlerService {
args, args,
}); });
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler]; const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler, SdiccCrawler, CnoocCrawler];
try { try {
for (const crawler of crawlers) { for (const crawler of crawlers) {

View File

@@ -4,13 +4,35 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('CdtCrawler Real Site Test', () => { describe('CdtCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
}); });
}); });

View File

@@ -137,9 +137,18 @@ export const CdtCrawler = {
logger.log('Simulating human scrolling...'); logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// 等待表格加载完成
logger.log('Waiting for table to load...');
await page.waitForSelector('table.layui-table', { timeout: 30000 });
while (currentPage <= maxPages) { while (currentPage <= maxPages) {
// 等待表格数据加载
await page.waitForSelector('tbody tr', { timeout: 10000 });
// 获取当前页面的 HTML 内容
const content = await page.content(); const content = await page.content();
const pageResults = this.extract(content); const pageResults = this.extract(content);
if (pageResults.length === 0) { if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`); logger.warn(`No results found on page ${currentPage}, stopping.`);
break; break;
@@ -155,24 +164,45 @@ export const CdtCrawler = {
logger.log('Simulating human scrolling before pagination...'); logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page); await simulateHumanScrolling(page);
// Find the "Next Page" button - layui pagination // 查找下一页按钮
const nextButtonSelector = 'a.layui-laypage-next:not(.layui-disabled)'; const nextButtonSelector = 'a.layui-laypage-next:not(.layui-disabled)';
const nextButton = await page.$(nextButtonSelector); const nextButtonExists = await page.evaluate((selector) => {
const btn = document.querySelector(selector);
return btn !== null && !btn.classList.contains('layui-disabled');
}, nextButtonSelector);
if (!nextButton) { if (!nextButtonExists) {
logger.log('Next page button not found. Reached end of list.'); logger.log('Next page button not found or disabled. Reached end of list.');
break; break;
} }
logger.log(`Navigating to page ${currentPage + 1}...`); logger.log(`Navigating to page ${currentPage + 1}...`);
try { try {
await Promise.all([ // 点击下一页按钮
page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }), await page.evaluate((selector) => {
nextButton.click(), const btn = document.querySelector(selector) as HTMLElement;
]); if (btn) btn.click();
} catch (navError) { }, nextButtonSelector);
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
// 等待 AJAX 请求完成(通过监听网络请求)
await page.waitForFunction(() => {
// 检查表格是否正在加载
const loading = document.querySelector('.layui-table-loading');
return !loading;
}, { timeout: 30000 }).catch(() => {});
// 额外等待确保数据加载完成
await new Promise(r => setTimeout(r, 2000));
// 检查是否真的翻页了(通过检查当前页码)
const currentActivePage = await page.evaluate(() => {
const activeSpan = document.querySelector('.layui-laypage-curr em:last-child');
return activeSpan ? parseInt(activeSpan.textContent || '1') : 1;
});
if (currentActivePage <= currentPage) {
logger.log('Page did not change, stopping.');
break; break;
} }
@@ -188,6 +218,11 @@ export const CdtCrawler = {
// Random delay between pages // Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000; const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay)); await new Promise(resolve => setTimeout(resolve, delay));
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
break;
}
} }
return allResults; return allResults;

View File

@@ -4,10 +4,32 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 120 seconds for manual inspection and slow sites // Increase timeout to 120 seconds for manual inspection and slow sites
jest.setTimeout(120000); jest.setTimeout(120000);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('CeicCrawler Real Site Test', () => { describe('CeicCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Run in non-headless mode headless: false, // Run in non-headless mode
args: [ args: [
@@ -16,6 +38,7 @@ describe('CeicCrawler Real Site Test', () => {
'--disable-blink-features=AutomationControlled', '--disable-blink-features=AutomationControlled',
'--window-size=1920,1080', '--window-size=1920,1080',
'--disable-infobars', '--disable-infobars',
...proxyArgs,
], ],
defaultViewport: null defaultViewport: null
}); });

View File

@@ -48,7 +48,7 @@ async function simulateHumanScrolling(page: puppeteer.Page) {
} }
export const CeicCrawler = { export const CeicCrawler = {
name: '大连能源采购平台', name: '国家能源集团生态协作平台',
url: 'https://ceic.dlnyzb.com/3001', url: 'https://ceic.dlnyzb.com/3001',
baseUrl: 'https://ceic.dlnyzb.com', baseUrl: 'https://ceic.dlnyzb.com',

View File

@@ -4,13 +4,35 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('CgnpcCrawler Real Site Test', () => { describe('CgnpcCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
}); });
}); });

View File

@@ -4,13 +4,35 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000); jest.setTimeout(60000);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('ChdtpCrawler Real Site Test', () => { describe('ChdtpCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: true, // Change to false to see the browser UI headless: true, // Change to false to see the browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
}); });
}); });

View File

@@ -8,7 +8,7 @@ export interface ChdtpResult {
} }
export const ChdtpCrawler = { export const ChdtpCrawler = {
name: '中国华能集团', name: '华电集团电子商务平台 ',
url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1', url: 'https://www.chdtp.com/webs/queryWebZbgg.action?zbggType=1',
baseUrl: 'https://www.chdtp.com/webs/', baseUrl: 'https://www.chdtp.com/webs/',

View File

@@ -4,6 +4,23 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 120 seconds for manual inspection and slow sites // Increase timeout to 120 seconds for manual inspection and slow sites
jest.setTimeout(120000); jest.setTimeout(120000);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
// 模拟人类鼠标移动 // 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) { async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport(); const viewport = page.viewport();
@@ -53,6 +70,11 @@ describe('ChngCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Run in non-headless mode headless: false, // Run in non-headless mode
args: [ args: [
@@ -61,6 +83,7 @@ describe('ChngCrawler Real Site Test', () => {
'--disable-blink-features=AutomationControlled', '--disable-blink-features=AutomationControlled',
'--window-size=1920,1080', '--window-size=1920,1080',
"--disable-infobars", "--disable-infobars",
...proxyArgs,
// "--headless=new", // "--headless=new",
// '--disable-dev-shm-usage', // '--disable-dev-shm-usage',
// '--disable-accelerated-2d-canvas', // '--disable-accelerated-2d-canvas',

View File

@@ -9,6 +9,23 @@ puppeteer.use(StealthPlugin());
// Increase timeout to 180 seconds for slow sites and stealth mode // Increase timeout to 180 seconds for slow sites and stealth mode
jest.setTimeout(180000); jest.setTimeout(180000);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
// 模拟人类鼠标移动 // 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: Page) { async function simulateHumanMouseMovement(page: Page) {
const viewport = page.viewport(); const viewport = page.viewport();
@@ -58,6 +75,11 @@ describe('ChngCrawler Stealth Test (Headless Mode with Stealth Plugin)', () => {
let browser: Browser; let browser: Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: true, // 使用 headless 模式 headless: true, // 使用 headless 模式
args: [ args: [
@@ -73,6 +95,7 @@ describe('ChngCrawler Stealth Test (Headless Mode with Stealth Plugin)', () => {
'--disable-gpu', '--disable-gpu',
'--disable-features=VizDisplayCompositor', '--disable-features=VizDisplayCompositor',
'--disable-webgl', '--disable-webgl',
...proxyArgs,
], ],
defaultViewport: null defaultViewport: null
}); });

View File

@@ -4,13 +4,35 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('CnncecpCrawler Real Site Test', () => { describe('CnncecpCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
}); });
}); });

View File

@@ -0,0 +1,73 @@
import { CnoocCrawler } from './cnooc_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('CnoocCrawler Real Site Test', () => {
let browser: puppeteer.Browser;
beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
});
});
afterAll(async () => {
if (browser) {
await browser.close();
}
});
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${CnoocCrawler.name}`);
console.log(`Target URL: ${CnoocCrawler.url}`);
const results = await CnoocCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
// Basic assertions to ensure crawler is working
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -0,0 +1,195 @@
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
export interface CnoocResult {
title: string;
publishDate: Date;
url: string;
}
export const CnoocCrawler = {
name: '中海油招标平台',
url: 'https://buy.cnooc.com.cn/cbjyweb/001/001001/moreinfo.html',
baseUrl: 'https://buy.cnooc.com.cn',
async crawl(browser: puppeteer.Browser): Promise<CnoocResult[]> {
const logger = new Logger('CnoocCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
const allResults: CnoocResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
logger.log(`Navigating to ${this.url}...`);
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
const content = await page.content();
const pageResults = this.extract(content);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`);
break;
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
// 查找下一页按钮 - 中海油使用特定的分页结构
// 下一页链接格式: <a href="https://buy.cnooc.com.cn/cbjyweb/001/001001/2.html" class="pageLink">下页 ></a>
const nextButtonSelector = 'a.pageLink[href*="/cbjyweb/001/001001/"]';
const nextButton = await page.$(nextButtonSelector);
if (!nextButton) {
logger.log('Next page button not found. Reached end of list.');
break;
}
logger.log(`Navigating to page ${currentPage + 1}...`);
try {
// 点击下一页按钮
await nextButton.click();
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
break;
}
currentPage++;
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): CnoocResult[] {
const results: CnoocResult[] = [];
/**
* Regex groups for buy.cnooc.com.cn:
* 1: URL (href属性)
* 2: Title (font标签内的文本)
* 3: Date (发布时间格式2026-01-12)
*
* HTML结构示例:
* <li class="now-hd-items clearfix">
* <a href="https://buy.cnooc.com.cn/cbjyweb/001/001001/20260112/1460280812582768641-zhy.html" target="_blank" class="now-link" title="...">
* <font style="font-weight:bold">中海油服-物探事业部2026-2028年度海事许可办理及码头服务(二次)</font>
* </a>
* <span class="now-span" style="width:100px">2026-01-12</span>
* </li>
*/
const regex = /<li class="now-hd-items clearfix">[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>[\s\S]*?<font[^>]*>([^<]*)<\/font>[\s\S]*?<span class="now-span"[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<\/li>/gs;
let match;
while ((match = regex.exec(html)) !== null) {
const url = match[1]?.trim();
const title = match[2]?.trim();
const dateStr = match[3]?.trim();
if (title && url) {
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: url.startsWith('http') ? url : this.baseUrl + url
});
}
}
return results;
}
};

View File

@@ -4,13 +4,35 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('EpsCrawler Real Site Test', () => { describe('EpsCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
}); });
}); });

View File

@@ -4,13 +4,35 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('EspicCrawler Real Site Test', () => { describe('EspicCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
}); });
}); });

View File

@@ -53,7 +53,7 @@ export interface EspicResult {
} }
export const EspicCrawler = { export const EspicCrawler = {
name: '电能e招采平台', name: '电能e招采平台(国电投)',
baseUrl: 'https://ebid.espic.com.cn', baseUrl: 'https://ebid.espic.com.cn',
// 生成动态 URL使用当前日期 // 生成动态 URL使用当前日期

View File

@@ -4,13 +4,35 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations // Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5); jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('PowerbeijingCrawler Real Site Test', () => { describe('PowerbeijingCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox'], args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
}); });
}); });

View File

@@ -53,7 +53,7 @@ export interface PowerbeijingResult {
} }
export const PowerbeijingCrawler = { export const PowerbeijingCrawler = {
name: '北京电力交易平台', name: '北京京能电子商务平台',
url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1', url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
baseUrl: 'https://www.powerbeijing-ec.com', baseUrl: 'https://www.powerbeijing-ec.com',

View File

@@ -0,0 +1,73 @@
import { SdiccCrawler } from './sdicc_target';
import * as puppeteer from 'puppeteer';
// Increase timeout to 60 seconds for network operations
jest.setTimeout(60000*5);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('SdiccCrawler Real Site Test', () => {
let browser: puppeteer.Browser;
beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({
headless: false, // Change to false to see browser UI
args: ['--no-sandbox', '--disable-setuid-sandbox', ...proxyArgs],
});
});
afterAll(async () => {
if (browser) {
await browser.close();
}
});
it('should visit website and list all found bid information', async () => {
console.log(`\nStarting crawl for: ${SdiccCrawler.name}`);
console.log(`Target URL: ${SdiccCrawler.url}`);
const results = await SdiccCrawler.crawl(browser);
console.log(`\nSuccessfully found ${results.length} items:\n`);
console.log('----------------------------------------');
results.forEach((item, index) => {
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
console.log(` Link: ${item.url}`);
console.log('----------------------------------------');
});
// Basic assertions to ensure crawler is working
expect(results).toBeDefined();
expect(Array.isArray(results)).toBeTruthy();
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
if (results.length === 0) {
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
} else {
// Check data integrity of first item
const firstItem = results[0];
expect(firstItem.title).toBeTruthy();
expect(firstItem.url).toMatch(/^https?:\/\//);
expect(firstItem.publishDate).toBeInstanceOf(Date);
}
});
});

View File

@@ -0,0 +1,203 @@
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
// 模拟人类鼠标移动
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20) // 10-30步使移动更平滑
});
// 随机停顿 100-500ms
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
}
}
// 模拟人类滚动
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth'
});
}, scrollDistance);
// 随机停顿 500-1500ms
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
}
// 滚动回顶部
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise(r => setTimeout(r, 1000));
}
export interface SdiccResult {
title: string;
publishDate: Date;
url: string;
}
export const SdiccCrawler = {
name: '国投集团电子采购平台',
url: 'https://www.sdicc.com.cn/cgxx/ggList',
baseUrl: 'https://www.sdicc.com.cn',
async crawl(browser: puppeteer.Browser): Promise<SdiccResult[]> {
const logger = new Logger('SdiccCrawler');
const page = await browser.newPage();
const username = process.env.PROXY_USERNAME;
const password = process.env.PROXY_PASSWORD;
if (username && password) {
await page.authenticate({ username, password });
}
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
const allResults: SdiccResult[] = [];
let currentPage = 1;
const maxPages = 5;
try {
logger.log(`Navigating to ${this.url}...`);
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
// 模拟人类行为
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
// 等待表格加载
logger.log('Waiting for table to load...');
await page.waitForSelector('.tbody table tbody tr', { timeout: 30000 }).catch(() => {
logger.warn('Table rows not found, trying alternative selectors...');
});
while (currentPage <= maxPages) {
logger.log(`Processing page ${currentPage}...`);
const content = await page.content();
const pageResults = this.extract(content);
if (pageResults.length === 0) {
logger.warn(`No results found on page ${currentPage}, stopping.`);
break;
}
allResults.push(...pageResults);
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
// 模拟人类行为 - 翻页前
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
// 查找下一页按钮
const nextButtonSelector = '#page_btnLas';
const nextButton = await page.$(nextButtonSelector);
if (!nextButton) {
logger.log('Next page button not found. Reached end of list.');
break;
}
logger.log(`Navigating to page ${currentPage + 1}...`);
try {
// 点击下一页按钮
await nextButton.click();
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
await new Promise(r => setTimeout(r, 2000)); // 额外等待确保数据加载完成
} catch (navError) {
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
break;
}
currentPage++;
// 模拟人类行为 - 翻页后
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise(resolve => setTimeout(resolve, delay));
}
return allResults;
} catch (error) {
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
return allResults;
} finally {
await page.close();
}
},
extract(html: string): SdiccResult[] {
const results: SdiccResult[] = [];
/**
* Regex groups for sdicc.com.cn:
* 1: Title (公告名称/项目名称)
* 2: Date (发布时间格式2026-01-09)
* 3: gcGuid (项目GUID)
* 4: ggGuid (公告GUID)
*
* HTML结构示例:
* <tr onclick="urlChange('a853e226-09bd-441c-8f05-badb945932f0','ec2ccdd8-1464-4a96-ad99-24a5396d028c')">
* <td colspan="1" rowspan="1" style="text-align: center;">1</td>
* <td colspan="1" rowspan="1"><span style="margin-right: 5px;">国投罗钾公司硫酸钾厂球磨机控制系统升级项目公开招标公告</span></td>
* <td colspan="1" rowspan="1"><span>服务</span></td>
* <td colspan="1" rowspan="1"><span> 2026-01-09 </span></td>
* </tr>
*/
const regex = /<tr[^>]*onclick="urlChange\('([^']+)','([^']+)'\)"[^>]*>[\s\S]*?<td[^>]*><span[^>]*>([^<]+)<\/span><\/td>[\s\S]*?<td[^>]*><span[^>]*>\s*(\d{4}-\d{2}-\d{2})\s*<\/span><\/td>[\s\S]*?<\/tr>/gs;
let match;
while ((match = regex.exec(html)) !== null) {
const ggGuid = match[1]?.trim();
const gcGuid = match[2]?.trim();
const title = match[3]?.trim();
const dateStr = match[4]?.trim();
if (title && ggGuid && gcGuid) {
results.push({
title,
publishDate: dateStr ? new Date(dateStr) : new Date(),
url: `${this.baseUrl}/cgxx/ggDetail?gcGuid=${gcGuid}&ggGuid=${ggGuid}`
});
}
}
return results;
}
};

View File

@@ -4,10 +4,32 @@ import * as puppeteer from 'puppeteer';
// Increase timeout to 120 seconds for manual inspection and slow sites // Increase timeout to 120 seconds for manual inspection and slow sites
jest.setTimeout(120000); jest.setTimeout(120000);
// 获取代理配置
const getProxyArgs = (): string[] => {
const proxyHost = process.env.PROXY_HOST;
const proxyPort = process.env.PROXY_PORT;
const proxyUsername = process.env.PROXY_USERNAME;
const proxyPassword = process.env.PROXY_PASSWORD;
if (proxyHost && proxyPort) {
const args = [`--proxy-server=${proxyHost}:${proxyPort}`];
if (proxyUsername && proxyPassword) {
args.push(`--proxy-auth=${proxyUsername}:${proxyPassword}`);
}
return args;
}
return [];
};
describe('SzecpCrawler Real Site Test', () => { describe('SzecpCrawler Real Site Test', () => {
let browser: puppeteer.Browser; let browser: puppeteer.Browser;
beforeAll(async () => { beforeAll(async () => {
const proxyArgs = getProxyArgs();
if (proxyArgs.length > 0) {
console.log('Using proxy:', proxyArgs.join(' '));
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: false, // Run in non-headless mode headless: false, // Run in non-headless mode
args: [ args: [
@@ -16,6 +38,7 @@ describe('SzecpCrawler Real Site Test', () => {
'--disable-blink-features=AutomationControlled', '--disable-blink-features=AutomationControlled',
'--window-size=1920,1080', '--window-size=1920,1080',
'--disable-infobars', '--disable-infobars',
...proxyArgs,
], ],
defaultViewport: null defaultViewport: null
}); });

View File

@@ -14,8 +14,8 @@ export class BidCrawlTask {
@Cron(CronExpression.EVERY_DAY_AT_MIDNIGHT) @Cron(CronExpression.EVERY_DAY_AT_MIDNIGHT)
async handleCron() { async handleCron() {
this.logger.debug('Scheduled crawl task started'); // this.logger.debug('Scheduled crawl task started');
await this.crawlerService.crawlAll(); // await this.crawlerService.crawlAll();
} }
@Cron(CronExpression.EVERY_DAY_AT_MIDNIGHT) @Cron(CronExpression.EVERY_DAY_AT_MIDNIGHT)

View File

@@ -0,0 +1,57 @@
import { NestFactory } from '@nestjs/core';
import { AppModule } from '../app.module';
import { getRepositoryToken } from '@nestjs/typeorm';
import { Repository } from 'typeorm';
import { BidItem } from '../bids/entities/bid-item.entity';
import { CustomLogger } from '../common/logger/logger.service';
async function updateSource() {
const app = await NestFactory.createApplicationContext(AppModule);
// 设置自定义 logger
const logger = await app.resolve(CustomLogger);
app.useLogger(logger);
logger.setContext('UpdateSourceScript');
try {
// 获取 BidItem 的 repository
const bidItemRepository = app.get<Repository<BidItem>>(getRepositoryToken(BidItem));
const oldSource = '北京电力交易平台';
const newSource = '北京京能电子商务平台';
logger.log(`开始更新 source 字段: "${oldSource}" -> "${newSource}"`);
// 查找需要更新的记录数量
const count = await bidItemRepository.count({
where: { source: oldSource },
});
logger.log(`找到 ${count} 条需要更新的记录`);
if (count === 0) {
logger.log('没有需要更新的记录');
await app.close();
process.exit(0);
}
// 执行更新
const result = await bidItemRepository
.createQueryBuilder()
.update(BidItem)
.set({ source: newSource })
.where('source = :oldSource', { oldSource })
.execute();
logger.log(`成功更新 ${result.affected} 条记录`);
await app.close();
process.exit(0);
} catch (error) {
logger.error('更新失败:', error);
await app.close();
process.exit(1);
}
}
updateSource();

9
test/jest-e2e.json Normal file
View File

@@ -0,0 +1,9 @@
{
"moduleFileExtensions": ["js", "json", "ts"],
"rootDir": ".",
"testEnvironment": "node",
"testRegex": ".e2e-spec.ts$",
"transform": {
"^.+\\.(t|j)s$": "ts-jest"
}
}