Compare commits
10 Commits
d9105797f4
...
66f535ed0c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
66f535ed0c | ||
|
|
a1badea135 | ||
|
|
b839779ec6 | ||
|
|
3454d9d07f | ||
|
|
bf17587bd3 | ||
|
|
74a4aec363 | ||
|
|
aa9b33bd94 | ||
|
|
6d626a0946 | ||
|
|
044fd770f7 | ||
|
|
07a7301968 |
6
.env
6
.env
@@ -4,4 +4,8 @@ DATABASE_PORT=23306
|
||||
DATABASE_USERNAME=root
|
||||
DATABASE_PASSWORD=410491
|
||||
DATABASE_NAME=bidding
|
||||
DATABASE_SYNCHRONIZE=true
|
||||
DATABASE_SYNCHRONIZE=true
|
||||
|
||||
# 代理配置(可选)
|
||||
PROXY_HOST=127.0.0.1
|
||||
PROXY_PORT=3211
|
||||
13
.env.example
Normal file
13
.env.example
Normal file
@@ -0,0 +1,13 @@
|
||||
DATABASE_TYPE=mariadb
|
||||
DATABASE_HOST=localhost
|
||||
DATABASE_PORT=3306
|
||||
DATABASE_USERNAME=root
|
||||
DATABASE_PASSWORD=root
|
||||
DATABASE_NAME=bidding
|
||||
DATABASE_SYNCHRONIZE=true
|
||||
|
||||
# 代理配置(可选)
|
||||
PROXY_HOST=127.0.0.1
|
||||
PROXY_PORT=6000
|
||||
# PROXY_USERNAME=
|
||||
# PROXY_PASSWORD=
|
||||
@@ -34,7 +34,7 @@
|
||||
<div v-if="activeIndex === '1'">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;">
|
||||
<h2 style="margin: 0;">Dashboard</h2>
|
||||
<el-button type="primary" :loading="crawling" @click="handleCrawl">
|
||||
<el-button type="primary" :loading="crawling" :disabled="isCrawling" @click="handleCrawl">
|
||||
<el-icon style="margin-right: 5px"><Refresh /></el-icon>
|
||||
立刻抓取
|
||||
</el-button>
|
||||
@@ -63,8 +63,26 @@
|
||||
</el-col>
|
||||
</el-row>
|
||||
<el-divider />
|
||||
<h3>Today's Bids</h3>
|
||||
<el-table :data="bids" v-loading="loading" style="width: 100%">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
|
||||
<h3 style="margin: 0;">Today's Bids</h3>
|
||||
<el-select
|
||||
v-model="selectedKeywords"
|
||||
multiple
|
||||
collapse-tags
|
||||
collapse-tags-tooltip
|
||||
placeholder="Filter by Keywords"
|
||||
clearable
|
||||
style="width: 300px;"
|
||||
>
|
||||
<el-option
|
||||
v-for="keyword in keywords"
|
||||
:key="keyword.id"
|
||||
:label="keyword.word"
|
||||
:value="keyword.word"
|
||||
/>
|
||||
</el-select>
|
||||
</div>
|
||||
<el-table :data="filteredTodayBids" v-loading="loading" style="width: 100%">
|
||||
<el-table-column prop="title" label="Title">
|
||||
<template #default="scope">
|
||||
<a :href="scope.row.url" target="_blank">{{ scope.row.title }}</a>
|
||||
@@ -78,7 +96,17 @@
|
||||
</div>
|
||||
|
||||
<div v-if="activeIndex === '2'">
|
||||
<h2>All Bids</h2>
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px;">
|
||||
<h2 style="margin: 0;">All Bids</h2>
|
||||
<el-select v-model="selectedSource" placeholder="Filter by Source" clearable style="width: 200px" @change="currentPage = 1; fetchData()">
|
||||
<el-option
|
||||
v-for="source in sourceOptions"
|
||||
:key="source"
|
||||
:label="source"
|
||||
:value="source"
|
||||
/>
|
||||
</el-select>
|
||||
</div>
|
||||
<el-table :data="bids" v-loading="loading" style="width: 100%">
|
||||
<el-table-column prop="title" label="Title">
|
||||
<template #default="scope">
|
||||
@@ -90,6 +118,16 @@
|
||||
<template #default="scope">{{ formatDate(scope.row.publishDate) }}</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
<el-pagination
|
||||
v-model:current-page="currentPage"
|
||||
v-model:page-size="pageSize"
|
||||
:page-sizes="[10, 20, 50, 100]"
|
||||
:total="total"
|
||||
layout="total, sizes, prev, pager, next, jumper"
|
||||
@current-change="handlePageChange"
|
||||
@size-change="handleSizeChange"
|
||||
style="margin-top: 20px; justify-content: flex-end;"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div v-if="activeIndex === '3'">
|
||||
@@ -97,16 +135,20 @@
|
||||
<h2>Keyword Management</h2>
|
||||
<el-button type="primary" @click="dialogVisible = true">Add Keyword</el-button>
|
||||
</div>
|
||||
|
||||
<el-table :data="keywords" v-loading="loading" style="width: 100%">
|
||||
<el-table-column prop="word" label="Keyword" />
|
||||
<el-table-column prop="weight" label="Weight" />
|
||||
<el-table-column label="Action">
|
||||
<template #default="scope">
|
||||
<el-button type="danger" size="small" @click="handleDeleteKeyword(scope.row.id)">Delete</el-button>
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
|
||||
<div v-loading="loading" style="min-height: 200px;">
|
||||
<el-tag
|
||||
v-for="keyword in keywords"
|
||||
:key="keyword.id"
|
||||
closable
|
||||
:type="getTagType(keyword.weight)"
|
||||
@close="handleDeleteKeyword(keyword.id)"
|
||||
style="margin: 5px;"
|
||||
>
|
||||
{{ keyword.word }}
|
||||
</el-tag>
|
||||
<el-empty v-if="keywords.length === 0" description="No keywords" />
|
||||
</div>
|
||||
</div>
|
||||
</el-main>
|
||||
</el-container>
|
||||
@@ -131,7 +173,7 @@
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, onMounted, reactive } from 'vue'
|
||||
import { ref, onMounted, reactive, computed, watch } from 'vue'
|
||||
import axios from 'axios'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { DataBoard, Document, Setting, Refresh } from '@element-plus/icons-vue'
|
||||
@@ -143,32 +185,102 @@ const keywords = ref<any[]>([])
|
||||
const loading = ref(false)
|
||||
const crawling = ref(false)
|
||||
const dialogVisible = ref(false)
|
||||
const selectedSource = ref('')
|
||||
const currentPage = ref(1)
|
||||
const pageSize = ref(10)
|
||||
const total = ref(0)
|
||||
const sourceOptions = ref<string[]>([])
|
||||
const isCrawling = ref(false)
|
||||
const selectedKeywords = ref<string[]>([])
|
||||
|
||||
// 从 localStorage 加载保存的关键字
|
||||
const loadSavedKeywords = () => {
|
||||
const saved = localStorage.getItem('selectedKeywords')
|
||||
if (saved) {
|
||||
try {
|
||||
selectedKeywords.value = JSON.parse(saved)
|
||||
} catch (e) {
|
||||
console.error('Failed to parse saved keywords:', e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 监听关键字变化并保存到 localStorage
|
||||
watch(selectedKeywords, (newKeywords) => {
|
||||
localStorage.setItem('selectedKeywords', JSON.stringify(newKeywords))
|
||||
}, { deep: true })
|
||||
|
||||
const form = reactive({
|
||||
word: '',
|
||||
weight: 1
|
||||
})
|
||||
|
||||
// 根据 weight 获取 tag 类型
|
||||
const getTagType = (weight: number) => {
|
||||
if (weight >= 5) return 'danger'
|
||||
if (weight >= 4) return 'warning'
|
||||
if (weight >= 3) return 'primary'
|
||||
if (weight >= 2) return 'success'
|
||||
return 'info'
|
||||
}
|
||||
|
||||
const handleSelect = (key: string) => {
|
||||
activeIndex.value = key
|
||||
}
|
||||
|
||||
// 处理分页变化
|
||||
const handlePageChange = (page: number) => {
|
||||
currentPage.value = page
|
||||
fetchData()
|
||||
}
|
||||
|
||||
// 处理每页数量变化
|
||||
const handleSizeChange = (size: number) => {
|
||||
pageSize.value = size
|
||||
currentPage.value = 1
|
||||
fetchData()
|
||||
}
|
||||
|
||||
const formatDate = (dateString: string) => {
|
||||
if (!dateString) return '-'
|
||||
return new Date(dateString).toLocaleDateString()
|
||||
}
|
||||
|
||||
// 过滤 Today's Bids,只显示包含所选关键字的项目
|
||||
const filteredTodayBids = computed(() => {
|
||||
if (selectedKeywords.value.length === 0) {
|
||||
return bids.value
|
||||
}
|
||||
|
||||
return bids.value.filter(bid => {
|
||||
return selectedKeywords.value.some(keyword =>
|
||||
bid.title.toLowerCase().includes(keyword.toLowerCase())
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
const fetchData = async () => {
|
||||
loading.value = true
|
||||
try {
|
||||
const [bidsRes, highRes, kwRes] = await Promise.all([
|
||||
axios.get('/api/bids'),
|
||||
const [bidsRes, highRes, kwRes, sourcesRes, statusRes] = await Promise.all([
|
||||
axios.get('/api/bids', {
|
||||
params: {
|
||||
page: currentPage.value,
|
||||
limit: pageSize.value,
|
||||
source: selectedSource.value || undefined
|
||||
}
|
||||
}),
|
||||
axios.get('/api/bids/high-priority'),
|
||||
axios.get('/api/keywords')
|
||||
axios.get('/api/keywords'),
|
||||
axios.get('/api/bids/sources'),
|
||||
axios.get('/api/crawler/status')
|
||||
])
|
||||
bids.value = bidsRes.data.items
|
||||
total.value = bidsRes.data.total
|
||||
highPriorityBids.value = highRes.data
|
||||
keywords.value = kwRes.data
|
||||
sourceOptions.value = sourcesRes.data
|
||||
isCrawling.value = statusRes.data.isCrawling
|
||||
} catch (error) {
|
||||
ElMessage.error('Failed to fetch data')
|
||||
} finally {
|
||||
@@ -177,6 +289,10 @@ const fetchData = async () => {
|
||||
}
|
||||
|
||||
const handleCrawl = async () => {
|
||||
if (isCrawling.value) {
|
||||
ElMessage.warning('Crawl is already running')
|
||||
return
|
||||
}
|
||||
crawling.value = true
|
||||
try {
|
||||
await axios.post('/api/crawler/run')
|
||||
@@ -217,6 +333,7 @@ const handleDeleteKeyword = async (id: string) => {
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
loadSavedKeywords()
|
||||
fetchData()
|
||||
})
|
||||
</script>
|
||||
|
||||
@@ -14,4 +14,9 @@ export class BidsController {
|
||||
getHighPriority() {
|
||||
return this.bidsService.getHighPriorityCorrected();
|
||||
}
|
||||
|
||||
@Get('sources')
|
||||
getSources() {
|
||||
return this.bidsService.getSources();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,12 @@ export class BidItem {
|
||||
@Column({ default: false })
|
||||
isRead: boolean;
|
||||
|
||||
@Column({ default: 0 })
|
||||
priority: number;
|
||||
|
||||
@Column({ nullable: true })
|
||||
unit: string;
|
||||
|
||||
@CreateDateColumn()
|
||||
createdAt: Date;
|
||||
|
||||
|
||||
@@ -65,4 +65,14 @@ export class BidsService {
|
||||
createdAt: LessThan(thirtyDaysAgo),
|
||||
});
|
||||
}
|
||||
|
||||
async getSources() {
|
||||
const result = await this.bidRepository
|
||||
.createQueryBuilder('bid')
|
||||
.select('DISTINCT bid.source')
|
||||
.where('bid.source IS NOT NULL')
|
||||
.orderBy('bid.source', 'ASC')
|
||||
.getRawMany();
|
||||
return result.map((item: any) => item.source);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,21 +1,38 @@
|
||||
import { Controller, Post } from '@nestjs/common';
|
||||
import { Controller, Post, Get } from '@nestjs/common';
|
||||
import { BidCrawlerService } from './services/bid-crawler.service';
|
||||
|
||||
@Controller('api/crawler')
|
||||
export class CrawlerController {
|
||||
private isCrawling = false;
|
||||
|
||||
constructor(private readonly crawlerService: BidCrawlerService) {}
|
||||
|
||||
@Get('status')
|
||||
getStatus() {
|
||||
return { isCrawling: this.isCrawling };
|
||||
}
|
||||
|
||||
@Post('run')
|
||||
async runCrawl() {
|
||||
if (this.isCrawling) {
|
||||
return { message: 'Crawl is already running' };
|
||||
}
|
||||
|
||||
this.isCrawling = true;
|
||||
|
||||
// We don't await this because we want it to run in the background
|
||||
// and return immediately, or we can await if we want the user to wait.
|
||||
// and return immediately, or we can await if we want to user to wait.
|
||||
// Given the requirement "Immediate Crawl", usually implies triggering it.
|
||||
// However, for a better UI experience, we might want to wait or just trigger.
|
||||
// Let's await it so the user knows when it's done (or failed),
|
||||
// Let's await it so that user knows when it's done (or failed),
|
||||
// assuming it doesn't take too long for the mock.
|
||||
// Real crawling might take long, so background is better.
|
||||
// For this prototype, I'll await it to show completion.
|
||||
await this.crawlerService.crawlAll();
|
||||
return { message: 'Crawl completed successfully' };
|
||||
try {
|
||||
await this.crawlerService.crawlAll();
|
||||
return { message: 'Crawl completed successfully' };
|
||||
} finally {
|
||||
this.isCrawling = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { BidsService } from '../../bids/services/bid.service';
|
||||
import { ChdtpCrawler } from './chdtp_target';
|
||||
import { ChngCrawler } from './chng_target';
|
||||
import { SzecpCrawler } from './szecp_target';
|
||||
import { CdtCrawler } from './cdt_target';
|
||||
import { EpsCrawler } from './eps_target';
|
||||
import { CnncecpCrawler } from './cnncecp_target';
|
||||
import { CgnpcCrawler } from './cgnpc_target';
|
||||
import { CeicCrawler } from './ceic_target';
|
||||
import { EspicCrawler } from './espic_target';
|
||||
import { PowerbeijingCrawler } from './powerbeijing_target';
|
||||
|
||||
@Injectable()
|
||||
export class BidCrawlerService {
|
||||
@@ -9,38 +19,89 @@ export class BidCrawlerService {
|
||||
|
||||
constructor(
|
||||
private bidsService: BidsService,
|
||||
private configService: ConfigService,
|
||||
) {}
|
||||
|
||||
async crawlAll() {
|
||||
this.logger.log('Starting crawl task with Puppeteer...');
|
||||
|
||||
// 设置最大执行时间为1小时
|
||||
const maxExecutionTime = 60 * 60 * 1000; // 1小时(毫秒)
|
||||
const startTime = Date.now();
|
||||
|
||||
// 从环境变量读取代理配置
|
||||
const proxyHost = this.configService.get<string>('PROXY_HOST');
|
||||
const proxyPort = this.configService.get<string>('PROXY_PORT');
|
||||
const proxyUsername = this.configService.get<string>('PROXY_USERNAME');
|
||||
const proxyPassword = this.configService.get<string>('PROXY_PASSWORD');
|
||||
|
||||
// 构建代理参数
|
||||
const args = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-infobars',
|
||||
'--window-position=0,0',
|
||||
'--ignore-certifcate-errors',
|
||||
'--ignore-certifcate-errors-spki-list',
|
||||
];
|
||||
|
||||
if (proxyHost && proxyPort) {
|
||||
const proxyUrl = proxyUsername && proxyPassword
|
||||
? `http://${proxyUsername}:${proxyPassword}@${proxyHost}:${proxyPort}`
|
||||
: `http://${proxyHost}:${proxyPort}`;
|
||||
args.push(`--proxy-server=${proxyUrl}`);
|
||||
this.logger.log(`Using proxy: ${proxyHost}:${proxyPort}`);
|
||||
}
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
headless: false,
|
||||
args,
|
||||
});
|
||||
|
||||
const crawlers = [ChdtpCrawler, ChngCrawler, SzecpCrawler, CdtCrawler, EpsCrawler, CnncecpCrawler, CgnpcCrawler, CeicCrawler, EspicCrawler, PowerbeijingCrawler];
|
||||
|
||||
try {
|
||||
// Currently only supports ChdtpCrawler, but can be extended to a list of crawlers
|
||||
const crawler = ChdtpCrawler;
|
||||
this.logger.log(`Crawling: ${crawler.name}`);
|
||||
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||
for (const crawler of crawlers) {
|
||||
this.logger.log(`Crawling: ${crawler.name}`);
|
||||
|
||||
// 检查是否超时
|
||||
const elapsedTime = Date.now() - startTime;
|
||||
if (elapsedTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour. Stopping...`);
|
||||
this.logger.warn(`⚠️ Total elapsed time: ${Math.floor(elapsedTime / 1000 / 60)} minutes`);
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const results = await crawler.crawl(browser);
|
||||
this.logger.log(`Extracted ${results.length} items from ${crawler.name}`);
|
||||
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title,
|
||||
url: itemUrl,
|
||||
publishDate,
|
||||
source: type || 'Unknown',
|
||||
});
|
||||
for (const item of results) {
|
||||
await this.bidsService.createOrUpdate({
|
||||
title: item.title,
|
||||
url: item.url,
|
||||
publishDate: item.publishDate,
|
||||
source: crawler.name,
|
||||
unit: '',
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.error(`Error crawling ${crawler.name}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
this.logger.error(`Crawl task failed: ${error.message}`);
|
||||
} finally {
|
||||
await browser.close();
|
||||
this.logger.log('Crawl task finished.');
|
||||
|
||||
const totalTime = Date.now() - startTime;
|
||||
const minutes = Math.floor(totalTime / 1000 / 60);
|
||||
this.logger.log(`Crawl task finished. Total time: ${minutes} minutes`);
|
||||
|
||||
if (totalTime > maxExecutionTime) {
|
||||
this.logger.warn(`⚠️ Crawl task exceeded maximum execution time of 1 hour.`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
51
src/crawler/services/cdt_target.spec.ts
Normal file
51
src/crawler/services/cdt_target.spec.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import { CdtCrawler } from './cdt_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
|
||||
describe('CdtCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${CdtCrawler.name}`);
|
||||
console.log(`Target URL: ${CdtCrawler.url}`);
|
||||
|
||||
const results = await CdtCrawler.crawl(browser);
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
// Basic assertions to ensure crawler is working
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
229
src/crawler/services/cdt_target.ts
Normal file
229
src/crawler/services/cdt_target.ts
Normal file
@@ -0,0 +1,229 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CdtResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export const CdtCrawler = {
|
||||
name: '中国大唐集团电子商务平台',
|
||||
url: 'https://tang.cdt-ec.com/home/index.html',
|
||||
baseUrl: 'https://tang.cdt-ec.com',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CdtResult[]> {
|
||||
const logger = new Logger('CdtCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
|
||||
|
||||
const allResults: CdtResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log(`Navigating to ${this.url}...`);
|
||||
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 点击"招标公告"标签
|
||||
logger.log('Looking for "招标公告" tab...');
|
||||
await page.waitForFunction(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
return tabs.some(tab => tab.textContent && tab.textContent.includes('招标公告'));
|
||||
}, { timeout: 30000 });
|
||||
|
||||
await page.evaluate(() => {
|
||||
const tabs = Array.from(document.querySelectorAll('span.notice-tab'));
|
||||
const target = tabs.find(tab => tab.textContent && tab.textContent.includes('招标公告')) as HTMLElement;
|
||||
if (target) target.click();
|
||||
});
|
||||
|
||||
logger.log('Clicked "招标公告" tab.');
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 点击"招标公告"下的"更多+"链接
|
||||
logger.log('Looking for "更多+" link under "招标公告"...');
|
||||
await page.waitForFunction(() => {
|
||||
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
|
||||
return titles.some(title => title.textContent && title.textContent.includes('招标公告'));
|
||||
}, { timeout: 30000 });
|
||||
|
||||
await page.evaluate(() => {
|
||||
const titles = Array.from(document.querySelectorAll('span.h-notice-title'));
|
||||
const targetTitle = titles.find(title => title.textContent && title.textContent.includes('招标公告'));
|
||||
if (targetTitle) {
|
||||
const parent = targetTitle.parentElement;
|
||||
if (parent) {
|
||||
const moreLink = parent.querySelector('a.h-notice-more') as HTMLElement;
|
||||
if (moreLink) moreLink.click();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
logger.log('Clicked "更多+" link under "招标公告".');
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }).catch(() => {});
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Find the "Next Page" button - layui pagination
|
||||
const nextButtonSelector = 'a.layui-laypage-next:not(.layui-disabled)';
|
||||
const nextButton = await page.$(nextButtonSelector);
|
||||
|
||||
if (!nextButton) {
|
||||
logger.log('Next page button not found. Reached end of list.');
|
||||
break;
|
||||
}
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
try {
|
||||
await Promise.all([
|
||||
page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 60000 }),
|
||||
nextButton.click(),
|
||||
]);
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CdtResult[] {
|
||||
const results: CdtResult[] = [];
|
||||
/**
|
||||
* Regex groups for tang.cdt-ec.com:
|
||||
* 1: URL
|
||||
* 2: Title (项目名称)
|
||||
* 3: Date (发布时间)
|
||||
*/
|
||||
const regex = /<tr[^>]*data-index="[^"]*"[^>]*>[\s\S]*?<a[^>]*class="layui-table-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<td[^>]*data-field="publish_time"[^>]*>[\s\S]*?<div[^>]*class="layui-table-cell[^"]*"[^>]*>([^<]*)<\/div>[\s\S]*?<\/td>[\s\S]*?<\/tr>/gs;
|
||||
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
|
||||
if (title && url) {
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: url.startsWith('http') ? url : this.baseUrl + url
|
||||
});
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
};
|
||||
61
src/crawler/services/ceic_target.spec.ts
Normal file
61
src/crawler/services/ceic_target.spec.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
import { CeicCrawler } from './ceic_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 120 seconds for manual inspection and slow sites
|
||||
jest.setTimeout(120000);
|
||||
|
||||
describe('CeicCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Run in non-headless mode
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
'--disable-infobars',
|
||||
],
|
||||
defaultViewport: null
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${CeicCrawler.name}`);
|
||||
console.log(`Target URL: ${CeicCrawler.url}`);
|
||||
|
||||
const results = await CeicCrawler.crawl(browser);
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
168
src/crawler/services/ceic_target.ts
Normal file
168
src/crawler/services/ceic_target.ts
Normal file
@@ -0,0 +1,168 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { ChdtpResult } from './chdtp_target';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export const CeicCrawler = {
|
||||
name: '大连能源采购平台',
|
||||
url: 'https://ceic.dlnyzb.com/3001',
|
||||
baseUrl: 'https://ceic.dlnyzb.com',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('CeicCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log(`Navigating to ${this.url}...`);
|
||||
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for content to load - MUI list items
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('li.MuiListItem-root').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate(() => {
|
||||
const results: { title: string; dateStr: string; url: string }[] = [];
|
||||
|
||||
// Extract from MUI list items
|
||||
const listItems = Array.from(document.querySelectorAll('li.MuiListItem-root'));
|
||||
listItems.forEach(item => {
|
||||
// Find the title link
|
||||
const titleLink = item.querySelector('a.css-1vdw90h');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
const href = titleLink?.getAttribute('href') || '';
|
||||
|
||||
// Find the publish date - look for text containing "发布时间:"
|
||||
const paragraphs = Array.from(item.querySelectorAll('p'));
|
||||
let dateStr = '';
|
||||
for (const p of paragraphs) {
|
||||
const text = p.textContent || '';
|
||||
if (text.includes('发布时间:')) {
|
||||
dateStr = text.replace('发布时间:', '').trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (title.length >= 5 && href) {
|
||||
results.push({ title, dateStr, url: href });
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r.title,
|
||||
publishDate: r.dateStr ? new Date(r.dateStr) : new Date(),
|
||||
url: r.url
|
||||
})));
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
// Pagination: look for next page button in MUI pagination
|
||||
const nextButton = await page.$('a[aria-label="Go to next page"]');
|
||||
if (!nextButton) break;
|
||||
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
currentPage++;
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
};
|
||||
51
src/crawler/services/cgnpc_target.spec.ts
Normal file
51
src/crawler/services/cgnpc_target.spec.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import { CgnpcCrawler } from './cgnpc_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
|
||||
describe('CgnpcCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${CgnpcCrawler.name}`);
|
||||
console.log(`Target URL: ${CgnpcCrawler.url}`);
|
||||
|
||||
const results = await CgnpcCrawler.crawl(browser);
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
// Basic assertions to ensure crawler is working
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
203
src/crawler/services/cgnpc_target.ts
Normal file
203
src/crawler/services/cgnpc_target.ts
Normal file
@@ -0,0 +1,203 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CgnpcResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export const CgnpcCrawler = {
|
||||
name: '中广核电子商务平台',
|
||||
url: 'https://ecp.cgnpc.com.cn/zbgg.html',
|
||||
baseUrl: 'https://ecp.cgnpc.com.cn',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CgnpcResult[]> {
|
||||
const logger = new Logger('CgnpcCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CgnpcResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log(`Navigating to ${this.url}...`);
|
||||
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 查找下一页按钮 - 中广核网站分页结构
|
||||
// 分页结构: <button type="button" class="btn-prev" onclick="setPageIndex(2)"><i class="fa fa-angle-right"></i></button>
|
||||
const nextButtonSelector = 'button.btn-prev:not([disabled])';
|
||||
const nextButton = await page.$(nextButtonSelector);
|
||||
|
||||
if (!nextButton) {
|
||||
logger.log('Next page button not found. Reached end of list.');
|
||||
break;
|
||||
}
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CgnpcResult[] {
|
||||
const results: CgnpcResult[] = [];
|
||||
/**
|
||||
* Regex groups for ecp.cgnpc.com.cn:
|
||||
* 1: URL (href属性)
|
||||
* 2: Title (title属性)
|
||||
* 3: Date (发布时间,格式:2026-01-23 17:00)
|
||||
*
|
||||
* HTML结构示例:
|
||||
* <div class="zbnr">
|
||||
* <div class="zbnr_left" style="width: calc(100% - 290px);">
|
||||
* <a title="中广核新能源新疆公司2026年-2028年各场站线路运维检修服务框架协议"
|
||||
* href="https://ecp.cgnpc.com.cn/Details.html?dataId=xxx&detailId=xxx" target="_blank">
|
||||
* <h2><i>中广核新能源新疆公司2026年-2028年各场站线路运维检修服务框架协议</i></h2>
|
||||
* </a>
|
||||
* </div>
|
||||
* <div class="zbnr_right" style="width: 270px;">
|
||||
* <dl>
|
||||
* <dt><p>文件获取截止时间</p><h2>2026-01-23 17:00</h2></dt>
|
||||
* <dt><p>投标截止时间</p><h2>2026-01-30 09:00</h2></dt>
|
||||
* </dl>
|
||||
* </div>
|
||||
* </div>
|
||||
*/
|
||||
const regex = /<div class="zbnr">[\s\S]*?<a[^>]*title="([^"]*)"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<dt>[\s\S]*?<p>文件获取截止时间<\/p>[\s\S]*?<h2>\s*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})\s*<\/h2>[\s\S]*?<\/div>/gs;
|
||||
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const title = match[1]?.trim();
|
||||
const url = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
|
||||
if (title && url) {
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: url.startsWith('http') ? url : this.baseUrl + url
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
};
|
||||
@@ -15,6 +15,13 @@ export const ChdtpCrawler = {
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('ChdtpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36');
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
|
||||
130
src/crawler/services/chng_target.spec.ts
Normal file
130
src/crawler/services/chng_target.spec.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
import { ChngCrawler } from './chng_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 120 seconds for manual inspection and slow sites
|
||||
jest.setTimeout(120000);
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
describe('ChngCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Run in non-headless mode
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
"--disable-infobars",
|
||||
// "--headless=new",
|
||||
// '--disable-dev-shm-usage',
|
||||
// '--disable-accelerated-2d-canvas',
|
||||
// '--no-first-run',
|
||||
// '--no-zygote',
|
||||
// '--disable-gpu',
|
||||
// '--disable-features=VizDisplayCompositor',
|
||||
// '--disable-webgl',
|
||||
// '--disable-javascript',
|
||||
],
|
||||
defaultViewport: null
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit the website and list all found bid information', async () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${ChngCrawler.name}`);
|
||||
console.log(`Target URL: ${ChngCrawler.url}`);
|
||||
|
||||
// 创建一个临时页面用于模拟人类行为
|
||||
const tempPage = await browser.newPage();
|
||||
await tempPage.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
console.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(tempPage);
|
||||
|
||||
// 模拟人类滚动
|
||||
console.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(tempPage);
|
||||
|
||||
await tempPage.close();
|
||||
|
||||
const results = await ChngCrawler.crawl(browser);
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe the browser window to see if content is loading or if there is a verification challenge.');
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
234
src/crawler/services/chng_target.ts
Normal file
234
src/crawler/services/chng_target.ts
Normal file
@@ -0,0 +1,234 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { ChdtpResult } from './chdtp_target';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export const ChngCrawler = {
|
||||
name: '华能集团电子商务平台',
|
||||
url: 'https://ec.chng.com.cn/ecmall/index.html#/purchase/home?top=0',
|
||||
baseUrl: 'https://ec.chng.com.cn/ecmall/index.html',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('ChngCrawler');
|
||||
let page = await browser.newPage();
|
||||
// await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
|
||||
// await page.setViewport({ deviceScaleFactor: 1 });
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log('Navigating to Bing...');
|
||||
await page.goto('https://cn.bing.com', { waitUntil: 'networkidle2' });
|
||||
|
||||
logger.log('Searching for target site...');
|
||||
const searchBoxSelector = 'input[name="q"]';
|
||||
await page.waitForSelector(searchBoxSelector);
|
||||
await page.type(searchBoxSelector, 'https://ec.chng.com.cn/');
|
||||
await page.keyboard.press('Enter');
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2' });
|
||||
|
||||
logger.log('Clicking search result...');
|
||||
await page.screenshot({ path: 'bing.png' });
|
||||
const firstResultSelector = '#b_results .b_algo h2 a';
|
||||
await page.waitForSelector(firstResultSelector);
|
||||
|
||||
const newTargetPromise = browser.waitForTarget(target => target.opener() === page.target());
|
||||
await page.click(firstResultSelector);
|
||||
|
||||
const newTarget = await newTargetPromise;
|
||||
const newPage = await newTarget.page();
|
||||
|
||||
if (newPage) {
|
||||
await newPage.screenshot({ path: 'newPage.png' });
|
||||
await page.close();
|
||||
page = newPage;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
}
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
await page.waitForNavigation({ waitUntil: 'domcontentloaded' }).catch(() => {});
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
|
||||
// PAUSE 15 SECONDS as requested
|
||||
logger.log('Pausing 15 seconds before looking for "采购专栏"...');
|
||||
await new Promise(r => setTimeout(r, 15000));
|
||||
await page.screenshot({ path: 'huaneng.png' });
|
||||
|
||||
logger.log('Looking for "采购专栏" link...');
|
||||
await page.waitForFunction(() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
return divs.some(div => div.textContent && div.textContent.includes('采购专栏'));
|
||||
}, { timeout: 60000 });
|
||||
|
||||
const purchaseTargetPromise = browser.waitForTarget(target => target.opener() === page.target(), { timeout: 15000 }).catch(() => null);
|
||||
|
||||
await page.evaluate(() => {
|
||||
const divs = Array.from(document.querySelectorAll('div.text'));
|
||||
const target = divs.find(div => div.textContent && div.textContent.includes('采购专栏')) as HTMLElement;
|
||||
if (target) target.click();
|
||||
});
|
||||
|
||||
const purchaseTarget = await purchaseTargetPromise;
|
||||
if (purchaseTarget) {
|
||||
const pPage = await purchaseTarget.page();
|
||||
if (pPage) {
|
||||
logger.log('Switched to Purchase Page tab.');
|
||||
page = pPage;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
}
|
||||
}
|
||||
|
||||
logger.log(`Active URL: ${page.url()}`);
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for table rows to load
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('tr.ant-table-row').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate((baseUrl) => {
|
||||
// Extract from table rows
|
||||
const items = Array.from(document.querySelectorAll('tr.ant-table-row'));
|
||||
return items.map(item => {
|
||||
const titleSpan = item.querySelector('span.list-text');
|
||||
const dateCell = item.querySelector('td.ant-table-row-cell-break-word p');
|
||||
|
||||
if (titleSpan && dateCell) {
|
||||
const title = titleSpan.textContent?.trim() || '';
|
||||
const dateStr = dateCell.textContent?.trim() || '';
|
||||
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
|
||||
// URL is not directly available in the table, need to construct from data-row-key
|
||||
const rowKey = item.getAttribute('data-row-key');
|
||||
const url = rowKey ? `${baseUrl}#/purchase/detail?id=${rowKey}` : '';
|
||||
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}).filter(i => i !== null);
|
||||
}, this.baseUrl);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r!.title,
|
||||
publishDate: new Date(r!.dateStr),
|
||||
url: r!.url
|
||||
})));
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
// Pagination: look for the "right" icon SVG
|
||||
const nextButton = await page.$('svg[data-icon="right"]');
|
||||
if (!nextButton) break;
|
||||
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
currentPage++;
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
};
|
||||
72
src/crawler/services/chng_target_playwright.spec.ts
Normal file
72
src/crawler/services/chng_target_playwright.spec.ts
Normal file
@@ -0,0 +1,72 @@
|
||||
import { chromium } from 'playwright';
|
||||
import { ChngCrawler } from './chng_target';
|
||||
|
||||
jest.setTimeout(120000);
|
||||
|
||||
describe('ChngCrawler Playwright Test', () => {
|
||||
let browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await chromium.launch({
|
||||
headless: false,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit the website and list all found bid information', async () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${ChngCrawler.name}`);
|
||||
console.log(`Target URL: ${ChngCrawler.url}`);
|
||||
|
||||
const context = await browser.newContext({
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
// Add stealth scripts if needed, but Playwright is often better at evasion
|
||||
await page.addInitScript(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
});
|
||||
|
||||
await page.goto(ChngCrawler.url, { waitUntil: 'networkidle', timeout: 60000 });
|
||||
|
||||
// Wait for content
|
||||
try {
|
||||
await page.waitForSelector('.ant-table-row', { timeout: 30000 });
|
||||
} catch (e) {
|
||||
console.warn('Timed out waiting for .ant-table-row');
|
||||
}
|
||||
|
||||
const content = await page.content();
|
||||
|
||||
// Reuse the extraction logic from the Crawler definition
|
||||
const results = ChngCrawler.extract(content);
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('No items found. Debugging content length: ' + content.length);
|
||||
if (content.length < 500) {
|
||||
console.log('Content dump:', content);
|
||||
}
|
||||
}
|
||||
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
});
|
||||
});
|
||||
51
src/crawler/services/cnncecp_target.spec.ts
Normal file
51
src/crawler/services/cnncecp_target.spec.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import { CnncecpCrawler } from './cnncecp_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
|
||||
describe('CnncecpCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${CnncecpCrawler.name}`);
|
||||
console.log(`Target URL: ${CnncecpCrawler.url}`);
|
||||
|
||||
const results = await CnncecpCrawler.crawl(browser);
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
// Basic assertions to ensure crawler is working
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
194
src/crawler/services/cnncecp_target.ts
Normal file
194
src/crawler/services/cnncecp_target.ts
Normal file
@@ -0,0 +1,194 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface CnncecpResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export const CnncecpCrawler = {
|
||||
name: '中核集团电子采购平台',
|
||||
url: 'https://www.cnncecp.com/xzbgg/index.jhtml',
|
||||
baseUrl: 'https://www.cnncecp.com',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<CnncecpResult[]> {
|
||||
const logger = new Logger('CnncecpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: CnncecpResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log(`Navigating to ${this.url}...`);
|
||||
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 查找下一页按钮
|
||||
const nextButtonSelector = 'a[href*="index_"]';
|
||||
const nextButton = await page.$(nextButtonSelector);
|
||||
|
||||
if (!nextButton) {
|
||||
logger.log('Next page button not found. Reached end of list.');
|
||||
break;
|
||||
}
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): CnncecpResult[] {
|
||||
const results: CnncecpResult[] = [];
|
||||
/**
|
||||
* Regex groups for cnncecp.com:
|
||||
* 1: Date (发布时间,格式:2026-01-11)
|
||||
* 2: URL (href属性)
|
||||
* 3: Title (a标签文本)
|
||||
*
|
||||
* HTML结构示例:
|
||||
* <li>
|
||||
* <span class="Right Gray">2026-01-11</span>
|
||||
* <span class="Right Right20"><em class="Red">文件下载截止:2025-12-08 23:59:00</em></span>
|
||||
* <span class="Blue">[变更公告]</span>
|
||||
* <a href="https://www.cnncecp.com/xzbgg/1862778.jhtml">中核四0四有限公司2026-2028年度质量流量控制器等采购项目(二次)变更公告</a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<span class="Right Gray">\s*(\d{4}-\d{2}-\d{2})\s*<\/span>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const dateStr = match[1]?.trim();
|
||||
const url = match[2]?.trim();
|
||||
const title = match[3]?.trim();
|
||||
|
||||
if (title && url) {
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: url.startsWith('http') ? url : this.baseUrl + url
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
};
|
||||
51
src/crawler/services/eps_target.spec.ts
Normal file
51
src/crawler/services/eps_target.spec.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import { EpsCrawler } from './eps_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
|
||||
describe('EpsCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${EpsCrawler.name}`);
|
||||
console.log(`Target URL: ${EpsCrawler.url}`);
|
||||
|
||||
const results = await EpsCrawler.crawl(browser);
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
// Basic assertions to ensure crawler is working
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
201
src/crawler/services/eps_target.ts
Normal file
201
src/crawler/services/eps_target.ts
Normal file
@@ -0,0 +1,201 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface EpsResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export const EpsCrawler = {
|
||||
name: '中国三峡集团电子商务平台',
|
||||
url: 'https://eps.ctg.com.cn/cms/channel/1ywgg1/index.htm',
|
||||
baseUrl: 'https://eps.ctg.com.cn',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<EpsResult[]> {
|
||||
const logger = new Logger('EpsCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: EpsResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log(`Navigating to ${this.url}...`);
|
||||
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 查找下一页按钮 - 根据网站实际结构调整选择器
|
||||
// 分页结构: <a href="javascript:;" aria-label="Next" class="pageItem" page="2">下页</a>
|
||||
const nextButtonSelector = 'a.pageItem[aria-label="Next"]';
|
||||
const nextButton = await page.$(nextButtonSelector);
|
||||
|
||||
if (!nextButton) {
|
||||
logger.log('Next page button not found. Reached end of list.');
|
||||
break;
|
||||
}
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
try {
|
||||
// 点击下一页按钮,等待页面更新
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): EpsResult[] {
|
||||
const results: EpsResult[] = [];
|
||||
/**
|
||||
* Regex groups for eps.ctg.com.cn:
|
||||
* 1: URL (href属性)
|
||||
* 2: Title (title属性)
|
||||
* 3: Date (发布时间,格式:2026-01-09)
|
||||
*
|
||||
* HTML结构示例:
|
||||
* <li name="li_name">
|
||||
* <a id="0" href="https://eps.ctg.com.cn/cms/channel/1ywgg1/240630340.htm"
|
||||
* title="三峡福清兴化湾海上风电场一期项目金风Y6风机发电机更换施工招标公告"
|
||||
* target="_blank" style="">
|
||||
* <span style="max-width: 700px;">
|
||||
* <i class="iconfont"></i>
|
||||
* <em style="width:6.5em; color: #1e52a8;font-weight: 700;float: none;"></em>
|
||||
* 三峡福清兴化湾海上风电场一期项目金风Y6风机发电机更换施工招标公告
|
||||
* </span>
|
||||
* <em>2026-01-09</em>
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li[^>]*name="li_name"[^>]*>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<em>\s*(\d{4}-\d{2}-\d{2})\s*<\/em>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
|
||||
if (title && url) {
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: url.startsWith('http') ? url : this.baseUrl + url
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
};
|
||||
51
src/crawler/services/espic_target.spec.ts
Normal file
51
src/crawler/services/espic_target.spec.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import { EspicCrawler } from './espic_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
|
||||
describe('EspicCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${EspicCrawler.name}`);
|
||||
console.log(`Target URL: ${EspicCrawler.getUrl()}`);
|
||||
|
||||
const results = await EspicCrawler.crawl(browser);
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
// Basic assertions to ensure crawler is working
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
247
src/crawler/services/espic_target.ts
Normal file
247
src/crawler/services/espic_target.ts
Normal file
@@ -0,0 +1,247 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface EspicResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export const EspicCrawler = {
|
||||
name: '电能e招采平台',
|
||||
baseUrl: 'https://ebid.espic.com.cn',
|
||||
|
||||
// 生成动态 URL,使用当前日期
|
||||
getUrl(page: number = 1): string {
|
||||
const now = new Date();
|
||||
const year = now.getFullYear();
|
||||
const month = now.getMonth() + 1; // 月份从0开始
|
||||
const day = now.getDate();
|
||||
const timeStr = `${year}-${month}-${day}`;
|
||||
return `https://ebid.espic.com.cn/newgdtcms//category/iframe.html?dates=300&categoryId=2&tenderMethod=01&tabName=&page=${page}&time=${timeStr}`;
|
||||
},
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<EspicResult[]> {
|
||||
const logger = new Logger('EspicCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: EspicResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
const url = this.getUrl(currentPage);
|
||||
logger.log(`Navigating to ${url}...`);
|
||||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 等待 WAF 验证通过
|
||||
logger.log('Waiting for WAF verification...');
|
||||
await page.waitForFunction(
|
||||
() => {
|
||||
// 检查是否已经通过验证(页面不再是 WAF 页面)
|
||||
const bodyText = document.body?.textContent || '';
|
||||
return !bodyText.includes('人机识别检测') && !bodyText.includes('WEB 应用防火墙');
|
||||
},
|
||||
{ timeout: 30000 }
|
||||
);
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 查找下一页按钮 - 根据网站实际结构调整选择器
|
||||
// 实际结构: <a href="javascript:void(0);" onclick="turnPage(2);">下一页</a>
|
||||
const nextButtonSelectors = [
|
||||
'a[onclick*="turnPage"]',
|
||||
'a:contains("下一页")',
|
||||
'a[aria-label="Next"]',
|
||||
'a.next',
|
||||
'li.next a',
|
||||
'a.layui-laypage-next:not(.layui-disabled)'
|
||||
];
|
||||
|
||||
let nextButton: puppeteer.ElementHandle<Element> | null = null;
|
||||
for (const selector of nextButtonSelectors) {
|
||||
try {
|
||||
nextButton = await page.$(selector);
|
||||
if (nextButton) break;
|
||||
} catch (e) {
|
||||
// 继续尝试下一个选择器
|
||||
}
|
||||
}
|
||||
|
||||
if (!nextButton) {
|
||||
logger.log('Next page button not found. Reached end of list.');
|
||||
break;
|
||||
}
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
try {
|
||||
// 点击下一页按钮
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): EspicResult[] {
|
||||
const results: EspicResult[] = [];
|
||||
/**
|
||||
* Regex groups for ebid.espic.com.cn:
|
||||
* 1: URL (href属性)
|
||||
* 2: Title (title属性,完整的招标工程名称)
|
||||
* 3: Date (发布时间,格式:2026-01-11)
|
||||
*
|
||||
* HTML结构示例:
|
||||
* <li>
|
||||
* <a href="https://ebid.espic.com.cn/sdny_bulletin/2026-01-11/977309.html"
|
||||
* title="GJDTHN2025225-国家电投集团河南公司平顶山发电2026年-2028年(两年期)旺河灰场运维项目招标公告"
|
||||
* class="clearfix" target="_blank">
|
||||
* <div class="row">
|
||||
* <div class="col-10 ">
|
||||
* <h5>GJDTHN2025225-国家电投集团河南公司平顶山发电2026年-2028年(两年期)旺...</h5>
|
||||
* <dl class="newsinfo row">
|
||||
* <dd class="col">招标编号:<span>DNYZC-2026-01-11-001</span></dd>
|
||||
* <dd class="col">招标方式:<span>公开招标</span></dd>
|
||||
* <dd class="col">报名截止时间:<span>2026-01-19</span></dd>
|
||||
* </dl>
|
||||
* </div>
|
||||
* <div class="col-2 ">
|
||||
* <div class="newsDate">
|
||||
* <div>2026-01-11</div>
|
||||
* </div>
|
||||
* </div>
|
||||
* </div>
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
|
||||
if (title && url) {
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: url.startsWith('http') ? url : this.baseUrl + url
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
};
|
||||
51
src/crawler/services/powerbeijing_target.spec.ts
Normal file
51
src/crawler/services/powerbeijing_target.spec.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import { PowerbeijingCrawler } from './powerbeijing_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 60 seconds for network operations
|
||||
jest.setTimeout(60000*5);
|
||||
|
||||
describe('PowerbeijingCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Change to false to see browser UI
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`\nStarting crawl for: ${PowerbeijingCrawler.name}`);
|
||||
console.log(`Target URL: ${PowerbeijingCrawler.url}`);
|
||||
|
||||
const results = await PowerbeijingCrawler.crawl(browser);
|
||||
|
||||
console.log(`\nSuccessfully found ${results.length} items:\n`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
// Basic assertions to ensure crawler is working
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
// Warn but don't fail if site returns 0 items (could be empty or changed structure)
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Check if website structure has changed or if list is currently empty.');
|
||||
} else {
|
||||
// Check data integrity of first item
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
198
src/crawler/services/powerbeijing_target.ts
Normal file
198
src/crawler/services/powerbeijing_target.ts
Normal file
@@ -0,0 +1,198 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface PowerbeijingResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export const PowerbeijingCrawler = {
|
||||
name: '北京电力交易平台',
|
||||
url: 'https://www.powerbeijing-ec.com/jncms/search/bulletin.html?dates=300&categoryId=2&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
|
||||
baseUrl: 'https://www.powerbeijing-ec.com',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<PowerbeijingResult[]> {
|
||||
const logger = new Logger('PowerbeijingCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => "zh-CN"});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5]});
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: PowerbeijingResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log(`Navigating to ${this.url}...`);
|
||||
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}, stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults);
|
||||
logger.log(`Extracted ${pageResults.length} items from page ${currentPage}`);
|
||||
|
||||
// 模拟人类行为 - 翻页前
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// 查找下一页按钮 - 根据网站实际结构调整选择器
|
||||
// 分页结构: <a href="javascript:void(0);" onclick="turnPage(2);">下一页</a>
|
||||
const nextButtonSelector = 'a[onclick*="turnPage"]';
|
||||
const nextButton = await page.$(nextButtonSelector);
|
||||
|
||||
if (!nextButton) {
|
||||
logger.log('Next page button not found. Reached end of list.');
|
||||
break;
|
||||
}
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
try {
|
||||
// 点击下一页按钮,等待页面更新
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000)); // 等待页面加载
|
||||
} catch (navError) {
|
||||
logger.error(`Navigation to page ${currentPage + 1} failed: ${navError.message}`);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
|
||||
// 模拟人类行为 - 翻页后
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Failed to crawl ${this.name}: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract(html: string): PowerbeijingResult[] {
|
||||
const results: PowerbeijingResult[] = [];
|
||||
/**
|
||||
* Regex groups for powerbeijing-ec.com:
|
||||
* 1: URL (href属性)
|
||||
* 2: Title (title属性)
|
||||
* 3: Date (发布时间,格式:2026-01-09)
|
||||
*
|
||||
* HTML结构示例:
|
||||
* <li>
|
||||
* <a href="https://www.powerbeijing-ec.com/biddingBulletin/2026-01-09/302075.html"
|
||||
* title="内蒙古京隆发电有限责任公司#1、#2机组锅炉受热面、空预器、MGGH高压水冲洗招标公告">
|
||||
* <h1>内蒙古京隆发电有限责任公司#1、#2机组锅炉受热面、空预器、MGGH高压水冲洗招标公告</h1>
|
||||
* <div class="newsDate">
|
||||
* <div>2026-01-09</div>
|
||||
* </div>
|
||||
* </a>
|
||||
* </li>
|
||||
*/
|
||||
const regex = /<li>[\s\S]*?<a[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*>[\s\S]*?<div class="newsDate">[\s\S]*?<div>\s*(\d{4}-\d{2}-\d{2})\s*<\/div>[\s\S]*?<\/div>[\s\S]*?<\/a>[\s\S]*?<\/li>/gs;
|
||||
|
||||
let match;
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
const url = match[1]?.trim();
|
||||
const title = match[2]?.trim();
|
||||
const dateStr = match[3]?.trim();
|
||||
|
||||
if (title && url) {
|
||||
results.push({
|
||||
title,
|
||||
publishDate: dateStr ? new Date(dateStr) : new Date(),
|
||||
url: url.startsWith('http') ? url : this.baseUrl + url
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
};
|
||||
61
src/crawler/services/szecp_target.spec.ts
Normal file
61
src/crawler/services/szecp_target.spec.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
import { SzecpCrawler } from './szecp_target';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
// Increase timeout to 120 seconds for manual inspection and slow sites
|
||||
jest.setTimeout(120000);
|
||||
|
||||
describe('SzecpCrawler Real Site Test', () => {
|
||||
let browser: puppeteer.Browser;
|
||||
|
||||
beforeAll(async () => {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false, // Run in non-headless mode
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
'--disable-infobars',
|
||||
],
|
||||
defaultViewport: null
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (browser) {
|
||||
// Keep open for a few seconds after test to see result
|
||||
await new Promise(r => setTimeout(r, 50000));
|
||||
await browser.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('should visit website and list all found bid information', async () => {
|
||||
console.log(`
|
||||
Starting crawl for: ${SzecpCrawler.name}`);
|
||||
console.log(`Target URL: ${SzecpCrawler.url}`);
|
||||
|
||||
const results = await SzecpCrawler.crawl(browser);
|
||||
|
||||
console.log(`
|
||||
Successfully found ${results.length} items:
|
||||
`);
|
||||
console.log('----------------------------------------');
|
||||
results.forEach((item, index) => {
|
||||
console.log(`${index + 1}. [${item.publishDate.toLocaleDateString()}] ${item.title}`);
|
||||
console.log(` Link: ${item.url}`);
|
||||
console.log('----------------------------------------');
|
||||
});
|
||||
|
||||
expect(results).toBeDefined();
|
||||
expect(Array.isArray(results)).toBeTruthy();
|
||||
|
||||
if (results.length === 0) {
|
||||
console.warn('Warning: No items found. Observe browser window to see if content is loading or if there is a verification challenge.');
|
||||
} else {
|
||||
const firstItem = results[0];
|
||||
expect(firstItem.title).toBeTruthy();
|
||||
expect(firstItem.url).toMatch(/^https?:\/\//);
|
||||
expect(firstItem.publishDate).toBeInstanceOf(Date);
|
||||
}
|
||||
});
|
||||
});
|
||||
170
src/crawler/services/szecp_target.ts
Normal file
170
src/crawler/services/szecp_target.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { ChdtpResult } from './chdtp_target';
|
||||
|
||||
// 模拟人类鼠标移动
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5); // 5-10次随机移动
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20) // 10-30步,使移动更平滑
|
||||
});
|
||||
|
||||
// 随机停顿 100-500ms
|
||||
await new Promise(r => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
// 模拟人类滚动
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5); // 3-7次滚动
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400); // 100-500px
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
// 随机停顿 500-1500ms
|
||||
await new Promise(r => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
// 滚动回顶部
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export const SzecpCrawler = {
|
||||
name: '华润守正采购交易平台',
|
||||
url: 'https://www.szecp.com.cn/first_zbgg/index.html',
|
||||
baseUrl: 'https://www.szecp.com.cn',
|
||||
|
||||
async crawl(browser: puppeteer.Browser): Promise<ChdtpResult[]> {
|
||||
const logger = new Logger('SzecpCrawler');
|
||||
const page = await browser.newPage();
|
||||
|
||||
const username = process.env.PROXY_USERNAME;
|
||||
const password = process.env.PROXY_PASSWORD;
|
||||
if (username && password) {
|
||||
await page.authenticate({ username, password });
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
Object.defineProperty(navigator, 'language', { get: () => 'zh-CN' });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
});
|
||||
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
const allResults: ChdtpResult[] = [];
|
||||
let currentPage = 1;
|
||||
const maxPages = 5;
|
||||
|
||||
try {
|
||||
logger.log(`Navigating to ${this.url}...`);
|
||||
await page.goto(this.url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Wait for search button to be available and click it
|
||||
logger.log('Clicking search button...');
|
||||
await page.waitForSelector('.szb-zbcgSearch-key-v1', { timeout: 60000 });
|
||||
await page.click('.szb-zbcgSearch-key-v1');
|
||||
await new Promise(r => setTimeout(r, 3000)); // Wait for results to load
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
logger.log(`Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for content to load
|
||||
await page.waitForFunction(() => {
|
||||
return document.querySelectorAll('.szb-zbcgTable-other').length > 0;
|
||||
}, { timeout: 60000 }).catch(() => logger.warn('Content not found. Site might be slow.'));
|
||||
|
||||
const pageResults = await page.evaluate((baseUrl) => {
|
||||
// Extract from table rows
|
||||
const items = Array.from(document.querySelectorAll('.szb-zbcgTable-other'));
|
||||
return items.map(item => {
|
||||
const divs = item.querySelectorAll('div');
|
||||
if (divs.length >= 5) {
|
||||
const titleLink = divs[1].querySelector('a');
|
||||
const title = titleLink?.textContent?.trim() || '';
|
||||
const dateStr = divs[4].textContent?.trim() || '';
|
||||
const href = titleLink?.getAttribute('href') || '';
|
||||
|
||||
if (title.length < 5) return null; // Filter noise
|
||||
|
||||
// Construct full URL if href is relative
|
||||
const url = href.startsWith('http') ? href : `${baseUrl}${href}`;
|
||||
|
||||
return {
|
||||
title,
|
||||
dateStr,
|
||||
url
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}).filter(i => i !== null);
|
||||
}, this.baseUrl);
|
||||
|
||||
if (pageResults.length === 0) {
|
||||
logger.warn(`No results found on page ${currentPage}. Extraction failed.`);
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...pageResults.map(r => ({
|
||||
title: r!.title,
|
||||
publishDate: new Date(r!.dateStr),
|
||||
url: r!.url
|
||||
})));
|
||||
|
||||
logger.log(`Extracted ${pageResults.length} items.`);
|
||||
|
||||
// Pagination: look for next page link
|
||||
const nextButton = await page.$('.pagination li a[page="+"]');
|
||||
if (!nextButton) break;
|
||||
|
||||
await nextButton.click();
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// 模拟人类行为
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
currentPage++;
|
||||
}
|
||||
|
||||
return allResults;
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Crawl failed: ${error.message}`);
|
||||
return allResults;
|
||||
} finally {
|
||||
if (page) await page.close();
|
||||
}
|
||||
},
|
||||
|
||||
extract() { return []; }
|
||||
};
|
||||
@@ -12,7 +12,7 @@ export class BidCrawlTask {
|
||||
private bidsService: BidsService,
|
||||
) {}
|
||||
|
||||
@Cron(CronExpression.EVERY_30_MINUTES)
|
||||
@Cron(CronExpression.EVERY_DAY_AT_MIDNIGHT)
|
||||
async handleCron() {
|
||||
this.logger.debug('Scheduled crawl task started');
|
||||
await this.crawlerService.crawlAll();
|
||||
|
||||
Reference in New Issue
Block a user