feat(crawler): add human-like behavior simulation to prevent detection

refactor(manifest): update appid and format manifest file
This commit is contained in:
dmy
2026-01-15 15:34:00 +08:00
parent 36cbb6fda1
commit eba5c7e5c5
3 changed files with 155 additions and 75 deletions

View File

@@ -1,10 +1,50 @@
import * as puppeteer from 'puppeteer';
import { Logger } from '@nestjs/common';
async function simulateHumanMouseMovement(page: puppeteer.Page) {
const viewport = page.viewport();
if (!viewport) return;
const movements = 5 + Math.floor(Math.random() * 5);
for (let i = 0; i < movements; i++) {
const x = Math.floor(Math.random() * viewport.width);
const y = Math.floor(Math.random() * viewport.height);
await page.mouse.move(x, y, {
steps: 10 + Math.floor(Math.random() * 20),
});
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
}
}
async function simulateHumanScrolling(page: puppeteer.Page) {
const scrollCount = 3 + Math.floor(Math.random() * 5);
for (let i = 0; i < scrollCount; i++) {
const scrollDistance = 100 + Math.floor(Math.random() * 400);
await page.evaluate((distance) => {
window.scrollBy({
top: distance,
behavior: 'smooth',
});
}, scrollDistance);
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
}
await page.evaluate(() => {
window.scrollTo({ top: 0, behavior: 'smooth' });
});
await new Promise((r) => setTimeout(r, 1000));
}
export interface ChdtpResult {
title: string;
publishDate: Date;
url: string; // Necessary for system uniqueness
url: string;
}
interface ChdtpCrawlerType {
@@ -101,6 +141,12 @@ export const ChdtpCrawler = {
logger,
);
logger.log('Simulating human mouse movements...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling...');
await simulateHumanScrolling(page);
while (currentPage <= maxPages) {
const content = await page.content();
const pageResults = this.extract(content);
@@ -115,6 +161,12 @@ export const ChdtpCrawler = {
`Extracted ${pageResults.length} items from page ${currentPage}`,
);
logger.log('Simulating human mouse movements before pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling before pagination...');
await simulateHumanScrolling(page);
// Find the "Next Page" button
// Using partial match for src to be robust against path variations
const nextButtonSelector = 'input[type="image"][src*="page-next.png"]';
@@ -125,9 +177,6 @@ export const ChdtpCrawler = {
break;
}
// Optional: Check if the button is disabled (though image inputs usually aren't "disabled" in the same way)
// For this specific site, we'll try to click.
logger.log(`Navigating to page ${currentPage + 1}...`);
try {
@@ -149,6 +198,12 @@ export const ChdtpCrawler = {
currentPage++;
logger.log('Simulating human mouse movements after pagination...');
await simulateHumanMouseMovement(page);
logger.log('Simulating human scrolling after pagination...');
await simulateHumanScrolling(page);
// Random delay between pages
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
await new Promise((resolve) => setTimeout(resolve, delay));