feat(crawler): add human-like behavior simulation to prevent detection
refactor(manifest): update appid and format manifest file
This commit is contained in:
@@ -1,10 +1,50 @@
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
||||
async function simulateHumanMouseMovement(page: puppeteer.Page) {
|
||||
const viewport = page.viewport();
|
||||
if (!viewport) return;
|
||||
|
||||
const movements = 5 + Math.floor(Math.random() * 5);
|
||||
|
||||
for (let i = 0; i < movements; i++) {
|
||||
const x = Math.floor(Math.random() * viewport.width);
|
||||
const y = Math.floor(Math.random() * viewport.height);
|
||||
|
||||
await page.mouse.move(x, y, {
|
||||
steps: 10 + Math.floor(Math.random() * 20),
|
||||
});
|
||||
|
||||
await new Promise((r) => setTimeout(r, 100 + Math.random() * 400));
|
||||
}
|
||||
}
|
||||
|
||||
async function simulateHumanScrolling(page: puppeteer.Page) {
|
||||
const scrollCount = 3 + Math.floor(Math.random() * 5);
|
||||
|
||||
for (let i = 0; i < scrollCount; i++) {
|
||||
const scrollDistance = 100 + Math.floor(Math.random() * 400);
|
||||
|
||||
await page.evaluate((distance) => {
|
||||
window.scrollBy({
|
||||
top: distance,
|
||||
behavior: 'smooth',
|
||||
});
|
||||
}, scrollDistance);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 500 + Math.random() * 1000));
|
||||
}
|
||||
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, behavior: 'smooth' });
|
||||
});
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
export interface ChdtpResult {
|
||||
title: string;
|
||||
publishDate: Date;
|
||||
url: string; // Necessary for system uniqueness
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface ChdtpCrawlerType {
|
||||
@@ -101,6 +141,12 @@ export const ChdtpCrawler = {
|
||||
logger,
|
||||
);
|
||||
|
||||
logger.log('Simulating human mouse movements...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
while (currentPage <= maxPages) {
|
||||
const content = await page.content();
|
||||
const pageResults = this.extract(content);
|
||||
@@ -115,6 +161,12 @@ export const ChdtpCrawler = {
|
||||
`Extracted ${pageResults.length} items from page ${currentPage}`,
|
||||
);
|
||||
|
||||
logger.log('Simulating human mouse movements before pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling before pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Find the "Next Page" button
|
||||
// Using partial match for src to be robust against path variations
|
||||
const nextButtonSelector = 'input[type="image"][src*="page-next.png"]';
|
||||
@@ -125,9 +177,6 @@ export const ChdtpCrawler = {
|
||||
break;
|
||||
}
|
||||
|
||||
// Optional: Check if the button is disabled (though image inputs usually aren't "disabled" in the same way)
|
||||
// For this specific site, we'll try to click.
|
||||
|
||||
logger.log(`Navigating to page ${currentPage + 1}...`);
|
||||
|
||||
try {
|
||||
@@ -149,6 +198,12 @@ export const ChdtpCrawler = {
|
||||
|
||||
currentPage++;
|
||||
|
||||
logger.log('Simulating human mouse movements after pagination...');
|
||||
await simulateHumanMouseMovement(page);
|
||||
|
||||
logger.log('Simulating human scrolling after pagination...');
|
||||
await simulateHumanScrolling(page);
|
||||
|
||||
// Random delay between pages
|
||||
const delay = Math.floor(Math.random() * (3000 - 1000 + 1)) + 1000;
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
|
||||
Reference in New Issue
Block a user