diff --git a/package-lock.json b/package-lock.json index 85c1ecdeb..735ec34e6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -30,7 +30,7 @@ "@adobe/spacecat-shared-slack-client": "1.5.32", "@adobe/spacecat-shared-tier-client": "1.3.10", "@adobe/spacecat-shared-tokowaka-client": "1.4.3", - "@adobe/spacecat-shared-utils": "1.86.0", + "@adobe/spacecat-shared-utils": "https://gist.github.com/tkotthakota-adobe/0bcfeb9e5daac09bb328ae94bc9dfdd7/raw/b63b067b1b5b516b65784280aa6770290626f974/adobe-spacecat-shared-utils-1.86.0.tgz", "@aws-sdk/client-s3": "3.940.0", "@aws-sdk/client-sfn": "3.940.0", "@aws-sdk/client-sqs": "3.940.0", @@ -544,6 +544,7 @@ "resolved": "https://registry.npmjs.org/@adobe/helix-universal/-/helix-universal-5.3.0.tgz", "integrity": "sha512-1eKFpKZMNamJHhq6eFm9gMLhgQunsf34mEFbaqg9ChEXZYk18SYgUu5GeNTvzk5Rzo0h9AuSwLtnI2Up2OSiSA==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@adobe/fetch": "4.2.3", "aws4": "1.13.2" @@ -2651,8 +2652,8 @@ }, "node_modules/@adobe/spacecat-shared-utils": { "version": "1.86.0", - "resolved": "https://registry.npmjs.org/@adobe/spacecat-shared-utils/-/spacecat-shared-utils-1.86.0.tgz", - "integrity": "sha512-8xd3nr56K1leWGAEUE0f7UpVqfDyD5TnVXf1Ilsk4n73+BqOnD8zeowJVsL6PdZDOCRR/qgIBM1rv8jewYkvcA==", + "resolved": "https://gist.github.com/tkotthakota-adobe/0bcfeb9e5daac09bb328ae94bc9dfdd7/raw/b63b067b1b5b516b65784280aa6770290626f974/adobe-spacecat-shared-utils-1.86.0.tgz", + "integrity": "sha512-p2f+i+LBFTu8EI325TSeQNL8bU8sgcWmnITTtJ7meY4sP9uWSTzlHFGbeiLr198PE7We2Kck37hciLLltvLoDg==", "license": "Apache-2.0", "dependencies": { "@adobe/fetch": "4.2.3", @@ -3695,6 +3696,7 @@ "resolved": "https://registry.npmjs.org/@aws-sdk/client-dynamodb/-/client-dynamodb-3.940.0.tgz", "integrity": "sha512-u2sXsNJazJbuHeWICvsj6RvNyJh3isedEfPvB21jK/kxcriK+dE/izlKC2cyxUjERCmku0zTFNzY9FhrLbYHjQ==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@aws-crypto/sha256-browser": "5.2.0", "@aws-crypto/sha256-js": "5.2.0", @@ -7662,6 +7664,7 @@ "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.79.tgz", "integrity": "sha512-ZLAs5YMM5N2UXN3kExMglltJrKKoW7hs3KMZFlXUnD7a5DFKBYxPFMeXA4rT+uvTxuJRZPCYX0JKI5BhyAWx4A==", "license": "MIT", + "peer": true, "dependencies": { "@cfworker/json-schema": "^4.0.2", "ansi-styles": "^5.0.0", @@ -7888,6 +7891,7 @@ "resolved": "https://registry.npmjs.org/@octokit/core/-/core-7.0.6.tgz", "integrity": "sha512-DhGl4xMVFGVIyMwswXeyzdL4uXD5OGILGX5N8Y+f6W7LhC1Ze2poSNrkF/fedpVDHEEZ+PHFW0vL14I+mm8K3Q==", "license": "MIT", + "peer": true, "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.3", @@ -8094,6 +8098,7 @@ "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", "devOptional": true, "license": "Apache-2.0", + "peer": true, "engines": { "node": ">=8.0.0" } @@ -8257,6 +8262,7 @@ "integrity": "sha512-xYLlvk/xdScGx1aEqvxLwf6sXQLXCjk3/1SQT9X9AoN5rXRhkdvIFShuNNmtTEPRBqcsMbS4p/gJLNI2wXaDuQ==", "devOptional": true, "license": "Apache-2.0", + "peer": true, "dependencies": { "@opentelemetry/core": "2.0.1", "@opentelemetry/resources": "2.0.1", @@ -10176,6 +10182,7 @@ "resolved": "https://registry.npmjs.org/@types/express/-/express-5.0.6.tgz", "integrity": "sha512-sKYVuV7Sv9fbPIt/442koC7+IIwK5olP1KWeD88e/idgoJqDm3JV/YUiPwkoKK92ylff2MGxSz1CSjsXelx0YA==", "license": "MIT", + "peer": true, "dependencies": { "@types/body-parser": "*", "@types/express-serve-static-core": "^5.0.0", @@ -10482,6 +10489,7 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -10528,6 +10536,7 @@ "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", @@ -11003,6 +11012,7 @@ "resolved": "https://registry.npmjs.org/aws-xray-sdk-core/-/aws-xray-sdk-core-3.12.0.tgz", "integrity": "sha512-lwalRdxXRy+Sn49/vN7W507qqmBRk5Fy2o0a9U6XTjL9IV+oR5PUiiptoBrOcaYCiVuGld8OEbNqhm6wvV3m6A==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@aws-sdk/types": "^3.4.1", "@smithy/service-error-classification": "^2.0.4", @@ -11653,6 +11663,7 @@ "integrity": "sha512-p4Z49OGG5W/WBCPSS/dH3jQ73kD6tiMmUM+bckNK6Jr5JHMG3k9bg/BvKR8lKmtVBKmOiuVaV2ws8s9oSbwysg==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=18" } @@ -13829,6 +13840,7 @@ "integrity": "sha512-BhHmn2yNOFA9H9JmmIVKJmd288g9hrVRDkdoIgRCRuSySRUHH7r/DI6aAXW9T1WwUuY3DFgrcaqB+deURBLR5g==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -17810,6 +17822,7 @@ "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", "dev": true, "license": "MIT", + "peer": true, "bin": { "marked": "bin/marked.js" }, @@ -18937,6 +18950,7 @@ "integrity": "sha512-UczzB+0nnwGotYSgllfARAqWCJ5e/skuV2K/l+Zyck/H6pJIhLXuBnz+6vn2i211o7DtbE78HQtsYEKICHGI+g==", "dev": true, "license": "MIT", + "peer": true, "funding": { "type": "opencollective", "url": "https://opencollective.com/mobx" @@ -22083,6 +22097,7 @@ "dev": true, "inBundle": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -22752,6 +22767,7 @@ "resolved": "https://registry.npmjs.org/openai/-/openai-5.12.2.tgz", "integrity": "sha512-xqzHHQch5Tws5PcKR2xsZGX9xtch+JQFz5zb14dGqlshmmDAFBFEWmeIpf7wVqWV+w7Emj7jRgkNJakyKE0tYQ==", "license": "Apache-2.0", + "peer": true, "bin": { "openai": "bin/cli" }, @@ -23869,6 +23885,7 @@ "integrity": "sha512-DGrYcCWK7tvYMnWh79yrPHt+vdx9tY+1gPZa7nJQtO/p8bLTDaHp4dzwEhQB7pZ4Xe3ok4XKuEPrVuc+wlpkmw==", "devOptional": true, "license": "MIT", + "peer": true, "engines": { "node": ">=0.10.0" } @@ -23879,6 +23896,7 @@ "integrity": "sha512-ibrK8llX2a4eOskq1mXKu/TGZj9qzomO+sNfO98M6d9zIPOEhlBkMkBUBLd1vgS0gQsLDBzA+8jJBVXDnfHmJg==", "devOptional": true, "license": "MIT", + "peer": true, "dependencies": { "scheduler": "^0.27.0" }, @@ -24588,6 +24606,7 @@ "integrity": "sha512-phCkJ6pjDi9ANdhuF5ElS10GGdAKY6R1Pvt9lT3SFhOwM4T7QZE7MLpBDbNruUx/Q3gFD92/UOFringGipRqZA==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@semantic-release/commit-analyzer": "^13.0.0-beta.1", "@semantic-release/error": "^4.0.0", @@ -25474,6 +25493,7 @@ "integrity": "sha512-TOgRcwFPbfGtpqvZw+hyqJDvqfapr1qUlOizROIk4bBLjlsjlB00Pg6wMFXNtJRpu+eCZuVOaLatG7M8105kAw==", "dev": true, "license": "BSD-3-Clause", + "peer": true, "dependencies": { "@sinonjs/commons": "^3.0.1", "@sinonjs/fake-timers": "^13.0.5", @@ -26039,6 +26059,7 @@ "integrity": "sha512-1v/e3Dl1BknC37cXMhwGomhO8AkYmN41CqyX9xhUDxry1ns3BFQy2lLDRQXJRdVVWB9OHemv/53xaStimvWyuA==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@emotion/is-prop-valid": "1.2.2", "@emotion/unitless": "0.8.1", @@ -27104,6 +27125,7 @@ "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz", "integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==", "license": "MIT", + "peer": true, "dependencies": { "@types/unist": "^3.0.0", "bail": "^2.0.0", @@ -27847,6 +27869,7 @@ "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", "license": "MIT", + "peer": true, "engines": { "node": ">=10.0.0" }, @@ -28108,6 +28131,7 @@ "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", "license": "MIT", + "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } @@ -28117,6 +28141,7 @@ "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.0.tgz", "integrity": "sha512-HvWtU2UG41LALjajJrML6uQejQhNJx+JBO9IflpSja4R03iNWfKXrj6W2h7ljuLyc1nKS+9yDyL/9tD1U/yBnQ==", "license": "ISC", + "peer": true, "peerDependencies": { "zod": "^3.25 || ^4" } diff --git a/package.json b/package.json index 10d4360ee..54d055bb5 100644 --- a/package.json +++ b/package.json @@ -86,7 +86,7 @@ "@adobe/spacecat-shared-slack-client": "1.5.32", "@adobe/spacecat-shared-tier-client": "1.3.10", "@adobe/spacecat-shared-tokowaka-client": "1.4.3", - "@adobe/spacecat-shared-utils": "1.86.0", + "@adobe/spacecat-shared-utils": "https://gist.github.com/tkotthakota-adobe/0bcfeb9e5daac09bb328ae94bc9dfdd7/raw/b63b067b1b5b516b65784280aa6770290626f974/adobe-spacecat-shared-utils-1.86.0.tgz", "@aws-sdk/client-s3": "3.940.0", "@aws-sdk/client-sfn": "3.940.0", "@aws-sdk/client-sqs": "3.940.0", diff --git a/src/support/slack/actions/commons.js b/src/support/slack/actions/commons.js index 32b182ec6..0665d034f 100644 --- a/src/support/slack/actions/commons.js +++ b/src/support/slack/actions/commons.js @@ -11,6 +11,7 @@ */ import { Blocks, Message } from 'slack-block-builder'; +import { SPACECAT_BOT_USER_AGENT, SPACECAT_BOT_IPS } from '@adobe/spacecat-shared-utils'; import { BUTTON_LABELS } from '../../../controllers/hooks.js'; export function extractURLFromSlackMessage(inputString) { @@ -48,3 +49,52 @@ export function composeReply(opts) { replace_original: true, }; } + +/** + * Formats bot protection details for Slack notifications + * @param {Object} options - Options + * @param {string} options.siteUrl - Site URL + * @param {Object} options.botProtection - Bot protection details + * @param {string} [options.environment='prod'] - Environment ('prod' or 'dev') + * @returns {string} Formatted Slack message + */ +export function formatBotProtectionSlackMessage({ + siteUrl, + botProtection, + environment = 'prod', +}) { + const ips = environment === 'prod' + ? SPACECAT_BOT_IPS.production + : SPACECAT_BOT_IPS.development; + const ipList = ips.map((ip) => `• \`${ip}\``).join('\n'); + + const envLabel = environment === 'prod' ? 'Production' : 'Development'; + + let message = ':warning: *Bot Protection Detected*\n\n' + + `*Site:* ${siteUrl}\n` + + `*Protection Type:* ${botProtection.type}\n` + + `*Confidence:* ${(botProtection.confidence * 100).toFixed(0)}%\n`; + + if (botProtection.reason) { + message += `*Reason:* ${botProtection.reason}\n`; + } + + message += '\n' + + '*Onboarding stopped due to the following reasons:*\n' + + '• SpaceCat bot cannot access the site due to bot protection\n' + + '• Scraper would receive challenge pages instead of real content\n' + + '• Audits and opportunities cannot be generated without site access\n' + + '\n' + + '*Action Required:*\n' + + `Customer must allowlist SpaceCat in their ${botProtection.type} configuration:\n` + + '\n' + + '*User-Agent to allowlist:*\n' + + `\`${SPACECAT_BOT_USER_AGENT}\`\n` + + '\n' + + `*${envLabel} IPs to allowlist:*\n` + + `${ipList}\n` + + '\n' + + '_After allowlisting, re-run the onboard command to complete onboarding._'; + + return message; +} diff --git a/src/support/slack/actions/onboard-modal.js b/src/support/slack/actions/onboard-modal.js index 48eae301f..49f97d560 100644 --- a/src/support/slack/actions/onboard-modal.js +++ b/src/support/slack/actions/onboard-modal.js @@ -15,6 +15,8 @@ import { Entitlement as EntitlementModel } from '@adobe/spacecat-shared-data-acc import { onboardSingleSite as sharedOnboardSingleSite } from '../../utils.js'; import { triggerBrandProfileAgent } from '../../brand-profile-trigger.js'; import { loadProfileConfig } from '../../../utils/slack/base.js'; +import { checkBotProtectionDuringOnboarding } from '../../utils/bot-protection-check.js'; +import { formatBotProtectionSlackMessage } from './commons.js'; export const AEM_CS_HOST = /^author-p(\d+)-e(\d+)/i; @@ -692,6 +694,42 @@ export function onboardSiteModal(lambdaContext) { thread_ts: responseThreadTs, }); + const botProtectionResult = await checkBotProtectionDuringOnboarding(siteUrl, log); + + if (botProtectionResult.blocked) { + log.warn(`Bot protection detected for ${siteUrl} - stopping onboarding`, botProtectionResult); + + const environment = env.AWS_REGION?.includes('us-east') ? 'prod' : 'dev'; + const botProtectionMessage = formatBotProtectionSlackMessage({ + siteUrl, + botProtection: botProtectionResult, + environment, + }); + + await client.chat.postMessage({ + channel: responseChannel, + text: `:warning: *Bot Protection Detected for ${siteUrl}*`, + blocks: [ + { + type: 'section', + text: { + type: 'mrkdwn', + text: botProtectionMessage, + }, + }, + ], + thread_ts: responseThreadTs, + }); + + await client.chat.postMessage({ + channel: responseChannel, + text: ':x: *Onboarding stopped.* Please allowlist SpaceCat IPs and User-Agent as shown above, then re-run the onboard command.', + thread_ts: responseThreadTs, + }); + + return; + } + const reportLine = await onboardSingleSiteFromModal( siteUrl, imsOrgId, diff --git a/src/support/utils/bot-protection-check.js b/src/support/utils/bot-protection-check.js new file mode 100644 index 000000000..f042c4d4c --- /dev/null +++ b/src/support/utils/bot-protection-check.js @@ -0,0 +1,224 @@ +/* + * Copyright 2025 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { analyzeBotProtection, SPACECAT_BOT_USER_AGENT } from '@adobe/spacecat-shared-utils'; + +/** + * Performs a lightweight bot protection check by fetching the homepage. + * This is a minimal check used during onboarding to determine if audits should be skipped. + * Uses the same detection logic as the content scraper but only checks the homepage. + * Also makes additional requests to common endpoints to detect HTTP/2 blocking patterns. + * + * @param {string} baseUrl - Site base URL + * @param {object} log - Logger + * @returns {Promise} Bot protection status + */ +export async function checkBotProtectionDuringOnboarding(baseUrl, log) { + log.info(`Performing lightweight bot protection check for ${baseUrl}`); + + try { + // Make multiple requests to detect HTTP/2 blocking patterns + // Some sites allow the first request but block subsequent automated requests + const requests = [ + { url: baseUrl, name: 'homepage' }, + { url: new URL('/robots.txt', baseUrl).toString(), name: 'robots.txt' }, + { url: new URL('/sitemap.xml', baseUrl).toString(), name: 'sitemap.xml' }, + ]; + + const results = await Promise.allSettled( + requests.map(async (req) => { + try { + const response = await fetch(req.url, { + method: 'GET', + headers: { + 'User-Agent': SPACECAT_BOT_USER_AGENT, + }, + signal: AbortSignal.timeout(10000), // 10 second timeout + }); + + // Try to read response body + const html = await response.text(); + + return { + name: req.name, + url: req.url, + success: true, + response, + html, + }; + } catch (error) { + // Check for HTTP/2 errors + const errorCode = error?.code || ''; + const errorMessage = error?.message || ''; + const isHttp2Error = errorCode === 'NGHTTP2_INTERNAL_ERROR' + || errorCode === 'ERR_HTTP2_STREAM_ERROR' + || errorCode === 'ERR_HTTP2_STREAM_CANCEL' + || errorMessage.includes('NGHTTP2_INTERNAL_ERROR') + || errorMessage.includes('HTTP2_STREAM_ERROR'); + + log.debug(`Fetch failed for ${req.name}: code=${errorCode}, message=${errorMessage}, isHttp2=${isHttp2Error}`); + + return { + name: req.name, + url: req.url, + success: false, + error, + isHttp2Error, + }; + } + }), + ); + + // Check if any requests failed with HTTP/2 errors + const http2Failures = results.filter( + (r) => r.status === 'fulfilled' && r.value && r.value.success === false && r.value.isHttp2Error === true, + ); + + if (http2Failures.length > 0) { + log.warn(`HTTP/2 errors detected for ${baseUrl} - likely bot protection`); + const firstFailure = http2Failures[0].value; + return { + blocked: true, + type: 'http2-block', + confidence: 0.9, + reason: `HTTP/2 connection error: ${firstFailure.error?.message || 'bot blocking detected'}`, + details: { + failedRequests: http2Failures.map((f) => ({ + name: f.value.name, + url: f.value.url, + error: f.value.error?.message, + code: f.value.error?.code, + })), + }, + }; + } + + // Get the homepage response for content analysis + const homepageResult = results[0]; + if (homepageResult.status === 'rejected' || !homepageResult.value?.success) { + // Homepage fetch failed completely + const error = homepageResult.reason || homepageResult.value?.error; + + // Check if this is an HTTP/2 error before throwing + if (error) { + const errorCode = error.code || ''; + const errorMessage = error.message || ''; + const isHttp2Error = errorCode === 'NGHTTP2_INTERNAL_ERROR' + || errorCode === 'ERR_HTTP2_STREAM_ERROR' + || errorCode === 'ERR_HTTP2_STREAM_CANCEL' + || errorMessage.includes('NGHTTP2_INTERNAL_ERROR') + || errorMessage.includes('HTTP2_STREAM_ERROR'); + + /* c8 ignore start */ + // Defensive check - in practice, HTTP/2 errors are caught by the first filter + // (lines 80-100). This serves as a safety net in case the error object structure changes. + if (isHttp2Error) { + log.warn(`HTTP/2 error detected on homepage for ${baseUrl} - likely bot protection`); + return { + blocked: true, + type: 'http2-block', + confidence: 0.9, + reason: `HTTP/2 connection error: ${errorMessage}`, + details: { + error: errorMessage, + code: errorCode, + }, + }; + } + /* c8 ignore stop */ + } + + throw error; + } + + const { response, html } = homepageResult.value; + + // Analyze homepage content for bot protection patterns + const botProtection = analyzeBotProtection({ + status: response.status, + headers: response.headers, + html, + }); + + log.info(`Bot protection check complete for ${baseUrl}`, { + crawlable: botProtection.crawlable, + type: botProtection.type, + confidence: botProtection.confidence, + }); + + return { + blocked: !botProtection.crawlable, + type: botProtection.type, + confidence: botProtection.confidence, + reason: botProtection.reason, + details: { + httpStatus: response.status, + htmlSize: html.length, + }, + }; + } catch (error) { + log.error(`Bot protection check failed for ${baseUrl}:`, error); + + // Check for HTTP/2 errors in the caught error + const errorCode = error.code || ''; + const errorMessage = error.message || ''; + const isHttp2Error = errorCode === 'NGHTTP2_INTERNAL_ERROR' + || errorCode === 'ERR_HTTP2_STREAM_ERROR' + || errorCode === 'ERR_HTTP2_STREAM_CANCEL' + || errorMessage.includes('NGHTTP2_INTERNAL_ERROR') + || errorMessage.includes('HTTP2_STREAM_ERROR'); + + if (isHttp2Error) { + log.warn(`HTTP/2 error detected for ${baseUrl} - likely bot protection`); + return { + blocked: true, + type: 'http2-block', + confidence: 0.9, + reason: `HTTP/2 connection error: ${errorMessage}`, + details: { + error: errorMessage, + code: errorCode, + }, + }; + } + + // Check if error suggests bot blocking (403, 401, etc.) + const isBotBlocking = errorMessage.includes('403') + || errorMessage.includes('401') + || errorMessage.includes('Forbidden') + || error.status === 403 + || error.status === 401; + + if (isBotBlocking) { + // Fetch failed with 403/401 - likely bot protection + log.warn(`HTTP error suggests bot protection for ${baseUrl}`); + return { + blocked: true, + type: 'http-error', + confidence: 0.7, + reason: `HTTP error suggests bot protection: ${errorMessage}`, + details: { + error: errorMessage, + }, + }; + } + + // Other errors (timeout, DNS, network) - fail open + // Better to try audits than block unnecessarily + return { + blocked: false, + type: 'unknown', + confidence: 0, + error: errorMessage, + }; + } +} diff --git a/test/support/slack/actions/commons.test.js b/test/support/slack/actions/commons.test.js index 621a206ec..cc663808d 100644 --- a/test/support/slack/actions/commons.test.js +++ b/test/support/slack/actions/commons.test.js @@ -13,7 +13,11 @@ /* eslint-env mocha */ import { expect } from 'chai'; -import { composeReply, extractURLFromSlackMessage } from '../../../../src/support/slack/actions/commons.js'; +import { + composeReply, + extractURLFromSlackMessage, + formatBotProtectionSlackMessage, +} from '../../../../src/support/slack/actions/commons.js'; import { slackActionResponse, slackApprovedFriendsFamilyReply, @@ -59,4 +63,92 @@ describe('Slack action commons', () => { })).to.eql(slackIgnoredReply); }); }); + + describe('formatBotProtectionSlackMessage', () => { + it('formats bot protection message for production environment', () => { + const result = formatBotProtectionSlackMessage({ + siteUrl: 'https://example.com', + botProtection: { + type: 'cloudflare', + confidence: 0.95, + reason: 'Challenge page detected', + }, + environment: 'prod', + }); + + expect(result).to.be.a('string'); + expect(result).to.include('Bot Protection Detected'); + expect(result).to.include('https://example.com'); + expect(result).to.include('cloudflare'); + expect(result).to.include('95%'); + expect(result).to.include('Challenge page detected'); + expect(result).to.include('Production IPs to allowlist'); + expect(result).to.include('Spacecat/1.0'); + expect(result).to.include('Onboarding stopped due to the following reasons:'); + expect(result).to.include('Action Required:'); + }); + + it('formats bot protection message for development environment', () => { + const result = formatBotProtectionSlackMessage({ + siteUrl: 'https://example.com', + botProtection: { + type: 'imperva', + confidence: 0.85, + }, + environment: 'dev', + }); + + expect(result).to.include('Bot Protection Detected'); + expect(result).to.include('imperva'); + expect(result).to.include('85%'); + expect(result).to.include('Development IPs to allowlist'); + }); + + it('defaults to production environment when not specified', () => { + const result = formatBotProtectionSlackMessage({ + siteUrl: 'https://example.com', + botProtection: { + type: 'akamai', + confidence: 0.9, + }, + }); + + expect(result).to.include('Production IPs to allowlist'); + }); + + it('handles missing reason gracefully', () => { + const result = formatBotProtectionSlackMessage({ + siteUrl: 'https://example.com', + botProtection: { + type: 'datadome', + confidence: 0.8, + }, + environment: 'prod', + }); + + expect(result).to.include('datadome'); + expect(result).to.include('80%'); + expect(result).not.to.include('*Reason:*'); + }); + + it('includes all required sections', () => { + const result = formatBotProtectionSlackMessage({ + siteUrl: 'https://example.com', + botProtection: { + type: 'cloudflare', + confidence: 0.95, + }, + environment: 'prod', + }); + + expect(result).to.include('*Site:*'); + expect(result).to.include('*Protection Type:*'); + expect(result).to.include('*Confidence:*'); + expect(result).to.include('*Onboarding stopped due to the following reasons:*'); + expect(result).to.include('cannot access the site'); + expect(result).to.include('*Action Required:*'); + expect(result).to.include('*User-Agent to allowlist:*'); + expect(result).to.include('re-run the onboard command'); + }); + }); }); diff --git a/test/support/slack/actions/onboard-modal.test.js b/test/support/slack/actions/onboard-modal.test.js index 64e151645..dd00f12f7 100644 --- a/test/support/slack/actions/onboard-modal.test.js +++ b/test/support/slack/actions/onboard-modal.test.js @@ -27,6 +27,7 @@ let startOnboarding; let onboardSiteModal; let extractDeliveryConfigFromPreviewUrl; let triggerBrandProfileAgentStub; +let checkBotProtectionStub; describe('onboard-modal', () => { let sandbox; @@ -34,6 +35,11 @@ describe('onboard-modal', () => { before(async () => { // Mock the network-dependent modules before importing triggerBrandProfileAgentStub = sinon.stub().resolves('exec-123'); + checkBotProtectionStub = sinon.stub().resolves({ + blocked: false, + type: 'none', + confidence: 0, + }); const mockedModule = await esmock('../../../../src/support/slack/actions/onboard-modal.js', { '../../../../src/utils/slack/base.js': { @@ -64,6 +70,9 @@ describe('onboard-modal', () => { '../../../../src/support/brand-profile-trigger.js': { triggerBrandProfileAgent: (...args) => triggerBrandProfileAgentStub(...args), }, + '../../../../src/support/utils/bot-protection-check.js': { + checkBotProtectionDuringOnboarding: (...args) => checkBotProtectionStub(...args), + }, }); ({ startOnboarding, onboardSiteModal, extractDeliveryConfigFromPreviewUrl } = mockedModule); @@ -74,6 +83,12 @@ describe('onboard-modal', () => { nock.disableNetConnect(); sandbox = sinon.createSandbox(); triggerBrandProfileAgentStub.resetHistory(); + checkBotProtectionStub.resetHistory(); + checkBotProtectionStub.resolves({ + blocked: false, + type: 'none', + confidence: 0, + }); }); afterEach(() => { @@ -1106,5 +1121,136 @@ describe('onboard-modal', () => { expect(ackMock).to.have.been.called; }); + + it('should detect bot protection and stop onboarding', async () => { + // Mock bot protection detected + checkBotProtectionStub.resolves({ + blocked: true, + type: 'cloudflare', + confidence: 0.95, + reason: 'Challenge page detected', + details: { + httpStatus: 200, + htmlSize: 5000, + }, + }); + + const onboardSiteModalAction = onboardSiteModal(context); + + await onboardSiteModalAction({ + ack: ackMock, + body, + client: clientMock, + }); + + // Should NOT call ack() since we return early + expect(checkBotProtectionStub).to.have.been.calledOnce; + expect(checkBotProtectionStub).to.have.been.calledWith('https://example.com', sinon.match.object); + + // Verify Slack messages posted + expect(clientMock.chat.postMessage).to.have.been.called; + + // Find the bot protection alert message + const botProtectionCall = clientMock.chat.postMessage.getCalls().find( + (call) => call.args[0].text && call.args[0].text.includes('Bot Protection Detected'), + ); + expect(botProtectionCall).to.exist; + expect(botProtectionCall.args[0].blocks).to.exist; + expect(botProtectionCall.args[0].blocks[0].text.text).to.include('cloudflare'); + expect(botProtectionCall.args[0].blocks[0].text.text).to.include('95%'); + + // Verify "Onboarding stopped" message was sent + const calls = clientMock.chat.postMessage.getCalls(); + const hasStoppedMessage = calls.some( + (call) => call.args[0].text && call.args[0].text.includes('Onboarding stopped'), + ); + expect(hasStoppedMessage).to.be.true; + + // Verify allowlist instructions in stopped message + const stoppedCall = calls.find( + (call) => call.args[0].text && call.args[0].text.includes('Onboarding stopped'), + ); + expect(stoppedCall.args[0].text).to.include('allowlist SpaceCat'); + expect(stoppedCall.args[0].text).to.include('re-run the onboard command'); + }); + + it('should proceed normally when no bot protection detected', async () => { + // Mock no bot protection detected (default behavior) + checkBotProtectionStub.resolves({ + blocked: false, + type: 'none', + confidence: 0, + }); + + const onboardSiteModalAction = onboardSiteModal(context); + + await onboardSiteModalAction({ + ack: ackMock, + body, + client: clientMock, + }); + + expect(ackMock).to.have.been.called; + expect(checkBotProtectionStub).to.have.been.calledOnce; + + // Should still post success message (from onboardSingleSite success path) + expect(clientMock.chat.postMessage).to.have.been.called; + }); + + it('should use correct environment for bot protection message', async () => { + // Set prod environment + context.env.AWS_REGION = 'us-east-1'; + + checkBotProtectionStub.resolves({ + blocked: true, + type: 'cloudflare', + confidence: 0.95, + }); + + const onboardSiteModalAction = onboardSiteModal(context); + + await onboardSiteModalAction({ + ack: ackMock, + body, + client: clientMock, + }); + + expect(ackMock).to.have.been.called; + + // Verify the bot protection message was sent + const botProtectionCall = clientMock.chat.postMessage.getCalls().find( + (call) => call.args[0].text && call.args[0].text.includes('Bot Protection Detected'), + ); + expect(botProtectionCall).to.exist; + expect(botProtectionCall.args[0].blocks[0].text.text).to.include('Production IPs'); + }); + + it('should use dev environment when AWS_REGION does not include us-east', async () => { + // Set dev environment + context.env.AWS_REGION = 'us-west-2'; + + checkBotProtectionStub.resolves({ + blocked: true, + type: 'imperva', + confidence: 0.85, + }); + + const onboardSiteModalAction = onboardSiteModal(context); + + await onboardSiteModalAction({ + ack: ackMock, + body, + client: clientMock, + }); + + expect(ackMock).to.have.been.called; + + // Verify dev environment message + const botProtectionCall = clientMock.chat.postMessage.getCalls().find( + (call) => call.args[0].text && call.args[0].text.includes('Bot Protection Detected'), + ); + expect(botProtectionCall).to.exist; + expect(botProtectionCall.args[0].blocks[0].text.text).to.include('Development IPs'); + }); }); }); diff --git a/test/support/utils/bot-protection-check.test.js b/test/support/utils/bot-protection-check.test.js new file mode 100644 index 000000000..fad082246 --- /dev/null +++ b/test/support/utils/bot-protection-check.test.js @@ -0,0 +1,668 @@ +/* + * Copyright 2025 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +/* eslint-env mocha */ + +import { expect, use } from 'chai'; +import sinon from 'sinon'; +import sinonChai from 'sinon-chai'; +import { checkBotProtectionDuringOnboarding } from '../../../src/support/utils/bot-protection-check.js'; + +use(sinonChai); + +describe('Bot Protection Check', () => { + let log; + let fetchStub; + let originalFetch; + + before(() => { + originalFetch = global.fetch; + }); + + after(() => { + global.fetch = originalFetch; + }); + + beforeEach(() => { + global.fetch = originalFetch; + + log = { + info: sinon.stub(), + error: sinon.stub(), + warn: sinon.stub(), + debug: sinon.stub(), + }; + + fetchStub = sinon.stub(); + global.fetch = fetchStub; + }); + + afterEach(() => { + sinon.restore(); + }); + + describe('checkBotProtectionDuringOnboarding', () => { + it('detects bot protection when challenge page is returned', async () => { + const baseUrl = 'https://example.com'; + const challengeHtml = 'Just a moment...
'; + + fetchStub.callsFake((url) => { + // Homepage returns challenge + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({ + 'content-type': 'text/html', + server: 'cloudflare', + 'cf-ray': '12345', + }), + text: sinon.stub().resolves(challengeHtml), + }); + } + // Other URLs return 404 + return Promise.resolve({ + status: 404, + headers: new Headers({}), + text: sinon.stub().resolves('Not Found'), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('cloudflare'); + expect(result.confidence).to.be.greaterThan(0.8); + expect(result.details.httpStatus).to.equal(200); + expect(result.details.htmlSize).to.equal(challengeHtml.length); + expect(log.info).to.have.been.calledWith( + `Performing lightweight bot protection check for ${baseUrl}`, + ); + }); + + it('detects no bot protection when site returns normal content', async () => { + const baseUrl = 'https://example.com'; + const normalHtml = 'Welcome

Hello World

This is normal content with plenty of text to avoid being flagged as suspiciously short.

'; + + fetchStub.callsFake((_) => Promise.resolve({ + status: 200, + headers: new Headers({ + 'content-type': 'text/html', + }), + text: sinon.stub().resolves(normalHtml), + })); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.false; + expect(result.type).to.equal('none'); + expect(result.details.httpStatus).to.equal(200); + expect(log.info).to.have.been.calledWith( + `Bot protection check complete for ${baseUrl}`, + sinon.match({ + crawlable: true, + type: 'none', + confidence: 1, + }), + ); + }); + + it('detects bot protection with 403 status', async () => { + const baseUrl = 'https://example.com'; + + fetchStub.resolves({ + status: 403, + headers: new Headers({ + server: 'cloudflare', + 'cf-ray': '12345', + }), + text: sinon.stub().resolves('Forbidden'), + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('cloudflare'); + expect(result.details.httpStatus).to.equal(403); + }); + + it('handles fetch errors gracefully (fail open)', async () => { + const baseUrl = 'https://example.com'; + const error = new Error('Network error'); + + fetchStub.rejects(error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.false; + expect(result.type).to.equal('unknown'); + expect(result.confidence).to.equal(0); + expect(result.error).to.equal('Network error'); + expect(log.error).to.have.been.calledWith( + `Bot protection check failed for ${baseUrl}:`, + error, + ); + }); + + it('handles timeout errors gracefully', async () => { + const baseUrl = 'https://example.com'; + const error = new Error('The operation was aborted'); + + fetchStub.rejects(error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.false; + expect(result.type).to.equal('unknown'); + expect(result.error).to.equal('The operation was aborted'); + }); + + it('includes reason when provided by analyzeBotProtection', async () => { + const baseUrl = 'https://example.com'; + const challengeHtml = 'Just a moment...Challenge page'; + + fetchStub.resolves({ + status: 200, + headers: new Headers({ + server: 'cloudflare', + }), + text: sinon.stub().resolves(challengeHtml), + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.reason).to.exist; + }); + + it('handles errors with undefined message', async () => { + const baseUrl = 'https://example.com'; + const error = new Error(); + delete error.message; // Make message undefined + + fetchStub.rejects(error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.false; + expect(result.type).to.equal('unknown'); + expect(result.confidence).to.equal(0); + expect(result.error).to.equal(''); + }); + + it('handles errors with null message', async () => { + const baseUrl = 'https://example.com'; + const error = new Error(); + error.message = null; + + fetchStub.rejects(error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.false; + expect(result.type).to.equal('unknown'); + expect(result.confidence).to.equal(0); + expect(result.error).to.equal(''); + }); + + it('detects HTTP/2 error (NGHTTP2_INTERNAL_ERROR) on homepage', async () => { + const baseUrl = 'https://bmw.fr'; + const http2Error = new Error('Stream closed with error code NGHTTP2_INTERNAL_ERROR'); + http2Error.code = 'NGHTTP2_INTERNAL_ERROR'; + + fetchStub.rejects(http2Error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + expect(result.details.failedRequests).to.be.an('array'); + expect(result.details.failedRequests[0].code).to.equal('NGHTTP2_INTERNAL_ERROR'); + expect(log.warn).to.have.been.calledWith( + `HTTP/2 errors detected for ${baseUrl} - likely bot protection`, + ); + }); + + it('detects HTTP/2 error (ERR_HTTP2_STREAM_ERROR) on homepage', async () => { + const baseUrl = 'https://example.com'; + const http2Error = new Error('HTTP/2 stream error'); + http2Error.code = 'ERR_HTTP2_STREAM_ERROR'; + + fetchStub.rejects(http2Error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + }); + + it('detects HTTP/2 error on subsequent requests (robots.txt)', async () => { + const baseUrl = 'https://bmw.fr'; + const normalHtml = 'Welcome

BMW

'; + + fetchStub.callsFake((url) => { + // Homepage succeeds + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({ 'content-type': 'text/html' }), + text: sinon.stub().resolves(normalHtml), + }); + } + // robots.txt fails with HTTP/2 error + const http2Error = new Error('Stream closed with error code NGHTTP2_INTERNAL_ERROR'); + http2Error.code = 'NGHTTP2_INTERNAL_ERROR'; + return Promise.reject(http2Error); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + expect(result.details.failedRequests).to.be.an('array'); + expect(result.details.failedRequests.length).to.be.greaterThan(0); + expect(log.warn).to.have.been.calledWith( + `HTTP/2 errors detected for ${baseUrl} - likely bot protection`, + ); + }); + + it('detects HTTP/2 error in error message (without code)', async () => { + const baseUrl = 'https://example.com'; + const http2Error = new Error('Fetch failed: NGHTTP2_INTERNAL_ERROR stream closed'); + + fetchStub.rejects(http2Error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + }); + + it('detects multiple HTTP/2 errors across requests', async () => { + const baseUrl = 'https://example.com'; + const http2Error = new Error('HTTP2_STREAM_ERROR'); + http2Error.code = 'ERR_HTTP2_STREAM_ERROR'; + + fetchStub.rejects(http2Error); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + }); + + it('continues normally if only non-critical requests fail', async () => { + const baseUrl = 'https://example.com'; + const normalHtml = 'Welcome

Normal Site

With plenty of content

'; + + fetchStub.callsFake((url) => { + // Homepage succeeds + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({ 'content-type': 'text/html' }), + text: sinon.stub().resolves(normalHtml), + }); + } + // robots.txt and sitemap.xml fail with 404 (normal error, not HTTP/2) + return Promise.resolve({ + status: 404, + headers: new Headers({}), + text: sinon.stub().resolves('Not Found'), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.false; + expect(result.type).to.equal('none'); + }); + + it('detects HTTP/2 error after Promise.allSettled completes', async () => { + const baseUrl = 'https://example.com'; + const normalHtml = 'Welcome

Normal Site

'; + const http2Error = new Error('Stream closed with error code NGHTTP2_INTERNAL_ERROR'); + http2Error.code = 'NGHTTP2_INTERNAL_ERROR'; + + fetchStub.callsFake((url) => { + // Homepage fails with HTTP/2 error in text() + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({ 'content-type': 'text/html' }), + text: sinon.stub().rejects(http2Error), + }); + } + // Other URLs succeed + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves(normalHtml), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + expect(result.details.failedRequests).to.be.an('array'); + expect(result.details.failedRequests[0].code).to.equal('NGHTTP2_INTERNAL_ERROR'); + expect(log.warn).to.have.been.calledWith( + `HTTP/2 errors detected for ${baseUrl} - likely bot protection`, + ); + }); + + it('detects HTTP error (403) in outer catch block', async () => { + const baseUrl = 'https://example.com'; + const error403 = new Error('Request failed with status 403'); + error403.status = 403; + + fetchStub.callsFake((url) => { + // Homepage returns response but text() throws 403 error + if (url === baseUrl) { + return Promise.resolve({ + status: 403, + headers: new Headers({}), + text: sinon.stub().rejects(error403), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves('OK'), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http-error'); + expect(result.confidence).to.equal(0.7); + expect(result.reason).to.include('HTTP error suggests bot protection'); + expect(log.warn).to.have.been.calledWith( + `HTTP error suggests bot protection for ${baseUrl}`, + ); + }); + + it('detects HTTP error (401) message in outer catch block', async () => { + const baseUrl = 'https://example.com'; + const error401 = new Error('401 Unauthorized'); + + fetchStub.callsFake((url) => { + // Homepage returns response but text() throws 401 error + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().rejects(error401), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves('OK'), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http-error'); + expect(result.confidence).to.equal(0.7); + expect(result.reason).to.include('HTTP error suggests bot protection'); + }); + + it('detects Forbidden error in outer catch block', async () => { + const baseUrl = 'https://example.com'; + const forbiddenError = new Error('Forbidden'); + + fetchStub.callsFake((url) => { + // Homepage returns response but text() throws Forbidden error + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().rejects(forbiddenError), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves('OK'), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http-error'); + expect(result.confidence).to.equal(0.7); + expect(result.reason).to.include('HTTP error suggests bot protection'); + }); + + it('detects HTTP/2 error with ERR_HTTP2_STREAM_CANCEL code in outer catch', async () => { + const baseUrl = 'https://example.com'; + const http2Error = new Error('HTTP/2 stream cancelled'); + http2Error.code = 'ERR_HTTP2_STREAM_CANCEL'; + + fetchStub.callsFake((url) => { + // Homepage returns response but text() throws HTTP/2 error + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().rejects(http2Error), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves('OK'), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + }); + + it('detects HTTP/2 error when analyzeBotProtection accesses response properties that throw', async () => { + const baseUrl = 'https://example.com'; + const http2Error = new Error('ERR_HTTP2_STREAM_ERROR accessing response'); + http2Error.code = 'ERR_HTTP2_STREAM_ERROR'; + const normalHtml = 'Normal content'; + + fetchStub.callsFake((url) => { + if (url === baseUrl) { + // Create response with getter that throws when analyzeBotProtection accesses .status + return Promise.resolve({ + get status() { throw http2Error; }, + headers: new Headers({}), + text: sinon.stub().resolves(normalHtml), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves(normalHtml), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + expect(log.warn).to.have.been.calledWith( + `HTTP/2 error detected for ${baseUrl} - likely bot protection`, + ); + }); + + it('detects HTTP/2 error with NGHTTP2 in message during analysis', async () => { + const baseUrl = 'https://example.com'; + const http2Error = new Error('Stream error: NGHTTP2_INTERNAL_ERROR'); + const normalHtml = 'Content'; + + fetchStub.callsFake((url) => { + if (url === baseUrl) { + // Response succeeds but accessing headers throws + return Promise.resolve({ + status: 200, + get headers() { throw http2Error; }, + text: sinon.stub().resolves(normalHtml), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves(normalHtml), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + expect(result.reason).to.include('NGHTTP2_INTERNAL_ERROR'); + }); + + it('detects HTTP/2 error in homepage check when first filter misses it', async () => { + const baseUrl = 'https://example.com'; + const normalHtml = 'Content'; + + // Create an error that will initially appear as non-HTTP/2 + // but will be detected by the second check + const subtleError = new Error('Request failed'); + // Don't set error.code initially + + let firstCall = true; + fetchStub.callsFake((url) => { + if (url === baseUrl) { + if (firstCall) { + firstCall = false; + // Return a promise that will be "rejected" status in allSettled + // but with an error that doesn't have HTTP/2 patterns initially + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().rejects(subtleError), + }); + } + } + // Other requests succeed + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves(normalHtml), + }); + }); + + // Now modify the error to have HTTP/2 code after the stub is set up + // This simulates an error object that gets modified or has different properties + // when checked the second time + subtleError.code = 'ERR_HTTP2_STREAM_CANCEL'; + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP/2 connection error'); + // When caught by first filter, code is in failedRequests array + expect(result.details.failedRequests).to.be.an('array'); + expect(result.details.failedRequests[0].code).to.equal('ERR_HTTP2_STREAM_CANCEL'); + expect(log.warn).to.have.been.calledWith( + `HTTP/2 errors detected for ${baseUrl} - likely bot protection`, + ); + }); + + it('detects HTTP/2 error in homepage check via message pattern only', async () => { + const baseUrl = 'https://example.com'; + const normalHtml = 'Content'; + + // Create error with HTTP/2 in message but no code (initially) + // The first filter might miss this if the message isn't checked properly + const messageError = new Error('Connection terminated: HTTP2_STREAM_ERROR detected'); + // NO error.code set + + fetchStub.callsFake((url) => { + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().rejects(messageError), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves(normalHtml), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('HTTP2_STREAM_ERROR'); + }); + + it('uses fallback reason when error message is undefined', async () => { + const baseUrl = 'https://example.com'; + const normalHtml = 'Content'; + + // Create error with HTTP/2 code but NO message + const noMessageError = new Error(); + delete noMessageError.message; // Remove message + noMessageError.code = 'NGHTTP2_INTERNAL_ERROR'; + + fetchStub.callsFake((url) => { + if (url === baseUrl) { + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().rejects(noMessageError), + }); + } + return Promise.resolve({ + status: 200, + headers: new Headers({}), + text: sinon.stub().resolves(normalHtml), + }); + }); + + const result = await checkBotProtectionDuringOnboarding(baseUrl, log); + + expect(result.blocked).to.be.true; + expect(result.type).to.equal('http2-block'); + expect(result.confidence).to.equal(0.9); + expect(result.reason).to.include('bot blocking detected'); + }); + }); +});