From 8c03adc2fbde7c97feab1f8311af0c6b7ad66b17 Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 31 Mar 2025 15:42:57 +0200 Subject: [PATCH 1/3] Update Dockerfile to use distroless image and streamline build process --- .actor/Dockerfile | 62 ++++++++++------------------------------------- 1 file changed, 13 insertions(+), 49 deletions(-) diff --git a/.actor/Dockerfile b/.actor/Dockerfile index 7bd8102..9eb57a4 100644 --- a/.actor/Dockerfile +++ b/.actor/Dockerfile @@ -1,65 +1,29 @@ -# Specify the base Docker image. You can read more about -# the available images at https://crawlee.dev/docs/guides/docker-images -# You can also use any other image from Docker Hub. -FROM apify/actor-node-playwright-chrome:22-1.46.0 AS builder +FROM node:22-bookworm AS builder + +WORKDIR /app # Copy just package.json and package-lock.json # to speed up the build using Docker layer cache. -COPY --chown=myuser package*.json ./ +COPY package*.json ./ # Install all dependencies. Don't audit to speed up the installation. -RUN npm install --include=dev --audit=false +RUN npm ci +RUN npx -y playwright install --with-deps firefox -# Next, copy the source files using the user set -# in the base image. -COPY --chown=myuser . ./ +# Next, copy the source files. +COPY . ./ # Install all dependencies and build the project. # Don't audit to speed up the installation. RUN npm run build # Create final image -FROM apify/actor-node-playwright-firefox:22-1.46.0 - -# Copy just package.json and package-lock.json -# to speed up the build using Docker layer cache. -COPY --chown=myuser package*.json ./ - -# Install NPM packages, skip optional and development dependencies to -# keep the image small. Avoid logging too much and print the dependency -# tree for debugging -RUN npm --quiet set progress=false \ - && npm install --omit=dev --omit=optional \ - && echo "Installed NPM packages:" \ - && (npm list --omit=dev --all || true) \ - && echo "Node.js version:" \ - && node --version \ - && echo "NPM version:" \ - && npm --version \ - && rm -r ~/.npm - -# Remove the existing firefox installation -RUN rm -rf ${PLAYWRIGHT_BROWSERS_PATH}/* - -# Install all required playwright dependencies for firefox -RUN npx playwright install firefox -# symlink the firefox binary to the root folder in order to bypass the versioning and resulting browser launch crashes. -RUN ln -s ${PLAYWRIGHT_BROWSERS_PATH}/firefox-*/firefox/firefox ${PLAYWRIGHT_BROWSERS_PATH}/ - -# Overrides the dynamic library used by Firefox to determine trusted root certificates with p11-kit-trust.so, which loads the system certificates. -RUN rm $PLAYWRIGHT_BROWSERS_PATH/firefox-*/firefox/libnssckbi.so -RUN ln -s /usr/lib/x86_64-linux-gnu/pkcs11/p11-kit-trust.so $(ls -d $PLAYWRIGHT_BROWSERS_PATH/firefox-*)/firefox/libnssckbi.so - -# Copy built JS files from builder image -COPY --from=builder --chown=myuser /home/myuser/dist ./dist +FROM gcr.io/distroless/nodejs22-debian12 -# Next, copy the remaining files and directories with the source code. -# Since we do this after NPM install, quick build will be really fast -# for most source file changes. -COPY --chown=myuser . ./ +WORKDIR /app -# Disable experimental feature warning from Node.js -ENV NODE_NO_WARNINGS=1 +# Copy the node_modules and built app from the build stage +COPY --from=builder /app /app # Run the image. -CMD npm run start:prod --silent +CMD ["/app/dist/src/main.js", "--silent"] From a8330b0b0d4bb543724a711a17eced0143bede92 Mon Sep 17 00:00:00 2001 From: MQ Date: Wed, 2 Apr 2025 16:55:29 +0200 Subject: [PATCH 2/3] fix dockerfile --- .actor/Dockerfile | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.actor/Dockerfile b/.actor/Dockerfile index 9eb57a4..1d650f9 100644 --- a/.actor/Dockerfile +++ b/.actor/Dockerfile @@ -8,15 +8,25 @@ COPY package*.json ./ # Install all dependencies. Don't audit to speed up the installation. RUN npm ci -RUN npx -y playwright install --with-deps firefox -# Next, copy the source files. +# Copy the source files. COPY . ./ # Install all dependencies and build the project. -# Don't audit to speed up the installation. RUN npm run build +# Install Playwright and Firefox. +RUN npx -y playwright install --with-deps firefox + +# Move the installed browser to a separate directory. +RUN mkdir -p /app/ms-playwright \ + && mv $(find ~/.cache/ms-playwright -type d -name "firefox" -path "*/firefox" | head -n 1) /app/ms-playwright/firefox + +# Move Firefox libs to a separate directory for later copy. +RUN apt-get -y install strace && \ + mkdir -p /firefox/lib/ && \ + strace -e openat /app/ms-playwright/firefox/firefox 2>&1 | grep /lib/x86 | awk -F'"' '{print $2}' | xargs -I {} cp {} /firefox/lib/ + # Create final image FROM gcr.io/distroless/nodejs22-debian12 @@ -25,5 +35,11 @@ WORKDIR /app # Copy the node_modules and built app from the build stage COPY --from=builder /app /app +# Copy Firefox libs +COPY --from=builder /firefox/lib/* /lib/x86_64-linux-gnu/ + +# Set the default browser path to the installed Firefox. +ENV APIFY_DEFAULT_BROWSER_PATH=/app/ms-playwright/firefox/firefox + # Run the image. CMD ["/app/dist/src/main.js", "--silent"] From a9d93437fe4698d7fd033791e29cf1a00d5cab83 Mon Sep 17 00:00:00 2001 From: MQ Date: Tue, 6 May 2025 14:55:57 +0200 Subject: [PATCH 3/3] add runtime benchmark --- benchmark/runtime.ts | 194 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 benchmark/runtime.ts diff --git a/benchmark/runtime.ts b/benchmark/runtime.ts new file mode 100644 index 0000000..b17db72 --- /dev/null +++ b/benchmark/runtime.ts @@ -0,0 +1,194 @@ +/* +Benchmark for Actor runtime + +This benchmark was mainly created for testing Actor run time performance to compare previous and distroless build. +*/ +import { ActorRun, ApifyClient } from 'apify-client'; + +// Configuration constants +const API_TOKEN = process.env.APIFY_TOKEN; +//const ACTOR_NAME = 'apify/rag-web-browser'; +const ACTOR_NAME = 'jakub.kopecky/rag-web-browser'; +const MAX_RUNS = 500; +const MAX_MEMORY_GB = 64; // Total memory available +const ACTOR_MEMORY_GB = 1; // Memory per actor run +const ACTOR_INPUT = { + query: 'apify ai', + maxResults: 1, +}; + +// Initialize Apify client +const client = new ApifyClient({ token: API_TOKEN }); + +async function computeLogTimes(run: ActorRun): Promise<{ + pullToStartTime: number; + startToSystemTime: number; +}> { + const log = await client.run(run.id).log().get(); + if (!log) { + throw new Error(`Failed to get logs for run ${run.id}`); + } + + const lines = log.split('\n'); + // get initial + const startTimeStr = lines[0].split(' ')[0]; + const startTime = new Date(startTimeStr).getTime(); + + // starting container + const startContainerLine = lines.find(line => line.includes('Starting Docker container')); + if (!startContainerLine) { + throw new Error(`Failed to find start container line in logs for run ${run.id}`); + } + const startContainerTimeStr = startContainerLine?.split(' ')[0]; + const startContainerTime = new Date(startContainerTimeStr).getTime(); + + // system info + const systemInfoLine = lines.find(line => line.includes('System info')); + if (!systemInfoLine) { + throw new Error(`Failed to find system info line in logs for run ${run.id}`); + } + const systemInfoTimeStr = systemInfoLine?.split(' ')[0]; + const systemInfoTime = new Date(systemInfoTimeStr).getTime(); + + // Calculate times + const pullToStartTime = (startContainerTime - startTime) / 1000; // in seconds + const startToSystemTime = (systemInfoTime - startContainerTime) / 1000; // in seconds + + return { + pullToStartTime, + startToSystemTime, + }; +} + +async function waitForRunFinishAndHandle(concurrentRunIDs: string[]): Promise<{ + pullToStartTime: number; + startToSystemTime: number; + run: ActorRun; +}> { + //const run = await client.run(concurrentRunIDs[0]).waitForFinish(); + let runid: string | undefined; + while (!runid) { + runid = concurrentRunIDs.find(id => async () => { + const run = await client.run(id).get(); + if (!run) { + throw new Error(`Failed to get run ${id}`); + } + return run.status === 'SUCCEEDED' || run.status === 'FAILED'; + }); + if (!runid) { + // sleep + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + const run = await client.run(runid).waitForFinish(); + + console.log(`Run ${run.id} finished in ${run.stats.runTimeSecs} seconds`); + const { pullToStartTime, startToSystemTime } = await computeLogTimes(run); + concurrentRunIDs.shift(); // Remove the finished run from the list + + return { + pullToStartTime, + startToSystemTime, + run, + } +} + +async function runBenchmark() { + // Calculate max concurrent runs to avoid memory overload + const maxConcurrentRuns = Math.floor(MAX_MEMORY_GB / ACTOR_MEMORY_GB); + console.log(`Starting ${MAX_RUNS} runs with ${ACTOR_MEMORY_GB}GB per run, max ${maxConcurrentRuns} concurrent`); + + // Track runs + const finishedRuns: ActorRun[] = []; + const concurrentRunIDs: string[] = []; + + // Extracted times from logs + // Time to pull the Actor container + const logPullToStartTimes: number[] = []; + // Time from starting the container to the first system log + const logStartToSystemTimes: number[] = []; + + // Actor run loop + while (finishedRuns.length + concurrentRunIDs.length < MAX_RUNS) { + if (concurrentRunIDs.length >= maxConcurrentRuns) { + const { pullToStartTime, startToSystemTime, run } = await waitForRunFinishAndHandle(concurrentRunIDs); + finishedRuns.push(run); + logPullToStartTimes.push(pullToStartTime); + logStartToSystemTimes.push(startToSystemTime); + } + + const run = await client.actor(ACTOR_NAME).start(ACTOR_INPUT, { memory: ACTOR_MEMORY_GB * 1024 }); + console.log(`Started run ${run.id} with ${ACTOR_MEMORY_GB}GB memory (${finishedRuns.length} finished, ${concurrentRunIDs.length} concurrent)`); + concurrentRunIDs.push(run.id); + } + // Wait for remaining runs to finish + while (concurrentRunIDs.length > 0) { + const { pullToStartTime, startToSystemTime, run } = await waitForRunFinishAndHandle(concurrentRunIDs); + finishedRuns.push(run); + logPullToStartTimes.push(pullToStartTime); + logStartToSystemTimes.push(startToSystemTime); + } + + // Calculate run times + const runTimes: number[] = finishedRuns.map(run => run.stats.runTimeSecs); + + // Log pull to start times + const averagePullToStartTime = logPullToStartTimes.reduce((sum, time) => sum + time, 0) / logPullToStartTimes.length; + const medianPullToStartTime = (() => { + const sorted = [...logPullToStartTimes].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; + })(); + const minPullToStartTime = Math.min(...logPullToStartTimes); + const maxPullToStartTime = Math.max(...logPullToStartTimes); + const stdDevPullToStartTime = Math.sqrt(logPullToStartTimes.reduce((sum, time) => sum + Math.pow(time - averagePullToStartTime, 2), 0) / logPullToStartTimes.length); + + // Log start to system times + const averageStartToSystemTime = logStartToSystemTimes.reduce((sum, time) => sum + time, 0) / logStartToSystemTimes.length; + const medianStartToSystemTime = (() => { + const sorted = [...logStartToSystemTimes].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; + })(); + const minStartToSystemTime = Math.min(...logStartToSystemTimes); + const maxStartToSystemTime = Math.max(...logStartToSystemTimes); + const stdDevStartToSystemTime = Math.sqrt(logStartToSystemTimes.reduce((sum, time) => sum + Math.pow(time - averageStartToSystemTime, 2), 0) / logStartToSystemTimes.length); + + // Compute average run time + const averageRunTime = runTimes.reduce((sum, time) => sum + time, 0) / runTimes.length; + const medianRunTime = (() => { + const sorted = [...runTimes].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; + })(); + const minRunTime = Math.min(...runTimes); + const maxRunTime = Math.max(...runTimes); + const stdDevRunTime = Math.sqrt(runTimes.reduce((sum, time) => sum + Math.pow(time - averageRunTime, 2), 0) / runTimes.length); + + // Log results + console.log(`Completed ${MAX_RUNS} runs`); + console.log('------------------------------------------------------') + console.log(`Average pull to start time: ${averagePullToStartTime.toFixed(2)} seconds`); + console.log(`Median pull to start time: ${medianPullToStartTime.toFixed(2)} seconds`); + console.log(`Min pull to start time: ${minPullToStartTime.toFixed(2)} seconds`); + console.log(`Max pull to start time: ${maxPullToStartTime.toFixed(2)} seconds`); + console.log(`Standard deviation of pull to start times: ${stdDevPullToStartTime.toFixed(2)} seconds`); + console.log('------------------------------------------------------') + console.log(`Average start to system time: ${averageStartToSystemTime.toFixed(2)} seconds`); + console.log(`Median start to system time: ${medianStartToSystemTime.toFixed(2)} seconds`); + console.log(`Min start to system time: ${minStartToSystemTime.toFixed(2)} seconds`); + console.log(`Max start to system time: ${maxStartToSystemTime.toFixed(2)} seconds`); + console.log(`Standard deviation of start to system times: ${stdDevStartToSystemTime.toFixed(2)} seconds`); + console.log('------------------------------------------------------') + console.log(`Average total run time: ${averageRunTime.toFixed(2)} seconds`); + console.log(`Median total run time: ${medianRunTime.toFixed(2)} seconds`); + console.log(`Min total run time: ${minRunTime.toFixed(2)} seconds`); + console.log(`Max total run time: ${maxRunTime.toFixed(2)} seconds`); + console.log(`Standard deviation of total run times: ${stdDevRunTime.toFixed(2)} seconds`); +} + +// Execute benchmark +runBenchmark().catch(error => { + console.error('Benchmark failed:', error.message); + process.exit(1); +});