diff --git a/.vale.ini b/.vale.ini index 3ecece96f..9e634e1dc 100644 --- a/.vale.ini +++ b/.vale.ini @@ -11,8 +11,10 @@ mdx = md [*.md] BasedOnStyles = Vale, Apify, write-good, Microsoft -# Ignore URLs, HTML/XML tags starting with capital letter, lines containing = sign, http & https URL ending with ] or ) & email addresses -TokenIgnores = (<\/?[A-Z].+>), ([^\n]+=[^\n]*), (\[[^\]]+\]\([^\)]+\)), ([^\n]+@[^\n]+\.[^\n]), ({[^}]*}), (`[^`]*`), (`\w+`) +# Ignore URLs, HTML/XML tags starting with capital letter, lines containing = sign, http & https URL ending with ] or ), email addresses, inline code +TokenIgnores = (<\/?[A-Z].+>), ([^\n]+=[^\n]*), (\[[^\]]+\]\([^\)]+\)), ([^\n]+@[^\n]+\.[^\n]), ({[^}]*}), `[^`]+` +# Ignore HTML comments and code blocks +BlockIgnores = (?s) ()|(```.*?```) Vale.Spelling = YES diff --git a/apify-api/openapi/components/schemas/actor-builds/GetOpenApiResponse.yaml b/apify-api/openapi/components/schemas/actor-builds/GetOpenApiResponse.yaml new file mode 100644 index 000000000..a5a159db6 --- /dev/null +++ b/apify-api/openapi/components/schemas/actor-builds/GetOpenApiResponse.yaml @@ -0,0 +1,348 @@ +title: GetOpenApiResponse +type: object +properties: + openapi: + type: string + example: 3.0.1 + info: + type: object + properties: + title: + type: string + example: Your Magic Actor + version: + type: string + example: '1.0' + x-build-id: + type: string + example: 'ID of build' + servers: + type: array + items: + type: object + properties: + url: + type: string + example: https://api.apify.com/v2 + paths: + type: object + properties: + /acts/~/run-sync-get-dataset-items: + type: object + properties: + post: + type: object + properties: + operationId: + type: string + example: run-sync-get-dataset-items + x-openai-isConsequential: + type: boolean + example: false + summary: + type: string + example: Executes an Actor, waits for its completion, and returns Actor's dataset items in response. + tags: + type: array + items: + type: string + example: ['Run Actor'] + requestBody: + type: object + properties: + required: + type: boolean + example: true + content: + type: object + properties: + application/json: + type: object + properties: + schema: + type: object + properties: + $ref: + type: string + example: '#/components/schemas/inputSchema' + parameters: + type: array + items: + type: object + properties: + name: + type: string + example: token + in: + type: string + example: query + required: + type: boolean + example: true + schema: + type: object + properties: + type: + type: string + example: string + description: + type: string + example: Enter your Apify token here + responses: + type: object + properties: + '200': + type: object + properties: + description: + type: string + example: OK + /acts/~/runs: + type: object + properties: + post: + type: object + properties: + operationId: + type: string + example: runs + x-openai-isConsequential: + type: boolean + example: false + summary: + type: string + example: Executes an Actor and returns information about the initiated run in response. + tags: + type: array + items: + type: string + example: ['Run Actor'] + requestBody: + type: object + properties: + required: + type: boolean + example: true + content: + type: object + properties: + application/json: + type: object + properties: + schema: + type: object + properties: + $ref: + type: string + example: '#/components/schemas/inputSchema' + parameters: + type: array + items: + type: object + properties: + name: + type: string + in: + type: string + example: query + required: + type: boolean + schema: + type: object + properties: + type: + type: string + description: + type: string + responses: + type: object + properties: + '200': + type: object + properties: + description: + type: string + example: OK + content: + type: object + properties: + application/json: + type: object + properties: + schema: + type: object + properties: + $ref: + type: string + example: '#/components/schemas/runsResponseSchema' + /acts/~/run-sync: + type: object + properties: + post: + type: object + properties: + operationId: + type: string + example: run-sync + x-openai-isConsequential: + type: boolean + example: false + summary: + type: string + example: Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response. + tags: + type: array + items: + type: string + example: ['Run Actor'] + requestBody: + type: object + properties: + required: + type: boolean + example: true + content: + type: object + properties: + application/json: + type: object + properties: + schema: + type: object + properties: + $ref: + type: string + example: '#/components/schemas/inputSchema' + parameters: + type: array + items: + type: object + properties: + name: + type: string + in: + type: string + example: query + required: + type: boolean + schema: + type: object + properties: + type: + type: string + description: + type: string + responses: + type: object + properties: + '200': + type: object + properties: + description: + type: string + example: OK + components: + type: object + properties: + schemas: + type: object + properties: + inputSchema: + type: object + properties: + type: + type: string + example: object + runsResponseSchema: + type: object + properties: + type: + type: string + example: object + properties: + type: object + properties: + data: + type: object + properties: + type: + type: string + example: object + properties: + type: object + properties: + id: + type: object + properties: + type: + type: string + example: string + actId: + type: object + properties: + type: + type: string + example: string + userId: + type: object + properties: + type: + type: string + example: string + startedAt: + type: object + properties: + type: + type: string + example: string + format: + type: string + example: date-time + example: + type: string + example: '2025-01-08T00:00:00.000Z' + finishedAt: + type: object + properties: + type: + type: string + example: string + format: + type: string + example: date-time + example: + type: string + example: '2025-01-08T00:00:00.000Z' + status: + type: object + properties: + type: + type: string + example: string + example: + type: string + example: 'READY' + meta: + type: object + properties: + type: + type: string + example: object + properties: + type: object + properties: + origin: + type: object + properties: + type: + type: string + example: string + example: + type: string + example: 'API' + userAgent: + type: object + properties: + type: + type: string + example: string diff --git a/apify-api/openapi/components/tags.yaml b/apify-api/openapi/components/tags.yaml index baa466a58..06cd107df 100644 --- a/apify-api/openapi/components/tags.yaml +++ b/apify-api/openapi/components/tags.yaml @@ -127,6 +127,10 @@ x-trait: 'true' description: '**[DEPRECATED]** API endpoints related to build of the actor were moved under new namespace [`actor-builds`](#/reference/actor-builds).' +- name: Actors/Default build object + x-displayName: Default build object + x-parent-tag-name: Actor builds + x-trait: 'true' - name: Actors/Abort build x-displayName: Abort build x-parent-tag-name: Actors @@ -251,6 +255,15 @@ ``` In order to save new items to the dataset, send HTTP POST request with JSON payload to the same URL. +- name: Actors/Get OpenAPI specification + x-displayName: Get OpenAPI specification + x-parent-tag-name: Actors + x-trait: 'true' + description: | + Get the OpenAPI specification for Actor builds. Two similar endpoints are available: + + - [First endpoint](/api/v2/act-openapi-specification-get): Requires both `actorId` and `buildId`. Use `default` as the `buildId` to get the OpenAPI schema for the default Actor build. + - [Second endpoint](/api/v2/actor-build-openapi-specification-get): Requires only `buildId`. - name: Actor tasks x-displayName: Actor tasks x-legacy-doc-urls: @@ -548,6 +561,15 @@ - '#tag/Actor-buildsBuild-log' x-trait: 'true' description: Check out [Logs](#/reference/logs) for full reference. +- name: Actor builds/Get OpenAPI specification + x-displayName: Get OpenAPI specification + x-parent-tag-name: Actor builds + x-trait: 'true' + description: | + Get the OpenAPI specification for Actor builds. Two similar endpoints are available: + + - [First endpoint](/api/v2/act-openapi-specification-get): Requires both `actorId` and `buildId`. Use `default` as the `buildId` to get the OpenAPI schema for the default Actor build. + - [Second endpoint](/api/v2/actor-build-openapi-specification-get): Requires only `buildId`. - name: Key-value stores x-displayName: Key-value stores x-legacy-doc-urls: diff --git a/apify-api/openapi/components/x-tag-groups.yaml b/apify-api/openapi/components/x-tag-groups.yaml index 9a85e48d2..d87d49741 100644 --- a/apify-api/openapi/components/x-tag-groups.yaml +++ b/apify-api/openapi/components/x-tag-groups.yaml @@ -10,6 +10,7 @@ - Actors/Webhook collection - Actors/Build collection - Actors/Build object + - Actors/Default build object - Actors/Abort build - Actors/Run collection - Actors/Run actor synchronously @@ -19,6 +20,7 @@ - Actors/Metamorph run - Actors/Resurrect run - Actors/Last run object and its storages + - Actors/Get OpenAPI specification - name: Actor tasks tags: - Actor tasks @@ -50,6 +52,7 @@ - Actor builds/Delete build - Actor builds/Abort build - Actor builds/Build log + - Actor builds/Get OpenAPI specification - name: Key-value stores tags: - Key-value stores diff --git a/apify-api/openapi/openapi.yaml b/apify-api/openapi/openapi.yaml index a0f3b23bf..1491a534e 100644 --- a/apify-api/openapi/openapi.yaml +++ b/apify-api/openapi/openapi.yaml @@ -10,7 +10,9 @@ info: The Apify API (version 2) provides programmatic access to the [Apify platform](https://docs.apify.com). The API is organized around [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) - HTTP endpoints. + HTTP endpoints. + + You can download the complete OpenAPI schema of Apify API in the [YAML](http://docs.apify.com/api/openapi.yaml) or [JSON](http://docs.apify.com/api/openapi.json) formats. The source code is also available on [GitHub](https://github.com/apify/apify-docs/tree/master/apify-api/openapi). All requests and responses (including errors) are encoded in [JSON](http://www.json.org/) format with UTF-8 encoding, @@ -499,8 +501,12 @@ paths: $ref: 'paths/actors/acts@{actorId}@builds.yaml' '/v2/acts/{actorId}/builds/{buildId}': $ref: 'paths/actors/acts@{actorId}@builds@{buildId}.yaml' + '/v2/acts/{actorId}/builds/default': + $ref: 'paths/actors/acts@{actorId}@builds@default.yaml' '/v2/acts/{actorId}/builds/{buildId}/abort': $ref: 'paths/actors/acts@{actorId}@builds@{buildId}@abort.yaml' + '/v2/acts/{actorId}/builds/{buildId}/openapi-specification': + $ref: 'paths/actors/acts@{actorId}@builds@{buildId}@openapi-specification.yaml' '/v2/acts/{actorId}/runs': $ref: 'paths/actors/acts@{actorId}@runs.yaml' '/v2/acts/{actorId}/run-sync': @@ -553,6 +559,8 @@ paths: $ref: 'paths/actor-builds/actor-builds@{buildId}@abort.yaml' '/v2/actor-builds/{buildId}/log': $ref: 'paths/actor-builds/actor-builds@{buildId}@log.yaml' + '/v2/actor-builds/{buildId}/openapi-specification': + $ref: 'paths/actor-builds/actor-builds@{buildId}@openapi-specification.yaml' /v2/key-value-stores: $ref: paths/key-value-stores/key-value-stores.yaml '/v2/key-value-stores/{storeId}': diff --git a/apify-api/openapi/paths/actor-builds/actor-builds@{buildId}@openapi-specification.yaml b/apify-api/openapi/paths/actor-builds/actor-builds@{buildId}@openapi-specification.yaml new file mode 100644 index 000000000..da6fb545c --- /dev/null +++ b/apify-api/openapi/paths/actor-builds/actor-builds@{buildId}@openapi-specification.yaml @@ -0,0 +1,30 @@ +get: + tags: + - Actor builds/Get OpenAPI specification + summary: Get OpenAPI specification + description: | + Get the OpenAPI specification for a specific Actor build. + Authentication is based on the build's unique ID. No authentication token is required. + + **Note**: You can also use the `/api/v2/act-openapi-specification-get` endpoint to get the OpenAPI specification for a build. + operationId: actorBuild_openapiSpecification_get + security: + - apiKeyActorBuilds: [] + - httpBearerActorBuilds: [] + parameters: + - name: buildId + in: path + description: ID of the build you want to get, found in the build's `Info` tab. + required: true + style: simple + schema: + type: string + example: soSkq9ekdmfOslopH + responses: + '200': + description: '' + headers: {} + content: + application/json: + schema: + $ref: '../../components/schemas/actor-builds/GetOpenApiResponse.yaml' diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@builds@default.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@builds@default.yaml new file mode 100644 index 000000000..f4de87c06 --- /dev/null +++ b/apify-api/openapi/paths/actors/acts@{actorId}@builds@default.yaml @@ -0,0 +1,76 @@ +get: + tags: + - Actors/Default build object + summary: Get default build + description: | + Get the default build for an Actor. + + Use the optional `waitForFinish` parameter to synchronously wait for the build to finish. + This avoids the need for periodic polling when waiting for the build to complete. + + This endpoint does not require an authentication token. Instead, calls are authenticated using the build's unique ID. + However, if you access the endpoint without a token, certain attributes (e.g., `usageUsd` and `usageTotalUsd`) will be hidden. + operationId: act_build_default_get + security: + - apiKeyActorBuilds: [] + - httpBearerActorBuilds: [] + parameters: + - name: actorId + in: path + description: Actor ID or a tilde-separated owner's username and Actor name. + required: true + style: simple + schema: + type: string + example: janedoe~my-actor + - name: waitForFinish + in: query + description: | + The maximum number of seconds the server waits for the build to finish. + If the build finishes within this time, the returned build object will have a terminal status (e.g. `SUCCEEDED`), + otherwise it will have a transitional status (e.g. `RUNNING`). + + By default it is `0`, the maximum value is `60`. + style: form + explode: true + schema: + type: number + format: double + example: 60 + responses: + '200': + description: '' + headers: {} + content: + application/json: + schema: + $ref: '../../components/schemas/actor-builds/GetBuildResponse.yaml' + example: + data: + id: HG7ML7M8z78YcAPEB + actId: janedoe~my-actor + userId: klmdEpoiojmdEMlk3 + startedAt: '2019-11-30T07:34:24.202Z' + finishedAt: '2019-12-12T09:30:12.202Z' + status: SUCCEEDED + meta: + origin: WEB + clientIp: 172.234.12.34 + userAgent: Mozilla/5.0 (iPad) + stats: + durationMillis: 1000 + runTimeSecs: 45.718 + computeUnits: 0.012699444444444444 + options: + useCache: false + betaPackages: false + memoryMbytes: 1024 + diskMbytes: 2048 + usage: + ACTOR_COMPUTE_UNITS: 0.08 + usageTotalUsd: 0.02 + usageUsd: + ACTOR_COMPUTE_UNITS: 0.02 + inputSchema: '{\n \"title\": \"Schema for ... }' + readme: '# Magic Actor\nThis Actor is magic.' + buildNumber: 0.1.1 diff --git a/apify-api/openapi/paths/actors/acts@{actorId}@builds@{buildId}@openapi-specification.yaml b/apify-api/openapi/paths/actors/acts@{actorId}@builds@{buildId}@openapi-specification.yaml new file mode 100644 index 000000000..6a929cfcd --- /dev/null +++ b/apify-api/openapi/paths/actors/acts@{actorId}@builds@{buildId}@openapi-specification.yaml @@ -0,0 +1,40 @@ +get: + tags: + - Actors/Get OpenAPI specification + summary: Get OpenAPI specification + description: | + Get the OpenAPI specification for a specific Actor build. + + To fetch the default Actor build, simply pass `default` as the `buildId`. + Authentication is based on the build's unique ID. No authentication token is required. + + **Note**: You can also use the `/api/v2/actor-build-openapi-specification-get` endpoint to get the OpenAPI specification for a build. + operationId: act_openapiSpecification_get + security: + - apiKeyActorBuilds: [] + - httpBearerActorBuilds: [] + parameters: + - name: actorId + in: path + description: Actor ID or a tilde-separated owner's username and Actor name. + required: true + style: simple + schema: + type: string + example: janedoe~my-actor + - name: buildId + in: path + description: ID of the build you want to get, found in the build's `Info` tab. Pass `default` for default Actor build. + required: true + style: simple + schema: + type: string + example: soSkq9ekdmfOslopH + responses: + '200': + description: '' + headers: {} + content: + application/json: + schema: + $ref: '../../components/schemas/actor-builds/GetOpenApiResponse.yaml' diff --git a/apify-docs-theme/package.json b/apify-docs-theme/package.json index b7ae65d42..ac7bd2349 100644 --- a/apify-docs-theme/package.json +++ b/apify-docs-theme/package.json @@ -1,6 +1,6 @@ { "name": "@apify/docs-theme", - "version": "1.0.163", + "version": "1.0.165", "description": "", "main": "./src/index.js", "files": [ diff --git a/apify-docs-theme/src/markdown.js b/apify-docs-theme/src/markdown.js index d1d5b295b..deb748f8a 100644 --- a/apify-docs-theme/src/markdown.js +++ b/apify-docs-theme/src/markdown.js @@ -1,7 +1,7 @@ const remarkParse = require('remark-parse'); const remarkStringify = require('remark-stringify'); const { unified } = require('unified'); -const visitParents = require('unist-util-visit-parents'); +const { visitParents } = require('unist-util-visit-parents'); /** * Updates the markdown content for better UX and compatibility with Docusaurus v3. @@ -11,6 +11,7 @@ const visitParents = require('unist-util-visit-parents'); function updateChangelog(changelog) { const pipeline = unified() .use(remarkParse) + .use(removeGitCliffMarkers) .use(incrementHeadingLevels) .use(prettifyPRLinks) .use(linkifyUserTags) @@ -31,10 +32,21 @@ function updateChangelog(changelog) { */ const incrementHeadingLevels = () => (tree) => { visitParents(tree, 'heading', (node) => { + if (node.depth === 1 && node.children[0].value === 'Changelog') return; + node.depth += 1; }); }; +const removeGitCliffMarkers = () => (tree) => { + visitParents(tree, 'html', (node) => { + const gitCliffMarkerRegex = /generated by git-cliff/ig; + const match = gitCliffMarkerRegex.exec(node.value); + + if (match) node.value = ''; + }); +}; + /** * Links user tags in the markdown content. This function replaces the user tags * (e.g. `@username`) with a link to the user's GitHub profile (just like GitHub's UI). @@ -46,9 +58,10 @@ const linkifyUserTags = () => (tree) => { const userTagRegex = /@([a-zA-Z0-9-]+)(\s|$)/g; const match = userTagRegex.exec(node.value); - if (!match) return; - const directParent = parents[parents.length - 1]; + + if (!match || directParent.type === 'link') return; + const nodeIndexInParent = directParent.children.findIndex((x) => x === node); const username = match[1]; diff --git a/docusaurus.config.js b/docusaurus.config.js index cd0bad273..d9ab9fb03 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -194,7 +194,7 @@ module.exports = { config: { /** @type {import('docusaurus-plugin-openapi-docs').Options} */ v2: { - downloadUrl: 'openapi.yaml', + downloadUrl: '/api/openapi.yaml', specPath: 'apify-api.yaml', outputDir: './sources/api', markdownGenerators: { diff --git a/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md b/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md index 2cdffe51f..05a8a438c 100644 --- a/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md +++ b/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md @@ -30,7 +30,7 @@ You can use one of the two main ways to programmatically interact with the Apify ## Our task -In the previous lesson, we created a **task** for the Amazon Actor we built in the first two lessons of this course. Now, we'll be creating another new Actor, which will have two jobs: +We'll be creating another new Actor, which will have two jobs: 1. Programmatically call the task for the Amazon Actor. 2. Export its results into CSV format under a new key called **OUTPUT.csv** in the default key-value store. diff --git a/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md b/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md index 6c1432650..bcb7cee71 100644 --- a/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md +++ b/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md @@ -28,9 +28,7 @@ Before moving on, give these valuable resources a quick lookover: 1. Why might you want to store statistics about an Actor's run (or a specific request)? 2. In our Amazon scraper, we are trying to store the number of retries of a request once its data is pushed to the dataset. Where would you get this information? Where would you store it? -3. We are building a new imaginary scraper for a website that sometimes displays captchas at unexpected times, rather than displaying the content we want. How would you keep a count of the total number of captchas hit for the entire run? Where would you store this data? Why? -4. Is storing these types of values necessary for every single Actor? -5. What is the difference between the `failedRequestHandler` and `errorHandler`? +3. What is the difference between the `failedRequestHandler` and `errorHandler`? ## Our task diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md b/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md index b9befd671..a730536c7 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md @@ -154,14 +154,6 @@ router.addHandler(labels.OFFERS, async ({ $, request }) => { **A:** This information is available directly on the request object under the property **retryCount**. -**Q: We are building a new imaginary scraper for a website that sometimes displays captchas at unexpected times, rather than displaying the content we want. How would you keep a count of the total number of captchas hit for the entire run? Where would you store this data? Why?** - -**A:** First, build a function that detects if the captcha has been hit. If so, it will throw an error and add to the **numberOfCaptchas** count. This data might be stored on a persisted state object to help better assess the anti-scraping mitigation techniques the scraper should be used. - -**Q: Is storing these types of values necessary for every single Actor?** - -**A:** For small Actors, it might be a waste of time to do this. For large-scale Actors, it can be extremely helpful when debugging and most definitely worth the extra 10–20 minutes of development time. Usually though, the default statistics from the Crawlee and the SDK might be enough for simple run stats. - **Q: What is the difference between the `failedRequestHandler` and `errorHandler`?** **A:** `failedRequestHandler` runs after a request has failed and reached its `maxRetries` count. `errorHandler` runs on every failure and retry. diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md b/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md index 5de6c4340..5c01c45a8 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md @@ -1,251 +1,12 @@ --- title: III - Using storage & creating tasks -description: Follow along with step-by-step instructions on how to complete the task outlined in the previous lesson. Use different storage types, and create a task. +description: Get quiz answers and explanations for the lesson about using storage and creating tasks on the Apify platform. sidebar_position: 3 slug: /expert-scraping-with-apify/solutions/using-storage-creating-tasks --- # Using storage & creating tasks {#using-storage-creating-tasks} -**Follow along with step-by-step instructions on how to complete the task outlined in the previous lesson. Use different storage types, and create a task.** - ---- - -Last lesson, our task was outlined for us. In this lesson, we'll be completing that task by making our Amazon Actor push to a **named dataset** and use the **default key-value store** to store the cheapest item found by the scraper. Finally, we'll create a task for the Actor back on the Apify platform. - -## Using a named dataset {#using-named-dataset} - -Something important to understand is that, in the Apify SDK, when you use `Actor.pushData()`, the data will always be pushed to the default dataset. To open up a named dataset, we'll use the `Actor.openDataset()` function: - -```js -// main.js -// ... - -await Actor.init(); - -const { keyword } = await Actor.getInput(); - -// Open a dataset with a custom named based on the -// keyword which was inputted by the user -const dataset = await Actor.openDataset(`amazon-offers-${keyword.replace(' ', '-')}`); -// ... -``` - -If we remember correctly, we are pushing data to the dataset in the `labels.OFFERS` handler we created in **routes.js**. Let's export the `dataset` variable pointing to our named dataset so we can import it in **routes.js** and use it in the handler: - -```js -export const dataset = await Actor.openDataset(`amazon-offers-${keyword.replace(' ', '-')}`); -``` - -Finally, let's modify the function to use the new `dataset` variable rather than the `Actor` class: - -```js -// Import the dataset pointer -import { dataset } from './main.js'; - -// ... - -router.addHandler(labels.OFFERS, async ({ $, request }) => { - const { data } = request.userData; - - for (const offer of $('#aod-offer')) { - const element = $(offer); - - // Replace "Actor" with "dataset" - await dataset.pushData({ - ...data, - sellerName: element.find('div[id*="soldBy"] a[aria-label]').text().trim(), - offer: element.find('.a-price .a-offscreen').text().trim(), - }); - } -}); -``` - -That's it! Now, our Actor will push its data to a dataset named **amazon-offers-KEYWORD**! - -## Using a key-value store {#using-key-value-store} - -We now want to store the cheapest item in the default key-value store under a key named **CHEAPEST-ITEM**. The most efficient and practical way of doing this is by filtering through all of the newly named dataset's items and pushing the cheapest one to the store. - -Let's add the following code to the bottom of the Actor after **Crawl finished** is logged to the console: - -```js -// ... - -const cheapest = items.reduce((prev, curr) => { - // If there is no previous offer price, or the previous is more - // expensive, set the cheapest to our current item - if (!prev?.offer || +prev.offer.slice(1) > +curr.offer.slice(1)) return curr; - // Otherwise, keep our previous item - return prev; -}); - -// Set the "CHEAPEST-ITEM" key in the key-value store to be the -// newly discovered cheapest item -await Actor.setValue(CHEAPEST_ITEM, cheapest); - -await Actor.exit(); -``` - -> If you start receiving a linting error after adding the following code to your **main.js** file, add `"parserOptions": { "ecmaVersion": "latest" }` to the **.eslintrc** file in the root directory of your project. - -You might have noticed that we are using a variable instead of a string for the key name in the key-value store. This is because we're using an exported variable from **constants.js** (which is best practice, as discussed in the [**modularity**](../../../webscraping/scraping_basics_javascript/challenge/modularity.md)) lesson back in the **Web scraping for beginners** course. Here is what our **constants.js** file looks like: - -```js -// constants.js -export const BASE_URL = 'https://www.amazon.com'; - -export const OFFERS_URL = (asin) => `${BASE_URL}/gp/aod/ajax/ref=auto_load_aod?asin=${asin}&pc=dp`; - -export const labels = { - START: 'START', - PRODUCT: 'PRODUCT', - OFFERS: 'OFFERS', -}; - -export const CHEAPEST_ITEM = 'CHEAPEST-ITEM'; -``` - -## Code check-in {#code-check-in} - -Here is what the **main.js** file looks like now: - -```js -// main.js -import { Actor } from 'apify'; -import { CheerioCrawler, log } from '@crawlee/cheerio'; - -import { router } from './routes.js'; -import { BASE_URL, CHEAPEST_ITEM } from './constants'; - -await Actor.init(); - -const { keyword } = await Actor.getInput(); - -export const dataset = await Actor.openDataset(`amazon-offers-${keyword.replace(' ', '-')}`); - -const proxyConfiguration = await Actor.createProxyConfiguration({ - groups: ['RESIDENTIAL'], -}); - -const crawler = new Actor.CheerioCrawler({ - proxyConfiguration, - useSessionPool: true, - maxConcurrency: 50, - requestHandler: router, -}); - -await crawler.addRequests([ - { - url: `${BASE_URL}/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=${keyword}`, - label: 'START', - userData: { - keyword, - }, - }, -]); - -log.info('Starting the crawl.'); -await crawler.run(); -log.info('Crawl finished.'); - -const { items } = await dataset.getData(); - -const cheapest = items.reduce((prev, curr) => { - if (!prev?.offer) return curr; - if (+prev.offer.slice(1) > +curr.offer.slice(1)) return curr; - return prev; -}); - -await Actor.setValue(CHEAPEST_ITEM, cheapest); - -await Actor.exit(); -``` - -And here is **routes.js**: - -```js -// routes.js -import { createCheerioRouter } from '@crawlee/cheerio'; -import { dataset } from './main.js'; -import { BASE_URL, OFFERS_URL, labels } from './constants'; - -export const router = createCheerioRouter(); - -router.addHandler(labels.START, async ({ $, crawler, request }) => { - const { keyword } = request.userData; - - const products = $('div > div[data-asin]:not([data-asin=""])'); - - for (const product of products) { - const element = $(product); - const titleElement = $(element.find('.a-text-normal[href]')); - - const url = `${BASE_URL}${titleElement.attr('href')}`; - - await crawler.addRequests([{ - url, - label: labels.PRODUCT, - userData: { - data: { - title: titleElement.first().text().trim(), - asin: element.attr('data-asin'), - itemUrl: url, - keyword, - }, - }, - }]); - } -}); - -router.addHandler(labels.PRODUCT, async ({ $, crawler, request }) => { - const { data } = request.userData; - - const element = $('div#productDescription'); - - await crawler.addRequests([{ - url: OFFERS_URL(data.asin), - label: labels.OFFERS, - userData: { - data: { - ...data, - description: element.text().trim(), - }, - }, - }]); -}); - -router.addHandler(labels.OFFERS, async ({ $, request }) => { - const { data } = request.userData; - - for (const offer of $('#aod-offer')) { - const element = $(offer); - - await dataset.pushData({ - ...data, - sellerName: element.find('div[id*="soldBy"] a[aria-label]').text().trim(), - offer: element.find('.a-price .a-offscreen').text().trim(), - }); - } -}); -``` - -Don't forget to push your changes to GitHub using `git push origin MAIN_BRANCH_NAME` to see them on the Apify platform! - -## Creating a task {#creating-task} - -Back on the platform, on your Actor's page, you can see a button in the top right hand corner that says **Create new task**: - -![Create new task button](./images/create-new-task.jpg) - -Then, configure the task to use **google pixel** as a keyword and click **Save**. - -> You can also add a custom name and description for the task in the **Settings** tab! - -![Creating a task](./images/creating-task.png) - -After saving it, you'll be able to see the newly created task in the **Tasks** tab on the Apify Console. Go ahead and run it. Did it work? - ## Quiz answers 📝 {#quiz-answers} **Q: What is the relationship between Actors and tasks?** diff --git a/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md b/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md index 2ee07fc6b..d18009c24 100644 --- a/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md +++ b/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md @@ -34,14 +34,6 @@ Storage allows us to save persistent data for further processing. As you'll lear 2. What are the differences between default (unnamed) and named storage? Which one would you use for everyday usage? 3. What is data retention, and how does it work for all types of storages (default and named)? -## Our task {#our-task} - -Once again, we'll be adding onto our main Amazon-scraping Actor in this activity, but don't worry - this lesson will be quite light, just like the last one. - -We have decided that we want to retain the data scraped by the Actor for a long period of time, so instead of pushing to the default dataset, we will be pushing to a named dataset. Additionally, we want to save the absolute cheapest item found by the scraper into the default key-value store under a key named **CHEAPEST-ITEM**. - -Finally, we'll create a task for the Actor that saves the configuration with the **keyword** set to **google pixel**. - [**Solution**](./solutions/using_storage_creating_tasks.md) ## Next up {#next} diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index d711bdf39..d6c1184cb 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -46,7 +46,6 @@ Successfully installed Jinja2-0.0.0 ... ... ... crawlee-0.0.0 ... ... ... Now let's use the framework to create a new version of our scraper. In the same project directory where our `main.py` file lives, create a file `newmain.py`. This way, we can keep peeking at the original implementation while working on the new one. The initial content will look like this: - ```py title="newmain.py" import asyncio from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler @@ -63,7 +62,6 @@ async def main(): if __name__ == '__main__': asyncio.run(main()) ``` - In the code, we do the following: @@ -427,7 +425,7 @@ If you export the dataset as JSON, it should look something like this: { "url": "https://www.f1academy.com/Racing-Series/Drivers/29/Emely-De-Heus", "name": "Emely De Heus", - "team": "MP Motorsport" + "team": "MP Motorsport", "nationality": "Dutch", "dob": "2003-02-10", "instagram_url": "https://www.instagram.com/emely.de.heus/", @@ -435,7 +433,7 @@ If you export the dataset as JSON, it should look something like this: { "url": "https://www.f1academy.com/Racing-Series/Drivers/28/Hamda-Al-Qubaisi", "name": "Hamda Al Qubaisi", - "team": "MP Motorsport" + "team": "MP Motorsport", "nationality": "Emirati", "dob": "2002-08-08", "instagram_url": "https://www.instagram.com/hamdaalqubaisi_official/", @@ -501,7 +499,7 @@ Hints: The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: -- URL of the film's imdb.com page +- URL of the film's IMDb page - Title - Rating diff --git a/sources/legal/latest/policies/privacy-policy.md b/sources/legal/latest/policies/privacy-policy.md index f46a6a638..258849f1b 100644 --- a/sources/legal/latest/policies/privacy-policy.md +++ b/sources/legal/latest/policies/privacy-policy.md @@ -108,14 +108,14 @@ We keep your personal data for no longer than necessary for the purposes for whi Upon your request and authentication of your identity, Apify will provide you with information about the personal data we have collected from you, whether we hold your personal data or process your personal data on behalf of a third party. Requests to access, change, or delete personal data made to Apify will be addressed within 30 days or earlier if required by applicable laws or regulations. -If your name, e-mail or postal address, telephone number, or other personal data changes, you may update, correct, or omit the relevant information by contacting Apify at privacy@apify.com or by updating your personal data on the Account settings page on the Website. +If your name, e-mail or postal address, telephone number, or other personal data changes, you may update, correct, or omit the relevant information by contacting Apify at privacy[at]apify[dot]com or by updating your personal data on the Account settings page on the Website. In some situations, we may not be able to provide access to certain personal data. Where an access request is refused, we will notify you in writing, document the reasons for refusal and outline further steps which are available to you. When a challenge regarding the accuracy of personal data is not resolved to your satisfaction, We will annotate the personal data under our control with a note that the correction was requested but not made. ### Removal and Objection If you prefer not to receive newsletters or other marketing emails from Apify, please let us know by clicking on the unsubscribe link within any newsletter or marketing email you receive. Please note that, regardless of your request, we may still use and disclose certain personal data as permitted by this Privacy Policy or as required by applicable law. For example, you may not opt out of certain transactional emails from us, such as those confirming your requests or providing you with updates regarding our legal terms. -If you prefer not to receive marketing mail via the mail carrier, please let us know by contacting User service at support@apify.com. Please note that such requests may take up to ten (10) days to become effective. +If you prefer not to receive marketing mail via the mail carrier, please let us know by contacting User service at support[at]apify[dot]com. Please note that such requests may take up to ten (10) days to become effective. For more information about your rights under EEA and U.K. GDPR, please refer to Clause “Territory-Specific Terms” below. ## Third-Party Links and Features @@ -183,7 +183,7 @@ We update this Privacy Policy from time to time and encourage you to review it p ## Contact Us -Any notices or requests to Apify under this Privacy Policy shall be made to [privacy@apify.com](mailto:privacy@apify.com) or: +Any notices or requests to Apify under this Privacy Policy shall be made to privacy[at]apify[dot]com or: By mail: diff --git a/sources/legal/latest/terms/data-processing-addendum.md b/sources/legal/latest/terms/data-processing-addendum.md index d151d9a09..4bba8d5aa 100644 --- a/sources/legal/latest/terms/data-processing-addendum.md +++ b/sources/legal/latest/terms/data-processing-addendum.md @@ -201,7 +201,7 @@ Data exporter: Name: Apify Technologies s.r.o. Address: Vodičkova 704/36, Nové Město, 110 00 Praha 1 Contact person’s name, position and contact details: -Apify Privacy Team, privacy@apify.com +Apify Privacy Team, privacy[at]apify[dot]com Activities relevant to the data transferred under these Clauses: Processing necessary to provide the Apify Platform and other Services by Apify to Customer and for any disclosures of Personal Data in accordance with the Agreement. Role: Processor or Subprocessor, as applicable