Skip to content

Commit

Permalink
cleanup: fix bugs with accepting webhook info on reqPayload + passing…
Browse files Browse the repository at this point in the history
… thru to API
  • Loading branch information
skeptrunedev committed Dec 12, 2024
1 parent d4e42a2 commit 81cb5e5
Show file tree
Hide file tree
Showing 11 changed files with 36 additions and 75 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ services:
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- HOST=${HOST:-0.0.0.0}
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
- LOGGING_LEVEL=${LOGGING_LEVEL}
extra_hosts:
- "host.docker.internal:host-gateway"
Expand All @@ -93,7 +92,6 @@ services:
- TEST_API_KEY=${TEST_API_KEY}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
- LOGGING_LEVEL=${LOGGING_LEVEL}
extra_hosts:
- "host.docker.internal:host-gateway"
Expand Down
3 changes: 0 additions & 3 deletions apps/api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ TWOCAPTCHA_TOKEN=
# <number> Maximal number of parallel workers. Defaults to 1.
MAX_CONCURRENCY=20

# Set this to the URL of your webhook when using the self-hosted version of FireCrawl
SELF_HOSTED_WEBHOOK_URL=

# LOGGING_LEVEL determines the verbosity of logs that the system will output.
# Available levels are:
# NONE - No logs will be output.
Expand Down
36 changes: 2 additions & 34 deletions apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["markdown", "html"],
formats: ["markdown", "rawHtml"],
};

const response: ScrapeResponseRequestTest = await request(TEST_URL)
Expand Down Expand Up @@ -349,7 +349,7 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["html","rawHtml"],
formats: ["rawHtml"],
};

const response: ScrapeResponseRequestTest = await request(TEST_URL)
Expand Down Expand Up @@ -407,38 +407,6 @@ describe("E2E Tests for v1 API Routes", () => {
},
30000
);

it.concurrent(
"should return a successful response with a valid links on page",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["links"],
};

const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);

expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data).not.toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("links");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.links).toContain("https://firecrawl.dev");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);


});

describe("POST /v1/map", () => {
Expand Down
7 changes: 4 additions & 3 deletions apps/api/src/controllers/v1/crawl.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
Expand Down Expand Up @@ -172,10 +171,11 @@ export async function crawlController(
team_id: req.auth.team_id,
crawlerOptions,
pageOptions,
webhookUrl: req.body.webhookUrl,
webhookMetadata: req.body.webhookMetadata,
origin: "api",
crawl_id: id,
sitemapped: true,
webhook: req.body.webhook,
v1: true,
},
opts: {
Expand Down Expand Up @@ -203,9 +203,10 @@ export async function crawlController(
crawlerOptions: crawlerOptions,
team_id: req.auth.team_id,
pageOptions: pageOptions,
webhookUrl: req.body.webhookUrl,
webhookMetadata: req.body.webhookMetadata,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true,
},
{
Expand Down
3 changes: 2 additions & 1 deletion apps/api/src/controllers/v1/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ export const crawlRequestSchema = crawlerOptions
url,
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
webhook: z.string().url().optional(),
webhookUrl: z.string().url().optional(),
webhookMetadata: z.any().optional(),
limit: z.number().default(10000),
})
.strict(strictMessage);
Expand Down
6 changes: 4 additions & 2 deletions apps/api/src/main/runWebScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,19 @@ export async function runWebScraper({
try {
const provider = new WebScraperDataProvider();
if (mode === "crawl") {
await provider.setOptions({
provider.setOptions({
jobId: bull_job_id,
mode: mode,
urls: [url],
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
webhookUrl: webhookUrl,
webhookMetadata: webhookMetadata,
bullJobId: bull_job_id,
priority,
});
} else {
await provider.setOptions({
provider.setOptions({
jobId: bull_job_id,
mode: mode,
urls: url.split(","),
Expand Down
6 changes: 3 additions & 3 deletions apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => {
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };

const resultWithHtml = await scrapeSingleUrl("TEST", url, pageOptionsWithHtml);
const resultWithoutHtml = await scrapeSingleUrl("TEST", url, pageOptionsWithoutHtml);
const resultWithHtml = await scrapeSingleUrl(url, pageOptionsWithHtml);
const resultWithoutHtml = await scrapeSingleUrl(url, pageOptionsWithoutHtml);

expect(resultWithHtml.html).toBeDefined();
expect(resultWithoutHtml.html).toBeUndefined();
Expand All @@ -27,7 +27,7 @@ it('should return a list of links on the firecrawl.ai page', async () => {
const url = 'https://flutterbricks.com';
const pageOptions: PageOptions = { includeHtml: true };

const result = await scrapeSingleUrl("TEST", url, pageOptions);
const result = await scrapeSingleUrl(url, pageOptions);

// Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined();
Expand Down
22 changes: 11 additions & 11 deletions apps/api/src/scraper/WebScraper/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,17 +218,17 @@ export class WebCrawler {
return fullUrl;
}

Logger.debug(
`Link filtered out: ${fullUrl} with tests: isInternalLink: ${this.isInternalLink(
fullUrl
)}, allowExternalLinks: ${
this.allowExternalLinks
}, isSocialMediaOrEmail: ${this.isSocialMediaOrEmail(
fullUrl
)}, matchesExcludes: ${this.matchesExcludes(
fullUrl
)}, matchesIncludes: ${this.matchesIncludes(fullUrl)}`
);
// Logger.debug(
// `Link filtered out: ${fullUrl} with tests: isInternalLink: ${this.isInternalLink(
// fullUrl
// )}, allowExternalLinks: ${
// this.allowExternalLinks
// }, isSocialMediaOrEmail: ${this.isSocialMediaOrEmail(
// fullUrl
// )}, matchesExcludes: ${this.matchesExcludes(
// fullUrl
// )}, matchesIncludes: ${this.matchesIncludes(fullUrl)}`
// );
return null;
}

Expand Down
8 changes: 0 additions & 8 deletions apps/api/src/scraper/WebScraper/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@ export class WebScraperDataProvider {
private crawlerMode: string = "default";
private allowExternalLinks: boolean = false;

authorize(): void {
throw new Error("Method not implemented.");
}

authorizeNango(): Promise<void> {
throw new Error("Method not implemented.");
}

private async convertUrlsToDocuments(
urls: string[],
inProgress?: (progress: Progress) => void,
Expand Down
17 changes: 10 additions & 7 deletions apps/api/src/scraper/WebScraper/single_url.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@ export const callWebhook = async (
metadata: any,
scrapeId?: string
) => {
let retries = 0;
while (retries < 3) {
retries++;

let retryCount = 0;
while (retryCount < 3) {
try {
await axios.post(
webhookUrl,
Expand All @@ -44,11 +42,11 @@ export const callWebhook = async (
break;
} catch (error) {
Logger.debug(
`Error sending webhook to ${webhookUrl} for scrape ID: ${scrapeId}, retry ${
retries + 1
}`
`Error sending webhook to ${webhookUrl} for scrape ID: ${scrapeId}, retry ${retryCount}`
);
}

retryCount++;
}
};

Expand Down Expand Up @@ -318,7 +316,12 @@ export async function scrapeSingleUrl(
}

if (webhookUrl) {
Logger.debug(
`Sending webhook for scrape ID ${scrapeId} to ${webhookUrl}`
);
await callWebhook(webhookUrl, document, webhookMetadata, scrapeId);
} else {
Logger.debug(`No webhook URL provided, skipping webhook`);
}

return document;
Expand Down
1 change: 0 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ x-common-service: &common-service
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- HOST=${HOST:-0.0.0.0}
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
- LOGGING_LEVEL=${LOGGING_LEVEL}
extra_hosts:
- "host.docker.internal:host-gateway"
Expand Down

0 comments on commit 81cb5e5

Please sign in to comment.