From f00c41ceb65fcb8be190c3d223c138d984bde9d3 Mon Sep 17 00:00:00 2001 From: Justin Duke Date: Mon, 2 Sep 2024 18:04:29 -0400 Subject: [PATCH] Fix bug with subdomain parsing --- justfile | 4 ++-- lib/data.test.ts | 20 ++++++++++++++++++++ lib/parsers/html.ts | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/justfile b/justfile index 7ee8052..d4d20ce 100644 --- a/justfile +++ b/justfile @@ -7,5 +7,5 @@ install: bootstrap: python3 scripts/bootstrap.py -test: - PINO_LEVEL=silent DISABLE_DATABASE=true DISABLE_PUPPETEER=true bun test +test *args: + PINO_LEVEL=silent DISABLE_DATABASE=true DISABLE_PUPPETEER=true bun test {{args}} diff --git a/lib/data.test.ts b/lib/data.test.ts index 637cb5f..37a7add 100644 --- a/lib/data.test.ts +++ b/lib/data.test.ts @@ -2,6 +2,17 @@ import fetch from "@/lib/data"; import { describe, expect, test } from "vitest"; import { DetectedTechnology } from "./parsers/types"; +const DOMAIN_TO_UNEXPECTED_DATA: Record = { + "changelog.com": [ + { + identifier: "subdomain", + metadata: { + value: "op3.dev", + }, + }, + ], +}; + const DOMAIN_TO_EXPECTED_DATA: Record = { "formkeep.com": [ { @@ -69,6 +80,15 @@ describe("fetching", () => { }); }); + Object.entries(DOMAIN_TO_UNEXPECTED_DATA).forEach(([domain, unexpectedData]) => { + unexpectedData.forEach((data) => { + test(`does not fetch ${data.identifier} for ${domain}`, async () => { + const { detected_technologies } = await fetch(domain); + expect(detected_technologies).not.toContainEqual(data); + }); + }); + }); + test("deduping identical records", async () => { const { detected_technologies } = await fetch("zed.dev"); expect(detected_technologies.filter((tech) => tech.identifier === "twitter")).toHaveLength(1); diff --git a/lib/parsers/html.ts b/lib/parsers/html.ts index 77bd728..367f5e9 100644 --- a/lib/parsers/html.ts +++ b/lib/parsers/html.ts @@ -160,7 +160,7 @@ const SUBDOMAIN_RULE = (html: string, domain: string) => { })) .filter( (v) => - v.value && v.value.startsWith("http") && v.value.includes(`.${domain}`) + v.value && v.value.startsWith("http") && new URL(v.value).hostname.includes(domain) ) .map((v) => ({ value: new URL(v.value || "").hostname,