From c025fd8b5a6ed6695e5724e12bbefabadf202262 Mon Sep 17 00:00:00 2001 From: Justin Duke Date: Sat, 14 Sep 2024 09:33:04 -0400 Subject: [PATCH] Finish flushing out 'affiliations' abstraction --- app/api/domains/[domain]/route.ts | 61 ++-- app/domain/[domain]/page.tsx | 285 ++++++++--------- lib/affiliations/loaders/osspledge.ts | 23 ++ lib/affiliations/loaders/tranco.ts | 28 +- lib/affiliations/loaders/ycombinator.ts | 60 ++-- lib/affiliations/registry.ts | 33 +- lib/affiliations/types.ts | 4 +- lib/data.ts | 103 ++++--- lib/db/types.ts | 39 ++- lib/loaders/html.ts | 105 ++++--- lib/loaders/tranco.ts | 25 -- lib/parsers/affiliations.ts | 23 ++ lib/parsers/dns.ts | 254 +++++++-------- lib/parsers/headers.ts | 38 ++- lib/parsers/html.ts | 391 ++++++++++++------------ lib/parsers/types.ts | 13 +- lib/utils.ts | 38 ++- scripts/refresh-affiliation.ts | 72 +++++ 18 files changed, 886 insertions(+), 709 deletions(-) create mode 100644 lib/affiliations/loaders/osspledge.ts delete mode 100644 lib/loaders/tranco.ts create mode 100644 lib/parsers/affiliations.ts create mode 100644 scripts/refresh-affiliation.ts diff --git a/app/api/domains/[domain]/route.ts b/app/api/domains/[domain]/route.ts index f915cd4..fec60ea 100644 --- a/app/api/domains/[domain]/route.ts +++ b/app/api/domains/[domain]/route.ts @@ -2,31 +2,48 @@ import fetch from "@/lib/data"; import { reify } from "@/lib/db/domains"; const SOCIAL_MEDIA_SERVICES = [ - "facebook", - "twitter", - "instagram", - "linkedin", - "youtube", - "github", + "facebook", + "twitter", + "instagram", + "linkedin", + "youtube", + "github", ]; export async function GET( - request: Request, - context: { - params: { - domain: string; - }; - } + request: Request, + context: { + params: { + domain: string; + }; + }, ) { - const rawResponse = await fetch(context.params.domain); - await reify(context.params.domain, rawResponse); + const rawResponse = await fetch(context.params.domain); + await reify(context.params.domain, rawResponse); + console.log(rawResponse.detected_technologies); - return Response.json({ - domain: context.params.domain, - records: rawResponse.data.filter((datum) => datum.label === "DNS").flatMap((datum) => datum.data), - ranking: rawResponse.data.find((datum) => datum.label === "Tranco")?.data[0]?.value, - services: rawResponse.detected_technologies.filter((technology) => technology.identifier !== "subdomain").map((technology) => technology.identifier).sort(), - subdomains: rawResponse.detected_technologies.filter((technology) => technology.identifier === "subdomain").map((technology) => technology.metadata.value).sort(), - social_media: Object.fromEntries(SOCIAL_MEDIA_SERVICES.map(service => [service, rawResponse.detected_technologies.find((note) => note.identifier === service)?.metadata.username])) - }); + return Response.json({ + domain: context.params.domain, + records: rawResponse.data + .filter((datum) => datum.label === "DNS") + .flatMap((datum) => datum.data), + ranking: rawResponse.data.find((datum) => datum.label === "Tranco")?.data[0] + ?.value, + services: rawResponse.detected_technologies + .filter((technology) => technology.identifier !== "subdomain") + .map((technology) => technology.identifier) + .sort(), + subdomains: rawResponse.detected_technologies + .filter((technology) => technology.identifier === "subdomain") + .map((technology) => technology.metadata.value) + .sort(), + social_media: Object.fromEntries( + SOCIAL_MEDIA_SERVICES.map((service) => [ + service, + rawResponse.detected_technologies.find( + (note) => note.identifier === service, + )?.metadata.username, + ]), + ), + }); } diff --git a/app/domain/[domain]/page.tsx b/app/domain/[domain]/page.tsx index 5383e99..f69aca4 100644 --- a/app/domain/[domain]/page.tsx +++ b/app/domain/[domain]/page.tsx @@ -1,171 +1,174 @@ import DomainIcon from "@/components/DomainIcon"; import Grid from "@/components/Grid"; import SectionHeader from "@/components/SectionHeader"; +import { REGISTRY as AFFILIATIONS_REGISTRY } from "@/lib/affiliations/registry"; import fetch from "@/lib/data"; import { reify } from "@/lib/db/domains"; import { GENRE_REGISTRY, REGISTRY } from "@/lib/services"; -import { Metadata, ResolvingMetadata } from "next"; +import type { Metadata, ResolvingMetadata } from "next"; type Props = { - params: { domain: string }; + params: { domain: string }; }; const SOCIAL_MEDIA_URL_TEMPLATES: { [key: string]: string } = { - twitter: "https://twitter.com/", - linkedin: "https://linkedin.com/in/", - facebook: "https://facebook.com/", - instagram: "https://instagram.com/", - youtube: "https://youtube.com/", - tiktok: "https://tiktok.com/@", - bluesky: "https://bsky.social/", - github: "https://github.com/", + twitter: "https://twitter.com/", + linkedin: "https://linkedin.com/in/", + facebook: "https://facebook.com/", + instagram: "https://instagram.com/", + youtube: "https://youtube.com/", + tiktok: "https://tiktok.com/@", + bluesky: "https://bsky.social/", + github: "https://github.com/", }; const generateURLForSocialMedia = ( - service: string, - username: string + service: string, + username: string, ): string => { - const template = SOCIAL_MEDIA_URL_TEMPLATES[service]; - return template ? `${template}${username}` : ""; + const template = SOCIAL_MEDIA_URL_TEMPLATES[service]; + return template ? `${template}${username}` : ""; }; export async function generateMetadata( - { params }: Props, - parent: ResolvingMetadata + { params }: Props, + parent: ResolvingMetadata, ): Promise { - return { - title: params.domain + " - shovel.report", - description: - "Information about " + - params.domain + - " and its DNS records, technologies, social media and more.", - alternates: { - canonical: `/domain/${params.domain}`, - }, - }; + return { + title: `${params.domain} - shovel.report`, + description: `Information about ${params.domain} and its DNS records, technologies, social media and more.`, + alternates: { + canonical: `/domain/${params.domain}`, + }, + }; } function formatJson(json: string) { - try { - return { - valid: true, - value: JSON.stringify(JSON.parse(json || "{}"), null, 2), - }; - } catch { - return { valid: false, value: json }; - } + try { + return { + valid: true, + value: JSON.stringify(JSON.parse(json || "{}"), null, 2), + }; + } catch { + return { valid: false, value: json }; + } } export default async function Page({ - params, + params, }: { - params: { - domain: string; - }; + params: { + domain: string; + }; }) { - const data = await fetch(params.domain); - if (!process.env.DISABLE_DATABASE) { - await reify(params.domain, data); - } + const data = await fetch(params.domain); + if (!process.env.DISABLE_DATABASE) { + await reify(params.domain, data); + } - const jsonld = data.detected_technologies.find( - (datum) => datum.identifier === "jsonld" - )?.metadata.value; - const formattedJsonLd = formatJson(jsonld ?? "{}"); + const jsonld = data.detected_technologies.find( + (datum) => datum.identifier === "jsonld", + )?.metadata.value; + const formattedJsonLd = formatJson(jsonld ?? "{}"); - return ( -
-

- - - {params.domain} - -

- DNS Records - - - {data.data - .filter((datum) => datum.label === "DNS") - .flatMap((datum) => - datum.data.map((record) => ( - - - - - )) - )} - -
{record.type}{record.value}
- Tranco ranking -
    - {data.data - .filter((datum) => datum.label === "Tranco") - .flatMap((datum) => - datum.data.map((record) => ( -
  • #{record.value}
  • - )) - )} -
      No Tranco record found
    -
- - {data.detected_technologies - .filter((datum) => datum.identifier === "subdomain") - .map((note, i) => ( - - {note.metadata.value} - - ))} - - - {data.detected_technologies - .filter((datum) => datum.identifier !== "subdomain") - .filter((note) => REGISTRY[note.identifier]) - .map((note, i) => ( - -
{REGISTRY[note.identifier]?.name}
-
- {GENRE_REGISTRY[REGISTRY[note.identifier]?.genre].name} -
-
- ))} -
- - {data.detected_technologies - .filter((note) => REGISTRY[note.identifier]?.genre === "social_media") - .map((note, i) => ( - -
{note.metadata.username}
-
- {REGISTRY[note.identifier]?.name} -
-
- ))} -
- {jsonld && ( - <> - JSON+LD -
-            {formattedJsonLd.value}
-          
- {!formattedJsonLd.valid &&

(this JSON isn't valid)

} - - )} -
- ); + return ( +
+

+ + + {params.domain} + +

+ DNS Records + + + {data.data + .filter((datum) => datum.label === "DNS") + .flatMap((datum) => + datum.data.map((record) => ( + + + + + )), + )} + +
{record.type}{record.value}
+ + {data.detected_technologies + .filter((datum) => datum.identifier in AFFILIATIONS_REGISTRY) + .map((affiliation, i) => ( + + {AFFILIATIONS_REGISTRY[affiliation.identifier].name} + + ))} + + + {data.detected_technologies + .filter((datum) => datum.identifier === "subdomain") + .map((note, i) => ( + + {note.metadata.value} + + ))} + + + {data.detected_technologies + .filter((datum) => datum.identifier !== "subdomain") + .filter((note) => REGISTRY[note.identifier]) + .map((note, i) => ( + +
{REGISTRY[note.identifier]?.name}
+
+ {GENRE_REGISTRY[REGISTRY[note.identifier]?.genre].name} +
+
+ ))} +
+ + {data.detected_technologies + .filter((note) => REGISTRY[note.identifier]?.genre === "social_media") + .map((note, i) => ( + +
{note.metadata.username}
+
+ {REGISTRY[note.identifier]?.name} +
+
+ ))} +
+ {jsonld && ( + <> + JSON+LD +
+						{formattedJsonLd.value}
+					
+ {!formattedJsonLd.valid &&

(this JSON isn't valid)

} + + )} +
+ ); } diff --git a/lib/affiliations/loaders/osspledge.ts b/lib/affiliations/loaders/osspledge.ts new file mode 100644 index 0000000..22d7d08 --- /dev/null +++ b/lib/affiliations/loaders/osspledge.ts @@ -0,0 +1,23 @@ +import { extractDomain } from "../../utils"; +import type { Affiliation } from "../types"; + +const OSSPLEDGE_URL = + "https://raw.githubusercontent.com/opensourcepledge/osspledge.com/main/members.csv"; + +export default async function* load(): AsyncGenerator { + const response = await fetch(OSSPLEDGE_URL); + const data = await response.text(); + const lines = data + .split("\n") + .map((line) => line.split(",")[1]) + .filter((l) => l !== undefined); + + for (const url of lines) { + yield { + domain: extractDomain(url), + metadata: { + url: url, + }, + }; + } +} diff --git a/lib/affiliations/loaders/tranco.ts b/lib/affiliations/loaders/tranco.ts index b19c602..2b99dba 100644 --- a/lib/affiliations/loaders/tranco.ts +++ b/lib/affiliations/loaders/tranco.ts @@ -1,15 +1,21 @@ -import { Affiliation } from "../types"; +import type { Affiliation } from "../types"; const TRANCO_URL = "https://tranco-list.eu/download/KJ94W/1000000"; -export default async function load(): Promise { - const response = await fetch(TRANCO_URL); - const data = await response.text(); - const lines = data.split('\n').filter(line => line.trim() !== '').map(line => line.split(',')); - return lines.map(([rank, domain]) => ({ - identifier: domain, - metadata: { - rank: rank, - }, - })); +export default async function* load(): AsyncGenerator { + const response = await fetch(TRANCO_URL); + const data = await response.text(); + const lines = data + .split("\n") + .filter((line) => line.trim() !== "") + .map((line) => line.split(",")); + + for (const [rank, domain] of lines) { + yield { + domain, + metadata: { + rank: rank, + }, + }; + } } diff --git a/lib/affiliations/loaders/ycombinator.ts b/lib/affiliations/loaders/ycombinator.ts index 0a1b5a8..2506733 100644 --- a/lib/affiliations/loaders/ycombinator.ts +++ b/lib/affiliations/loaders/ycombinator.ts @@ -1,34 +1,40 @@ import { XMLParser } from "fast-xml-parser"; -import { Affiliation } from "../types"; +import { extractDomain } from "../../utils"; +import type { Affiliation } from "../types"; const SITEMAP_URL = "https://www.ycombinator.com/companies/sitemap.xml"; -export default async function load(): Promise { - const response = await fetch(SITEMAP_URL); - const data = await response.text(); - const parser = new XMLParser(); - const result = parser.parse(data); - const relevantURLs = result.urlset.url.filter((url: { loc: string }) => url.loc.endsWith(".com") && !url.loc.includes("/industry/")); - const affiliations: Affiliation[] = []; - for (const url of relevantURLs) { - try { - const companyResponse = await fetch(url.loc); - const companyHtml = await companyResponse.text(); - const hrefMatch = companyHtml.match(/href="([^"]*)"[^>]*class="[^"]*mb-2[^"]*whitespace-nowrap[^"]*"/); +export default async function* load(): AsyncGenerator { + const response = await fetch(SITEMAP_URL); + const data = await response.text(); + const parser = new XMLParser(); + const result = parser.parse(data); + const relevantURLs = result.urlset.url.filter( + (url: { loc: string }) => !url.loc.includes("/industry/"), + ); - if (hrefMatch && hrefMatch[1]) { - affiliations.push({ - identifier: hrefMatch[1], - metadata: { - source: 'ycombinator', - originalUrl: url.loc - } - }); - } - } catch (error) { - console.error(`Error fetching ${url.loc}:`, error); - } - } + for (const url of relevantURLs) { + try { + const companyResponse = await fetch(url.loc); + const companyHtml = await companyResponse.text(); + const hrefMatch = companyHtml.match( + /href="([^"]*)"[^>]*class="[^"]*mb-2[^"]*whitespace-nowrap[^"]*"/, + ); - return affiliations; + if (hrefMatch?.[1]) { + if (hrefMatch[1] === "https://") { + continue; + } + + yield { + domain: extractDomain(hrefMatch[1]), + metadata: { + originalUrl: url.loc, + }, + }; + } + } catch (error) { + console.error(`Error fetching ${url.loc}:`, error); + } + } } diff --git a/lib/affiliations/registry.ts b/lib/affiliations/registry.ts index 05882f0..f94bc63 100644 --- a/lib/affiliations/registry.ts +++ b/lib/affiliations/registry.ts @@ -1,16 +1,31 @@ +import osspledge from "./loaders/osspledge"; import tranco from "./loaders/tranco"; -import { Affiliation } from "./types"; +import ycombinator from "./loaders/ycombinator"; +import type { Affiliation } from "./types"; type RegisteredAffiliation = { - identifier: string; - name: string; - load: () => Promise; + identifier: string; + name: string; + domain?: string; + load: () => AsyncGenerator; }; export const REGISTRY: { [key in string]: RegisteredAffiliation } = { - "tranco": { - identifier: "tranco", - name: "Tranco", - load: tranco, - }, + tranco: { + identifier: "tranco", + name: "Tranco", + load: tranco, + }, + ycombinator: { + identifier: "ycombinator", + name: "Y Combinator", + load: ycombinator, + domain: "ycombinator.com", + }, + osspledge: { + identifier: "osspledge", + name: "OSS Pledge", + load: osspledge, + domain: "osspledge.com", + }, }; diff --git a/lib/affiliations/types.ts b/lib/affiliations/types.ts index 8bae348..f18e8a0 100644 --- a/lib/affiliations/types.ts +++ b/lib/affiliations/types.ts @@ -1,4 +1,4 @@ export type Affiliation = { - identifier: string; - metadata: Record; + domain: string; + metadata: Record; }; diff --git a/lib/data.ts b/lib/data.ts index 22a3c63..d0208d4 100644 --- a/lib/data.ts +++ b/lib/data.ts @@ -1,70 +1,77 @@ import dns from "@/lib/loaders/dns"; import dns_prefix from "@/lib/loaders/dns_prefix"; import html from "@/lib/loaders/html"; -import tranco from "@/lib/loaders/tranco"; +import affiliations from "@/lib/parsers/affiliations"; import records from "@/lib/parsers/dns"; import headers from "@/lib/parsers/headers"; import htmlRecords from "@/lib/parsers/html"; import { unique } from "@/lib/utils"; import pino from "pino"; -import { Loader, RecordGroup } from "./loaders/types"; -import { DetectedTechnology } from "./parsers/types"; +import type { Loader, RecordGroup } from "./loaders/types"; +import type { DetectedTechnology } from "./parsers/types"; -const LOADERS = [dns, html, dns_prefix, tranco]; -const PARSERS = [records, htmlRecords, headers]; +const LOADERS = [dns, html, dns_prefix]; +const PARSERS = [records, htmlRecords, headers, affiliations]; const logger = pino({ - level: process.env.PINO_LEVEL || "warn", + level: process.env.PINO_LEVEL || "warn", }); const load = async ( - domain: string, - loader: { - load: Loader; - name: string; - } + domain: string, + loader: { + load: Loader; + name: string; + }, ) => { - logger.info({ message: "loader.started", domain, loader: loader.name }); - const data = await loader.load(domain); - logger.info({ message: "loader.ended", domain, loader: loader.name }); - return data; + logger.info({ message: "loader.started", domain, loader: loader.name }); + const data = await loader.load(domain); + logger.info({ message: "loader.ended", domain, loader: loader.name }); + return data; }; -const fetch = async (domain: string): Promise<{ - domain: string; - data: RecordGroup[]; - detected_technologies: DetectedTechnology[]; +const fetch = async ( + domain: string, +): Promise<{ + domain: string; + data: RecordGroup[]; + detected_technologies: DetectedTechnology[]; }> => { - const data = [ - ...(await Promise.all(LOADERS.map((loader) => load(domain, loader)))), - { - label: "URL", - data: [ - { - value: `${domain}`, - type: "text/url", - }, - ], - }, - ]; + const data = [ + ...(await Promise.all(LOADERS.map((loader) => load(domain, loader)))), + { + label: "URL", + data: [ + { + value: `${domain}`, + type: "text/url", + }, + ], + }, + ]; - const detected_technologies = PARSERS.flatMap((parser) => parser.parse(data)); - return { - domain, - data: unique(data), - detected_technologies: [ - ...unique(detected_technologies, (n) => n.identifier === "subdomain" ? n.metadata.value : n.identifier), - ...data - .filter((d) => d.label === "SERVICE") - .flatMap((d) => d.data) - .map((d) => { - return { - identifier: d.type, - metadata: {}, - }; - }), - ], - }; + const detected_technologies = ( + await Promise.all(PARSERS.map((parser) => parser.parse(domain, data))) + ).flat(); + + return { + domain, + data: unique(data), + detected_technologies: [ + ...unique(detected_technologies, (n) => + n.identifier === "subdomain" ? n.metadata.value : n.identifier, + ), + ...data + .filter((d) => d.label === "SERVICE") + .flatMap((d) => d.data) + .map((d) => { + return { + identifier: d.type, + metadata: {}, + }; + }), + ], + }; }; export default fetch; diff --git a/lib/db/types.ts b/lib/db/types.ts index 60ffc71..7781f73 100644 --- a/lib/db/types.ts +++ b/lib/db/types.ts @@ -1,28 +1,34 @@ -import { ColumnType, Insertable, JSONColumnType, Selectable } from "kysely"; +import type { + ColumnType, + Insertable, + JSONColumnType, + Selectable, +} from "kysely"; export interface Database { - domains: DomainTable; - detected_technologies: DetectedTechnologyTable; - tranco: TrancoTable; + domains: DomainTable; + detected_technologies: DetectedTechnologyTable; + affiliations: AffiliationTable; } -export interface TrancoTable { - ranking: number; - domain: string; - creation_date: ColumnType; +export interface AffiliationTable { + domain: string; + identifier: string; + metadata: JSONColumnType; + creation_date: ColumnType; } export interface DomainTable { - domain: string; - data: JSONColumnType; - creation_date: ColumnType; + domain: string; + data: JSONColumnType; + creation_date: ColumnType; } export interface DetectedTechnologyTable { - domain: string; - technology: string; - data: JSONColumnType; - creation_date: ColumnType; + domain: string; + technology: string; + data: JSONColumnType; + creation_date: ColumnType; } export type Domain = Selectable; @@ -31,4 +37,5 @@ export type NewDomain = Insertable; export type DetectedTechnology = Selectable; export type NewDetectedTechnology = Insertable; -export type Tranco = Selectable; +export type Affiliation = Selectable; +export type NewAffiliation = Insertable; diff --git a/lib/loaders/html.ts b/lib/loaders/html.ts index 370eb81..26596c0 100644 --- a/lib/loaders/html.ts +++ b/lib/loaders/html.ts @@ -1,59 +1,58 @@ import puppeteer from "puppeteer"; -import { Loader } from "./types"; +import type { Loader } from "./types"; const load: Loader = async (domain: string) => { - try { - if (process.env.DISABLE_PUPPETEER !== "true") { - const browser = await puppeteer.launch(); - const page = await browser.newPage(); - const response = await page.goto(`https://${domain}`, { - waitUntil: "networkidle0", - }); - const html = await page.content(); - const headers = await response?.headers(); - await browser.close(); - return { - label: "HTML", - data: [ - { - value: html, - type: "text/html", - }, - ...Object.entries(headers || {}).map(([key, value]) => ({ - value: value || "", - type: `text/headers/${key}`, - })), - ], - }; - } else { - const response = await fetch(`https://${domain}`); - const html = await response.text(); - const headers = response.headers; - return { - label: "HTML", - data: [ - { - value: html, - type: "text/html", - }, - ...Object.entries(headers).map(([key, value]) => ({ - value: value[0] || "", - type: `text/headers/${key}`, - })), - ], - }; - } - } catch (error) { - return { - label: "HTML", - data: [ - { - value: "Error loading HTML", - type: "text/error", - }, - ], - }; - } + try { + if (process.env.DISABLE_PUPPETEER !== "true") { + const browser = await puppeteer.launch(); + const page = await browser.newPage(); + const response = await page.goto(`https://${domain}`, { + waitUntil: "networkidle0", + }); + const html = await page.content(); + const headers = await response?.headers(); + await browser.close(); + return { + label: "HTML", + data: [ + { + value: html, + type: "text/html", + }, + ...Object.entries(headers || {}).map(([key, value]) => ({ + value: value || "", + type: `text/headers/${key}`, + })), + ], + }; + } + const response = await fetch(`https://${domain}`); + const html = await response.text(); + const headers = response.headers; + return { + label: "HTML", + data: [ + { + value: html, + type: "text/html", + }, + ...Object.entries(headers).map(([key, value]) => ({ + value: value[0] || "", + type: `text/headers/${key}`, + })), + ], + }; + } catch (error) { + return { + label: "HTML", + data: [ + { + value: "Error loading HTML", + type: "text/error", + }, + ], + }; + } }; const exports = { load, name: "html" }; diff --git a/lib/loaders/tranco.ts b/lib/loaders/tranco.ts deleted file mode 100644 index 282efc1..0000000 --- a/lib/loaders/tranco.ts +++ /dev/null @@ -1,25 +0,0 @@ - -import { db } from "@/lib/db/connection"; -import { Loader } from "./types"; - -const load: Loader = async (domain: string) => { - if (process.env.DISABLE_DATABASE === "true") { - return { - label: "Tranco", - data: [], - }; - } - - const tranco = await db.selectFrom("tranco").where("domain", "=", domain).selectAll().executeTakeFirst(); - - return { - label: "Tranco", - data: tranco ? [{ - value: tranco.ranking.toString(), - type: "text/number", - }] : [], - }; -}; - -const exports = { load, name: "tranco" }; -export default exports; diff --git a/lib/parsers/affiliations.ts b/lib/parsers/affiliations.ts new file mode 100644 index 0000000..afb3021 --- /dev/null +++ b/lib/parsers/affiliations.ts @@ -0,0 +1,23 @@ +import { db } from "@/lib/db/connection"; +import type { RecordGroup } from "../loaders/types"; +import type { Parser } from "./types"; + +const parse: Parser = async (domain: string, data: RecordGroup[]) => { + if (process.env.DISABLE_DATABASE === "true") { + return []; + } + + const affiliations = await db + .selectFrom("affiliations") + .where("domain", "=", domain) + .selectAll() + .execute(); + + return affiliations.map((affiliation) => ({ + identifier: affiliation.identifier, + metadata: affiliation.metadata, + })); +}; + +const exports = { parse }; +export default exports; diff --git a/lib/parsers/dns.ts b/lib/parsers/dns.ts index 9fbf323..9365ea3 100644 --- a/lib/parsers/dns.ts +++ b/lib/parsers/dns.ts @@ -1,157 +1,159 @@ -import { Record } from "../loaders/types"; +import type { Record } from "../loaders/types"; import { REGISTRY } from "../services"; -import { DetectedTechnology, Parser } from "./types"; +import type { DetectedTechnology, Parser } from "./types"; const NAMESERVER_RULE = (record: Record): DetectedTechnology[] => { - if (record.type !== "NS") { - return []; - } - return Object.values(REGISTRY).flatMap((service) => { - if (service.ns_values === undefined) { - return []; - } - if (record.value.includes(service.ns_values[0])) { - return [ - { - identifier: service.identifier, - metadata: { - genre: "nameserver", - }, - }, - ]; - } - return []; - }); + if (record.type !== "NS") { + return []; + } + return Object.values(REGISTRY).flatMap((service) => { + if (service.ns_values === undefined) { + return []; + } + if (record.value.includes(service.ns_values[0])) { + return [ + { + identifier: service.identifier, + metadata: { + genre: "nameserver", + }, + }, + ]; + } + return []; + }); }; const TXT_RULE = (record: Record): DetectedTechnology[] => { - if (record.type !== "TXT") { - return []; - } - return Object.values(REGISTRY).flatMap((service) => { - if (service.txt_values === undefined) { - return []; - } - if (service.txt_values.some((value) => record.value.includes(value))) { - return [ - { - identifier: service.identifier, - metadata: { - via: "TXT", - }, - }, - ]; - } - return []; - }); + if (record.type !== "TXT") { + return []; + } + return Object.values(REGISTRY).flatMap((service) => { + if (service.txt_values === undefined) { + return []; + } + if (service.txt_values.some((value) => record.value.includes(value))) { + return [ + { + identifier: service.identifier, + metadata: { + via: "TXT", + }, + }, + ]; + } + return []; + }); }; const MX_RULE = (record: Record): DetectedTechnology[] => { - if (record.type !== "MX") { - return []; - } - return Object.values(REGISTRY).flatMap((service) => { - if ( - (service.mx_values || []).some((value) => record.value.includes(value)) - ) { - return [ - { - identifier: service.identifier, - metadata: { - genre: "Mailserver", - }, - }, - ]; - } - return []; - }); + if (record.type !== "MX") { + return []; + } + return Object.values(REGISTRY).flatMap((service) => { + if ( + (service.mx_values || []).some((value) => record.value.includes(value)) + ) { + return [ + { + identifier: service.identifier, + metadata: { + genre: "Mailserver", + }, + }, + ]; + } + return []; + }); }; const CNAME_RULE = (record: Record): DetectedTechnology[] => { - if (record.type !== "CNAME") { - return []; - } - return Object.values(REGISTRY).flatMap((service) => { - if ( - (service.cname_values || []).some((value) => record.value.includes(value)) - ) { - return [ - { - identifier: service.identifier, - metadata: { - via: "CNAME", - }, - }, - ]; - } - return []; - }); + if (record.type !== "CNAME") { + return []; + } + return Object.values(REGISTRY).flatMap((service) => { + if ( + (service.cname_values || []).some((value) => record.value.includes(value)) + ) { + return [ + { + identifier: service.identifier, + metadata: { + via: "CNAME", + }, + }, + ]; + } + return []; + }); }; const extractURLsOrIPsFromSPF = (record: string): string[] => { - return record - .split(" ") - .filter((part) => part.includes("include:") || part.includes("ip4:")) - .map((part) => part.split(":")[1]) - .map( - (part) => - Object.values(REGISTRY).find((s) => - s.spf_values?.some((v) => - v.includes("*") ? part.includes(v.replace("*", "")) : v === part - ) - )?.identifier || part - ); + return record + .split(" ") + .filter((part) => part.includes("include:") || part.includes("ip4:")) + .map((part) => part.split(":")[1]) + .map( + (part) => + Object.values(REGISTRY).find((s) => + s.spf_values?.some((v) => + v.includes("*") ? part.includes(v.replace("*", "")) : v === part, + ), + )?.identifier || part, + ); }; const isIPAddress = (value: string): boolean => { - // Catch both 127.0.0.1 _and_ 127.0.0.1/17. - return value.match(/^(?:[0-9]{1,3}\.){3}[0-9]{1,3}(?:\/[0-9]{1,2})?$/) - ? true - : false; + // Catch both 127.0.0.1 _and_ 127.0.0.1/17. + return ( + value.match(/^(?:[0-9]{1,3}\.){3}[0-9]{1,3}(?:\/[0-9]{1,2})?$/) !== null + ); }; const SPF_RULE = (record: Record): DetectedTechnology[] => { - if (record.type !== "TXT") { - return []; - } - if (record.value.startsWith("v=spf1")) { - return extractURLsOrIPsFromSPF(record.value).flatMap((value) => { - if (isIPAddress(value)) { - return []; - } - return [ - { - identifier: value, - metadata: { - via: "SPF", - }, - }, - ]; - }); - } - return []; + if (record.type !== "TXT") { + return []; + } + if (record.value.startsWith("v=spf1")) { + return extractURLsOrIPsFromSPF(record.value).flatMap((value) => { + if (isIPAddress(value)) { + return []; + } + return [ + { + identifier: value, + metadata: { + via: "SPF", + }, + }, + ]; + }); + } + return []; }; const RULES = [NAMESERVER_RULE, MX_RULE, SPF_RULE, CNAME_RULE, TXT_RULE]; const filterToUnique = (values: DetectedTechnology[]): DetectedTechnology[] => { - const seen = new Set(); - return values.filter((value) => { - const key = JSON.stringify(value); - if (seen.has(key)) { - return false; - } - seen.add(key); - return true; - }); + const seen = new Set(); + return values.filter((value) => { + const key = JSON.stringify(value); + if (seen.has(key)) { + return false; + } + seen.add(key); + return true; + }); }; -const parse: Parser = (data) => { - return filterToUnique( - data - .filter((datum) => datum.label === "DNS") - .flatMap((datum) => RULES.flatMap((rule) => datum.data.flatMap(rule))) - ); +const parse: Parser = (domain, data) => { + return Promise.resolve( + filterToUnique( + data + .filter((datum) => datum.label === "DNS") + .flatMap((datum) => RULES.flatMap((rule) => datum.data.flatMap(rule))), + ), + ); }; const exports = { parse }; export default exports; diff --git a/lib/parsers/headers.ts b/lib/parsers/headers.ts index 25ab3a7..5876783 100644 --- a/lib/parsers/headers.ts +++ b/lib/parsers/headers.ts @@ -1,19 +1,29 @@ +import type { RecordGroup } from "../loaders/types"; import { REGISTRY } from "../services"; -import { Parser } from "./types"; +import type { Parser } from "./types"; -const parse: Parser = (data) => { - return data - .flatMap((datum) => datum.data) - .flatMap((d) => { - const servicesWithHeaders = Object.values(REGISTRY).filter((service) => service.headers); - return servicesWithHeaders.filter((service) => d.type.includes(service.headers?.key || '') && (service.headers?.value === '*' || d.value.includes(service.headers?.value || ''))) - }) - .map((service) => ({ - identifier: service.identifier, - metadata: { - via: "headers", - }, - })); +const parse: Parser = (domain: string, data: RecordGroup[]) => { + return Promise.resolve( + data + .flatMap((datum) => datum.data) + .flatMap((d) => { + const servicesWithHeaders = Object.values(REGISTRY).filter( + (service) => service.headers, + ); + return servicesWithHeaders.filter( + (service) => + d.type.includes(service.headers?.key || "") && + (service.headers?.value === "*" || + d.value.includes(service.headers?.value || "")), + ); + }) + .map((service) => ({ + identifier: service.identifier, + metadata: { + via: "headers", + }, + })), + ); }; const exports = { parse }; diff --git a/lib/parsers/html.ts b/lib/parsers/html.ts index 5d282cd..4e1a0df 100644 --- a/lib/parsers/html.ts +++ b/lib/parsers/html.ts @@ -1,226 +1,225 @@ import { parse as parseHTML } from "node-html-parser"; import { REGISTRY } from "../services"; -import { DetectedTechnology, Parser } from "./types"; +import type { DetectedTechnology, Parser } from "./types"; const GENERIC_SOCIAL_MEDIA_PROVIDER = (html: string) => { - const socialMediaProviders = Object.values(REGISTRY).filter( - (service) => service.genre === "social_media" - ); - const potentialMatches = socialMediaProviders.filter((provider) => - provider.urlSubstrings?.some((substring) => html.includes(substring)) - ); - return potentialMatches - .flatMap((service) => - service.urlSubstrings?.map((s) => { - return { - identifier: service.identifier, - substring: s, - }; - }) - ) - .flatMap((potentialMatch) => { - const match = html.match( - new RegExp( - `href=["']https?://(www\.)?${potentialMatch?.substring}/([^/"^%]+?)/?["']` - ) - ); - if (match) { - const username = match[match.length - 1]; - return [ - { - identifier: potentialMatch?.identifier, - metadata: { - username: username.split("?")[0], - }, - }, - ]; - } - return []; - }); + const socialMediaProviders = Object.values(REGISTRY).filter( + (service) => service.genre === "social_media", + ); + const potentialMatches = socialMediaProviders.filter((provider) => + provider.urlSubstrings?.some((substring) => html.includes(substring)), + ); + return potentialMatches + .flatMap((service) => + service.urlSubstrings?.map((s) => { + return { + identifier: service.identifier, + substring: s, + }; + }), + ) + .flatMap((potentialMatch) => { + const match = html.match( + new RegExp( + `href=["']https?://(www\.)?${potentialMatch?.substring}/([^/"^%]+?)/?["']`, + ), + ); + if (match) { + const username = match[match.length - 1]; + return [ + { + identifier: potentialMatch?.identifier, + metadata: { + username: username.split("?")[0], + }, + }, + ]; + } + return []; + }); }; const TWITTER_RULE = (html: string) => { - // Match on ` and pull out the username. - // Make sure to avoid matching on twitter.com/intent. - const match = html.match(/href="https:\/\/twitter.com\/([^\/"]+)"/); - if (match) { - const username = match[1]; - // Also remove query parameters from the username. - const usernameWithoutQuery = username.split("?")[0]; - return [ - { - identifier: "twitter", - metadata: { username: usernameWithoutQuery }, - }, - ]; - } + // Match on ` and pull out the username. + // Make sure to avoid matching on twitter.com/intent. + const match = html.match(/href="https:\/\/twitter.com\/([^\/"]+)"/); + if (match) { + const username = match[1]; + // Also remove query parameters from the username. + const usernameWithoutQuery = username.split("?")[0]; + return [ + { + identifier: "twitter", + metadata: { username: usernameWithoutQuery }, + }, + ]; + } - // Also check for `rel="me"` links that have twitter in them, like: - // - const match2 = html.match( - / + const match2 = html.match( + / { - // Match on ` and pull out the username. - const match = html.match(/ and pull out the username. + const match = html.match(/ { - const tag = parseHTML(html).querySelector( - "script[type='application/ld+json']" - ); - if (tag) { - const text = tag.text; - const baseRule = [ - { - identifier: "jsonld", - metadata: { value: text }, - }, - ...((() => { - try { - return JSON.parse(text) - } catch (error) { - console.error("Error parsing JSON-LD:", error); - return {}; - } - })() - ["@graph"]?.filter((i: { sameAs: string[] }) => i.sameAs) - .flatMap((i: any) => { - return i.sameAs.flatMap((url: string) => { - const service = Object.values(REGISTRY).find((service) => - url.includes(service.urlSubstrings?.[0] || "") - ); - if (!service) { - return []; - } - return [ - { - identifier: service.identifier.split("?")[0], - metadata: { - username: url.split("/").pop(), - }, - }, - ]; - }); - }) || []), - ]; - return baseRule; - } - return []; + const tag = parseHTML(html).querySelector( + "script[type='application/ld+json']", + ); + if (tag) { + const text = tag.text; + const baseRule = [ + { + identifier: "jsonld", + metadata: { value: text }, + }, + ...((() => { + try { + return JSON.parse(text); + } catch (error) { + console.error("Error parsing JSON-LD:", error); + return {}; + } + })() + ["@graph"]?.filter((i: { sameAs: string[] }) => i.sameAs) + .flatMap((i: any) => { + return i.sameAs.flatMap((url: string) => { + const service = Object.values(REGISTRY).find((service) => + url.includes(service.urlSubstrings?.[0] || ""), + ); + if (!service) { + return []; + } + return [ + { + identifier: service.identifier.split("?")[0], + metadata: { + username: url.split("/").pop(), + }, + }, + ]; + }); + }) || []), + ]; + return baseRule; + } + return []; }; const RSS_RULE = (html: string): DetectedTechnology[] => { - const tag = parseHTML(html).querySelector("link[type='application/rss+xml']"); - if (tag) { - const href = tag.getAttribute("href") || ""; - return [ - { - identifier: "rss", - metadata: { url: href }, - }, - ]; - } + const tag = parseHTML(html).querySelector("link[type='application/rss+xml']"); + if (tag) { + const href = tag.getAttribute("href") || ""; + return [ + { + identifier: "rss", + metadata: { url: href }, + }, + ]; + } - const tag2 = parseHTML(html).querySelector("a[href*='feed.xml']"); - if (tag2) { - const href = tag2.getAttribute("href") || ""; - return [ - { - identifier: "rss", - metadata: { url: href }, - }, - ]; - } + const tag2 = parseHTML(html).querySelector("a[href*='feed.xml']"); + if (tag2) { + const href = tag2.getAttribute("href") || ""; + return [ + { + identifier: "rss", + metadata: { url: href }, + }, + ]; + } - return []; + return []; }; const SUBDOMAIN_RULE = (html: string, domain: string) => { - const subdomains = parseHTML(html) - .querySelectorAll("a") - .map((a) => ({ - value: a.getAttribute("href"), - })) - .filter( - (v) => - v.value && - v.value.startsWith("http") && - new URL(v.value).hostname.includes(domain) && - new URL(v.value).hostname !== "www." + domain && - new URL(v.value).hostname !== domain - ) - .map((v) => ({ - value: new URL(v.value || "").hostname, - })) - .filter((v, i, a) => a.findIndex((t) => t.value === v.value) === i); - return subdomains.map((subdomain) => ({ - // Subdomains aren't a technology, but it's kind of a weird case. We do need - // a better abstraction here, though. - identifier: "subdomain", - metadata: { - value: subdomain.value, - }, - })); + const subdomains = parseHTML(html) + .querySelectorAll("a") + .map((a) => ({ + value: a.getAttribute("href"), + })) + .filter( + (v) => + v.value && + v.value.startsWith("http") && + new URL(v.value).hostname.includes(domain) && + new URL(v.value).hostname !== "www." + domain && + new URL(v.value).hostname !== domain, + ) + .map((v) => ({ + value: new URL(v.value || "").hostname, + })) + .filter((v, i, a) => a.findIndex((t) => t.value === v.value) === i); + return subdomains.map((subdomain) => ({ + // Subdomains aren't a technology, but it's kind of a weird case. We do need + // a better abstraction here, though. + identifier: "subdomain", + metadata: { + value: subdomain.value, + }, + })); }; const RULES: ((html: string, domain: string) => DetectedTechnology[])[] = [ - ...Object.values(REGISTRY).map((service) => { - return (html: string, domain: string) => { - const potentialMatches = service.substrings?.filter((substring) => - html.includes(substring) - ); - return ( - potentialMatches?.map(() => { - return { - identifier: service.identifier, - metadata: { - value: service.identifier, - via: "URL", - }, - }; - }) || [] - ); - }; - }), - TWITTER_RULE, - GENERIC_SOCIAL_MEDIA_PROVIDER, - EMAIL_ADDRESS_RULE, - RSS_RULE, - JSONLD_RULE, - SUBDOMAIN_RULE, + ...Object.values(REGISTRY).map((service) => { + return (html: string, domain: string) => { + const potentialMatches = service.substrings?.filter((substring) => + html.includes(substring), + ); + return ( + potentialMatches?.map(() => { + return { + identifier: service.identifier, + metadata: { + value: service.identifier, + via: "URL", + }, + }; + }) || [] + ); + }; + }), + TWITTER_RULE, + GENERIC_SOCIAL_MEDIA_PROVIDER, + EMAIL_ADDRESS_RULE, + RSS_RULE, + JSONLD_RULE, + SUBDOMAIN_RULE, ]; -const parse: Parser = (data) => { - const domain = data.find((datum) => datum.label === "URL")?.data[0].value; - const html = data.find((datum) => datum.label === "HTML")?.data[0].value; - if (!domain || !html) { - return []; - } - return RULES.flatMap((rule) => rule(html, domain)); +const parse: Parser = (domain, data) => { + const html = data.find((datum) => datum.label === "HTML")?.data[0].value; + if (!domain || !html) { + return Promise.resolve([]); + } + return Promise.resolve(RULES.flatMap((rule) => rule(html, domain))); }; const exports = { parse }; export default exports; diff --git a/lib/parsers/types.ts b/lib/parsers/types.ts index f367e35..fd19595 100644 --- a/lib/parsers/types.ts +++ b/lib/parsers/types.ts @@ -1,9 +1,12 @@ -import { RecordGroup } from "../loaders/types"; -import { REGISTRY } from "../services"; +import type { RecordGroup } from "../loaders/types"; +import type { REGISTRY } from "../services"; export type DetectedTechnology = { - identifier: keyof typeof REGISTRY; - metadata: Record; + identifier: keyof typeof REGISTRY; + metadata: Record; }; -export type Parser = (data: RecordGroup[]) => DetectedTechnology[]; +export type Parser = ( + domain: string, + data: RecordGroup[], +) => Promise; diff --git a/lib/utils.ts b/lib/utils.ts index 02ba83c..b55b182 100644 --- a/lib/utils.ts +++ b/lib/utils.ts @@ -1,18 +1,28 @@ const recursivelyStringify = (obj: T): string => { - const allKeys: Set = new Set(); - JSON.stringify(obj, (key, value) => { - allKeys.add(key); - return value; - }); - return JSON.stringify(obj, Array.from(allKeys).sort()); + const allKeys: Set = new Set(); + JSON.stringify(obj, (key, value) => { + allKeys.add(key); + return value; + }); + return JSON.stringify(obj, Array.from(allKeys).sort()); }; -export const unique = (arr: T[], keyFn?: (obj: T) => string) => { - // Objects can be complex, so we can't use Set here - return arr.filter( - (v, i, a) => - a.findIndex( - (t) => recursivelyStringify(keyFn ? keyFn(t) : t) === recursivelyStringify(keyFn ? keyFn(v) : v) - ) === i - ); +export const unique = ( + arr: T[], + keyFn?: (obj: T) => string, +) => { + // Objects can be complex, so we can't use Set here + return arr.filter( + (v, i, a) => + a.findIndex( + (t) => + recursivelyStringify(keyFn ? keyFn(t) : t) === + recursivelyStringify(keyFn ? keyFn(v) : v), + ) === i, + ); +}; + +export const extractDomain = (url: string) => { + const parsedUrl = new URL(url); + return parsedUrl.hostname.replace(/^www\./, ""); }; diff --git a/scripts/refresh-affiliation.ts b/scripts/refresh-affiliation.ts new file mode 100644 index 0000000..cb84e51 --- /dev/null +++ b/scripts/refresh-affiliation.ts @@ -0,0 +1,72 @@ +import { REGISTRY } from "../lib/affiliations/registry"; +import type { Affiliation } from "../lib/affiliations/types"; +import { db } from "../lib/db/connection"; + +async function refreshAffiliation(identifier: string) { + const affiliation = REGISTRY[identifier]; + + if (!affiliation) { + console.error(`Affiliation "${identifier}" not found in the registry.`); + process.exit(1); + } + + if (!affiliation.load) { + console.error( + `Affiliation "${identifier}" does not have a loader function.`, + ); + process.exit(1); + } + + try { + console.log(`Refreshing affiliation: ${identifier}`); + const generator = affiliation.load(); + let batch: Affiliation[] = []; + + for await (const result of generator) { + batch.push(result); + + if (batch.length === 10) { + await db + .insertInto("affiliations") + .values( + batch.map((item) => ({ + domain: item.domain, + identifier, + metadata: JSON.stringify(item.metadata), + creation_date: new Date().toISOString(), + })), + ) + .execute(); + batch = []; + } + } + + // Insert any remaining items + if (batch.length > 0) { + await db + .insertInto("affiliations") + .values( + batch.map((item) => ({ + domain: item.domain, + identifier, + metadata: JSON.stringify(item.metadata), + creation_date: new Date().toISOString(), + })), + ) + .execute(); + } + } catch (error) { + console.error(`Error refreshing affiliation ${identifier}:`, error); + process.exit(1); + } +} + +// Check if an affiliation name was provided as a command-line argument +const affiliationArg = process.argv[2]; + +if (!affiliationArg) { + console.error("Please provide an affiliation name as an argument."); + process.exit(1); +} + +refreshAffiliation(affiliationArg);