Skip to content

Commit

Permalink
add docsearch config for backup
Browse files Browse the repository at this point in the history
  • Loading branch information
mashehu committed Dec 12, 2023
1 parent 0d9e386 commit 9b93b5b
Showing 1 changed file with 145 additions and 0 deletions.
145 changes: 145 additions & 0 deletions docsearch.crawler.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
////////////////////////////////////////////////////
//❗❗❗ manually synced might be out of date ❗❗❗//
//❗❗❗ API keys not included in this file ❗❗❗//
////////////////////////////////////////////////////

// This file is used to configure the DocSearch crawler.
// For more information, see https://docsearch.algolia.com/docs/what-is-docsearch

new Crawler({
rateLimit: 8, // Maximum number of requests per second
maxDepth: 10, // Maximum depth of crawling
maxUrls: 10000, // Maximum number of URLs to crawl
startUrls: ["https://nf-co.re/"], // URLs to start crawling from
renderJavaScript: false, // Whether to render JavaScript on crawled pages
sitemaps: ["https://nf-co.re/sitemap-index.xml"], // Sitemaps to crawl
ignoreCanonicalTo: false, // Whether to ignore canonical URLs
discoveryPatterns: ["https://nf-co.re/**"], // URL patterns to discover new pages
exclusionPatterns: [
"https://nf-co.re/**/results/**", // URL patterns to exclude from crawling
"https://nf-co.re/**/latest/**",
"https://nf-co.re/launch/**",
"https://nf-co.re/**/*.html",
],
schedule: "every 1 day at 10:00 am", // Schedule for crawling
actions: [
{
indexName: "nf-co", // Index name for storing crawled data
pathsToMatch: ["https://nf-co.re/**"], // URL patterns to match for indexing
recordExtractor: ({ $, helpers }) => {
const version = $("meta[name='docsearch:version']").attr("content"); // Extract version from meta tag
let page_type =
$("meta[name='docsearch:page_type']").attr("content") || ""; // Extract page type from meta tag
let metaPageRank =
Number($("meta[name='docsearch:page_rank']").attr("content")) || 0; // Extract page rank from meta tag
if (page_type === "Pipeline") {
if (version && version.includes("latest")) { // don't crawl older pipeline releases
const lvl3Placeholder = $("title").text().includes(": Introduction") // get topic entries as level 3 data, but only for the pipeline index pages
? [".mainpage-heading .topics .topic"]
: ""; // Placeholder for level 3 data
return helpers.docsearch({
recordProps: {
lvl0: {
selectors: "",
defaultValue: page_type,
},
lvl1: ["title"], // Extract level 1 data
lvl2: [".mainpage-heading .lead p"], // Extract level 2 data
lvl3: lvl3Placeholder, // Extract level 3 data
content: [".markdown-content p"], // Extract content data
pageRank:
(version && version.includes("latest") ? 100 : 0) +
metaPageRank, // Calculate page rank
},
aggregateContent: true, // Whether to aggregate content from different levels
recordVersion: "v3", // Version of the record
});
}
} else {
const pageTypeRankMap = {
Docs: 70,
Tools: 60,
Modules: 50,
Subworkflows: 40,
};

let pageTypeRank = pageTypeRankMap[page_type] || 0; // Get page type rank
metaPageRank = metaPageRank + pageTypeRank; // Calculate page rank
return helpers.docsearch({
recordProps: {
lvl0: {
selectors: "",
defaultValue: page_type,
},
lvl1: [".mainpage-heading h1 span"], // Extract page heading
lvl2: [".mainpage-heading .lead p"], // Extract the page description
lvl3: [".mainpage-heading .topics .topic"], // Extract the topics
lvl4: [".markdown-content h2"], // Extract headings in main text
content: [".markdown-content p"], // Extract content data
pageRank: metaPageRank, // Calculate page rank
},
aggregateContent: true, // Whether to aggregate content from different levels
recordVersion: "v3", // Version of the record
});
}
},
},
],
safetyChecks: { beforeIndexPublishing: { maxLostRecordsPercentage: 30 } }, // Safety checks before publishing index
initialIndexSettings: {
"nf-core": {
attributesForFaceting: ["filterOnly(page_type)", "filterOnly(version)"], // Attributes for faceting
attributesToRetrieve: [
"hierarchy",
"content",
"anchor",
"url",
"url_without_anchor",
"type",
"page_type",
"version",
], // Attributes to retrieve
attributesToHighlight: ["hierarchy", "content"], // Attributes to highlight
attributesToSnippet: ["content:10"], // Attributes to snippet
camelCaseAttributes: ["hierarchy", "content"], // Camel case attributes
searchableAttributes: [
"unordered(hierarchy.lvl0)",
"unordered(hierarchy.lvl1)",
"unordered(hierarchy.lvl2)",
"unordered(hierarchy.lvl3)",
"unordered(hierarchy.lvl4)",
"unordered(hierarchy.lvl5)",
"unordered(hierarchy.lvl6)",
"content",
], // Searchable attributes
distinct: true, // Whether to enable distinct
attributeForDistinct: "url", // Attribute for distinct
customRanking: [
"desc(weight.pageRank)",
"desc(weight.level)",
"asc(weight.position)",
], // Custom ranking
ranking: [
"words",
"filters",
"typo",
"attribute",
"proximity",
"exact",
"custom",
], // Ranking criteria
highlightPreTag: '<span class="algolia-docsearch-suggestion--highlight">', // Highlight pre tag
highlightPostTag: "</span>", // Highlight post tag
minWordSizefor1Typo: 3, // Minimum word size for 1 typo
minWordSizefor2Typos: 7, // Minimum word size for 2 typos
allowTyposOnNumericTokens: false, // Whether to allow typos on numeric tokens
minProximity: 1, // Minimum proximity
ignorePlurals: true, // Whether to ignore plurals
advancedSyntax: true, // Whether to enable advanced syntax
attributeCriteriaComputedByMinProximity: true, // Whether to compute attribute criteria by minimum proximity
removeWordsIfNoResults: "allOptional", // Remove words if no results
},
},
appId: "XXX", // Algolia App ID
apiKey: "XXX", // Algolia API Key
});

0 comments on commit 9b93b5b

Please sign in to comment.