Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: search improvements #10113

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/Docfx.Build.ManagedReference/FillMetadata.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Immutable;
using System.Composition;
using Docfx.Build.Common;
using Docfx.DataContracts.ManagedReference;
using Docfx.Plugins;

namespace Docfx.Build.ManagedReference;

[Export(nameof(ManagedReferenceDocumentProcessor), typeof(IDocumentBuildStep))]
public class FillMetadata : BaseDocumentBuildStep
{
public override string Name => nameof(FillMetadata);
public override int BuildOrder => 0x30;

public override void Postbuild(ImmutableList<FileModel> models, IHostService host)
{
if (models.Count > 0)
{
foreach (var model in models)
{
if (model.Type != DocumentType.Article)
{
continue;
}

model.ManifestProperties.Uid = null;
var pageViewModel = (PageViewModel)model.Content;
if (pageViewModel.Items.Count == 0)
{
continue;
}

model.ManifestProperties.IsMRef = true;
model.ManifestProperties.Title = pageViewModel.Items[0].FullName;
model.ManifestProperties.Summary = pageViewModel.Items[0].Summary;
}
}
}
}
88 changes: 80 additions & 8 deletions src/Docfx.Build/PostProcessors/ExtractSearchIndex.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Immutable;
Expand All @@ -17,6 +17,7 @@ namespace Docfx.Build.Engine;
class ExtractSearchIndex : IPostProcessor
{
private static readonly Regex s_regexWhiteSpace = new(@"\s+", RegexOptions.Compiled);
private static readonly Regex s_regexCase = new(@"[a-z0-9]+|[A-Z0-9]+[a-z0-9]*|[0-9]+", RegexOptions.Compiled);
private static readonly HashSet<string> s_htmlInlineTags = new(StringComparer.OrdinalIgnoreCase)
{
"a", "area", "del", "ins", "link", "map", "meta", "abbr", "audio", "b", "bdo", "button", "canvas", "cite", "code", "command", "data",
Expand All @@ -28,12 +29,20 @@ class ExtractSearchIndex : IPostProcessor
public string Name => nameof(ExtractSearchIndex);
public const string IndexFileName = "index.json";

internal bool UseMetadata { get; set; } = false;
internal bool UseMetadataTitle { get; set; } = true;

public ImmutableDictionary<string, object> PrepareMetadata(ImmutableDictionary<string, object> metadata)
{
if (!metadata.ContainsKey("_enableSearch"))
{
metadata = metadata.Add("_enableSearch", true);
}

UseMetadata = metadata.TryGetValue("_searchIndexUseMetadata", out var useMetadataObject) && (bool)useMetadataObject;
UseMetadataTitle = !metadata.TryGetValue("_searchIndexUseMetadataTitle", out var useMetadataTitleObject) || (bool)useMetadataTitleObject;

Logger.LogInfo($"{Name}: {nameof(UseMetadata)} = {UseMetadata}, {nameof(UseMetadataTitle)} = {UseMetadataTitle}");
return metadata;
}

Expand All @@ -48,14 +57,15 @@ public Manifest Process(Manifest manifest, string outputFolder, CancellationToke
var htmlFiles = (from item in manifest.Files ?? Enumerable.Empty<ManifestItem>()
from output in item.Output
where item.Type != "Toc" && output.Key.Equals(".html", StringComparison.OrdinalIgnoreCase)
select output.Value.RelativePath).ToList();
select (output.Value.RelativePath, item.Metadata)).ToList();

if (htmlFiles.Count == 0)
{
return manifest;
}

Logger.LogInfo($"Extracting index data from {htmlFiles.Count} html files");
foreach (var relativePath in htmlFiles)
foreach ((string relativePath, Dictionary<string, object> metadata) in htmlFiles)
{
cancellationToken.ThrowIfCancellationRequested();

Expand All @@ -75,7 +85,7 @@ from output in item.Output
Logger.LogWarning($"Warning: Can't load content from {filePath}: {ex.Message}");
continue;
}
var indexItem = ExtractItem(html, relativePath);
var indexItem = ExtractItem(html, relativePath, metadata);
if (indexItem != null)
{
indexData[relativePath] = indexItem;
Expand All @@ -98,7 +108,7 @@ from output in item.Output
return manifest;
}

internal SearchIndexItem ExtractItem(HtmlDocument html, string href)
internal SearchIndexItem ExtractItem(HtmlDocument html, string href, Dictionary<string, object> metadata = null)
{
var contentBuilder = new StringBuilder();

Expand All @@ -116,10 +126,37 @@ internal SearchIndexItem ExtractItem(HtmlDocument html, string href)
ExtractTextFromNode(node, contentBuilder);
}

var content = NormalizeContent(contentBuilder.ToString());
var title = ExtractTitleFromHtml(html);
string title;
string summary = null;
string keywords = null;

var isMRef = metadata != null && metadata.TryGetValue("IsMRef", out var isMRefMetadata) && (bool)isMRefMetadata;
if (UseMetadata && isMRef)
{
title = UseMetadataTitle
? (string)metadata["Title"] ?? ExtractTitleFromHtml(html)
: ExtractTitleFromHtml(html);

var htmlSummary = (string)metadata["Summary"];
if (!string.IsNullOrEmpty(htmlSummary))
{
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(htmlSummary);
var htmlRootNode = htmlDocument.DocumentNode.FirstChild;
var summaryBuilder = new StringBuilder();
ExtractTextFromNode(htmlRootNode, summaryBuilder);
summary = NormalizeContent(summaryBuilder.ToString());
}

keywords = string.Join(' ', title.Split(' ').Select(word => string.Join(' ', GetStemAggregations(word.Split('.')[^1]))));
}
else
{
title = ExtractTitleFromHtml(html);
summary = NormalizeContent(contentBuilder.ToString());
}

return new SearchIndexItem { Href = href, Title = title, Keywords = content };
return new SearchIndexItem { Href = href, Title = title, Summary = summary, Keywords = keywords };
}

private static string ExtractTitleFromHtml(HtmlDocument html)
Expand All @@ -139,6 +176,41 @@ private static string NormalizeContent(string str)
return s_regexWhiteSpace.Replace(str, " ").Trim();
}

private static string[] GetStems(string str)
{
if (string.IsNullOrEmpty(str))
{
return [string.Empty];
}
str = WebUtility.HtmlDecode(str);
return s_regexCase.Matches(str).Select(m => m.Value).ToArray();
}

private static List<string> GetStemAggregations(string str)
{
var stems = GetStems(str);

var results = new List<string>();
Aggregate(stems, [], results, 0);
return results;

static void Aggregate(string[] input, List<string> current, List<string> results, int index)
{
if (index == input.Length)
{
return;
}

for (int i = index; i < input.Length; i++)
{
current.Add(input[i]);
results.Add(string.Join(string.Empty, current));
Aggregate(input, current, results, i + 1);
current.RemoveAt(current.Count - 1);
}
}
}

private static void ExtractTextFromNode(HtmlNode node, StringBuilder contentBuilder)
{
if (node == null)
Expand Down
14 changes: 12 additions & 2 deletions src/Docfx.Build/PostProcessors/SearchIndexItem.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ class SearchIndexItem
[JsonPropertyName("keywords")]
public string Keywords { get; set; }

[JsonProperty("summary")]
[JsonPropertyName("summary")]
public string Summary { get; set; }

public override bool Equals(object obj)
{
return Equals(obj as SearchIndexItem);
Expand All @@ -35,11 +39,17 @@ public bool Equals(SearchIndexItem other)
{
return true;
}
return string.Equals(Title, other.Title) && string.Equals(Href, other.Href) && string.Equals(Keywords, other.Keywords);
return string.Equals(Title, other.Title) &&
string.Equals(Href, other.Href) &&
string.Equals(Summary, other.Summary) &&
string.Equals(Keywords, other.Keywords);
}

public override int GetHashCode()
{
return Title.GetHashCode() ^ Href.GetHashCode() ^ Keywords.GetHashCode();
return Title.GetHashCode() ^
Href.GetHashCode() ^
Summary.GetHashCode() ^
Keywords.GetHashCode();
}
}
5 changes: 3 additions & 2 deletions templates/default/src/search-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
var results = [];
hits.forEach(function (hit) {
var item = searchData[hit.ref];
results.push({ 'href': item.href, 'title': item.title, 'keywords': item.keywords });
results.push({ 'href': item.href, 'title': item.title, 'summary': item.summary, 'keywords': item.keywords });
});
postMessage({ e: 'query-ready', q: q, d: results });
}
Expand All @@ -51,7 +51,8 @@
this.pipeline.remove(lunr.stopWordFilter);
this.ref('href');
this.field('title', { boost: 50 });
this.field('keywords', { boost: 20 });
this.field('keywords', { boost: 40 });
this.field('summary', { boost: 20 });

for (var prop in searchData) {
if (searchData.hasOwnProperty(prop)) {
Expand Down
5 changes: 4 additions & 1 deletion templates/default/styles/docfx.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,9 @@ $(function () {
}

function extractContentBrief(content) {
if (!content) {
return
}
var briefOffset = 512;
var words = query.split(/\s+/g);
var queryIndex = content.indexOf(words[0]);
Expand Down Expand Up @@ -285,7 +288,7 @@ $(function () {
var itemRawHref = relativeUrlToAbsoluteUrl(currentUrl, relHref + hit.href);
var itemHref = relHref + hit.href + "?q=" + query;
var itemTitle = hit.title;
var itemBrief = extractContentBrief(hit.keywords);
var itemBrief = extractContentBrief(hit.summary || '');

var itemNode = $('<div>').attr('class', 'sr-item');
var itemTitleNode = $('<div>').attr('class', 'item-title').append($('<a>').attr('href', itemHref).attr("target", "_blank").attr("rel", "noopener noreferrer").text(itemTitle));
Expand Down
4 changes: 3 additions & 1 deletion templates/modern/src/search-worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { get, set, createStore } from 'idb-keyval'
type SearchHit = {
href: string
title: string
summary: string
keywords: string
}

Expand Down Expand Up @@ -47,7 +48,8 @@ async function loadIndex({ lunrLanguages }: { lunrLanguages?: string[] }) {

this.ref('href')
this.field('title', { boost: 50 })
this.field('keywords', { boost: 20 })
this.field('keywords', { boost: 40 })
this.field('summary', { boost: 20 })

if (lunrLanguages && lunrLanguages.length > 0) {
this.use(lunr.multiLanguage(...lunrLanguages))
Expand Down
11 changes: 9 additions & 2 deletions templates/modern/src/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { classMap } from 'lit-html/directives/class-map.js'
type SearchHit = {
href: string
title: string
summary: string
keywords: string
}

Expand All @@ -34,6 +35,11 @@ export async function enableSearch() {
case 'index-ready':
searchQuery.disabled = false
searchQuery.addEventListener('input', onSearchQueryInput)
searchQuery.addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
event.preventDefault()
}
})
window.docfx.searchReady = true
break
case 'query-ready':
Expand All @@ -56,7 +62,8 @@ export async function enableSearch() {
if (query === '') {
document.body.removeAttribute('data-search')
} else {
worker.postMessage({ q: query })
const additiveQuery = query.replace(/\s+/g, ' ').split(' ').map(w => '+' + w).join(' ')
worker.postMessage({ q: additiveQuery })
}
}

Expand Down Expand Up @@ -108,7 +115,7 @@ export async function enableSearch() {
const currentUrl = window.location.href
const itemRawHref = relativeUrlToAbsoluteUrl(currentUrl, relHref + hit.href)
const itemHref = relHref + hit.href + '?q=' + query
const itemBrief = extractContentBrief(hit.keywords)
const itemBrief = hit.summary ? extractContentBrief(hit.summary) : ''

return html`
<div class="sr-item">
Expand Down
Loading