Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion server/indexer/files.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func IndexFile(path string) error {
fileURL := files.PathToFileURL(absPath)

// Skip if already indexed with the same modification time
existing := GetByURL(fileURL)
existing := GetByURLAndUser(fileURL, 0)
if existing != nil && existing.Added == info.ModTime().Unix() {
return nil
}
Expand Down
128 changes: 122 additions & 6 deletions server/indexer/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,94 @@ type Query struct {
SemanticWeight float64 `json:"semantic_weight"`
PageKey string `json:"page_key"`
IncludeHTML bool `json:"include_html"`
cfg *config.Config
Facets bool `json:"facets,omitempty"`
// FacetTermSize overrides the default top-N cap for term facets
// (domain, language). Zero uses the default. Useful for completion
// callers that want to post-filter a larger pool by prefix.
FacetTermSize int `json:"facet_term_size,omitempty"`
// MatchAll bypasses the text-DSL builder and runs a match-all query.
// Combine with UserID / Facets / DateFrom / DateTo for cheap aggregate
// queries (e.g. completion sources). Text is ignored when set.
MatchAll bool `json:"match_all,omitempty"`
cfg *config.Config
}

const defaultFacetTermSize = 10

// TermCount and RangeCount are the shape of facet buckets returned by Search
// when Query.Facets is true.
type TermCount struct {
Term string `json:"term"`
Count int `json:"count"`
}

type RangeCount struct {
Name string `json:"name"`
Count int `json:"count"`
}

type FacetsResult struct {
Domains []TermCount `json:"domains,omitempty"`
Languages []TermCount `json:"languages,omitempty"`
DateHistogram []RangeCount `json:"date_histogram,omitempty"`
}

// dateFacetBuckets drives the "added" histogram. Each entry is a non-
// overlapping slice of time ending at the previous bucket's boundary; the
// final "older" bucket is appended implicitly. Order matters, the loop
// walks most-recent -> oldest so each range's upper bound is the prior
// range's lower bound.
var dateFacetBuckets = []struct {
name string
age time.Duration
}{
{"last_24h", 24 * time.Hour},
{"last_7d", 7 * 24 * time.Hour},
{"last_30d", 30 * 24 * time.Hour},
{"last_year", 365 * 24 * time.Hour},
}

func addFacets(req *bleve.SearchRequest, termSize int) {
if termSize <= 0 {
termSize = defaultFacetTermSize
}
req.AddFacet("domains", bleve.NewFacetRequest("domain", termSize))
req.AddFacet("languages", bleve.NewFacetRequest("language", termSize))
now := time.Now()
dh := bleve.NewFacetRequest("added", len(dateFacetBuckets)+1)
var prev *float64
for _, b := range dateFacetBuckets {
ts := float64(now.Add(-b.age).Unix())
dh.AddNumericRange(b.name, &ts, prev)
prev = &ts
}
dh.AddNumericRange("older", nil, prev)
req.AddFacet("added", dh)
}

func extractTermFacet(f *search.FacetResult) []TermCount {
if f == nil || f.Terms == nil {
return nil
}
terms := f.Terms.Terms()
out := make([]TermCount, 0, len(terms))
for _, t := range terms {
out = append(out, TermCount{Term: t.Term, Count: t.Count})
}
return out
}

func extractFacets(facets search.FacetResults) *FacetsResult {
fr := &FacetsResult{
Domains: extractTermFacet(facets["domains"]),
Languages: extractTermFacet(facets["languages"]),
}
if f := facets["added"]; f != nil {
for _, nr := range f.NumericRanges {
fr.DateHistogram = append(fr.DateHistogram, RangeCount{Name: nr.Name, Count: nr.Count})
}
}
return fr
}

// SemanticHit represents a document found via vector similarity search.
Expand All @@ -91,6 +178,7 @@ type Results struct {
PageKey string `json:"page_key"`
SemanticHits []SemanticHit `json:"semantic_hits,omitempty"`
SemanticEnabled bool `json:"semantic_enabled"`
Facets *FacetsResult `json:"facets,omitempty"`
}

type MultiBatch struct {
Expand Down Expand Up @@ -705,6 +793,10 @@ func Search(cfg *config.Config, q *Query) (*Results, error) {
}
}

if q.Facets {
addFacets(req, q.FacetTermSize)
}

res, err := i.idx.Search(req)
if err != nil {
return nil, err
Expand All @@ -722,6 +814,9 @@ func Search(cfg *config.Config, q *Query) (*Results, error) {
Query: q,
Documents: matches,
}
if q.Facets && len(res.Facets) > 0 {
r.Facets = extractFacets(res.Facets)
}
if len(res.Hits) > 0 {
lastHit := res.Hits[len(res.Hits)-1]
lastSort := lastHit.Sort
Expand Down Expand Up @@ -791,7 +886,13 @@ func Search(cfg *config.Config, q *Query) (*Results, error) {
MatchedChunk: truncateText(dh.chunkText, semanticTextPreviewLen),
}
// For semantic-only hits, populate the document with a truncated text preview.
d := GetByURL(docID)
// vectorStore.Search was scoped to q.UserID, so every docID
// here has that uid prefix (see document.GetDocID).
url := docID
if q.UserID > 0 {
url = strings.TrimPrefix(docID, fmt.Sprintf("%d:", q.UserID))
}
d := GetByURLAndUser(url, q.UserID)
if d != nil {
if _, inKeyword := keywordURLs[d.URL]; !inKeyword {
d.Text = truncateText(d.Text, semanticTextPreviewLen)
Expand All @@ -807,9 +908,20 @@ func Search(cfg *config.Config, q *Query) (*Results, error) {
return r, nil
}

func GetByURL(u string) *document.Document {
q := query.NewTermQuery(strings.ToLower(u))
q.SetField("url")
// GetByURLAndUser returns the document at u owned by uid. The url field is
// shared across owners in multi-user mode, so callers must pass their own
// UserID to avoid returning another user's copy of the same URL. A uid of 0
// matches the global (single-user) owner.
func GetByURLAndUser(u string, uid uint) *document.Document {
urlQ := query.NewTermQuery(strings.ToLower(u))
urlQ.SetField("url")
var q query.Query = urlQ
if uid > 0 {
Comment thread
FlameFlag marked this conversation as resolved.
Outdated
f := float64(uid)
userQ := bleve.NewNumericRangeInclusiveQuery(&f, &f, new(true), new(true))
userQ.SetField("user_id")
q = bleve.NewConjunctionQuery(urlQ, userQ)
}
req := bleve.NewSearchRequest(q)
req.Fields = allFields
req.Highlight = bleve.NewHighlight()
Expand Down Expand Up @@ -897,7 +1009,11 @@ func docFromHit(h *search.DocumentMatch) *document.Document {

func (q *Query) create() query.Query {
var sq query.Query
sq = querybuilder.Build(q.Text)
if q.MatchAll {
sq = query.NewMatchAllQuery()
} else {
sq = querybuilder.Build(q.Text)
}

if q.DateFrom != 0 || q.DateTo != 0 {
if q.DateFrom != 0 && q.DateTo == 0 {
Expand Down
6 changes: 3 additions & 3 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -996,7 +996,7 @@ func serveRules(c *webContext) {

func serveGet(c *webContext) {
u := c.Request.URL.Query().Get("url")
doc := indexer.GetByURL(u)
doc := indexer.GetByURLAndUser(u, c.UserID)
if doc == nil {
http.Error(c.Response, "document not found", http.StatusNotFound)
return
Expand All @@ -1011,7 +1011,7 @@ func serveGet(c *webContext) {

func servePreview(c *webContext) {
u := c.Request.URL.Query().Get("url")
doc := indexer.GetByURL(u)
doc := indexer.GetByURLAndUser(u, c.UserID)
if doc == nil {
serve500(c)
return
Expand Down Expand Up @@ -1397,7 +1397,7 @@ func serveBatch(c *webContext) {
results[i] = batchOpResult{Status: http.StatusBadRequest, Error: "missing url"}
continue
}
d := indexer.GetByURL(op.URL)
d := indexer.GetByURLAndUser(op.URL, c.UserID)
if d == nil {
results[i] = batchOpResult{Status: http.StatusNotFound, Error: "document not found"}
} else {
Expand Down