all repos — searchix @ b650b993e38f9919d2e65b732a5dee0267c71b84

Search engine for NixOS, nix-darwin, home-manager and NUR users

feat: split compound words in names into n-grams

Implements: https://todo.sr.ht/~alanpearce/searchix/9
Alan Pearce alan@alanpearce.eu
Tue, 18 Mar 2025 19:21:57 +0100
commit

b650b993e38f9919d2e65b732a5dee0267c71b84

parent

1e3868ef2bdedb72a9ec535489da9aeb8ae93c16

1 files changed, 27 insertions(+), 3 deletions(-)

jump to
M internal/index/indexer.gointernal/index/indexer.go
@@ -16,13 +16,15 @@ "go.alanpearce.eu/x/log" 	"go.uber.org/zap"
 
 	"github.com/blevesearch/bleve/v2"
+	"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
 	"github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
 	"github.com/blevesearch/bleve/v2/analysis/analyzer/simple"
 	"github.com/blevesearch/bleve/v2/analysis/analyzer/web"
-	"github.com/blevesearch/bleve/v2/analysis/lang/en"
 	"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
+	"github.com/blevesearch/bleve/v2/analysis/token/ngram"
 	"github.com/blevesearch/bleve/v2/analysis/token/porter"
 	"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
 	"github.com/blevesearch/bleve/v2/document"
 	"github.com/blevesearch/bleve/v2/mapping"
 	indexAPI "github.com/blevesearch/bleve_index_api"
@@ -58,6 +60,28 @@ descriptionFieldMapping.Store = false 	descriptionFieldMapping.Analyzer = web.Name
 
 	var err error
+
+	err = indexMapping.AddCustomTokenFilter("ngram", map[string]any{
+		"type": ngram.Name,
+		"min":  3.0,
+		"max":  25.0,
+	})
+	if err != nil {
+		return nil, errors.WithMessage(err, "failed to add ngram token filter")
+	}
+
+	err = indexMapping.AddCustomAnalyzer("c_name", map[string]any{
+		"type":      custom.Name,
+		"tokenizer": unicode.Name,
+		"token_filters": []string{
+			camelcase.Name,
+			"ngram",
+		},
+	})
+	if err != nil {
+		return nil, errors.WithMessage(err, "could not add custom analyser")
+	}
+
 	err = indexMapping.AddCustomAnalyzer("loc", map[string]any{
 		"type":      keyword.Name,
 		"tokenizer": letter.Name,
@@ -83,7 +107,7 @@ keywordFieldMapping := bleve.NewKeywordFieldMapping() 	keywordFieldMapping.Analyzer = simple.Name
 
 	nameMapping := bleve.NewTextFieldMapping()
-	nameMapping.Analyzer = en.AnalyzerName
+	nameMapping.Analyzer = "c_name"
 	nameMapping.IncludeTermVectors = true
 	nameMapping.Store = false
 
@@ -110,7 +134,7 @@ 	packageMapping := bleve.NewDocumentStaticMapping()
 
 	packageMapping.AddFieldMappingsAt("Name", nameMapping)
-	packageMapping.AddFieldMappingsAt("Attribute", keywordFieldMapping)
+	packageMapping.AddFieldMappingsAt("Attribute", nameMapping)
 	packageMapping.AddFieldMappingsAt("Source", keywordFieldMapping)
 	packageMapping.AddFieldMappingsAt("Description", descriptionFieldMapping)
 	packageMapping.AddFieldMappingsAt("MainProgram", keywordFieldMapping)