feat: add support to opt-in for fuzzy search (#10378)

The rationale for keeping it behind a flag is due to fuzzy search being computationally intensive #5261
Admins may opt-in by setting the `[indexer].REPO_INDEXER_FUZZY_ENABLED` flag to true.

Closes #10331

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/10378
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
Shiny Nematoda 2025-12-17 13:51:48 +01:00 committed by Gusted
parent 32429c0b13
commit cdc27b0d62
13 changed files with 175 additions and 52 deletions

View file

@ -39,6 +39,10 @@ const (
// llu:TrKeysSuffix search.
var GrepSearchOptions = [3]string{"exact", "union", "regexp"}
func (mode GrepMode) String() string {
return GrepSearchOptions[mode]
}
type GrepOptions struct {
RefName string
MaxResultLimit int

View file

@ -259,12 +259,16 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
if opts.Mode == internal.CodeSearchModeUnion {
query := bleve.NewDisjunctionQuery()
for _, field := range strings.Fields(opts.Keyword) {
for field := range strings.FieldsSeq(opts.Keyword) {
query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, false, 1.0))
}
keywordQuery = query
} else {
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, false, 1.0)
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword,
"Content",
repoIndexerAnalyzer,
opts.Mode == internal.CodeSearchModeFuzzy,
1.0)
}
if len(opts.RepoIDs) > 0 {

View file

@ -335,11 +335,14 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchType := esMultiMatchTypePhrase
if opts.Mode == internal.CodeSearchModeUnion {
if opts.Mode == internal.CodeSearchModeUnion || opts.Mode == internal.CodeSearchModeFuzzy {
searchType = esMultiMatchTypeBestFields
}
kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType)
if opts.Mode == internal.CodeSearchModeFuzzy {
kwQuery = kwQuery.Fuzziness("AUTO")
}
query := elastic.NewBoolQuery()
query = query.Must(kwQuery)
if len(opts.RepoIDs) > 0 {

View file

@ -91,12 +91,23 @@ func index(ctx context.Context, indexer internal.Indexer, repoID int64) error {
return repo_model.UpdateIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode, sha)
}
func setSearchOption(set bool, val string) {
if set {
if !slices.Contains(CodeSearchOptions, val) {
CodeSearchOptions = append(CodeSearchOptions, val)
}
} else if i := slices.Index(CodeSearchOptions, val); i >= 0 {
CodeSearchOptions = append(CodeSearchOptions[:i], CodeSearchOptions[i+1:]...)
}
}
// Init initialize the repo indexer
func Init() {
if !setting.Indexer.RepoIndexerEnabled {
(*globalIndexer.Load()).Close()
return
}
setSearchOption(setting.Indexer.RepoIndexerEnableFuzzy, "fuzzy")
ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), "Service: CodeIndexer", process.SystemProcessType, false)

View file

@ -116,6 +116,57 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
})
}
t.Run("Fuzzy", func(t *testing.T) {
for _, kw := range []struct {
keyword string
ids []int64
}{
{
keyword: "reppo1", // should match repo1
ids: []int64{repoID},
},
{
keyword: "1", // must not be fuzzy match only repo1
ids: []int64{repoID},
},
{
keyword: "Description!", // should match "Description"
ids: []int64{repoID},
},
{
keyword: "escription", // should match "Description"
ids: []int64{repoID},
},
{
keyword: "form", // should match "for"
ids: []int64{repoID},
},
{
keyword: "invalid", // should not match anything
ids: []int64{},
},
} {
t.Run(kw.keyword, func(t *testing.T) {
_, res, _, err := indexer.Search(t.Context(), &internal.SearchOptions{
Keyword: kw.keyword,
Paginator: &db.ListOptions{
Page: 1,
PageSize: 10,
},
Mode: SearchModeFuzzy,
})
require.NoError(t, err)
ids := make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
}
assert.Equal(t, kw.ids, ids)
})
}
})
require.NoError(t, indexer.Delete(t.Context(), repoID))
})
}

View file

@ -25,13 +25,18 @@ type CodeSearchMode int
const (
CodeSearchModeExact CodeSearchMode = iota
CodeSearchModeUnion
CodeSearchModeFuzzy
)
func (mode CodeSearchMode) String() string {
if mode == CodeSearchModeUnion {
switch mode {
case CodeSearchModeFuzzy:
return "fuzzy"
case CodeSearchModeUnion:
return "union"
default:
return "exact"
}
return "exact"
}
type SearchOptions struct {

View file

@ -36,13 +36,14 @@ type SearchResultLanguages = internal.SearchResultLanguages
type SearchOptions = internal.SearchOptions
// llu:TrKeysSuffix search.
var CodeSearchOptions = [2]string{"exact", "union"}
var CodeSearchOptions = []string{"exact", "union", "fuzzy"}
type SearchMode = internal.CodeSearchMode
const (
SearchModeExact = internal.CodeSearchModeExact
SearchModeUnion = internal.CodeSearchModeUnion
SearchModeFuzzy = internal.CodeSearchModeFuzzy
)
func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {

View file

@ -23,16 +23,17 @@ var Indexer = struct {
IssueIndexerName string
StartupTimeout time.Duration
RepoIndexerEnabled bool
RepoIndexerRepoTypes []string
RepoType string
RepoPath string
RepoConnStr string
RepoIndexerName string
MaxIndexerFileSize int64
IncludePatterns []Glob
ExcludePatterns []Glob
ExcludeVendored bool
RepoIndexerEnabled bool
RepoIndexerRepoTypes []string
RepoIndexerEnableFuzzy bool
RepoType string
RepoPath string
RepoConnStr string
RepoIndexerName string
MaxIndexerFileSize int64
IncludePatterns []Glob
ExcludePatterns []Glob
ExcludeVendored bool
}{
IssueType: "bleve",
IssuePath: "indexers/issues.bleve",
@ -40,14 +41,15 @@ var Indexer = struct {
IssueConnAuth: "",
IssueIndexerName: "gitea_issues",
RepoIndexerEnabled: false,
RepoIndexerRepoTypes: []string{"sources", "forks", "mirrors", "templates"},
RepoType: "bleve",
RepoPath: "indexers/repos.bleve",
RepoConnStr: "",
RepoIndexerName: "gitea_codes",
MaxIndexerFileSize: 1024 * 1024,
ExcludeVendored: true,
RepoIndexerEnabled: false,
RepoIndexerRepoTypes: []string{"sources", "forks", "mirrors", "templates"},
RepoIndexerEnableFuzzy: false,
RepoType: "bleve",
RepoPath: "indexers/repos.bleve",
RepoConnStr: "",
RepoIndexerName: "gitea_codes",
MaxIndexerFileSize: 1024 * 1024,
ExcludeVendored: true,
}
type Glob struct {
@ -87,6 +89,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) {
Indexer.RepoIndexerEnabled = sec.Key("REPO_INDEXER_ENABLED").MustBool(false)
Indexer.RepoIndexerRepoTypes = strings.Split(sec.Key("REPO_INDEXER_REPO_TYPES").MustString("sources,forks,mirrors,templates"), ",")
Indexer.RepoIndexerEnableFuzzy = sec.Key("REPO_INDEXER_FUZZY_ENABLED").MustBool(false)
Indexer.RepoType = sec.Key("REPO_INDEXER_TYPE").MustString("bleve")
Indexer.RepoPath = filepath.ToSlash(sec.Key("REPO_INDEXER_PATH").MustString(filepath.ToSlash(filepath.Join(AppDataPath, "indexers/repos.bleve"))))
if !filepath.IsAbs(Indexer.RepoPath) {

View file

@ -100,6 +100,8 @@
"repo.issue_indexer.title": "Issue Indexer",
"search.milestone_kind": "Search milestones…",
"search.syntax": "Search syntax",
"search.fuzzy": "Fuzzy",
"search.fuzzy_tooltip": "Include results is an approximate match to the search term",
"repo.settings.push_mirror.branch_filter.label": "Branch filter (optional)",
"repo.settings.push_mirror.branch_filter.description": "Branches to be mirrored. Leave blank to mirror all branches. See <a href=\"%[1]s\">%[2]s documentation</a> for syntax. Examples: <code>main, release/*</code>",
"incorrect_root_url": "This Forgejo instance is configured to be served on \"%s\". You are currently viewing Forgejo through a different URL, which may cause parts of the application to break. The canonical URL is controlled by Forgejo admins via the ROOT_URL setting in the app.ini.",

View file

@ -38,10 +38,14 @@ func Code(ctx *context.Context) {
path := ctx.FormTrim("path")
mode := code_indexer.SearchModeExact
if m := ctx.FormTrim("mode"); m == "union" ||
m == "fuzzy" ||
ctx.FormBool("fuzzy") {
if m := ctx.FormTrim("mode"); m == "union" {
mode = code_indexer.SearchModeUnion
} else if m == "fuzzy" || ctx.FormBool("fuzzy") {
if setting.Indexer.RepoIndexerEnableFuzzy {
mode = code_indexer.SearchModeFuzzy
} else {
mode = code_indexer.SearchModeUnion
}
}
ctx.Data["Keyword"] = keyword

View file

@ -22,13 +22,16 @@ type searchMode int
const (
ExactSearchMode searchMode = iota
UnionSearchMode
FuzzySearchMode
RegExpSearchMode
)
func searchModeFromString(s string) searchMode {
switch s {
case "fuzzy", "union":
case "union":
return UnionSearchMode
case "fuzzy":
return FuzzySearchMode
case "regexp":
return RegExpSearchMode
default:
@ -36,23 +39,13 @@ func searchModeFromString(s string) searchMode {
}
}
func (m searchMode) String() string {
switch m {
case ExactSearchMode:
return "exact"
case UnionSearchMode:
return "union"
case RegExpSearchMode:
return "regexp"
default:
panic("cannot happen")
}
}
func (m searchMode) ToIndexer() code_indexer.SearchMode {
if m == ExactSearchMode {
return code_indexer.SearchModeExact
}
if setting.Indexer.RepoIndexerEnableFuzzy && m == FuzzySearchMode {
return code_indexer.SearchModeFuzzy
}
return code_indexer.SearchModeUnion
}
@ -83,7 +76,6 @@ func Search(ctx *context.Context) {
ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
ctx.Data["CodeSearchPath"] = path
ctx.Data["CodeSearchMode"] = mode.String()
ctx.Data["PageIsViewCode"] = true
ctx.Data["CodeIndexerDisabled"] = !setting.Indexer.RepoIndexerEnabled
if setting.Indexer.RepoIndexerEnabled {
@ -106,11 +98,14 @@ func Search(ctx *context.Context) {
var searchResults []*code_indexer.Result
var searchResultLanguages []*code_indexer.SearchResultLanguages
if setting.Indexer.RepoIndexerEnabled {
m := mode.ToIndexer()
ctx.Data["CodeSearchMode"] = m.String()
var err error
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
RepoIDs: []int64{ctx.Repo.Repository.ID},
Keyword: keyword,
Mode: mode.ToIndexer(),
Mode: m,
Language: language,
Filename: path,
Paginator: &db.ListOptions{
@ -128,11 +123,14 @@ func Search(ctx *context.Context) {
ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx)
}
} else {
m := mode.ToGitGrep()
ctx.Data["CodeSearchMode"] = m.String()
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{
ContextLineNumber: 1,
RefName: ctx.Repo.RefName,
Filename: path,
Mode: mode.ToGitGrep(),
Mode: m,
})
if err != nil {
ctx.ServerError("GrepSearch", err)

View file

@ -42,10 +42,14 @@ func CodeSearch(ctx *context.Context) {
path := ctx.FormTrim("path")
mode := code_indexer.SearchModeExact
if m := ctx.FormTrim("mode"); m == "union" ||
m == "fuzzy" ||
ctx.FormBool("fuzzy") {
if m := ctx.FormTrim("mode"); m == "union" {
mode = code_indexer.SearchModeUnion
} else if m == "fuzzy" || ctx.FormBool("fuzzy") {
if setting.Indexer.RepoIndexerEnableFuzzy {
mode = code_indexer.SearchModeFuzzy
} else {
mode = code_indexer.SearchModeUnion
}
}
ctx.Data["Keyword"] = keyword

View file

@ -4,6 +4,7 @@ import (
"net/http"
"testing"
code_indexer "forgejo.org/modules/indexer/code"
"forgejo.org/modules/setting"
"forgejo.org/modules/test"
"forgejo.org/tests"
@ -16,11 +17,43 @@ func TestExploreCodeSearchIndexer(t *testing.T) {
defer tests.PrepareTestEnv(t)()
defer test.MockVariableValue(&setting.Indexer.RepoIndexerEnabled, true)()
req := NewRequest(t, "GET", "/explore/code?q=file&fuzzy=true")
resp := MakeRequest(t, req, http.StatusOK)
doc := NewHTMLParser(t, resp.Body).Find(".explore")
t.Run("Exact", func(t *testing.T) {
req := NewRequest(t, "GET", "/explore/code?q=file&mode=exact")
resp := MakeRequest(t, req, http.StatusOK)
doc := NewHTMLParser(t, resp.Body).Find(".explore")
doc.Find(".file-body").Each(func(i int, sel *goquery.Selection) {
assert.Positive(t, sel.Find(".code-inner").Find(".search-highlight").Length(), 0)
active, ok := doc.Find("[data-test-tag=fuzzy-dropdown] .active input").Attr("value")
assert.True(t, ok)
assert.Equal(t, "exact", active)
doc.Find(".file-body").Each(func(i int, sel *goquery.Selection) {
assert.Positive(t, sel.Find(".code-inner").Find(".search-highlight").Length())
})
})
t.Run("Fuzzy", func(t *testing.T) {
defer test.MockVariableValue(&setting.Indexer.RepoIndexerEnableFuzzy, true)()
code_indexer.CodeSearchOptions = []string{"exact", "union", "fuzzy"} // usually set by Init
req := NewRequest(t, "GET", "/explore/code?q=file&mode=fuzzy")
resp := MakeRequest(t, req, http.StatusOK)
doc := NewHTMLParser(t, resp.Body).Find(".explore")
active, ok := doc.Find("[data-test-tag=fuzzy-dropdown] .active input").Attr("value")
assert.True(t, ok)
assert.Equal(t, "fuzzy", active)
})
t.Run("No Fuzzy", func(t *testing.T) {
defer test.MockVariableValue(&setting.Indexer.RepoIndexerEnableFuzzy, false)()
code_indexer.CodeSearchOptions = []string{"exact", "union"} // usually set by Init
req := NewRequest(t, "GET", "/explore/code?q=file&mode=fuzzy")
resp := MakeRequest(t, req, http.StatusOK)
doc := NewHTMLParser(t, resp.Body).Find(".explore")
active, ok := doc.Find("[data-test-tag=fuzzy-dropdown] .active input").Attr("value")
assert.True(t, ok)
assert.Equal(t, "union", active)
})
}