Skip to content

Commit

Permalink
Tackle the issue of XML files filtered as binaries in search results (#…
Browse files Browse the repository at this point in the history
…910)

When skipping a doc, we currently report the detected language as "binary" (if
it looks like binary) or "skipped" (if it's skipped for any other reason).
Skipped docs are still added to the index and can still be returned as search
results, for example if you only match on filename. So sometimes file matches
are returned with "skipped" as their language, even though the file path is
clearly some other language like XML.

This PR updates the indexing logic to still detect the language even if the
document is skipped. However, we avoid passing the contents to the language
detection library to avoid running detection on huge files.

---------

Co-authored-by: Julie Tibshirani <[email protected]>
  • Loading branch information
srijansk and jtibshirani authored Feb 6, 2025
1 parent 70647ba commit 6b8e10c
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 6 deletions.
1 change: 0 additions & 1 deletion index/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,6 @@ func (b *Builder) Add(doc Document) error {
doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
doc.SkipReason = err.Error()
doc.Language = "binary"
}

b.todo = append(b.todo, &doc)
Expand Down
15 changes: 10 additions & 5 deletions index/shard_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,16 @@ func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
}

func DetermineLanguageIfUnknown(doc *Document) {
if doc.Language == "" {
if doc.Language != "" {
return
}

if doc.SkipReason != "" {
// If this document has been skipped, it's likely very large, or it's a non-code file like binary.
// In this case, we just guess the language based on file name to avoid examining the contents.
// Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
doc.Language = languages.GetLanguage(doc.Name, nil)
} else {
doc.Language = languages.GetLanguage(doc.Name, doc.Content)
}
}
Expand All @@ -407,16 +416,12 @@ func (b *ShardBuilder) Add(doc Document) error {

if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
doc.Language = "binary"
}

if doc.SkipReason != "" {
doc.Content = []byte(notIndexedMarker + doc.SkipReason)
doc.Symbols = nil
doc.SymbolsMetaData = nil
if doc.Language == "" {
doc.Language = "skipped"
}
}

DetermineLanguageIfUnknown(&doc)
Expand Down
46 changes: 46 additions & 0 deletions index/shard_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,49 @@ func TestShardName(t *testing.T) {
})
}
}

func TestDetermineLanguageIfUnknown(t *testing.T) {
tests := []struct {
name string
doc Document
wantLang string
skipContent bool
}{
{
name: "already has language",
doc: Document{
Name: "test.java",
Language: "Go",
Content: []byte("package main"),
},
wantLang: "Go",
},
{
name: "skipped file",
doc: Document{
Name: "large.js",
SkipReason: "too large",
Content: []byte(notIndexedMarker + "too large"),
},
wantLang: "JavaScript",
},
{
name: "skipped file with unknown extension",
doc: Document{
Name: "deadb33f",
SkipReason: "binary",
Content: []byte(notIndexedMarker + "binary"),
},
wantLang: "",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
DetermineLanguageIfUnknown(&tt.doc)
if tt.doc.Language != tt.wantLang {
t.Errorf("DetermineLanguageIfUnknown() got language = %v, want %v", tt.doc.Language, tt.wantLang)
}
})
}
}

0 comments on commit 6b8e10c

Please sign in to comment.