Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 32 additions & 12 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,23 +586,43 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
names = uniq(names)

log.Printf("attempting to index %d total files", totalFiles)
for idx, name := range names {
keys := fileKeys[name]

for _, key := range keys {
// Flatten keys in sorted order for pipeline processing.
allKeys := make([]fileKey, 0, totalFiles)
for _, name := range names {
allKeys = append(allKeys, fileKeys[name]...)
}

// Pre-fetch documents using a pipeline: a goroutine reads blobs
// ahead of the main loop, overlapping I/O with builder processing.
type docResult struct {
doc index.Document
key fileKey
err error
}
ch := make(chan docResult, 64)
go func() {
defer close(ch)
for _, key := range allKeys {
doc, err := createDocument(key, repos, opts.BuildOptions)
if err != nil {
return false, err
}
ch <- docResult{doc: doc, key: key, err: err}
}
}()

if err := builder.Add(doc); err != nil {
return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
}
idx := 0
for result := range ch {
if result.err != nil {
return false, result.err
}

if idx%10_000 == 0 {
builder.CheckMemoryUsage()
}
if err := builder.Add(result.doc); err != nil {
return false, fmt.Errorf("error adding document with name %s: %w", result.key.FullPath(), err)
}

if idx%10_000 == 0 {
builder.CheckMemoryUsage()
}
idx++
}
return true, builder.Finish()
}
Expand Down
35 changes: 27 additions & 8 deletions index/shard_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,17 @@ func HostnameBestEffort() string {
// Store character (unicode codepoint) offset (in bytes) this often.
const runeOffsetFrequency = 100

// postingEntry holds the posting list data and the last offset for a single trigram.
// By combining these into a single struct accessed via pointer, we reduce map
// operations in the hot loop from 4 per rune (2 reads + 2 writes on two maps)
// to 1 per rune (1 read, then modify through pointer).
type postingEntry struct {
data []byte
lastOff uint32
}

type postingsBuilder struct {
postings map[ngram][]byte
lastOffsets map[ngram]uint32
postings map[ngram]*postingEntry

// To support UTF-8 searching, we must map back runes to byte
// offsets. As a first attempt, we sample regularly. The
Expand All @@ -77,9 +85,12 @@ type postingsBuilder struct {
}

func newPostingsBuilder() *postingsBuilder {
// Pre-allocate map with a reasonable capacity hint.
// A typical shard (~100MB) contains 50K-200K unique trigrams.
// Pre-allocating avoids repeated map growth during indexing.
const initialTrigramCapacity = 200_000
return &postingsBuilder{
postings: map[ngram][]byte{},
lastOffsets: map[ngram]uint32{},
postings: make(map[ngram]*postingEntry, initialTrigramCapacity),
isPlainASCII: true,
}
}
Expand Down Expand Up @@ -130,12 +141,20 @@ func (s *postingsBuilder) newSearchableString(data []byte, byteSections []Docume
}

ng := runesToNGram(runeGram)
lastOff := s.lastOffsets[ng]
newOff := endRune + uint32(runeIndex) - 2

m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
s.postings[ng] = append(s.postings[ng], buf[:m]...)
s.lastOffsets[ng] = newOff
e := s.postings[ng]
if e == nil {
// Pre-allocate data slice. Most trigrams appear many times
// across a shard, so starting with 64 bytes avoids several
// small reallocations during early appends.
e = &postingEntry{data: make([]byte, 0, 64)}
s.postings[ng] = e
}

m := binary.PutUvarint(buf[:], uint64(newOff-e.lastOff))
e.data = append(e.data, buf[:m]...)
e.lastOff = newOff
}
s.runeCount += runeIndex

Expand Down
2 changes: 1 addition & 1 deletion index/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,

postings.start(w)
for _, k := range keys {
postings.addItem(w, s.postings[k])
postings.addItem(w, s.postings[k].data)
}
postings.end(w)

Expand Down
Loading