Skip to content

Commit 689f32d

Browse files
committed
optimize line number tracking for XML element start tags
1 parent 86aa304 commit 689f32d

File tree

1 file changed

+32
-296
lines changed

1 file changed

+32
-296
lines changed

parse.go

Lines changed: 32 additions & 296 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,11 @@ func Parse(r io.Reader) (*Node, error) {
3838

3939
// ParseWithOptions is like parse, but with custom options
4040
func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
41-
var data []byte
4241
var lineStarts []int
43-
4442
// If line numbers are requested, read all data for position tracking
4543
if options.WithLineNumbers {
4644
var err error
45+
var data []byte
4746
data, err = io.ReadAll(r)
4847
if err != nil {
4948
return nil, err
@@ -60,7 +59,11 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
6059
}
6160

6261
p := createParser(r)
62+
if options.WithLineNumbers {
63+
p.lineStarts = lineStarts
64+
}
6365
options.apply(p)
66+
6467
var err error
6568
for err == nil {
6669
_, err = p.parse()
@@ -83,19 +86,6 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
8386
return nil, fmt.Errorf("xmlquery: invalid XML document")
8487
}
8588

86-
// If line numbers were requested, annotate the parsed document
87-
if options.WithLineNumbers {
88-
annotator := &lineNumberAnnotator{
89-
data: data,
90-
lineStarts: lineStarts,
91-
}
92-
93-
err = annotator.annotateLineNumbers(p.doc)
94-
if err != nil {
95-
return nil, err
96-
}
97-
}
98-
9989
return p.doc, nil
10090
}
10191

@@ -116,6 +106,8 @@ type parser struct {
116106
space2prefix map[string]*xmlnsPrefix
117107
currentLine int // Track current line number during parsing
118108
lastProcessedPos int // Track how much cached data we've already processed for line counting
109+
110+
lineStarts []int
119111
}
120112

121113
type xmlnsPrefix struct {
@@ -132,6 +124,7 @@ func createParser(r io.Reader) *parser {
132124
reader: reader,
133125
currentLine: 0,
134126
lastProcessedPos: 0,
127+
lineStarts: nil,
135128
}
136129
if p.decoder.CharsetReader == nil {
137130
p.decoder.CharsetReader = charset.NewReaderLabel
@@ -142,17 +135,33 @@ func createParser(r io.Reader) *parser {
142135

143136
// updateLineNumber scans only new cached data for newlines to update current line position
144137
func (p *parser) updateLineNumber() {
145-
cached := p.reader.CacheWithLimit(-1) // Get all cached data
146-
147-
// Only process data we haven't seen before
148-
for i := p.lastProcessedPos; i < len(cached); i++ {
149-
if cached[i] == '\n' {
150-
p.currentLine++
138+
if p.lineStarts == nil {
139+
return
140+
}
141+
offset := int(p.decoder.InputOffset())
142+
for i := p.currentLine; i < len(p.lineStarts); i++ {
143+
if offset > p.lineStarts[i] && p.lineStarts[i] >= p.lastProcessedPos {
144+
p.currentLine = i + 1
145+
break
146+
}
147+
if offset <= p.lineStarts[i] {
148+
break
151149
}
152150
}
151+
p.lastProcessedPos = offset
152+
/*
153+
cached := p.reader.CacheWithLimit(-1) // Get all cached data
154+
155+
// Only process data we haven't seen before
156+
for i := p.lastProcessedPos; i < len(cached); i++ {
157+
if cached[i] == '\n' {
158+
p.currentLine++
159+
}
160+
}
153161
154-
// Update our position to avoid reprocessing this data
155-
p.lastProcessedPos = len(cached)
162+
// Update our position to avoid reprocessing this data
163+
p.lastProcessedPos = len(cached)
164+
*/
156165
}
157166

158167
func (p *parser) parse() (*Node, error) {
@@ -492,276 +501,3 @@ func (sp *StreamParser) Read() (*Node, error) {
492501
}
493502
return sp.p.parse()
494503
}
495-
496-
// lineNumberAnnotator handles post-processing line number annotation
497-
type lineNumberAnnotator struct {
498-
data []byte
499-
lineStarts []int
500-
tracker *positionTracker
501-
}
502-
503-
// getLineForPosition returns the line number for a given byte position
504-
func (p *lineNumberAnnotator) getLineForPosition(pos int) int {
505-
if pos < 0 {
506-
return 1
507-
}
508-
509-
line := 1
510-
for i, start := range p.lineStarts {
511-
if pos < start {
512-
return i // i is the line number (1-based because lineStarts[0] = 0 for line 1)
513-
}
514-
line = i + 1
515-
}
516-
return line
517-
}
518-
519-
// annotateLineNumbers walks through the XML data and annotates nodes with line numbers
520-
func (p *lineNumberAnnotator) annotateLineNumbers(doc *Node) error {
521-
// First reset all line numbers to ensure clean state
522-
p.resetLineNumbers(doc)
523-
// Use a simpler approach: walk through the document in order and match with positions
524-
p.annotateNodesByPosition(doc)
525-
return nil
526-
}
527-
528-
// resetLineNumbers recursively resets all line numbers to 0
529-
func (p *lineNumberAnnotator) resetLineNumbers(node *Node) {
530-
if node == nil {
531-
return
532-
}
533-
node.LineNumber = 0
534-
for child := node.FirstChild; child != nil; child = child.NextSibling {
535-
p.resetLineNumbers(child)
536-
}
537-
}
538-
539-
// annotateNodesByPosition recursively annotates nodes by finding their positions in source
540-
func (p *lineNumberAnnotator) annotateNodesByPosition(node *Node) {
541-
if node == nil {
542-
return
543-
}
544-
545-
// Annotate current node if not already done
546-
if node.LineNumber == 0 {
547-
switch node.Type {
548-
case ElementNode:
549-
node.LineNumber = p.findElementPosition(node.Data)
550-
case CommentNode:
551-
node.LineNumber = p.findCommentPosition(node.Data)
552-
case DeclarationNode:
553-
node.LineNumber = p.findDeclarationLine()
554-
case ProcessingInstruction:
555-
node.LineNumber = p.findProcessingInstructionPosition(node.Data)
556-
case TextNode, CharDataNode:
557-
text := strings.TrimSpace(node.Data)
558-
if text != "" {
559-
node.LineNumber = p.findTextPosition(text)
560-
}
561-
}
562-
}
563-
564-
// Recursively annotate children
565-
for child := node.FirstChild; child != nil; child = child.NextSibling {
566-
p.annotateNodesByPosition(child)
567-
}
568-
}
569-
570-
// State to track positions as we traverse the document
571-
type positionTracker struct {
572-
currentPos int
573-
elementCounts map[string]int
574-
commentCounts map[string]int
575-
textCounts map[string]int
576-
}
577-
578-
// findElementPosition finds the line number for the next occurrence of an element
579-
func (p *lineNumberAnnotator) findElementPosition(name string) int {
580-
if p.tracker == nil {
581-
p.tracker = &positionTracker{
582-
elementCounts: make(map[string]int),
583-
commentCounts: make(map[string]int),
584-
textCounts: make(map[string]int),
585-
}
586-
}
587-
588-
p.tracker.elementCounts[name]++
589-
return p.findNthElementOccurrence(name, p.tracker.elementCounts[name])
590-
}
591-
592-
// findNthElementOccurrence finds the nth occurrence of an element
593-
func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
594-
count := 0
595-
pos := 0
596-
dataStr := string(p.data)
597-
598-
// Look for both prefixed and non-prefixed versions
599-
patterns := []string{
600-
fmt.Sprintf("<%s", name), // <name
601-
fmt.Sprintf(":%s", name), // prefix:name
602-
}
603-
604-
for {
605-
earliestPos := len(p.data)
606-
foundPattern := ""
607-
608-
// Find the earliest occurrence of any pattern
609-
for _, pattern := range patterns {
610-
foundPos := strings.Index(dataStr[pos:], pattern)
611-
if foundPos >= 0 {
612-
absolutePos := pos + foundPos
613-
if absolutePos < earliestPos {
614-
earliestPos = absolutePos
615-
foundPattern = pattern
616-
}
617-
}
618-
}
619-
620-
if earliestPos == len(p.data) {
621-
break // No more occurrences found
622-
}
623-
624-
// Validate the match
625-
nextCharPos := earliestPos + len(foundPattern)
626-
isValidMatch := false
627-
628-
if foundPattern[0] == '<' {
629-
// Direct element match like <name
630-
if nextCharPos < len(p.data) {
631-
ch := p.data[nextCharPos]
632-
if ch == '>' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
633-
isValidMatch = true
634-
}
635-
}
636-
} else {
637-
// Namespace prefix match like :name
638-
// Make sure it's preceded by < and some prefix
639-
if earliestPos > 0 && nextCharPos < len(p.data) {
640-
ch := p.data[nextCharPos]
641-
if ch == '>' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
642-
// Look backwards to find the <
643-
foundOpenTag := false
644-
for i := earliestPos - 1; i >= 0; i-- {
645-
if p.data[i] == '<' {
646-
foundOpenTag = true
647-
break
648-
} else if p.data[i] == '>' {
649-
break // Found closing tag first, not valid
650-
}
651-
}
652-
if foundOpenTag {
653-
isValidMatch = true
654-
}
655-
}
656-
}
657-
}
658-
659-
if isValidMatch {
660-
count++
661-
if count == n {
662-
// For namespace prefix matches, return the position of the <
663-
if foundPattern[0] == ':' {
664-
for i := earliestPos - 1; i >= 0; i-- {
665-
if p.data[i] == '<' {
666-
return p.getLineForPosition(i)
667-
}
668-
}
669-
}
670-
return p.getLineForPosition(earliestPos)
671-
}
672-
}
673-
674-
pos = earliestPos + 1
675-
}
676-
677-
return 1
678-
}
679-
680-
// findCommentPosition finds the line number for the next occurrence of a comment
681-
func (p *lineNumberAnnotator) findCommentPosition(content string) int {
682-
if p.tracker == nil {
683-
p.tracker = &positionTracker{
684-
elementCounts: make(map[string]int),
685-
commentCounts: make(map[string]int),
686-
textCounts: make(map[string]int),
687-
}
688-
}
689-
690-
p.tracker.commentCounts[content]++
691-
return p.findNthCommentOccurrence(content, p.tracker.commentCounts[content])
692-
}
693-
694-
// findNthCommentOccurrence finds the nth occurrence of a comment
695-
func (p *lineNumberAnnotator) findNthCommentOccurrence(content string, n int) int {
696-
pattern := fmt.Sprintf("<!--%s-->", content)
697-
count := 0
698-
pos := 0
699-
700-
for {
701-
foundPos := strings.Index(string(p.data[pos:]), pattern)
702-
if foundPos < 0 {
703-
break
704-
}
705-
count++
706-
absolutePos := pos + foundPos
707-
if count == n {
708-
return p.getLineForPosition(absolutePos)
709-
}
710-
pos = absolutePos + len(pattern)
711-
}
712-
return 1
713-
}
714-
715-
// findDeclarationLine finds the line number of the XML declaration
716-
func (p *lineNumberAnnotator) findDeclarationLine() int {
717-
pattern := "<?xml"
718-
pos := bytes.Index(p.data, []byte(pattern))
719-
if pos >= 0 {
720-
return p.getLineForPosition(pos)
721-
}
722-
return 1
723-
}
724-
725-
// findTextPosition finds the line number for the next occurrence of text
726-
func (p *lineNumberAnnotator) findTextPosition(text string) int {
727-
if p.tracker == nil {
728-
p.tracker = &positionTracker{
729-
elementCounts: make(map[string]int),
730-
commentCounts: make(map[string]int),
731-
textCounts: make(map[string]int),
732-
}
733-
}
734-
735-
p.tracker.textCounts[text]++
736-
return p.findNthTextOccurrence(text, p.tracker.textCounts[text])
737-
}
738-
739-
// findNthTextOccurrence finds the nth occurrence of text
740-
func (p *lineNumberAnnotator) findNthTextOccurrence(text string, n int) int {
741-
count := 0
742-
pos := 0
743-
744-
for {
745-
foundPos := strings.Index(string(p.data[pos:]), text)
746-
if foundPos < 0 {
747-
break
748-
}
749-
count++
750-
absolutePos := pos + foundPos
751-
if count == n {
752-
return p.getLineForPosition(absolutePos)
753-
}
754-
pos = absolutePos + len(text)
755-
}
756-
return 1
757-
}
758-
759-
// findProcessingInstructionPosition finds the line number for a processing instruction
760-
func (p *lineNumberAnnotator) findProcessingInstructionPosition(target string) int {
761-
pattern := fmt.Sprintf("<?%s", target)
762-
pos := strings.Index(string(p.data), pattern)
763-
if pos >= 0 {
764-
return p.getLineForPosition(pos)
765-
}
766-
return 1
767-
}

0 commit comments

Comments
 (0)