@@ -38,12 +38,11 @@ func Parse(r io.Reader) (*Node, error) {
3838
3939// ParseWithOptions is like parse, but with custom options
4040func ParseWithOptions (r io.Reader , options ParserOptions ) (* Node , error ) {
41- var data []byte
4241 var lineStarts []int
43-
4442 // If line numbers are requested, read all data for position tracking
4543 if options .WithLineNumbers {
4644 var err error
45+ var data []byte
4746 data , err = io .ReadAll (r )
4847 if err != nil {
4948 return nil , err
@@ -60,7 +59,11 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
6059 }
6160
6261 p := createParser (r )
62+ if options .WithLineNumbers {
63+ p .lineStarts = lineStarts
64+ }
6365 options .apply (p )
66+
6467 var err error
6568 for err == nil {
6669 _ , err = p .parse ()
@@ -83,19 +86,6 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
8386 return nil , fmt .Errorf ("xmlquery: invalid XML document" )
8487 }
8588
86- // If line numbers were requested, annotate the parsed document
87- if options .WithLineNumbers {
88- annotator := & lineNumberAnnotator {
89- data : data ,
90- lineStarts : lineStarts ,
91- }
92-
93- err = annotator .annotateLineNumbers (p .doc )
94- if err != nil {
95- return nil , err
96- }
97- }
98-
9989 return p .doc , nil
10090 }
10191
@@ -116,6 +106,8 @@ type parser struct {
116106 space2prefix map [string ]* xmlnsPrefix
117107 currentLine int // Track current line number during parsing
118108 lastProcessedPos int // Track how much cached data we've already processed for line counting
109+
110+ lineStarts []int
119111}
120112
121113type xmlnsPrefix struct {
@@ -132,6 +124,7 @@ func createParser(r io.Reader) *parser {
132124 reader : reader ,
133125 currentLine : 0 ,
134126 lastProcessedPos : 0 ,
127+ lineStarts : nil ,
135128 }
136129 if p .decoder .CharsetReader == nil {
137130 p .decoder .CharsetReader = charset .NewReaderLabel
@@ -142,17 +135,33 @@ func createParser(r io.Reader) *parser {
142135
143136// updateLineNumber scans only new cached data for newlines to update current line position
144137func (p * parser ) updateLineNumber () {
145- cached := p .reader .CacheWithLimit (- 1 ) // Get all cached data
146-
147- // Only process data we haven't seen before
148- for i := p .lastProcessedPos ; i < len (cached ); i ++ {
149- if cached [i ] == '\n' {
150- p .currentLine ++
138+ if p .lineStarts == nil {
139+ return
140+ }
141+ offset := int (p .decoder .InputOffset ())
142+ for i := p .currentLine ; i < len (p .lineStarts ); i ++ {
143+ if offset > p .lineStarts [i ] && p .lineStarts [i ] >= p .lastProcessedPos {
144+ p .currentLine = i + 1
145+ break
146+ }
147+ if offset <= p .lineStarts [i ] {
148+ break
151149 }
152150 }
151+ p .lastProcessedPos = offset
152+ /*
153+ cached := p.reader.CacheWithLimit(-1) // Get all cached data
154+
155+ // Only process data we haven't seen before
156+ for i := p.lastProcessedPos; i < len(cached); i++ {
157+ if cached[i] == '\n' {
158+ p.currentLine++
159+ }
160+ }
153161
154- // Update our position to avoid reprocessing this data
155- p .lastProcessedPos = len (cached )
162+ // Update our position to avoid reprocessing this data
163+ p.lastProcessedPos = len(cached)
164+ */
156165}
157166
158167func (p * parser ) parse () (* Node , error ) {
@@ -492,276 +501,3 @@ func (sp *StreamParser) Read() (*Node, error) {
492501 }
493502 return sp .p .parse ()
494503}
495-
496- // lineNumberAnnotator handles post-processing line number annotation
497- type lineNumberAnnotator struct {
498- data []byte
499- lineStarts []int
500- tracker * positionTracker
501- }
502-
503- // getLineForPosition returns the line number for a given byte position
504- func (p * lineNumberAnnotator ) getLineForPosition (pos int ) int {
505- if pos < 0 {
506- return 1
507- }
508-
509- line := 1
510- for i , start := range p .lineStarts {
511- if pos < start {
512- return i // i is the line number (1-based because lineStarts[0] = 0 for line 1)
513- }
514- line = i + 1
515- }
516- return line
517- }
518-
519- // annotateLineNumbers walks through the XML data and annotates nodes with line numbers
520- func (p * lineNumberAnnotator ) annotateLineNumbers (doc * Node ) error {
521- // First reset all line numbers to ensure clean state
522- p .resetLineNumbers (doc )
523- // Use a simpler approach: walk through the document in order and match with positions
524- p .annotateNodesByPosition (doc )
525- return nil
526- }
527-
528- // resetLineNumbers recursively resets all line numbers to 0
529- func (p * lineNumberAnnotator ) resetLineNumbers (node * Node ) {
530- if node == nil {
531- return
532- }
533- node .LineNumber = 0
534- for child := node .FirstChild ; child != nil ; child = child .NextSibling {
535- p .resetLineNumbers (child )
536- }
537- }
538-
539- // annotateNodesByPosition recursively annotates nodes by finding their positions in source
540- func (p * lineNumberAnnotator ) annotateNodesByPosition (node * Node ) {
541- if node == nil {
542- return
543- }
544-
545- // Annotate current node if not already done
546- if node .LineNumber == 0 {
547- switch node .Type {
548- case ElementNode :
549- node .LineNumber = p .findElementPosition (node .Data )
550- case CommentNode :
551- node .LineNumber = p .findCommentPosition (node .Data )
552- case DeclarationNode :
553- node .LineNumber = p .findDeclarationLine ()
554- case ProcessingInstruction :
555- node .LineNumber = p .findProcessingInstructionPosition (node .Data )
556- case TextNode , CharDataNode :
557- text := strings .TrimSpace (node .Data )
558- if text != "" {
559- node .LineNumber = p .findTextPosition (text )
560- }
561- }
562- }
563-
564- // Recursively annotate children
565- for child := node .FirstChild ; child != nil ; child = child .NextSibling {
566- p .annotateNodesByPosition (child )
567- }
568- }
569-
570- // State to track positions as we traverse the document
571- type positionTracker struct {
572- currentPos int
573- elementCounts map [string ]int
574- commentCounts map [string ]int
575- textCounts map [string ]int
576- }
577-
578- // findElementPosition finds the line number for the next occurrence of an element
579- func (p * lineNumberAnnotator ) findElementPosition (name string ) int {
580- if p .tracker == nil {
581- p .tracker = & positionTracker {
582- elementCounts : make (map [string ]int ),
583- commentCounts : make (map [string ]int ),
584- textCounts : make (map [string ]int ),
585- }
586- }
587-
588- p .tracker .elementCounts [name ]++
589- return p .findNthElementOccurrence (name , p .tracker .elementCounts [name ])
590- }
591-
592- // findNthElementOccurrence finds the nth occurrence of an element
593- func (p * lineNumberAnnotator ) findNthElementOccurrence (name string , n int ) int {
594- count := 0
595- pos := 0
596- dataStr := string (p .data )
597-
598- // Look for both prefixed and non-prefixed versions
599- patterns := []string {
600- fmt .Sprintf ("<%s" , name ), // <name
601- fmt .Sprintf (":%s" , name ), // prefix:name
602- }
603-
604- for {
605- earliestPos := len (p .data )
606- foundPattern := ""
607-
608- // Find the earliest occurrence of any pattern
609- for _ , pattern := range patterns {
610- foundPos := strings .Index (dataStr [pos :], pattern )
611- if foundPos >= 0 {
612- absolutePos := pos + foundPos
613- if absolutePos < earliestPos {
614- earliestPos = absolutePos
615- foundPattern = pattern
616- }
617- }
618- }
619-
620- if earliestPos == len (p .data ) {
621- break // No more occurrences found
622- }
623-
624- // Validate the match
625- nextCharPos := earliestPos + len (foundPattern )
626- isValidMatch := false
627-
628- if foundPattern [0 ] == '<' {
629- // Direct element match like <name
630- if nextCharPos < len (p .data ) {
631- ch := p .data [nextCharPos ]
632- if ch == '>' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
633- isValidMatch = true
634- }
635- }
636- } else {
637- // Namespace prefix match like :name
638- // Make sure it's preceded by < and some prefix
639- if earliestPos > 0 && nextCharPos < len (p .data ) {
640- ch := p .data [nextCharPos ]
641- if ch == '>' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
642- // Look backwards to find the <
643- foundOpenTag := false
644- for i := earliestPos - 1 ; i >= 0 ; i -- {
645- if p .data [i ] == '<' {
646- foundOpenTag = true
647- break
648- } else if p .data [i ] == '>' {
649- break // Found closing tag first, not valid
650- }
651- }
652- if foundOpenTag {
653- isValidMatch = true
654- }
655- }
656- }
657- }
658-
659- if isValidMatch {
660- count ++
661- if count == n {
662- // For namespace prefix matches, return the position of the <
663- if foundPattern [0 ] == ':' {
664- for i := earliestPos - 1 ; i >= 0 ; i -- {
665- if p .data [i ] == '<' {
666- return p .getLineForPosition (i )
667- }
668- }
669- }
670- return p .getLineForPosition (earliestPos )
671- }
672- }
673-
674- pos = earliestPos + 1
675- }
676-
677- return 1
678- }
679-
680- // findCommentPosition finds the line number for the next occurrence of a comment
681- func (p * lineNumberAnnotator ) findCommentPosition (content string ) int {
682- if p .tracker == nil {
683- p .tracker = & positionTracker {
684- elementCounts : make (map [string ]int ),
685- commentCounts : make (map [string ]int ),
686- textCounts : make (map [string ]int ),
687- }
688- }
689-
690- p .tracker .commentCounts [content ]++
691- return p .findNthCommentOccurrence (content , p .tracker .commentCounts [content ])
692- }
693-
694- // findNthCommentOccurrence finds the nth occurrence of a comment
695- func (p * lineNumberAnnotator ) findNthCommentOccurrence (content string , n int ) int {
696- pattern := fmt .Sprintf ("<!--%s-->" , content )
697- count := 0
698- pos := 0
699-
700- for {
701- foundPos := strings .Index (string (p .data [pos :]), pattern )
702- if foundPos < 0 {
703- break
704- }
705- count ++
706- absolutePos := pos + foundPos
707- if count == n {
708- return p .getLineForPosition (absolutePos )
709- }
710- pos = absolutePos + len (pattern )
711- }
712- return 1
713- }
714-
715- // findDeclarationLine finds the line number of the XML declaration
716- func (p * lineNumberAnnotator ) findDeclarationLine () int {
717- pattern := "<?xml"
718- pos := bytes .Index (p .data , []byte (pattern ))
719- if pos >= 0 {
720- return p .getLineForPosition (pos )
721- }
722- return 1
723- }
724-
725- // findTextPosition finds the line number for the next occurrence of text
726- func (p * lineNumberAnnotator ) findTextPosition (text string ) int {
727- if p .tracker == nil {
728- p .tracker = & positionTracker {
729- elementCounts : make (map [string ]int ),
730- commentCounts : make (map [string ]int ),
731- textCounts : make (map [string ]int ),
732- }
733- }
734-
735- p .tracker .textCounts [text ]++
736- return p .findNthTextOccurrence (text , p .tracker .textCounts [text ])
737- }
738-
739- // findNthTextOccurrence finds the nth occurrence of text
740- func (p * lineNumberAnnotator ) findNthTextOccurrence (text string , n int ) int {
741- count := 0
742- pos := 0
743-
744- for {
745- foundPos := strings .Index (string (p .data [pos :]), text )
746- if foundPos < 0 {
747- break
748- }
749- count ++
750- absolutePos := pos + foundPos
751- if count == n {
752- return p .getLineForPosition (absolutePos )
753- }
754- pos = absolutePos + len (text )
755- }
756- return 1
757- }
758-
759- // findProcessingInstructionPosition finds the line number for a processing instruction
760- func (p * lineNumberAnnotator ) findProcessingInstructionPosition (target string ) int {
761- pattern := fmt .Sprintf ("<?%s" , target )
762- pos := strings .Index (string (p .data ), pattern )
763- if pos >= 0 {
764- return p .getLineForPosition (pos )
765- }
766- return 1
767- }
0 commit comments