From f9d8bba25e995940a8285754165c8bc807288e9a Mon Sep 17 00:00:00 2001 From: Allie Date: Tue, 16 Sep 2025 16:29:38 -0400 Subject: [PATCH 1/6] add method for grouping embed and writing blank tags as one complete word --- lib/htmldiff/list_of_words.rb | 38 +++++++++++++++++++++++++++++++++++ lib/htmldiff/word.rb | 4 ++++ 2 files changed, 42 insertions(+) diff --git a/lib/htmldiff/list_of_words.rb b/lib/htmldiff/list_of_words.rb index feb4634..9a99497 100644 --- a/lib/htmldiff/list_of_words.rb +++ b/lib/htmldiff/list_of_words.rb @@ -13,6 +13,7 @@ def initialize(string, options = {}) @words = string else convert_html_to_list_of_words string.chars + group_embed_or_blank_tags! end end @@ -72,6 +73,43 @@ def contains_unclosed_tag? private + # Group our-embed tags and Writing Blank spans, which are + # intentionally left blank, into single words + def group_embed_or_blank_tags! + return if @words.empty? + new_words = [] + i = 0 + + while i < @words.length + current_word = @words[i] + + if current_word.embed_or_blank_opening_tag? + word_group = [current_word] + tag_name = current_word.to_s.match(/^<(span|our-embed)/)[1] + i += 1 + + # Collect words until the appropriate closing tag is reached + while i < @words.length + word = @words[i] + word_group << word + + if word.to_s.match?(/^<\/#{tag_name}>$/) + i += 1 + break + end + + i += 1 + end + # Create a single word from the entire element + new_words << Word.new(word_group.map(&:to_s).join) + else + new_words << current_word + i += 1 + end + end + @words = new_words + end + def convert_html_to_list_of_words(character_array) @mode = :char @current_word = Word.new diff --git a/lib/htmldiff/word.rb b/lib/htmldiff/word.rb index a29b44a..5fce0f9 100644 --- a/lib/htmldiff/word.rb +++ b/lib/htmldiff/word.rb @@ -26,6 +26,10 @@ def iframe_tag? (@word[0..7].downcase =~ %r{^<\/?iframe ?}) end + def embed_or_blank_opening_tag? + @word =~ Regexp.union(/^]*class="[^"]*blank[^"]*"[^>]*>$/i, /^]*>$/i) + end + def tag? opening_tag? || closing_tag? || standalone_tag? end From da4b694f629f14202e666618214952aa88c19f82 Mon Sep 17 00:00:00 2001 From: Allie Date: Tue, 16 Sep 2025 16:33:32 -0400 Subject: [PATCH 2/6] ensure embed tags are ignored by same_tag? method --- lib/htmldiff/diff_builder.rb | 1 + lib/htmldiff/operation.rb | 8 ++++++++ spec/operation_spec.rb | 9 +++++++++ 3 files changed, 18 insertions(+) diff --git a/lib/htmldiff/diff_builder.rb b/lib/htmldiff/diff_builder.rb index 404a0f8..ef6a8ee 100644 --- a/lib/htmldiff/diff_builder.rb +++ b/lib/htmldiff/diff_builder.rb @@ -54,6 +54,7 @@ def replace(operation) # added e.g.

becomes

due to an editor button # press. For this, we just show the new version, otherwise it gets messy # trying to find the closing tag. + # our-embed tags are the exception and will be replaced in full. if operation.same_tag? equal(operation) else diff --git a/lib/htmldiff/operation.rb b/lib/htmldiff/operation.rb index 7635c6b..f36d3a2 100644 --- a/lib/htmldiff/operation.rb +++ b/lib/htmldiff/operation.rb @@ -16,7 +16,11 @@ class Operation # Ignores any attributes and tells us if the tag is the same e.g.

and #

are the same. + # The exception to this rule is our-embed tags, where we + # always want a full replacement. def same_tag? + return false if contains_our_embed_tags? + pattern = /<([^>\s]+)[\s>].*/ first_tagname = pattern.match(old_text) # nil means they are not tags first_tagname = first_tagname[1] if first_tagname @@ -27,6 +31,10 @@ def same_tag? first_tagname && (first_tagname == second_tagname) end + def contains_our_embed_tags? + old_text.match?(%r{^<\/?our-embed}) && new_text.match?(%r{^<\/?our-embed}) + end + def old_text old_words.join end diff --git a/spec/operation_spec.rb b/spec/operation_spec.rb index b022257..0f6bc9e 100644 --- a/spec/operation_spec.rb +++ b/spec/operation_spec.rb @@ -24,6 +24,15 @@ end end + context 'with `our-embed` tags that may have different attributes' do + let(:old_tag) { '' } + let(:new_tag) { '' } + + it 'returns false for matching and non-matching `our-embed` tags' do + expect(operation.same_tag?).to be_false + end + end + context 'with two different tags' do let(:old_tag) { '

' } let(:new_tag) { '' } From 6452b79c479c57cba02fa4e4dbae4b37d98ce877 Mon Sep 17 00:00:00 2001 From: Allie Date: Tue, 16 Sep 2025 16:40:43 -0400 Subject: [PATCH 3/6] add rules for closed embed and writing blank tags to DiffBuilder --- lib/htmldiff/diff_builder.rb | 12 +++++++----- lib/htmldiff/word.rb | 4 ++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/htmldiff/diff_builder.rb b/lib/htmldiff/diff_builder.rb index ef6a8ee..7d2ffb8 100644 --- a/lib/htmldiff/diff_builder.rb +++ b/lib/htmldiff/diff_builder.rb @@ -100,10 +100,12 @@ def insert_tag(tagname, cssclass, words) loop do break if words.empty? - if words.first.standalone_tag? - tag_words = words.extract_consecutive_words! do |word| - word.standalone_tag? - end + # Handle our-embeds and writing blank spans as single blocks + if words.first.closed_embed_or_blank_tag? + tag_words = words.extract_consecutive_words! { |word| word.closed_embed_or_blank_tag? } + @content << wrap_text_in_diff_tag(tag_words.join, tagname, cssclass) + elsif words.first.standalone_tag? + tag_words = words.extract_consecutive_words! { |word| word.standalone_tag? } @content << wrap_text_in_diff_tag(tag_words.join, tagname, cssclass) elsif words.first.iframe_tag? tag_words = words.extract_consecutive_words! { |word| word.iframe_tag? } @@ -125,7 +127,7 @@ def insert_tag(tagname, cssclass, words) wrapped = true end @content += words.extract_consecutive_words! do |word| - word.tag? && !word.standalone_tag? && !word.iframe_tag? + word.tag? && !word.standalone_tag? && !word.iframe_tag? && !word.closed_embed_or_blank_tag? end else non_tags = words.extract_consecutive_words! do |word| diff --git a/lib/htmldiff/word.rb b/lib/htmldiff/word.rb index 5fce0f9..89c112d 100644 --- a/lib/htmldiff/word.rb +++ b/lib/htmldiff/word.rb @@ -30,6 +30,10 @@ def embed_or_blank_opening_tag? @word =~ Regexp.union(/^]*class="[^"]*blank[^"]*"[^>]*>$/i, /^]*>$/i) end + def closed_embed_or_blank_tag? + @word =~ /^<(span[^>]*class="[^"]*blank[^"]*"[^>]*|our-embed[^>]*)><\/(span|our-embed)>$/i + end + def tag? opening_tag? || closing_tag? || standalone_tag? end From f9ffc4948558cd8645b0adb7f5566a7e15f3472f Mon Sep 17 00:00:00 2001 From: Allie Date: Tue, 16 Sep 2025 19:45:43 -0400 Subject: [PATCH 4/6] revert changes to `same_tag?` --- lib/htmldiff/diff_builder.rb | 1 - lib/htmldiff/operation.rb | 8 -------- spec/operation_spec.rb | 9 --------- 3 files changed, 18 deletions(-) diff --git a/lib/htmldiff/diff_builder.rb b/lib/htmldiff/diff_builder.rb index 7d2ffb8..fd4a0a3 100644 --- a/lib/htmldiff/diff_builder.rb +++ b/lib/htmldiff/diff_builder.rb @@ -54,7 +54,6 @@ def replace(operation) # added e.g.

becomes

due to an editor button # press. For this, we just show the new version, otherwise it gets messy # trying to find the closing tag. - # our-embed tags are the exception and will be replaced in full. if operation.same_tag? equal(operation) else diff --git a/lib/htmldiff/operation.rb b/lib/htmldiff/operation.rb index f36d3a2..7635c6b 100644 --- a/lib/htmldiff/operation.rb +++ b/lib/htmldiff/operation.rb @@ -16,11 +16,7 @@ class Operation # Ignores any attributes and tells us if the tag is the same e.g.

and #

are the same. - # The exception to this rule is our-embed tags, where we - # always want a full replacement. def same_tag? - return false if contains_our_embed_tags? - pattern = /<([^>\s]+)[\s>].*/ first_tagname = pattern.match(old_text) # nil means they are not tags first_tagname = first_tagname[1] if first_tagname @@ -31,10 +27,6 @@ def same_tag? first_tagname && (first_tagname == second_tagname) end - def contains_our_embed_tags? - old_text.match?(%r{^<\/?our-embed}) && new_text.match?(%r{^<\/?our-embed}) - end - def old_text old_words.join end diff --git a/spec/operation_spec.rb b/spec/operation_spec.rb index 0f6bc9e..b022257 100644 --- a/spec/operation_spec.rb +++ b/spec/operation_spec.rb @@ -24,15 +24,6 @@ end end - context 'with `our-embed` tags that may have different attributes' do - let(:old_tag) { '' } - let(:new_tag) { '' } - - it 'returns false for matching and non-matching `our-embed` tags' do - expect(operation.same_tag?).to be_false - end - end - context 'with two different tags' do let(:old_tag) { '

' } let(:new_tag) { '' } From 6afcb649fb20be2daea99c6012a4293ae402ae7a Mon Sep 17 00:00:00 2001 From: Allie Date: Fri, 19 Sep 2025 11:58:21 -0400 Subject: [PATCH 5/6] replace tag name specific rules with rules for grouping any empty tag --- lib/htmldiff/diff_builder.rb | 4 +-- lib/htmldiff/list_of_words.rb | 50 ++++++++++++++++++++++------------- lib/htmldiff/word.rb | 8 ++---- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/lib/htmldiff/diff_builder.rb b/lib/htmldiff/diff_builder.rb index fd4a0a3..7a3f0be 100644 --- a/lib/htmldiff/diff_builder.rb +++ b/lib/htmldiff/diff_builder.rb @@ -99,9 +99,7 @@ def insert_tag(tagname, cssclass, words) loop do break if words.empty? - # Handle our-embeds and writing blank spans as single blocks - if words.first.closed_embed_or_blank_tag? - tag_words = words.extract_consecutive_words! { |word| word.closed_embed_or_blank_tag? } + # Handle empty tags as single blocks @content << wrap_text_in_diff_tag(tag_words.join, tagname, cssclass) elsif words.first.standalone_tag? tag_words = words.extract_consecutive_words! { |word| word.standalone_tag? } diff --git a/lib/htmldiff/list_of_words.rb b/lib/htmldiff/list_of_words.rb index 9a99497..0fcfeba 100644 --- a/lib/htmldiff/list_of_words.rb +++ b/lib/htmldiff/list_of_words.rb @@ -13,7 +13,7 @@ def initialize(string, options = {}) @words = string else convert_html_to_list_of_words string.chars - group_embed_or_blank_tags! + group_empty_tags! end end @@ -73,9 +73,7 @@ def contains_unclosed_tag? private - # Group our-embed tags and Writing Blank spans, which are - # intentionally left blank, into single words - def group_embed_or_blank_tags! + def group_empty_tags! return if @words.empty? new_words = [] i = 0 @@ -83,33 +81,47 @@ def group_embed_or_blank_tags! while i < @words.length current_word = @words[i] - if current_word.embed_or_blank_opening_tag? - word_group = [current_word] - tag_name = current_word.to_s.match(/^<(span|our-embed)/)[1] - i += 1 - - # Collect words until the appropriate closing tag is reached - while i < @words.length - word = @words[i] - word_group << word - - if word.to_s.match?(/^<\/#{tag_name}>$/) - i += 1 - break + # Check if this is an opening tag + if (tag_match = current_word.to_s.match(/^<([^\s>\/]+)[^>]*>$/i)) + tag_name = tag_match[1] + + # Look ahead to see if the very next word (after any whitespace) is the closing tag + # next_non_whitespace_index = find_next_non_whitespace_word(i + 1) + next_index = i + 1 + # If the very next word is the closing tag, group the empty tag pair + if @words[next_index]&.to_s&.match?(/^<\/#{Regexp.escape(tag_name)}>$/i) + word_group = [] + (i..next_index).each do |idx| + word_group << @words[idx] end + new_words << Word.new(word_group.map(&:to_s).join) + i = next_index + 1 + else + # Otherwise, add as individual word + new_words << current_word i += 1 end - # Create a single word from the entire element - new_words << Word.new(word_group.map(&:to_s).join) else + # Not an opening tag - keep individual word new_words << current_word i += 1 end end + @words = new_words end + def find_next_non_whitespace_word(start_index) + i = start_index + while i < @words.length + word_str = @words[i].to_s.strip + return i unless word_str.empty? + i += 1 + end + nil + end + def convert_html_to_list_of_words(character_array) @mode = :char @current_word = Word.new diff --git a/lib/htmldiff/word.rb b/lib/htmldiff/word.rb index 89c112d..4a71ea8 100644 --- a/lib/htmldiff/word.rb +++ b/lib/htmldiff/word.rb @@ -26,12 +26,8 @@ def iframe_tag? (@word[0..7].downcase =~ %r{^<\/?iframe ?}) end - def embed_or_blank_opening_tag? - @word =~ Regexp.union(/^]*class="[^"]*blank[^"]*"[^>]*>$/i, /^]*>$/i) - end - - def closed_embed_or_blank_tag? - @word =~ /^<(span[^>]*class="[^"]*blank[^"]*"[^>]*|our-embed[^>]*)><\/(span|our-embed)>$/i + def closed_empty_tag? + @word.match?(/^<([^\/\s>]+)(?:\s[^>]*)?>(?:\s*)<\/\1>$/) end def tag? From f9dceccfb910da440da823d3c89a5d92d2fb2bc6 Mon Sep 17 00:00:00 2001 From: Allie Date: Fri, 19 Sep 2025 12:01:39 -0400 Subject: [PATCH 6/6] add arg to `same_tag?` for optionally comparing attributes --- lib/htmldiff/diff_builder.rb | 14 +++++++++++--- lib/htmldiff/operation.rb | 26 +++++++++++++++++--------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/lib/htmldiff/diff_builder.rb b/lib/htmldiff/diff_builder.rb index 7a3f0be..0cc4485 100644 --- a/lib/htmldiff/diff_builder.rb +++ b/lib/htmldiff/diff_builder.rb @@ -16,10 +16,15 @@ def initialize(old_version, new_version, options = {}) def default_options { - block_tag_classes: [] + block_tag_classes: [], + compare_tag_attributes: false } end + def compare_tag_attributes? + @options[:compare_tag_attributes] + end + def build perform_operations content.join @@ -54,7 +59,8 @@ def replace(operation) # added e.g.

becomes

due to an editor button # press. For this, we just show the new version, otherwise it gets messy # trying to find the closing tag. - if operation.same_tag? + + if operation.same_tag?(compare_tag_attributes?) equal(operation) else delete(operation, 'diffmod') @@ -100,6 +106,8 @@ def insert_tag(tagname, cssclass, words) break if words.empty? # Handle empty tags as single blocks + if words.first.closed_empty_tag? + tag_words = words.extract_consecutive_words! { |word| word.closed_empty_tag? } @content << wrap_text_in_diff_tag(tag_words.join, tagname, cssclass) elsif words.first.standalone_tag? tag_words = words.extract_consecutive_words! { |word| word.standalone_tag? } @@ -124,7 +132,7 @@ def insert_tag(tagname, cssclass, words) wrapped = true end @content += words.extract_consecutive_words! do |word| - word.tag? && !word.standalone_tag? && !word.iframe_tag? && !word.closed_embed_or_blank_tag? + word.tag? && !word.standalone_tag? && !word.iframe_tag? && !word.closed_empty_tag? end else non_tags = words.extract_consecutive_words! do |word| diff --git a/lib/htmldiff/operation.rb b/lib/htmldiff/operation.rb index 7635c6b..9cf0f26 100644 --- a/lib/htmldiff/operation.rb +++ b/lib/htmldiff/operation.rb @@ -14,17 +14,25 @@ class Operation # @!method old_words # @!method new_words - # Ignores any attributes and tells us if the tag is the same e.g.

and - #

are the same. - def same_tag? - pattern = /<([^>\s]+)[\s>].*/ - first_tagname = pattern.match(old_text) # nil means they are not tags - first_tagname = first_tagname[1] if first_tagname + # Unless `compare_attributes` is true, Ignores any attributes and tells us + # if the tag is the same e.g.

and

are the same. + def same_tag?(compare_attributes = false) + pattern = /<(?[^>\s]+)\s(?.*)[\s>].*/ + first_tag = pattern.match(old_text) # nil means they are not tags + first_tagname = first_tag[:name] if first_tag - second_tagname = pattern.match(new_text) - second_tagname = second_tagname[1] if second_tagname + second_tag = pattern.match(new_text) + second_tagname = second_tag[:name] if second_tag - first_tagname && (first_tagname == second_tagname) + return false unless first_tag && second_tag + + if compare_attributes + first_attrs = first_tag[:attrs] + second_attrs = first_tag[:attrs] + return false if first_attrs && (first_attrs == second_attrs) + end + + first_tagname == second_tagname end def old_text