From e14847cee53d26eb162ad786ba12e3cd7a86fce0 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 12 Aug 2024 10:03:34 +0900 Subject: [PATCH 01/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index bb804b0e..99d574b3 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.5" + VERSION = "3.3.6" REVISION = "" Copyright = COPYRIGHT From 2f019f9b33594b561e1ef39c42fab1f2fda51891 Mon Sep 17 00:00:00 2001 From: Viktor Ivarsson Date: Fri, 16 Aug 2024 03:47:25 +0200 Subject: [PATCH 02/91] Improve `BaseParser#unnormalize` (#194) The current implementation of `#unnormalize` iterates over matched entity references that already has been substituted. With these changes we will reduce the number of redundant calls to `rv.gsub!`. * Reject filtered matches earlier in the loop * Improve `#unnormalize` by removing redundant calls to `rv.gsub!` * Improve `entity_expansion_limit` tests --- Example: ```ruby require "rexml/parsers/baseparser" entity_less_than = "<" entitiy_length = 100 filler_text = "A" filler_length = 100 feed = "#{entity_less_than * entitiy_length}#{filler_text * filler_length}" base_parser = REXML::Parsers::BaseParser.new("") base_parser.unnormalize(feed) # => "<" * 100 + "A" * 100 ``` Before this PR, the example above would require 100 iterations. After this PR, 1 iteration. --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 53 ++++++++++++++++++++++++--------- test/test_pullparser.rb | 14 +++++---- test/test_sax.rb | 12 ++++---- 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 342f9482..b471feff 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -8,6 +8,22 @@ module REXML module Parsers + unless [].respond_to?(:tally) + module EnumerableTally + refine Enumerable do + def tally + counts = {} + each do |item| + counts[item] ||= 0 + counts[item] += 1 + end + counts + end + end + end + using EnumerableTally + end + if StringScanner::Version < "3.0.8" module StringScannerCaptures refine StringScanner do @@ -547,20 +563,29 @@ def unnormalize( string, entities=nil, filter=nil ) [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! + if filter + matches.reject! do |entity_reference| + filter.include?(entity_reference) + end + end if matches.size > 0 - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - entity_value = entity( entity_reference, entities ) - if entity_value - re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ - rv.gsub!( re, entity_value ) - if rv.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - end - else - er = DEFAULT_ENTITIES[entity_reference] - rv.gsub!( er[0], er[2] ) if er + matches.tally.each do |entity_reference, n| + entity_expansion_count_before = @entity_expansion_count + entity_value = entity( entity_reference, entities ) + if entity_value + if n > 1 + entity_expansion_count_delta = + @entity_expansion_count - entity_expansion_count_before + record_entity_expansion(entity_expansion_count_delta * (n - 1)) + end + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ + rv.gsub!( re, entity_value ) + if rv.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" end + else + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er end end rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) @@ -570,8 +595,8 @@ def unnormalize( string, entities=nil, filter=nil ) private - def record_entity_expansion - @entity_expansion_count += 1 + def record_entity_expansion(delta=1) + @entity_expansion_count += delta if @entity_expansion_count > Security.entity_expansion_limit raise "number of entity expansions exceeded, processing aborted." end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 827fad1d..dbde8779 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -206,21 +206,23 @@ def test_empty_value XML + REXML::Security.entity_expansion_limit = 100000 parser = REXML::Parsers::PullParser.new(source) - assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do - while parser.has_next? - parser.pull - end + while parser.has_next? + parser.pull end + assert_equal(11111, parser.entity_expansion_count) - REXML::Security.entity_expansion_limit = 100 + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit parser = REXML::Parsers::PullParser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do while parser.has_next? parser.pull end end - assert_equal(101, parser.entity_expansion_count) + assert do + parser.entity_expansion_count > @default_entity_expansion_limit + end end def test_with_default_entity diff --git a/test/test_sax.rb b/test/test_sax.rb index f452de50..d31de183 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -147,17 +147,19 @@ def test_empty_value XML + REXML::Security.entity_expansion_limit = 100000 sax = REXML::Parsers::SAX2Parser.new(source) - assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do - sax.parse - end + sax.parse + assert_equal(11111, sax.entity_expansion_count) - REXML::Security.entity_expansion_limit = 100 + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit sax = REXML::Parsers::SAX2Parser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do sax.parse end - assert_equal(101, sax.entity_expansion_count) + assert do + sax.entity_expansion_count > @default_entity_expansion_limit + end end def test_with_default_entity From 1c76dbbb7c5f001f4b931b8055e8e2d7578da760 Mon Sep 17 00:00:00 2001 From: Viktor Ivarsson Date: Sat, 17 Aug 2024 04:09:54 +0200 Subject: [PATCH 03/91] Fix RuntimeError in `REXML::Parsers::BaseParser` for valid feeds (#199) GitHub: fix GH-198 Change `#entity` to not match against default entities After this change, the following example will not raise a RuntimeError: ```ruby # rexml/refactor_entity_example.rb $LOAD_PATH.unshift(File.expand_path("lib")) require "rexml/parsers/baseparser" valid_feed = "<p>#{'A' * 10_240}</p>" base_parser = REXML::Parsers::BaseParser.new("") base_parser.unnormalize(valid_feed) # => "

" + "A" * 10_240 + "

" ``` Default entities now gets substituted by this block instead https://github.com/ruby/rexml/blob/e14847cee53d26eb162ad786ba12e3cd7a86fce0/lib/rexml/parsers/baseparser.rb#L560-L563 --------- Co-authored-by: Sutou Kouhei Co-authored-by: NAITOH Jun --- lib/rexml/parsers/baseparser.rb | 16 +++++++------- test/test_pullparser.rb | 30 ++++++++++++++++++++++++++ test/test_sax.rb | 24 +++++++++++++++++++++ test/test_stream.rb | 37 +++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 9 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b471feff..c03f375f 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -521,15 +521,13 @@ def pull_event private :pull_event def entity( reference, entities ) - value = nil - value = entities[ reference ] if entities - if value - record_entity_expansion - else - value = DEFAULT_ENTITIES[ reference ] - value = value[2] if value - end - unnormalize( value, entities ) if value + return unless entities + + value = entities[ reference ] + return if value.nil? + + record_entity_expansion + unnormalize( value, entities ) end # Escapes all possible entities diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index dbde8779..005a106a 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -254,6 +254,36 @@ def test_with_default_entity end end + def test_with_only_default_entities + member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + source = <<-XML + + +#{member_value} + + XML + + parser = REXML::Parsers::PullParser.new(source) + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, events['member'].strip) + assert_equal(0, parser.entity_expansion_count) + assert do + events['member'].bytesize > @default_entity_expansion_text_limit + end + end + def test_entity_expansion_text_limit source = <<-XML + +#{member_value} + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + text_value = nil + sax.listen(:characters, ["member"]) do |text| + text_value = text + end + sax.parse + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, text_value.strip) + assert_equal(0, sax.entity_expansion_count) + assert do + text_value.bytesize > @default_entity_expansion_text_limit + end + end + def test_entity_expansion_text_limit source = <<-XML + +#{member_value} + + XML + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Document.parse_stream(source, listener) + + expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + assert_equal(expected_value, listener.text_value.strip) + assert do + listener.text_value.bytesize > @default_entity_expansion_text_limit + end + end + end # For test_listener class RequestReader From c8110b4830c990453e167ccca934e65858308fa1 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 17 Aug 2024 11:25:28 +0900 Subject: [PATCH 04/91] Fix to not allow parameter entity references at internal subsets (#191) ## Why? In the internal subset of DTD, references to parameter entities are not allowed within markup declarations. See: https://www.w3.org/TR/xml/#wfc-PEinInternalSubset > Well-formedness constraint: PEs in Internal Subset > In the internal DTD subset, parameter-entity references MUST NOT occur within markup declarations; they may occur where markup declarations can occur. (This does not apply to references that occur in external parameter entities or to the external subset.) --- lib/rexml/entity.rb | 52 ++-------------- lib/rexml/parsers/baseparser.rb | 3 + test/test_document.rb | 50 ---------------- test/test_entity.rb | 102 +++++++++++++++++++++++++------- 4 files changed, 89 insertions(+), 118 deletions(-) diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 573db691..12bbad3f 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -12,6 +12,7 @@ class Entity < Child EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" NDATADECL = "\\s+NDATA\\s+#{NAME}" PEREFERENCE = "%#{NAME};" + PEREFERENCE_RE = /#{PEREFERENCE}/um ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" @@ -19,7 +20,7 @@ class Entity < Child GEDECL = "" ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um - attr_reader :name, :external, :ref, :ndata, :pubid + attr_reader :name, :external, :ref, :ndata, :pubid, :value # Create a new entity. Simple entities can be constructed by passing a # name, value to the constructor; this creates a generic, plain entity @@ -68,14 +69,11 @@ def Entity::matches? string end # Evaluates to the unnormalized value of this entity; that is, replacing - # all entities -- both %ent; and &ent; entities. This differs from - # +value()+ in that +value+ only replaces %ent; entities. + # &ent; entities. def unnormalized document.record_entity_expansion unless document.nil? - v = value() - return nil if v.nil? - @unnormalized = Text::unnormalize(v, parent) - @unnormalized + return nil if @value.nil? + @unnormalized = Text::unnormalize(@value, parent) end #once :unnormalized @@ -121,46 +119,6 @@ def to_s write rv rv end - - PEREFERENCE_RE = /#{PEREFERENCE}/um - # Returns the value of this entity. At the moment, only internal entities - # are processed. If the value contains internal references (IE, - # %blah;), those are replaced with their values. IE, if the doctype - # contains: - # - # - # then: - # doctype.entity('yada').value #-> "nanoo bar nanoo" - def value - @resolved_value ||= resolve_value - end - - def parent=(other) - @resolved_value = nil - super - end - - private - def resolve_value - return nil if @value.nil? - return @value unless @value.match?(PEREFERENCE_RE) - - matches = @value.scan(PEREFERENCE_RE) - rv = @value.clone - if @parent - sum = 0 - matches.each do |entity_reference| - entity_value = @parent.entity( entity_reference[0] ) - if sum + entity_value.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - else - sum += entity_value.bytesize - end - rv.gsub!( /%#{entity_reference.join};/um, entity_value ) - end - end - rv - end end # This is a set of entity constants -- the ones defined in the XML diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c03f375f..e38ff86e 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -141,6 +141,7 @@ class BaseParser } module Private + PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -350,6 +351,8 @@ def pull_event match[4] = match[4][1..-2] # HREF match.delete_at(5) if match.size > 5 # Chop out NDATA decl # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] + elsif Private::PEREFERENCE_PATTERN.match?(match[2]) + raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source) else match[2] = match[2][1..-2] match.pop if match.size == 4 diff --git a/test/test_document.rb b/test/test_document.rb index 72ec3579..25a8828f 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -147,56 +147,6 @@ def test_entity_expansion_text_limit assert_equal(90, doc.root.children.first.value.bytesize) end end - - class ParameterEntityTest < self - def test_have_value - xml = < - - - - - - - -]> - -EOF - - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - REXML::Security.entity_expansion_limit = 100 - assert_equal(100, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - end - - def test_empty_value - xml = < - - - - - - - -]> - -EOF - - REXML::Document.new(xml) - REXML::Security.entity_expansion_limit = 90 - assert_equal(90, REXML::Security.entity_expansion_limit) - assert_raise(REXML::ParseException) do - REXML::Document.new(xml) - end - end - end end def test_tag_in_cdata_with_not_ascii_only_but_ascii8bit_encoding_source diff --git a/test/test_entity.rb b/test/test_entity.rb index a2b262f7..89f83894 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -59,8 +59,7 @@ def test_parse_entity def test_constructor one = [ %q{}, - %q{}, - %q{}, + %q{}, '', '' ] source = %q{ - - + ', + "a", + "B", + "B", + "B", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_without_reference + entity = REXML::Entity.new([:entitydecl, "a", "&b;"]) + assert_equal([ + '', + "a", + "&b;", + "&b;", + "&b;", + ], + [ + entity.to_s, + entity.name, + entity.value, + entity.normalized, + entity.unnormalized, + ]) + end + + def test_readers_with_nested_references + doctype = REXML::DocType.new('root') + doctype.add(REXML::Entity.new([:entitydecl, "a", "&b;"])) + doctype.add(REXML::Entity.new([:entitydecl, "b", "X"])) + assert_equal([ + "a", + "&b;", + "&b;", + "X", + "b", + "X", + "X", + "X", + ], + [ + doctype.entities["a"].name, + doctype.entities["a"].value, + doctype.entities["a"].normalized, + doctype.entities["a"].unnormalized, + doctype.entities["b"].name, + doctype.entities["b"].value, + doctype.entities["b"].normalized, + doctype.entities["b"].unnormalized, + ]) + end + + def test_parameter_entity_reference_forbidden_by_internal_subset_in_parser + source = ' ]>' + parser = REXML::Parsers::BaseParser.new(source) + exception = assert_raise(REXML::ParseException) do + while parser.has_next? + parser.pull + end + end + assert_equal(<<-DETAIL, exception.to_s) +Parameter entity references forbidden in internal subset: "%a;" +Line: 1 +Position: 54 +Last 80 unconsumed characters: + DETAIL + end + def test_entity_string_limit template = ' ]> $' len = 5120 # 5k per entity @@ -122,22 +198,6 @@ def test_entity_string_limit end end - def test_entity_string_limit_for_parameter_entity - template = ' ]>' - len = 5120 # 5k per entity - template.sub!(/\^/, "B" * len) - - # 10k is OK - entities = '%a;' * 2 # 5k entity * 2 = 10k - REXML::Document.new(template.sub(/\$/, entities)) - - # above 10k explodes - entities = '%a;' * 3 # 5k entity * 2 = 15k - assert_raise(REXML::ParseException) do - REXML::Document.new(template.sub(/\$/, entities)) - end - end - def test_raw source = ' @@ -161,7 +221,7 @@ def test_lazy_evaluation def test_entity_replacement source = %q{ - ]> + ]> &WhatHeSaid;} d = REXML::Document.new( source ) From 790ad5c8530d1b6f6ad7445c085a7403119c5150 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 15:54:14 +0900 Subject: [PATCH 05/91] test: split duplicated attribute case and namespace conflict case --- test/test_core.rb | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/test/test_core.rb b/test/test_core.rb index e1fba8a7..b079c203 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -114,22 +114,35 @@ def test_attribute name4='test4'/>).join(' '), e.to_s end - def test_attribute_namespace_conflict + def test_attribute_duplicated # https://www.w3.org/TR/xml-names/#uniqAttrs message = <<-MESSAGE.chomp Duplicate attribute "a" -Line: 4 -Position: 140 +Line: 2 +Position: 24 Last 80 unconsumed characters: /> MESSAGE assert_raise(REXML::ParseException.new(message)) do Document.new(<<-XML) + + + + XML + end + end + + def test_attribute_namespace_conflict + # https://www.w3.org/TR/xml-names/#uniqAttrs + message = <<-MESSAGE.chomp +Namespace conflict in adding attribute "a": Prefix "n1" = "http://www.w3.org" and prefix "n2" = "http://www.w3.org" + MESSAGE + assert_raise(REXML::ParseException.new(message)) do + Document.new(<<-XML) - - + xmlns:n2="http://www.w3.org"> + XML end From 6422fa34494fd4145d7bc68fbbe9525d42becf62 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:10:05 +0900 Subject: [PATCH 06/91] Use loop instead of recursive call for Element#root It's for performance and avoiding stack level too deep. --- lib/rexml/element.rb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index a5808d7c..27132926 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -441,9 +441,14 @@ def root_node # Related: #root_node, #document. # def root - return elements[1] if self.kind_of? Document - return self if parent.kind_of? Document or parent.nil? - return parent.root + target = self + while target + return target.elements[1] if target.kind_of? Document + parent = target.parent + return target if parent.kind_of? Document or parent.nil? + target = parent + end + nil end # :call-seq: From fdbffe744b38811be8b1cf6a9eec3eea4d71c412 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:14:19 +0900 Subject: [PATCH 07/91] Use loop instead of recursive call for Element#namespace It's for performance and avoiding stack level too deep. --- lib/rexml/element.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 27132926..eb802165 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -624,8 +624,12 @@ def namespace(prefix=nil) else prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' end - ns = attributes[ prefix ] - ns = parent.namespace(prefix) if ns.nil? and parent + ns = nil + target = self + while ns.nil? and target + ns = target.attributes[prefix] + target = target.parent + end ns = '' if ns.nil? and prefix == 'xmlns' return ns end From df3a0cc83013f3cde7b7c2044e3ce00bcad321cb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:18:58 +0900 Subject: [PATCH 08/91] test: fix indent --- test/parser/test_sax2.rb | 276 ++++++++++++++++---------------- test/parser/test_tree.rb | 48 +++--- test/parser/test_ultra_light.rb | 94 +++++------ 3 files changed, 209 insertions(+), 209 deletions(-) diff --git a/test/parser/test_sax2.rb b/test/parser/test_sax2.rb index 91d135f5..9cc76ffb 100644 --- a/test/parser/test_sax2.rb +++ b/test/parser/test_sax2.rb @@ -4,200 +4,200 @@ require "rexml/sax2listener" module REXMLTests -class TestSAX2Parser < Test::Unit::TestCase - class TestDocumentTypeDeclaration < self - private - def xml(internal_subset) - <<-XML + class TestSAX2Parser < Test::Unit::TestCase + class TestDocumentTypeDeclaration < self + private + def xml(internal_subset) + <<-XML - XML - end + XML + end - class TestEntityDeclaration < self - class Listener - include REXML::SAX2Listener - attr_reader :entity_declarations - def initialize - @entity_declarations = [] - end + class TestEntityDeclaration < self + class Listener + include REXML::SAX2Listener + attr_reader :entity_declarations + def initialize + @entity_declarations = [] + end - def entitydecl(declaration) - super - @entity_declarations << declaration + def entitydecl(declaration) + super + @entity_declarations << declaration + end end - end - private - def parse(internal_subset) - listener = Listener.new - parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) - parser.listen(listener) - parser.parse - listener.entity_declarations - end + private + def parse(internal_subset) + listener = Listener.new + parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) + parser.listen(listener) + parser.parse + listener.entity_declarations + end - class TestGeneralEntity < self - class TestValue < self - def test_double_quote - assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) + class TestGeneralEntity < self + class TestValue < self + def test_double_quote + assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - def test_single_quote - assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) + def test_single_quote + assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end - end - class TestExternlID < self - class TestSystem < self - def test_with_ndata - declaration = [ - "name", - "SYSTEM", "system-literal", - "NDATA", "ndata-name", - ] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + class TestExternlID < self + class TestSystem < self + def test_with_ndata + declaration = [ + "name", + "SYSTEM", "system-literal", + "NDATA", "ndata-name", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET + end + + def test_without_ndata + declaration = [ + "name", + "SYSTEM", "system-literal", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + end + + class TestPublic < self + def test_with_ndata + declaration = [ + "name", + "PUBLIC", "public-literal", "system-literal", + "NDATA", "ndata-name", + ] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + + def test_without_ndata + declaration = [ + "name", + "PUBLIC", "public-literal", "system-literal", + ] + assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) + + INTERNAL_SUBSET + end + end + end + end + + class TestParameterEntity < self + class TestValue < self + def test_double_quote + assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end - def test_without_ndata - declaration = [ - "name", - "SYSTEM", "system-literal", - ] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - + def test_single_quote + assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end end - class TestPublic < self - def test_with_ndata + class TestExternlID < self + def test_system declaration = [ + "%", "name", - "PUBLIC", "public-literal", "system-literal", - "NDATA", "ndata-name", + "SYSTEM", "system-literal", ] assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - + parse(<<-INTERNAL_SUBSET)) + INTERNAL_SUBSET end - def test_without_ndata + def test_public declaration = [ + "%", "name", "PUBLIC", "public-literal", "system-literal", ] assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - + INTERNAL_SUBSET end end end end - class TestParameterEntity < self - class TestValue < self - def test_double_quote - assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET + class TestNotationDeclaration < self + class Listener + include REXML::SAX2Listener + attr_reader :notation_declarations + def initialize + @notation_declarations = [] end - def test_single_quote - assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET + def notationdecl(*declaration) + super + @notation_declarations << declaration end end + private + def parse(internal_subset) + listener = Listener.new + parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) + parser.listen(listener) + parser.parse + listener.notation_declarations + end + class TestExternlID < self def test_system - declaration = [ - "%", - "name", - "SYSTEM", "system-literal", - ] + declaration = ["name", "SYSTEM", nil, "system-literal"] assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET - end - - def test_public - declaration = [ - "%", - "name", - "PUBLIC", "public-literal", "system-literal", - ] - assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - - INTERNAL_SUBSET - end - end - end - end - - class TestNotationDeclaration < self - class Listener - include REXML::SAX2Listener - attr_reader :notation_declarations - def initialize - @notation_declarations = [] - end - - def notationdecl(*declaration) - super - @notation_declarations << declaration - end - end - - private - def parse(internal_subset) - listener = Listener.new - parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset)) - parser.listen(listener) - parser.parse - listener.notation_declarations - end - - class TestExternlID < self - def test_system - declaration = ["name", "SYSTEM", nil, "system-literal"] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - def test_public - declaration = ["name", "PUBLIC", "public-literal", "system-literal"] - assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) + def test_public + declaration = ["name", "PUBLIC", "public-literal", "system-literal"] + assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end - end - class TestPublicID < self - def test_literal - declaration = ["name", "PUBLIC", "public-literal", nil] - assert_equal([declaration], - parse(<<-INTERNAL_SUBSET)) + class TestPublicID < self + def test_literal + declaration = ["name", "PUBLIC", "public-literal", nil] + assert_equal([declaration], + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET + INTERNAL_SUBSET + end end end end end end -end diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb index 8a5d9d12..88bc075c 100644 --- a/test/parser/test_tree.rb +++ b/test/parser/test_tree.rb @@ -4,40 +4,40 @@ require "rexml/parsers/treeparser" module REXMLTests -class TestTreeParser < Test::Unit::TestCase - class TestInvalid < self - def test_unmatched_close_tag - xml = "" - exception = assert_raise(REXML::ParseException) do - parse(xml) - end - assert_equal(<<-MESSAGE, exception.to_s) + class TestTreeParser < Test::Unit::TestCase + class TestInvalid < self + def test_unmatched_close_tag + xml = "" + exception = assert_raise(REXML::ParseException) do + parse(xml) + end + assert_equal(<<-MESSAGE, exception.to_s) Missing end tag for 'root' (got 'not-root') Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: - MESSAGE - end - - def test_no_close_tag - xml = "" - exception = assert_raise(REXML::ParseException) do - parse(xml) + MESSAGE end - assert_equal(<<-MESSAGE, exception.to_s) + + def test_no_close_tag + xml = "" + exception = assert_raise(REXML::ParseException) do + parse(xml) + end + assert_equal(<<-MESSAGE, exception.to_s) No close tag for /root Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: - MESSAGE - end + MESSAGE + end - private - def parse(xml) - document = REXML::Document.new - parser = REXML::Parsers::TreeParser.new(xml, document) - parser.parse + private + def parse(xml) + document = REXML::Document.new + parser = REXML::Parsers::TreeParser.new(xml, document) + parser.parse + end end end end -end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index b3f576ff..d1364d6a 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -3,66 +3,66 @@ require "rexml/parsers/ultralightparser" module REXMLTests -class TestUltraLightParser < Test::Unit::TestCase - class TestDocumentTypeDeclaration < self - def test_entity_declaration - assert_equal([ - [ - :start_doctype, - :parent, - "root", - "SYSTEM", - "urn:x-test", - nil, - [:entitydecl, "name", "value"] + class TestUltraLightParser < Test::Unit::TestCase + class TestDocumentTypeDeclaration < self + def test_entity_declaration + assert_equal([ + [ + :start_doctype, + :parent, + "root", + "SYSTEM", + "urn:x-test", + nil, + [:entitydecl, "name", "value"] + ], + [:start_element, :parent, "root", {}], ], - [:start_element, :parent, "root", {}], - ], - parse(<<-INTERNAL_SUBSET)) + parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - private - def xml(internal_subset) - <<-XML + private + def xml(internal_subset) + <<-XML - XML - end + XML + end - def parse(internal_subset) - parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset)) - normalize(parser.parse) - end + def parse(internal_subset) + parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset)) + normalize(parser.parse) + end - def normalize(root) - root.collect do |child| - normalize_child(child) + def normalize(root) + root.collect do |child| + normalize_child(child) + end end - end - def normalize_child(child) - tag = child.first - case tag - when :start_doctype - normalized_parent = :parent - normalized_doctype = child.dup - normalized_doctype[1] = normalized_parent - normalized_doctype - when :start_element - tag, _parent, name, attributes, *children = child - normalized_parent = :parent - normalized_children = children.collect do |sub_child| - normalize_child(sub_child) + def normalize_child(child) + tag = child.first + case tag + when :start_doctype + normalized_parent = :parent + normalized_doctype = child.dup + normalized_doctype[1] = normalized_parent + normalized_doctype + when :start_element + tag, _parent, name, attributes, *children = child + normalized_parent = :parent + normalized_children = children.collect do |sub_child| + normalize_child(sub_child) + end + [tag, normalized_parent, name, attributes, *normalized_children] + else + child end - [tag, normalized_parent, name, attributes, *normalized_children] - else - child end end end end -end From 6e00a14daf2f901df535eafe96cc94d43a957ffe Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:20:50 +0900 Subject: [PATCH 09/91] test: fix indent --- test/parser/test_sax2.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/parser/test_sax2.rb b/test/parser/test_sax2.rb index 9cc76ffb..c2548907 100644 --- a/test/parser/test_sax2.rb +++ b/test/parser/test_sax2.rb @@ -177,12 +177,12 @@ def test_system assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) - INTERNAL_SUBSET - end + INTERNAL_SUBSET + end - def test_public - declaration = ["name", "PUBLIC", "public-literal", "system-literal"] - assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) + def test_public + declaration = ["name", "PUBLIC", "public-literal", "system-literal"] + assert_equal([declaration], parse(<<-INTERNAL_SUBSET)) INTERNAL_SUBSET end From 35e1681a179c28d5b6ec97d4ab1c110e5ac00303 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 16:21:19 +0900 Subject: [PATCH 10/91] test tree-parser: move common method to base class --- test/parser/test_tree.rb | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb index 88bc075c..cdd28d2c 100644 --- a/test/parser/test_tree.rb +++ b/test/parser/test_tree.rb @@ -5,6 +5,12 @@ module REXMLTests class TestTreeParser < Test::Unit::TestCase + private def parse(xml) + document = REXML::Document.new + parser = REXML::Parsers::TreeParser.new(xml, document) + parser.parse + end + class TestInvalid < self def test_unmatched_close_tag xml = "" @@ -31,13 +37,6 @@ def test_no_close_tag Last 80 unconsumed characters: MESSAGE end - - private - def parse(xml) - document = REXML::Document.new - parser = REXML::Parsers::TreeParser.new(xml, document) - parser.parse - end end end end From 2b47b161db19c38c5e45e36c2008c045543e976e Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 17:05:29 +0900 Subject: [PATCH 11/91] parser: move duplicated end tag check to BaseParser --- lib/rexml/parsers/baseparser.rb | 4 ++++ lib/rexml/parsers/streamparser.rb | 8 -------- lib/rexml/parsers/treeparser.rb | 7 ------- test/parser/test_tree.rb | 2 +- 4 files changed, 5 insertions(+), 16 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e38ff86e..093af36a 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -249,6 +249,10 @@ def pull_event if @document_status == :in_doctype raise ParseException.new("Malformed DOCTYPE: unclosed", @source) end + unless @tags.empty? + path = "/" + @tags.join("/") + raise ParseException.new("Missing end tag for '#{path}'", @source) + end return [ :end_document ] end return @stack.shift if @stack.size > 0 diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index fa3ac496..e2da2a7d 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -7,7 +7,6 @@ class StreamParser def initialize source, listener @listener = listener @parser = BaseParser.new( source ) - @tag_stack = [] end def add_listener( listener ) @@ -20,21 +19,14 @@ def parse event = @parser.pull case event[0] when :end_document - unless @tag_stack.empty? - tag_path = "/" + @tag_stack.join("/") - raise ParseException.new("Missing end tag for '#{tag_path}'", - @parser.source) - end return when :start_element - @tag_stack << event[1] attrs = event[2].each do |n, v| event[2][n] = @parser.unnormalize( v ) end @listener.tag_start( event[1], attrs ) when :end_element @listener.tag_end( event[1] ) - @tag_stack.pop when :text unnormalized = @parser.unnormalize( event[1] ) @listener.text( unnormalized ) diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index 0cb6f7cc..4565a406 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -15,7 +15,6 @@ def add_listener( listener ) end def parse - tag_stack = [] entities = nil begin while true @@ -23,19 +22,13 @@ def parse #STDERR.puts "TREEPARSER GOT #{event.inspect}" case event[0] when :end_document - unless tag_stack.empty? - raise ParseException.new("No close tag for #{@build_context.xpath}", - @parser.source, @parser) - end return when :start_element - tag_stack.push(event[1]) el = @build_context = @build_context.add_element( event[1] ) event[2].each do |key, value| el.attributes[key]=Attribute.new(key,value,self) end when :end_element - tag_stack.pop @build_context = @build_context.parent when :text if @build_context[-1].instance_of? Text diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb index cdd28d2c..315be9c2 100644 --- a/test/parser/test_tree.rb +++ b/test/parser/test_tree.rb @@ -31,7 +31,7 @@ def test_no_close_tag parse(xml) end assert_equal(<<-MESSAGE, exception.to_s) -No close tag for /root +Missing end tag for '/root' Line: 1 Position: #{xml.bytesize} Last 80 unconsumed characters: From cb158582f18cebb3bf7b3f21f230e2fb17d435aa Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 17:39:14 +0900 Subject: [PATCH 12/91] parser: keep the current namespaces instead of stack of Set It improves namespace resolution performance for deep element. --- lib/rexml/parsers/baseparser.rb | 45 +++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 093af36a..9ed032d3 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -181,7 +181,8 @@ def stream=( source ) @tags = [] @stack = [] @entities = [] - @nsstack = [] + @namespaces = {} + @namespaces_restore_stack = [] end def position @@ -285,7 +286,6 @@ def pull_event @source.position = start_position raise REXML::ParseException.new(message, @source) end - @nsstack.unshift(Set.new) name = parse_name(base_error_message) if @source.match(/\s*\[/um, true) id = [nil, nil, nil] @@ -379,7 +379,7 @@ def pull_event val = attdef[4] if val == "#FIXED " pairs[attdef[0]] = val if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 + @namespaces[$1] = val end end end @@ -432,7 +432,7 @@ def pull_event # here explicitly. @source.ensure_buffer if @source.match("/", true) - @nsstack.shift + @namespaces_restore_stack.pop last_tag = @tags.pop md = @source.match(Private::CLOSE_PATTERN, true) if md and !last_tag @@ -477,18 +477,18 @@ def pull_event @document_status = :in_element @prefixes.clear @prefixes << md[2] if md[2] - @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(@prefixes, curr_ns) + push_namespaces_restore + attributes, closed = parse_attributes(@prefixes) # Verify that all of the prefixes have been defined for prefix in @prefixes - unless @nsstack.find{|k| k.member?(prefix)} + unless @namespaces.key?(prefix) raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed @closed = tag - @nsstack.shift + pop_namespaces_restore else if @tags.empty? and @have_root raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) @@ -599,6 +599,31 @@ def unnormalize( string, entities=nil, filter=nil ) end private + def add_namespace(prefix, uri) + @namespaces_restore_stack.last[prefix] = @namespaces[prefix] + if uri.nil? + @namespaces.delete(prefix) + else + @namespaces[prefix] = uri + end + end + + def push_namespaces_restore + namespaces_restore = {} + @namespaces_restore_stack.push(namespaces_restore) + namespaces_restore + end + + def pop_namespaces_restore + namespaces_restore = @namespaces_restore_stack.pop + namespaces_restore.each do |prefix, uri| + if uri.nil? + @namespaces.delete(prefix) + else + @namespaces[prefix] = uri + end + end + end def record_entity_expansion(delta=1) @entity_expansion_count += delta @@ -727,7 +752,7 @@ def process_instruction [:processing_instruction, name, content] end - def parse_attributes(prefixes, curr_ns) + def parse_attributes(prefixes) attributes = {} closed = false while true @@ -770,7 +795,7 @@ def parse_attributes(prefixes, curr_ns) "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self) end - curr_ns << local_part + add_namespace(local_part, value) elsif prefix prefixes << prefix unless prefix == "xml" end From 6109e0183cecf4f8b587d76209716cb1bbcd6bd5 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 21 Aug 2024 15:23:00 +0900 Subject: [PATCH 13/91] Fix a bug that Stream parser doesn't expand the user-defined entity references for "text" (#200) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Why? Pull parser expands character references and predefined entity references, but doesn't expand user-defined entity references. ## Change - text_stream_unnormalize.rb ``` $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml/document' require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' require 'rexml/streamlistener' xml = < ]>&la;&lala;<P> <I> <B> Text </B> </I>test™ EOS class StListener include REXML::StreamListener def text(text) puts text end end puts "REXML(DOM)" REXML::Document.new(xml).elements.each("/root/*") {|element| puts element.text} puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? event = parser.pull case event.event_type when :text puts event[1] end end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, StListener.new).parse puts "" puts "REXML(SAX)" sax = REXML::Parsers::SAX2Parser.new(xml) sax.listen(:characters) {|x| puts x } sax.parse ``` ## Before (master) ``` $ ruby text_stream_unnormalize.rb REXML(DOM) 1234 --1234--

Text test™ REXML(Pull) 1234 --1234--

Text test™ REXML(Stream) &la; #<= This &lala; #<= This

Text test™ REXML(SAX) 1234 --1234--

Text test™ ``` ## After(This PR) ``` $ ruby text_stream_unnormalize.rb REXML(DOM) 1234 --1234--

Text test™ REXML(Pull) 1234 --1234--

Text test™ REXML(Stream) 1234 --1234--

Text test™ REXML(SAX) 1234 --1234--

Text test™ ``` --- lib/rexml/parsers/streamparser.rb | 8 +- test/test_stream.rb | 141 +++++++++++++++++++++++++++++- 2 files changed, 147 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index e2da2a7d..7781fe44 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -7,12 +7,17 @@ class StreamParser def initialize source, listener @listener = listener @parser = BaseParser.new( source ) + @entities = {} end def add_listener( listener ) @parser.add_listener( listener ) end + def entity_expansion_count + @parser.entity_expansion_count + end + def parse # entity string while true @@ -28,7 +33,7 @@ def parse when :end_element @listener.tag_end( event[1] ) when :text - unnormalized = @parser.unnormalize( event[1] ) + unnormalized = @parser.unnormalize( event[1], @entities ) @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) @@ -40,6 +45,7 @@ def parse when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl @listener.send( event[0].to_s, *event[1..-1] ) when :entitydecl, :notationdecl + @entities[ event[1] ] = event[2] if event.size == 3 @listener.send( event[0].to_s, event[1..-1] ) when :externalentity entity_reference = event[1] diff --git a/test/test_stream.rb b/test/test_stream.rb index 615d497f..782066c2 100644 --- a/test/test_stream.rb +++ b/test/test_stream.rb @@ -87,6 +87,42 @@ def entity(content) assert_equal(["ISOLat2"], listener.entities) end + + def test_entity_replacement + source = <<-XML + + + +]>&la;&lala; + XML + + listener = MyListener.new + class << listener + attr_accessor :text_values + def text(text) + @text_values << text + end + end + listener.text_values = [] + REXML::Document.parse_stream(source, listener) + assert_equal(["1234", "--1234--"], listener.text_values) + end + + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Document.parse_stream(source, listener) + assert_equal("

Text ", listener.text_value) + end end class EntityExpansionLimitTest < Test::Unit::TestCase @@ -100,6 +136,81 @@ def teardown REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit end + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + REXML::Document.parse_stream(source, MyListener.new) + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + listener = MyListener.new + REXML::Security.entity_expansion_limit = 100000 + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.parse + assert_equal(11111, parser.entity_expansion_count) + + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + parser = REXML::Parsers::StreamParser.new( source, listener ) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + parser.parse + end + assert do + parser.entity_expansion_count > @default_entity_expansion_limit + end + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + listener = MyListener.new + REXML::Security.entity_expansion_limit = 4 + REXML::Document.parse_stream(source, listener) + + REXML::Security.entity_expansion_limit = 3 + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + REXML::Document.parse_stream(source, listener) + end + end + def test_with_only_default_entities member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" source = <<-XML @@ -117,14 +228,42 @@ def text(text) end end listener.text_value = "" - REXML::Document.parse_stream(source, listener) + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.parse expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" assert_equal(expected_value, listener.text_value.strip) + assert_equal(0, parser.entity_expansion_count) assert do listener.text_value.bytesize > @default_entity_expansion_text_limit end end + + def test_entity_expansion_text_limit + source = <<-XML + + + + + +]> +&a; + XML + + listener = MyListener.new + class << listener + attr_accessor :text_value + def text(text) + @text_value << text + end + end + listener.text_value = "" + REXML::Security.entity_expansion_text_limit = 90 + REXML::Document.parse_stream(source, listener) + + assert_equal(90, listener.text_value.size) + end end # For test_listener From 7cb5eaeb221c322b9912f724183294d8ce96bae3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 17 Aug 2024 17:45:52 +0900 Subject: [PATCH 14/91] parser tree: improve namespace conflicted attribute check performance It was slow for deep element. Reported by l33thaxor. Thanks!!! --- lib/rexml/element.rb | 11 ----------- lib/rexml/parsers/baseparser.rb | 15 +++++++++++++++ test/parse/test_element.rb | 14 ++++++++++++++ test/test_core.rb | 4 ++++ 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index eb802165..4e3a60b9 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -2384,17 +2384,6 @@ def []=( name, value ) elsif old_attr.kind_of? Hash old_attr[value.prefix] = value elsif old_attr.prefix != value.prefix - # Check for conflicting namespaces - if value.prefix != "xmlns" and old_attr.prefix != "xmlns" - old_namespace = old_attr.namespace - new_namespace = value.namespace - if old_namespace == new_namespace - raise ParseException.new( - "Namespace conflict in adding attribute \"#{value.name}\": "+ - "Prefix \"#{old_attr.prefix}\" = \"#{old_namespace}\" and "+ - "prefix \"#{value.prefix}\" = \"#{new_namespace}\"") - end - end store value.name, {old_attr.prefix => old_attr, value.prefix => value} else diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 9ed032d3..d11c2766 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -754,6 +754,7 @@ def process_instruction def parse_attributes(prefixes) attributes = {} + expanded_names = {} closed = false while true if @source.match(">", true) @@ -805,6 +806,20 @@ def parse_attributes(prefixes) raise REXML::ParseException.new(msg, @source, self) end + unless prefix == "xmlns" + uri = @namespaces[prefix] + expanded_name = [uri, local_part] + existing_prefix = expanded_names[expanded_name] + if existing_prefix + message = "Namespace conflict in adding attribute " + + "\"#{local_part}\": " + + "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " + + "prefix \"#{prefix}\" = \"#{uri}\"" + raise REXML::ParseException.new(message, @source, self) + end + expanded_names[expanded_name] = prefix + end + attributes[name] = value else message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 2b0746ea..ab4818da 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -131,5 +131,19 @@ def test_linear_performance_attribute_value_gt REXML::Document.new('" * n + '">') end end + + def test_linear_performance_deep_same_name_attributes + seq = [100, 500, 1000, 1500, 2000] + assert_linear_performance(seq, rehearsal: 10) do |n| + xml = <<-XML + + +#{"\n" * n} +#{"\n" * n} + + XML + REXML::Document.new(xml) + end + end end end diff --git a/test/test_core.rb b/test/test_core.rb index b079c203..48666c86 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -136,6 +136,10 @@ def test_attribute_namespace_conflict # https://www.w3.org/TR/xml-names/#uniqAttrs message = <<-MESSAGE.chomp Namespace conflict in adding attribute "a": Prefix "n1" = "http://www.w3.org" and prefix "n2" = "http://www.w3.org" +Line: 4 +Position: 140 +Last 80 unconsumed characters: +/> MESSAGE assert_raise(REXML::ParseException.new(message)) do Document.new(<<-XML) From 95871f399eda642a022b03550479b7994895c742 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 22 Aug 2024 09:54:49 +0900 Subject: [PATCH 15/91] Add 3.3.6 entry --- NEWS.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/NEWS.md b/NEWS.md index 165b1c76..6c290678 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,44 @@ # News +## 3.3.6 - 2024-08-22 {#version-3-3-6} + +### Improvements + + * Removed duplicated entity expansions for performance. + * GH-194 + * Patch by Viktor Ivarsson. + + * Improved namespace conflicted attribute check performance. It was + too slow for deep elements. + * Reported by l33thaxor. + +### Fixes + + * Fixed a bug that default entity expansions are counted for + security check. Default entity expansions should not be counted + because they don't have a security risk. + * GH-198 + * GH-199 + * Patch Viktor Ivarsson + + * Fixed a parser bug that parameter entity references in internal + subsets are expanded. It's not allowed in the XML specification. + * GH-191 + * Patch by NAITOH Jun. + + * Fixed a stream parser bug that user-defined entity references in + text aren't expanded. + * GH-200 + * Patch by NAITOH Jun. + +### Thanks + + * Viktor Ivarsson + + * NAITOH Jun + + * l33thaxor + ## 3.3.5 - 2024-08-12 {#version-3-3-5} ### Fixes From 1c694d1e7f72d31fd11dcd13a0d7918384e320c9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 22 Aug 2024 10:06:03 +0900 Subject: [PATCH 16/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 99d574b3..37331199 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.6" + VERSION = "3.3.7" REVISION = "" Copyright = COPYRIGHT From caec1879433e86914755245116d4acb416864e0d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 26 Aug 2024 18:49:02 +0900 Subject: [PATCH 17/91] Add local entity expansion limit methods (#202) GitHub: fix GH-192 Add local entity expansion limit methods. - `REXML::Document#entity_expansion_limit=` - `REXML::Document#entity_expansion_text_limit=` - `REXML::Parsers::SAX2Parser#entity_expansion_limit=` - `REXML::Parsers::SAX2Parser#entity_expansion_text_limit=` - `REXML::Parsers::StreamParser#entity_expansion_limit=` - `REXML::Parsers::StreamParser#entity_expansion_text_limit=` - `REXML::Parsers::PullParser#entity_expansion_limit=` - `REXML::Parsers::PullParser#entity_expansion_text_limit=` --------- Co-authored-by: Sutou Kouhei --- lib/rexml/attribute.rb | 5 +++-- lib/rexml/document.rb | 6 ++++- lib/rexml/entity.rb | 7 ++++-- lib/rexml/parsers/baseparser.rb | 8 +++++-- lib/rexml/parsers/pullparser.rb | 8 +++++++ lib/rexml/parsers/sax2parser.rb | 8 +++++++ lib/rexml/parsers/streamparser.rb | 8 +++++++ lib/rexml/text.rb | 8 ++++--- test/test_document.rb | 22 +++++------------- test/test_pullparser.rb | 27 +++++++--------------- test/test_sax.rb | 27 +++++++--------------- test/test_stream.rb | 37 ++++++++++++------------------- 12 files changed, 83 insertions(+), 88 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 11893a95..fe48745c 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -148,8 +148,9 @@ def to_s # have been expanded to their values def value return @unnormalized if @unnormalized - @unnormalized = Text::unnormalize( @normalized, doctype ) - @unnormalized + + @unnormalized = Text::unnormalize(@normalized, doctype, + entity_expansion_text_limit: @element&.document&.entity_expansion_text_limit) end # The normalized value of this attribute. That is, the attribute with diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index b1caa020..d1747dd4 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -91,6 +91,8 @@ class Document < Element # def initialize( source = nil, context = {} ) @entity_expansion_count = 0 + @entity_expansion_limit = Security.entity_expansion_limit + @entity_expansion_text_limit = Security.entity_expansion_text_limit super() @context = context return if source.nil? @@ -431,10 +433,12 @@ def Document::entity_expansion_text_limit end attr_reader :entity_expansion_count + attr_writer :entity_expansion_limit + attr_accessor :entity_expansion_text_limit def record_entity_expansion @entity_expansion_count += 1 - if @entity_expansion_count > Security.entity_expansion_limit + if @entity_expansion_count > @entity_expansion_limit raise "number of entity expansions exceeded, processing aborted." end end diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 12bbad3f..1ba5a7bb 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -71,9 +71,12 @@ def Entity::matches? string # Evaluates to the unnormalized value of this entity; that is, replacing # &ent; entities. def unnormalized - document.record_entity_expansion unless document.nil? + document&.record_entity_expansion + return nil if @value.nil? - @unnormalized = Text::unnormalize(@value, parent) + + @unnormalized = Text::unnormalize(@value, parent, + entity_expansion_text_limit: document&.entity_expansion_text_limit) end #once :unnormalized diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index d11c2766..89a9d0b6 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -164,6 +164,8 @@ def initialize( source ) @listeners = [] @prefixes = Set.new @entity_expansion_count = 0 + @entity_expansion_limit = Security.entity_expansion_limit + @entity_expansion_text_limit = Security.entity_expansion_text_limit end def add_listener( listener ) @@ -172,6 +174,8 @@ def add_listener( listener ) attr_reader :source attr_reader :entity_expansion_count + attr_writer :entity_expansion_limit + attr_writer :entity_expansion_text_limit def stream=( source ) @source = SourceFactory.create_from( source ) @@ -585,7 +589,7 @@ def unnormalize( string, entities=nil, filter=nil ) end re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) - if rv.bytesize > Security.entity_expansion_text_limit + if rv.bytesize > @entity_expansion_text_limit raise "entity expansion has grown too large" end else @@ -627,7 +631,7 @@ def pop_namespaces_restore def record_entity_expansion(delta=1) @entity_expansion_count += delta - if @entity_expansion_count > Security.entity_expansion_limit + if @entity_expansion_count > @entity_expansion_limit raise "number of entity expansions exceeded, processing aborted." end end diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index 36b45953..a331eff5 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -51,6 +51,14 @@ def entity_expansion_count @parser.entity_expansion_count end + def entity_expansion_limit=( limit ) + @parser.entity_expansion_limit = limit + end + + def entity_expansion_text_limit=( limit ) + @parser.entity_expansion_text_limit = limit + end + def each while has_next? yield self.pull diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index cec9d2fc..5452d4b8 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -26,6 +26,14 @@ def entity_expansion_count @parser.entity_expansion_count end + def entity_expansion_limit=( limit ) + @parser.entity_expansion_limit = limit + end + + def entity_expansion_text_limit=( limit ) + @parser.entity_expansion_text_limit = limit + end + def add_listener( listener ) @parser.add_listener( listener ) end diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 7781fe44..6c64d978 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -18,6 +18,14 @@ def entity_expansion_count @parser.entity_expansion_count end + def entity_expansion_limit=( limit ) + @parser.entity_expansion_limit = limit + end + + def entity_expansion_text_limit=( limit ) + @parser.entity_expansion_text_limit = limit + end + def parse # entity string while true diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 7e0befe9..997f77d3 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -268,7 +268,8 @@ def inspect # u = Text.new( "sean russell", false, nil, true ) # u.value #-> "sean russell" def value - @unnormalized ||= Text::unnormalize( @string, doctype ) + @unnormalized ||= Text::unnormalize(@string, doctype, + entity_expansion_text_limit: document&.entity_expansion_text_limit) end # Sets the contents of this text node. This expects the text to be @@ -411,11 +412,12 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) end # Unescapes all possible entities - def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) + def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil ) + entity_expansion_text_limit ||= Security.entity_expansion_text_limit sum = 0 string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { s = Text.expand($&, doctype, filter) - if sum + s.bytesize > Security.entity_expansion_text_limit + if sum + s.bytesize > entity_expansion_text_limit raise "entity expansion has grown too large" else sum += s.bytesize diff --git a/test/test_document.rb b/test/test_document.rb index 25a8828f..cda4354f 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -31,16 +31,6 @@ def test_new end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - class GeneralEntityTest < self def test_have_value xml = < XML - REXML::Security.entity_expansion_limit = 4 doc = REXML::Document.new(xml) + doc.entity_expansion_limit = 4 assert_equal("\na\na a\n<\n", doc.root.children.first.value) - REXML::Security.entity_expansion_limit = 3 doc = REXML::Document.new(xml) + doc.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end @@ -142,8 +130,8 @@ def test_entity_expansion_text_limit &a; XML - REXML::Security.entity_expansion_text_limit = 90 doc = REXML::Document.new(xml) + doc.entity_expansion_text_limit = 90 assert_equal(90, doc.root.children.first.value.bytesize) end end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 005a106a..bdf8be17 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -157,16 +157,6 @@ def test_peek end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - class GeneralEntityTest < self def test_have_value source = <<-XML @@ -206,14 +196,13 @@ def test_empty_value XML - REXML::Security.entity_expansion_limit = 100000 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_limit = 100000 while parser.has_next? parser.pull end assert_equal(11111, parser.entity_expansion_count) - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit parser = REXML::Parsers::PullParser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do while parser.has_next? @@ -221,7 +210,7 @@ def test_empty_value end end assert do - parser.entity_expansion_count > @default_entity_expansion_limit + parser.entity_expansion_count > REXML::Security.entity_expansion_limit end end @@ -239,14 +228,14 @@ def test_with_default_entity XML - REXML::Security.entity_expansion_limit = 4 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_limit = 4 while parser.has_next? parser.pull end - REXML::Security.entity_expansion_limit = 3 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do while parser.has_next? parser.pull @@ -255,7 +244,7 @@ def test_with_default_entity end def test_with_only_default_entities - member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + member_value = "<p>#{'A' * REXML::Security.entity_expansion_text_limit}</p>" source = <<-XML @@ -276,11 +265,11 @@ def test_with_only_default_entities end end - expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + expected_value = "

#{'A' * REXML::Security.entity_expansion_text_limit}

" assert_equal(expected_value, events['member'].strip) assert_equal(0, parser.entity_expansion_count) assert do - events['member'].bytesize > @default_entity_expansion_text_limit + events['member'].bytesize > REXML::Security.entity_expansion_text_limit end end @@ -296,8 +285,8 @@ def test_entity_expansion_text_limit &a; XML - REXML::Security.entity_expansion_text_limit = 90 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_text_limit = 90 events = {} element_name = '' while parser.has_next? diff --git a/test/test_sax.rb b/test/test_sax.rb index ae17e364..6aaeb618 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -100,16 +100,6 @@ def test_sax2 end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - class GeneralEntityTest < self def test_have_value source = <<-XML @@ -147,18 +137,17 @@ def test_empty_value
XML - REXML::Security.entity_expansion_limit = 100000 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_limit = 100000 sax.parse assert_equal(11111, sax.entity_expansion_count) - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit sax = REXML::Parsers::SAX2Parser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do sax.parse end assert do - sax.entity_expansion_count > @default_entity_expansion_limit + sax.entity_expansion_count > REXML::Security.entity_expansion_limit end end @@ -176,19 +165,19 @@ def test_with_default_entity XML - REXML::Security.entity_expansion_limit = 4 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_limit = 4 sax.parse - REXML::Security.entity_expansion_limit = 3 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do sax.parse end end def test_with_only_default_entities - member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + member_value = "<p>#{'A' * REXML::Security.entity_expansion_text_limit}</p>" source = <<-XML @@ -203,11 +192,11 @@ def test_with_only_default_entities end sax.parse - expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + expected_value = "

#{'A' * REXML::Security.entity_expansion_text_limit}

" assert_equal(expected_value, text_value.strip) assert_equal(0, sax.entity_expansion_count) assert do - text_value.bytesize > @default_entity_expansion_text_limit + text_value.bytesize > REXML::Security.entity_expansion_text_limit end end @@ -223,8 +212,8 @@ def test_entity_expansion_text_limit &a; XML - REXML::Security.entity_expansion_text_limit = 90 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_text_limit = 90 text_size = nil sax.listen(:characters, ["member"]) do |text| text_size = text.size diff --git a/test/test_stream.rb b/test/test_stream.rb index 782066c2..7917760a 100644 --- a/test/test_stream.rb +++ b/test/test_stream.rb @@ -126,16 +126,6 @@ def text(text) end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - def test_have_value source = <<-XML @@ -172,18 +162,17 @@ def test_empty_value XML listener = MyListener.new - REXML::Security.entity_expansion_limit = 100000 parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_limit = 100000 parser.parse assert_equal(11111, parser.entity_expansion_count) - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit parser = REXML::Parsers::StreamParser.new( source, listener ) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do parser.parse end assert do - parser.entity_expansion_count > @default_entity_expansion_limit + parser.entity_expansion_count > REXML::Security.entity_expansion_limit end end @@ -202,17 +191,19 @@ def test_with_default_entity XML listener = MyListener.new - REXML::Security.entity_expansion_limit = 4 - REXML::Document.parse_stream(source, listener) + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_limit = 4 + parser.parse - REXML::Security.entity_expansion_limit = 3 + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do - REXML::Document.parse_stream(source, listener) + parser.parse end end def test_with_only_default_entities - member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + member_value = "<p>#{'A' * REXML::Security.entity_expansion_text_limit}</p>" source = <<-XML @@ -231,11 +222,11 @@ def text(text) parser = REXML::Parsers::StreamParser.new( source, listener ) parser.parse - expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + expected_value = "

#{'A' * REXML::Security.entity_expansion_text_limit}

" assert_equal(expected_value, listener.text_value.strip) assert_equal(0, parser.entity_expansion_count) assert do - listener.text_value.bytesize > @default_entity_expansion_text_limit + listener.text_value.bytesize > REXML::Security.entity_expansion_text_limit end end @@ -259,9 +250,9 @@ def text(text) end end listener.text_value = "" - REXML::Security.entity_expansion_text_limit = 90 - REXML::Document.parse_stream(source, listener) - + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_text_limit = 90 + parser.parse assert_equal(90, listener.text_value.size) end end From ad02f99c616385bca1b84e161b93a144a99f71bf Mon Sep 17 00:00:00 2001 From: Bo Anderson Date: Wed, 4 Sep 2024 04:03:39 +0100 Subject: [PATCH 18/91] Remove strscan dependency declaration from gemspec (#204) `strscan` is a part of the Ruby standard library in all versions of Ruby supported by REXML. So we don't need to declare it as a dependency explicitly. See also: https://github.com/ruby/rexml/issues/140#issuecomment-2327645303 --- rexml.gemspec | 2 -- 1 file changed, 2 deletions(-) diff --git a/rexml.gemspec b/rexml.gemspec index 0de3e845..e5cf8581 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -58,6 +58,4 @@ Gem::Specification.new do |spec| spec.extra_rdoc_files = rdoc_files spec.required_ruby_version = '>= 2.5.0' - - spec.add_runtime_dependency("strscan") end From 6246ba112140372ee3e40cb3bfb1fabef65130e6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 13:54:24 +0900 Subject: [PATCH 19/91] ci document: fix method forwarding with recent Ruby ArgumentError: wrong number of arguments (given 2, expected 1) (ArgumentError) /home/runner/work/rexml/rexml/Rakefile:18:in `warn' /home/runner/work/rexml/rexml/Rakefile:18:in `warn' /opt/hostedtoolcache/Ruby/3.3.5/x64/bin/bundle:25:in `load' /opt/hostedtoolcache/Ruby/3.3.5/x64/bin/bundle:25:in `
' --- Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rakefile b/Rakefile index 76a56296..4676930b 100644 --- a/Rakefile +++ b/Rakefile @@ -14,7 +14,7 @@ task :default => :test namespace :warning do desc "Treat warning as error" task :error do - def Warning.warn(*message) + def Warning.warn(*message, **) super raise "Treat warning as error:\n" + message.join("\n") end From 9294410f6eb90834a69a3fa363de61f5a3f6a927 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 14:02:34 +0900 Subject: [PATCH 20/91] ci document: suppress a ostruct warning --- Gemfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Gemfile b/Gemfile index 67f21dfb..1710ec99 100644 --- a/Gemfile +++ b/Gemfile @@ -7,6 +7,14 @@ gemspec group :development do gem "bundler" + # This is for suppressing the following warning: + # + # warning: ostruct was loaded from the standard library, but will + # no longer be part of the default gems starting from Ruby 3.5.0. + # + # This should be part of "json". We can remove this when "json" + # depends on "ostruct" explicitly. + gem "ostruct" gem "rake" end From 86a11c05f53dbb3dfbe504a365f1412f2e691c25 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 14:13:15 +0900 Subject: [PATCH 21/91] Add 3.3.7 entry --- NEWS.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/NEWS.md b/NEWS.md index 6c290678..844eeb94 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,27 @@ # News +## 3.3.7 - 2024-09-04 {#version-3-3-7} + +### Improvements + + * Added local entity expansion limit methods + * GH-192 + * GH-202 + * Reported by takuya kodama. + * Patch by NAITOH Jun. + + * Removed explicit strscan dependency + * GH-204 + * Patch by Bo Anderson. + +### Thanks + + * takuya kodama + + * NAITOH Jun + + * Bo Anderson + ## 3.3.6 - 2024-08-22 {#version-3-3-6} ### Improvements From 35ee73e0cd125633cfcb53996c0bcb7897e97cd2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 14:13:49 +0900 Subject: [PATCH 22/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 37331199..f27b4261 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.7" + VERSION = "3.3.8" REVISION = "" Copyright = COPYRIGHT From 2e1cd64f2f9c0667a840a0e31f9bb99f9e1c2b33 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 25 Sep 2024 06:29:02 +0900 Subject: [PATCH 23/91] Optimize SAX2Parser#get_namespace (#207) ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 18.085 17.677 33.086 32.778 i/s - 100.000 times in 5.529372s 5.657097s 3.022471s 3.050832s sax 25.450 26.182 44.797 47.916 i/s - 100.000 times in 3.929249s 3.819475s 2.232309s 2.086982s pull 29.160 29.089 55.407 53.531 i/s - 100.000 times in 3.429304s 3.437757s 1.804825s 1.868072s stream 29.137 29.055 52.780 51.368 i/s - 100.000 times in 3.432007s 3.441754s 1.894649s 1.946724s Comparison: dom before(YJIT): 33.1 i/s after(YJIT): 32.8 i/s - 1.01x slower before: 18.1 i/s - 1.83x slower after: 17.7 i/s - 1.87x slower sax after(YJIT): 47.9 i/s before(YJIT): 44.8 i/s - 1.07x slower after: 26.2 i/s - 1.83x slower before: 25.5 i/s - 1.88x slower pull before(YJIT): 55.4 i/s after(YJIT): 53.5 i/s - 1.04x slower before: 29.2 i/s - 1.90x slower after: 29.1 i/s - 1.90x slower stream before(YJIT): 52.8 i/s after(YJIT): 51.4 i/s - 1.03x slower before: 29.1 i/s - 1.81x slower after: 29.1 i/s - 1.82x slower ``` - sax - YJIT=ON : 1.07x faster - YJIT=OFF : 1.03x faster --- lib/rexml/parsers/sax2parser.rb | 2 ++ test/test_sax.rb | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 5452d4b8..a51477de 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -259,6 +259,8 @@ def add( pair ) end def get_namespace( prefix ) + return nil if @namespace_stack.empty? + uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) || (@namespace_stack.find { |ns| not ns[nil].nil? }) uris[-1][prefix] unless uris.nil? or 0 == uris.size diff --git a/test/test_sax.rb b/test/test_sax.rb index 6aaeb618..caec983b 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -99,6 +99,52 @@ def test_sax2 end end + def test_without_namespace + xml = <<-XML + + + + + + XML + + parser = REXML::Parsers::SAX2Parser.new(xml) + elements = [] + parser.listen(:start_element) do |uri, localname, qname, attrs| + elements << [uri, localname, qname, attrs] + end + parser.parse + assert_equal([ + [nil, "root", "root", {}], + [nil, "a", "a", {"att1"=>"1", "att2"=>"2", "att3"=>"<"}], + [nil, "b", "b", {}] + ], elements) + end + + def test_with_namespace + xml = <<-XML + + + + + + XML + + parser = REXML::Parsers::SAX2Parser.new(xml) + elements = [] + parser.listen(:start_element) do |uri, localname, qname, attrs| + elements << [uri, localname, qname, attrs] + end + parser.parse + assert_equal([ + ["http://example.org/default", "root", "root", {"xmlns"=>"http://example.org/default", "xmlns:bar"=>"http://example.org/bar", "xmlns:foo"=>"http://example.org/foo"}], + ["http://example.org/default", "a", "a", {"att"=>"<", "bar:att"=>"2", "foo:att"=>"1"}], + ["http://example.org/bar", "b", "bar:b", {}] + ], elements) + end + class EntityExpansionLimitTest < Test::Unit::TestCase class GeneralEntityTest < self def test_have_value From 78f8712dccad773a51dc5eef31c02d523e994570 Mon Sep 17 00:00:00 2001 From: KITAITI Makoto Date: Sun, 29 Sep 2024 15:57:03 +0900 Subject: [PATCH 24/91] Fix handling with "xml:" prefixed namespace (#208) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I found parsing XHTML documents like below fails since v3.3.3: ```xml XHTML Document

XHTML Document

この段落は日本語です。

``` [XML namespace spec][spec] is a little bit ambiguous but document above is valid according to an [article W3C serves][article]. I fixed the parsing algorithm. Can you review it? As an aside, `` style language declaration is often used in XHTML files included in EPUB files because [sample EPUB files][samples] provided by IDPF, former EPUB spec authority, use the style. [spec]: https://www.w3.org/TR/REC-xml-names/#defaulting [article]: https://www.w3.org/International/questions/qa-html-language-declarations#attributes [samples]: https://github.com/IDPF/epub3-samples --- lib/rexml/parsers/baseparser.rb | 5 +++-- test/parser/test_base_parser.rb | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 89a9d0b6..a567e045 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -156,6 +156,7 @@ module Private default_entities.each do |term| DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ end + XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace" end private_constant :Private @@ -185,7 +186,7 @@ def stream=( source ) @tags = [] @stack = [] @entities = [] - @namespaces = {} + @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE} @namespaces_restore_stack = [] end @@ -790,7 +791,7 @@ def parse_attributes(prefixes) @source.match(/\s*/um, true) if prefix == "xmlns" if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" + if value != Private::XML_PREFIXED_NAMESPACE msg = "The 'xml' prefix must not be bound to any other namespace "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self ) diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb index 17d01979..da169a25 100644 --- a/test/parser/test_base_parser.rb +++ b/test/parser/test_base_parser.rb @@ -23,5 +23,40 @@ def test_large_xml parser.position < xml.bytesize end end + + def test_attribute_prefixed_by_xml + xml = <<-XML + + + + + XHTML Document + + +

XHTML Document

+

この段落は日本語です。

+ + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + 5.times {parser.pull} + + html = parser.pull + assert_equal([:start_element, + "html", + {"xmlns" => "http://www.w3.org/1999/xhtml", + "xml:lang" => "en", + "lang" => "en"}], + html) + + 15.times {parser.pull} + + p = parser.pull + assert_equal([:start_element, + "p", + {"xml:lang" => "ja", "lang" => "ja"}], + p) + end end end From 4197054a19e65511fb51983518a134a5c65aa840 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 29 Sep 2024 16:03:58 +0900 Subject: [PATCH 25/91] Add 3.3.8 entry --- NEWS.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/NEWS.md b/NEWS.md index 844eeb94..5f0f2e01 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,26 @@ # News +## 3.3.8 - 2024-09-29 {#version-3-3-8} + +### Improvements + + * SAX2: Improve parse performance. + * GH-207 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that unexpected attribute namespace conflict error for + the predefined "xml" namespace is reported. + * GH-208 + * Patch by KITAITI Makoto + +### Thanks + + * NAITOH Jun + + * KITAITI Makoto + ## 3.3.7 - 2024-09-04 {#version-3-3-7} ### Improvements From 036d50851ce091c797db0b9ba3ed8e5a39c3918c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 29 Sep 2024 16:04:43 +0900 Subject: [PATCH 26/91] test: avoid using needless non ASCII characters --- test/parser/test_base_parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb index da169a25..6f213978 100644 --- a/test/parser/test_base_parser.rb +++ b/test/parser/test_base_parser.rb @@ -34,7 +34,7 @@ def test_attribute_prefixed_by_xml

XHTML Document

-

この段落は日本語です。

+

For Japanese

XML From 622011f25ac1519fd553d6c56da52d7eba14a787 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 29 Sep 2024 16:05:32 +0900 Subject: [PATCH 27/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index f27b4261..0fbd5eb2 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.8" + VERSION = "3.3.9" REVISION = "" Copyright = COPYRIGHT From 1d0c362526f6e25e2abcd13e2fcefcc718c20e78 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 9 Oct 2024 10:21:44 +0900 Subject: [PATCH 28/91] Optimize `IOSource#read_until` method (#210) ## Why? The result of `encode(term)` can be cached. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.546 18.512 32.282 32.306 i/s - 100.000 times in 5.699323s 5.402026s 3.097658s 3.095448s sax 25.435 28.294 47.526 50.074 i/s - 100.000 times in 3.931613s 3.534310s 2.104122s 1.997057s pull 29.471 31.870 54.400 57.554 i/s - 100.000 times in 3.393211s 3.137793s 1.838222s 1.737494s stream 29.169 31.153 51.613 52.898 i/s - 100.000 times in 3.428318s 3.209941s 1.937508s 1.890424s Comparison: dom after(YJIT): 32.3 i/s before(YJIT): 32.3 i/s - 1.00x slower after: 18.5 i/s - 1.75x slower before: 17.5 i/s - 1.84x slower sax after(YJIT): 50.1 i/s before(YJIT): 47.5 i/s - 1.05x slower after: 28.3 i/s - 1.77x slower before: 25.4 i/s - 1.97x slower pull after(YJIT): 57.6 i/s before(YJIT): 54.4 i/s - 1.06x slower after: 31.9 i/s - 1.81x slower before: 29.5 i/s - 1.95x slower stream after(YJIT): 52.9 i/s before(YJIT): 51.6 i/s - 1.02x slower after: 31.2 i/s - 1.70x slower before: 29.2 i/s - 1.81x slower ``` - YJIT=ON : 1.00x - 1.06x faster - YJIT=OFF : 1.05x - 1.11x faster --- lib/rexml/source.rb | 3 ++- test/test_document.rb | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index ff887fc0..e1a466e9 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -77,6 +77,7 @@ def initialize(arg, encoding=nil) detect_encoding end @line = 0 + @term_encord = {} end # The current buffer (what we're going to read next) @@ -227,7 +228,7 @@ def read(term = nil, min_bytes = 1) def read_until(term) pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ - term = encode(term) + term = @term_encord[term] ||= encode(term) until str = @scanner.scan_until(pattern) break if @source.nil? break if @source.eof? diff --git a/test/test_document.rb b/test/test_document.rb index cda4354f..609aeba2 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -403,6 +403,40 @@ def test_utf_16 assert_equal(expected_xml, actual_xml) end end + + class ReadUntilTest < Test::Unit::TestCase + def test_utf_8 + xml = <<-EOX.force_encoding("ASCII-8BIT") + +Hello world! +EOX + document = REXML::Document.new(xml) + assert_equal("UTF-8", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end + + def test_utf_16le + xml = <<-EOX.encode("UTF-16LE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-16LE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-16", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end + + def test_utf_16be + xml = <<-EOX.encode("UTF-16BE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-16BE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-16", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end + end end end end From cf0fb9c9ca3dc0d725c8e4644aa0e728025f42ce Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 19 Oct 2024 15:27:25 +0900 Subject: [PATCH 29/91] Fix `IOSource#readline` for `@pending_buffer` (#215) ## Why? Fixed a problem that `@pending_buffer` is not processed when `IOError` occurs in `@source.readline` although `@pending_buffer` exists when reading XML file. --- lib/rexml/parsers/baseparser.rb | 1 + lib/rexml/source.rb | 7 ++++++- test/parse/test_text.rb | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a567e045..7bd8adf8 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -167,6 +167,7 @@ def initialize( source ) @entity_expansion_count = 0 @entity_expansion_limit = Security.entity_expansion_limit @entity_expansion_text_limit = Security.entity_expansion_text_limit + @source.ensure_buffer end def add_listener( listener ) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index e1a466e9..dc0b5323 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -295,14 +295,19 @@ def current_line private def readline(term = nil) - str = @source.readline(term || @line_break) if @pending_buffer + begin + str = @source.readline(term || @line_break) + rescue IOError + end if str.nil? str = @pending_buffer else str = @pending_buffer + str end @pending_buffer = nil + else + str = @source.readline(term || @line_break) end return nil if str.nil? diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index 04f553ae..bb208d47 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -4,6 +4,23 @@ module REXMLTests class TestParseText < Test::Unit::TestCase class TestInvalid < self + def test_text_only + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('a') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'a') + Line: 1 + Position: 1 + Last 80 unconsumed characters: + + DETAIL + end + def test_before_root exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('b') From a09646d395a07399cbf9bc3bc8d6d8bb1d13ecea Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:40:13 +0900 Subject: [PATCH 30/91] test: fix indent --- test/test_document.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_document.rb b/test/test_document.rb index 609aeba2..39b6c337 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -405,7 +405,7 @@ def test_utf_16 end class ReadUntilTest < Test::Unit::TestCase - def test_utf_8 + def test_utf_8 xml = <<-EOX.force_encoding("ASCII-8BIT") Hello world! From ce59f2eb1aeb371fe1643414f06618dbe031979f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:45:31 +0900 Subject: [PATCH 31/91] parser: fix a bug that �x...; is accepted as a character reference --- lib/rexml/parsers/baseparser.rb | 10 +++++++--- test/parse/test_character_reference.rb | 6 ++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 7bd8adf8..b4547ba3 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -150,7 +150,7 @@ module Private PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ - CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/ DEFAULT_ENTITIES_PATTERNS = {} default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] default_entities.each do |term| @@ -570,8 +570,12 @@ def unnormalize( string, entities=nil, filter=nil ) return rv if matches.size == 0 rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') + if m.start_with?("x") + code_point = Integer(m[1..-1], 16) + else + code_point = Integer(m, 10) + end + [code_point].pack('U*') } matches.collect!{|x|x[0]}.compact! if filter diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb index bf8d2190..4bb5da5c 100644 --- a/test/parse/test_character_reference.rb +++ b/test/parse/test_character_reference.rb @@ -13,5 +13,11 @@ def test_linear_performance_many_preceding_zeros REXML::Document.new('') end end + + def test_hex_precedding_zero + parser = REXML::Parsers::PullParser.new("a�x61;") + parser.pull # :start_element + assert_equal("a�x61;", parser.pull[1]) # :text + end end end From 38eaa86ac7abe0d31cf49d8df57ad239fdeb80e9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:47:38 +0900 Subject: [PATCH 32/91] Add 3.3.9 entry --- NEWS.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5f0f2e01..3d17c287 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,26 @@ # News +## 3.3.9 - 2024-10-24 {#version-3-3-9} + +### Improvements + + * Improved performance. + * GH-210 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a parse bug for text only invalid XML. + * GH-215 + * Patch by NAITOH Jun. + + * Fixed a parse bug that `�x...;` is accepted as a character + reference. + +### Thanks + + * NAITOH Jun + ## 3.3.8 - 2024-09-29 {#version-3-3-8} ### Improvements From 7495d18b2c5cfc322cec347f9f9fcc7a28e774a8 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:57:05 +0900 Subject: [PATCH 33/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 0fbd5eb2..42623b08 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.9" + VERSION = "3.4.0" REVISION = "" Copyright = COPYRIGHT From 6a8c041d825c7d16e76b056bd63b20edb92febc6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 15:15:39 +0900 Subject: [PATCH 34/91] test jruby: omit fragile test --- test/parse/test_element.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index ab4818da..f07a7d5a 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -133,6 +133,7 @@ def test_linear_performance_attribute_value_gt end def test_linear_performance_deep_same_name_attributes + omit("This is fragile on JRuby") if RUBY_ENGINE == "jruby" seq = [100, 500, 1000, 1500, 2000] assert_linear_performance(seq, rehearsal: 10) do |n| xml = <<-XML From 8ef75024b96d3e5279b39fdd1692821cdbcd84b5 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 27 Oct 2024 15:02:18 +0900 Subject: [PATCH 35/91] Add `IOSource#match?` method (#216) ## Why? `StringScanner#match?` is faster than `StringScanner#check`. See: https://github.com/ruby/strscan/pull/111 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 18.819 19.362 32.846 34.708 i/s - 100.000 times in 5.313905s 5.164791s 3.044500s 2.881200s sax 28.188 29.982 48.386 52.554 i/s - 100.000 times in 3.547597s 3.335304s 2.066732s 1.902809s pull 31.962 33.902 57.868 60.662 i/s - 100.000 times in 3.128689s 2.949690s 1.728071s 1.648467s stream 31.436 33.030 52.808 56.647 i/s - 100.000 times in 3.181095s 3.027574s 1.893635s 1.765304s Comparison: dom after(YJIT): 34.7 i/s before(YJIT): 32.8 i/s - 1.06x slower after: 19.4 i/s - 1.79x slower before: 18.8 i/s - 1.84x slower sax after(YJIT): 52.6 i/s before(YJIT): 48.4 i/s - 1.09x slower after: 30.0 i/s - 1.75x slower before: 28.2 i/s - 1.86x slower pull after(YJIT): 60.7 i/s before(YJIT): 57.9 i/s - 1.05x slower after: 33.9 i/s - 1.79x slower before: 32.0 i/s - 1.90x slower stream after(YJIT): 56.6 i/s before(YJIT): 52.8 i/s - 1.07x slower after: 33.0 i/s - 1.72x slower before: 31.4 i/s - 1.80x slower ``` - YJIT=ON : 1.05x - 1.09x faster - YJIT=OFF : 1.02x - 1.06x faster --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 84 ++++++++++++++++----------------- lib/rexml/source.rb | 35 ++++++++++++++ 2 files changed, 77 insertions(+), 42 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b4547ba3..ff72ce44 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -269,10 +269,10 @@ def pull_event @source.ensure_buffer if @document_status == nil start_position = @source.position - if @source.match("/um, true) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) @@ -281,10 +281,10 @@ def pull_event raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] - elsif @source.match("DOCTYPE", true) + elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" - unless @source.match(/\s+/um, true) - if @source.match(">") + unless @source.match?(/\s+/um, true) + if @source.match?(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" @@ -293,10 +293,10 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - if @source.match(/\s*\[/um, true) + if @source.match?(/\s*\[/um, true) id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match(/\s*>/um, true) + elsif @source.match?(/\s*>/um, true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer @@ -308,9 +308,9 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - if @source.match(/\s*\[/um, true) + if @source.match?(/\s*\[/um, true) @document_status = :in_doctype - elsif @source.match(/\s*>/um, true) + elsif @source.match?(/\s*>/um, true) @document_status = :after_doctype @source.ensure_buffer else @@ -320,7 +320,7 @@ def pull_event end args = [:start_doctype, name, *id] if @document_status == :after_doctype - @source.match(/\s*/um, true) + @source.match?(/\s*/um, true) @stack << [ :end_doctype ] end return args @@ -331,14 +331,14 @@ def pull_event end end if @document_status == :in_doctype - @source.match(/\s*/um, true) # skip spaces + @source.match?(/\s*/um, true) # skip spaces start_position = @source.position - if @source.match("/um, true) raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, "") + unless @source.match?(/\s+/um, true) + if @source.match?(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" @@ -405,7 +405,7 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - unless @source.match(/\s*>/um, true) + unless @source.match?(/\s*>/um, true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end @@ -419,7 +419,7 @@ def pull_event end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] - elsif @source.match(/\]\s*>/um, true) + elsif @source.match?(/\]\s*>/um, true) @document_status = :after_doctype return [ :end_doctype ] end @@ -428,16 +428,16 @@ def pull_event end end if @document_status == :after_doctype - @source.match(/\s*/um, true) + @source.match?(/\s*/um, true) end begin start_position = @source.position - if @source.match("<", true) + if @source.match?("<", true) # :text's read_until may remain only "<" in buffer. In the # case, buffer is empty here. So we need to fill buffer # here explicitly. @source.ensure_buffer - if @source.match("/", true) + if @source.match?("/", true) @namespaces_restore_stack.pop last_tag = @tags.pop md = @source.match(Private::CLOSE_PATTERN, true) @@ -452,7 +452,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] - elsif @source.match("!", true) + elsif @source.match?("!", true) md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md @@ -470,7 +470,7 @@ def pull_event end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif @source.match("?", true) + elsif @source.match?("?", true) return process_instruction else # Get the next tag @@ -651,7 +651,7 @@ def need_source_encoding_update?(xml_declaration_encoding) def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md - if @source.match(/\S/um) + if @source.match?(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" @@ -693,34 +693,34 @@ def parse_id_invalid_details(accept_external_id:, accept_public_id:) public = /\A\s*PUBLIC/um system = /\A\s*SYSTEM/um - if (accept_external_id or accept_public_id) and @source.match(/#{public}/um) - if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um) + if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um) + if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um) return "public ID literal is missing" end - unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um) + unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um) return "invalid public ID literal" end if accept_public_id - if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) + if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) return "system ID literal is missing" end - unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) + unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else "garbage after public ID literal" end - elsif accept_external_id and @source.match(/#{system}/um) - if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um) + elsif accept_external_id and @source.match?(/#{system}/um) + if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um) return "system literal is missing" end - unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um) + unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else - unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um) + unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um) return "invalid ID type" end "ID type is missing" @@ -729,7 +729,7 @@ def parse_id_invalid_details(accept_external_id:, def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") - if @source.match(/\s+/um, true) + if @source.match?(/\s+/um, true) match_data = @source.match(/(.*?)\?>/um, true) unless match_data raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) @@ -737,7 +737,7 @@ def process_instruction content = match_data[1] else content = nil - unless @source.match("?>", true) + unless @source.match?("?>", true) raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) end end @@ -767,9 +767,9 @@ def parse_attributes(prefixes) expanded_names = {} closed = false while true - if @source.match(">", true) + if @source.match?(">", true) return attributes, closed - elsif @source.match("/>", true) + elsif @source.match?("/>", true) closed = true return attributes, closed elsif match = @source.match(QNAME, true) @@ -777,7 +777,7 @@ def parse_attributes(prefixes) prefix = match[2] local_part = match[3] - unless @source.match(/\s*=\s*/um, true) + unless @source.match?(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end @@ -793,7 +793,7 @@ def parse_attributes(prefixes) message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end - @source.match(/\s*/um, true) + @source.match?(/\s*/um, true) if prefix == "xmlns" if local_part == "xml" if value != Private::XML_PREFIXED_NAMESPACE diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index dc0b5323..27a6349a 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -18,6 +18,16 @@ def scan(pattern) pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) super(pattern) end + + def match?(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + + def skip(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end end end using StringScannerCheckScanString @@ -126,6 +136,14 @@ def match(pattern, cons=false) end end + def match?(pattern, cons=false) + if cons + !@scanner.skip(pattern).nil? + else + !@scanner.match?(pattern).nil? + end + end + def position @scanner.pos end @@ -267,6 +285,23 @@ def match( pattern, cons=false ) md.nil? ? nil : @scanner end + def match?( pattern, cons=false ) + # To avoid performance issue, we need to increase bytes to read per scan + min_bytes = 1 + while true + if cons + n_matched_bytes = @scanner.skip(pattern) + else + n_matched_bytes = @scanner.match?(pattern) + end + return true if n_matched_bytes + return false if pattern.is_a?(String) + return false if @source.nil? + return false unless read(nil, min_bytes) + min_bytes *= 2 + end + end + def empty? super and ( @source.nil? || @source.eof? ) end From 519ae6c0f3c0b08101e7251fddcc941308d9be87 Mon Sep 17 00:00:00 2001 From: Kevin Ebaugh Date: Tue, 29 Oct 2024 20:20:16 -0400 Subject: [PATCH 36/91] Clarify variable name (#218) Co-authored-by: Olle Jonsson --- lib/rexml/source.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 27a6349a..9a9b8f5d 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -87,7 +87,7 @@ def initialize(arg, encoding=nil) detect_encoding end @line = 0 - @term_encord = {} + @encoded_terms = {} end # The current buffer (what we're going to read next) @@ -246,7 +246,7 @@ def read(term = nil, min_bytes = 1) def read_until(term) pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ - term = @term_encord[term] ||= encode(term) + term = @encoded_terms[term] ||= encode(term) until str = @scanner.scan_until(pattern) break if @source.nil? break if @source.eof? From ed9168e187c6c8a0cb244f863c841d8f8fc8effd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Duarte?= Date: Wed, 6 Nov 2024 00:06:51 +0000 Subject: [PATCH 37/91] Stop requiring stringio dynamically (#219) `SourceFactory::create_from(String)` will always run the `require 'stringio'` operation. This prevents a multi-threaded JRuby application from parsing xml on separate threads concurrently given that `require` will pass through a synchronized piece of code. An experiment in removing this `require` lead to a 10x performance improvement on 10 threads parsing incoming strings on xml. For more details see https://github.com/logstash-plugins/logstash-filter-xml/issues/83 --- lib/rexml/source.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 9a9b8f5d..655164f3 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,6 +1,7 @@ # coding: US-ASCII # frozen_string_literal: false +require "stringio" require "strscan" require_relative 'encoding' @@ -45,7 +46,6 @@ def SourceFactory::create_from(arg) arg.respond_to? :eof? IOSource.new(arg) elsif arg.respond_to? :to_str - require 'stringio' IOSource.new(StringIO.new(arg)) elsif arg.kind_of? Source arg From 20562ec7bb15226b1820d53e16770a446089026e Mon Sep 17 00:00:00 2001 From: Dmitry Pogrebnoy <39134692+DmitryPogrebnoy@users.noreply.github.com> Date: Fri, 8 Nov 2024 06:11:34 +0100 Subject: [PATCH 38/91] parser pull: Add support for reusing parser (#220) GitHub: Fix GH-214 This is for parsing XML documents stream. We can use one parser to parse multiple XML documents with this feature. Co-authored-by: Dmitry Pogrebnoy --- lib/rexml/parsers/baseparser.rb | 4 ++++ lib/rexml/parsers/pullparser.rb | 4 ++++ test/test_pullparser.rb | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ff72ce44..90851bb1 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -181,6 +181,10 @@ def add_listener( listener ) def stream=( source ) @source = SourceFactory.create_from( source ) + reset + end + + def reset @closed = nil @have_root = false @document_status = nil diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index a331eff5..e0b1e94d 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -93,6 +93,10 @@ def pull def unshift token @my_stack.unshift token end + + def reset + @parser.reset + end end # A parsing event. The contents of the event are accessed as an +Array?, diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index bdf8be17..4471df4b 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -156,6 +156,39 @@ def test_peek assert_equal( 0, names.length ) end + def test_reset + xml_chunks = [ + "First valid and complete message", + "Second valid and complete message", + "Third valid and complete message" + ] + + messages = [] + + IO.pipe do |reader, writer| + xml_chunks.each do |chunk| + writer.write(chunk) + end + writer.close + + parser = REXML::Parsers::PullParser.new(reader) + while parser.has_next? + parser.pull + message_text = parser.pull + messages << message_text[0] + parser.pull + parser.reset + end + end + + assert_equal( + messages, + ["First valid and complete message", + "Second valid and complete message", + "Third valid and complete message"] + ) + end + class EntityExpansionLimitTest < Test::Unit::TestCase class GeneralEntityTest < self def test_have_value From 963ccdf830d64044676830b87ef2b4189fbefe2b Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 19 Nov 2024 09:35:04 +0900 Subject: [PATCH 39/91] Fix error handling when parsing XML via IO.pipe (#221) ## Why? If via IO.pipe, `IOError` exception is not raised, but `Errno::ESPIPE` or `Errno::EPIPE` or `Errno::EINVAL` exception is raised. - CRuby ``` @er_source.pos #=> Errno::ESPIPE Exception: Illegal seek ``` - CRuby (Windows (Ruby 2.6 or earlier)) ``` @er_source.pos #=> Errno::EINVAL: Invalid argument ``` - JRuby ``` @er_source.pos #=> Errno::EPIPE: Broken pipe - No message available ``` Co-authored-by: Sutou Kouhei --- lib/rexml/source.rb | 2 +- test/parse/test_text.rb | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 655164f3..b0b89b71 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -321,7 +321,7 @@ def current_line rescue end @er_source.seek(pos) - rescue IOError + rescue IOError, SystemCallError pos = -1 line = -1 end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index bb208d47..eb6de0cb 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -6,10 +6,7 @@ class TestParseText < Test::Unit::TestCase class TestInvalid < self def test_text_only exception = assert_raise(REXML::ParseException) do - parser = REXML::Parsers::BaseParser.new('a') - while parser.has_next? - parser.pull - end + REXML::Parsers::BaseParser.new('a').pull end assert_equal(<<~DETAIL.chomp, exception.to_s) @@ -21,6 +18,25 @@ def test_text_only DETAIL end + def test_text_only_with_io_pipe + IO.pipe do |reader, writer| + writer.write('a') + writer.close + + exception = assert_raise(REXML::ParseException) do + REXML::Parsers::BaseParser.new(reader).pull + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'a') + Line: -1 + Position: -1 + Last 80 unconsumed characters: + + DETAIL + end + end + def test_before_root exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('b') From 46dd810e001763e42ee08346e71c46dbc8cfa7fd Mon Sep 17 00:00:00 2001 From: Jun Aruga Date: Thu, 28 Nov 2024 02:24:27 +0100 Subject: [PATCH 40/91] test: Fix NameError: uninitialized constant REXML::Parsers::PullParser (#222) This commit fixes the following NameError when running only specific test file test/parse/test_character_reference.rb. ``` $ ruby test/parse/test_character_reference.rb -v Loaded suite test/parse/test_character_reference Started REXMLTests::TestParseCharacterReference: test_hex_precedding_zero: E =========================================================================================== Error: test_hex_precedding_zero(REXMLTests::TestParseCharacterReference): NameError: uninitialized constant REXML::Parsers::PullParser test/parse/test_character_reference.rb:18:in 'REXMLTests::TestParseCharacterReference#test_hex_precedding_zero' =========================================================================================== : (0.001470) test_linear_performance_many_preceding_zeros: .: (0.033970) Finished in 0.036068018 seconds. ------------------------------------------------------------------------------------------- 2 tests, 15 assertions, 0 failures, 1 errors, 0 pendings, 0 omissions, 0 notifications 50% passed ------------------------------------------------------------------------------------------- 55.45 tests/s, 415.88 assertions/s ``` --- test/parse/test_character_reference.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb index 4bb5da5c..bf2b1938 100644 --- a/test/parse/test_character_reference.rb +++ b/test/parse/test_character_reference.rb @@ -2,6 +2,7 @@ require "core_assertions" require "rexml/document" +require "rexml/parsers/pullparser" module REXMLTests class TestParseCharacterReference < Test::Unit::TestCase From 91305c191152107bd9ea8d026f3777e5f00c80b5 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 9 Dec 2024 09:58:05 +0900 Subject: [PATCH 41/91] Remove old code for Ruby 1.8 (#223) ## Why? `String#encode` is supported in Ruby 1.9 and later. --- lib/rexml/text.rb | 55 +++++++++++++---------------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 997f77d3..2bf480fb 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -29,31 +29,16 @@ class Text < Child (0x10000..0x10FFFF) ] - if String.method_defined? :encode - VALID_XML_CHARS = Regexp.new('^['+ - VALID_CHAR.map { |item| - case item - when Integer - [item].pack('U').force_encoding('utf-8') - when Range - [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') - end - }.join + - ']*$') - else - VALID_XML_CHARS = /^( - [\x09\x0A\x0D\x20-\x7E] # ASCII - | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte - | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs - | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte - | \xEF[\x80-\xBE]{2} # - | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff - | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates - | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 - | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 - | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 - )*$/nx; - end + VALID_XML_CHARS = Regexp.new('^['+ + VALID_CHAR.map { |item| + case item + when Integer + [item].pack('U').force_encoding('utf-8') + when Range + [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') + end + }.join + + ']*$') # Constructor # +arg+ if a String, the content is set to the String. If a Text, @@ -132,21 +117,11 @@ def Text.check string, pattern, doctype # illegal anywhere if !string.match?(VALID_XML_CHARS) - if String.method_defined? :encode - string.chars.each do |c| - case c.ord - when *VALID_CHAR - else - raise "Illegal character #{c.inspect} in raw string #{string.inspect}" - end - end - else - string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c| - case c.unpack('U') - when *VALID_CHAR - else - raise "Illegal character #{c.inspect} in raw string #{string.inspect}" - end + string.chars.each do |c| + case c.ord + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string #{string.inspect}" end end end From dfc775343b3b4aec4c046b9df17bbe571612a861 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Dec 2024 11:14:48 +0900 Subject: [PATCH 42/91] release: use Trusted Publishing --- .github/workflows/release.yml | 18 ++++++++++++++++++ Rakefile | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 20ff87e7..76269f44 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -28,3 +28,21 @@ jobs: --title "${title}" env: GH_TOKEN: ${{ github.token }} + + rubygems: + name: RubyGems + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + id-token: write + environment: release + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ruby + bundler-cache: true + - uses: rubygems/configure-rubygems-credentials@v1.0.0 + - name: Push gems + run: | + bundle exec rake release:rubygem_push diff --git a/Rakefile b/Rakefile index 4676930b..8f42da1f 100644 --- a/Rakefile +++ b/Rakefile @@ -67,3 +67,12 @@ end desc "Run all benchmarks" task :benchmark => benchmark_tasks + +release_task = Rake.application["release"] +release_task.prerequisites.delete("build") +release_task.prerequisites.delete("release:rubygem_push") +release_task_comment = release_task.comment +if release_task_comment + release_task.clear_comments + release_task.comment = release_task_comment.gsub(/ and build.*$/, "") +end From 19d8ebfbcfb3816fdad050baa74da40d6f04ffac Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Dec 2024 11:18:54 +0900 Subject: [PATCH 43/91] Add 3.4.0 entry --- NEWS.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/NEWS.md b/NEWS.md index 3d17c287..f25a33f2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,34 @@ # News +## 3.4.0 - 2024-12-15 {#version-3-4-0} + +### Improvement + + * Improved performance. + * GH-216 + * Patch by NAITOH Jun + + * JRuby: Improved parse performance. + * GH-219 + * Patch by João Duarte + + * Added support for reusing pull parser. + * GH-214 + * GH-220 + * Patch by Dmitry Pogrebnoy + + * Improved error handling when source is `IO`. + * GH-221 + * Patch by NAITOH Jun + +### Thanks + + * NAITOH Jun + + * João Duarte + + * Dmitry Pogrebnoy + ## 3.3.9 - 2024-10-24 {#version-3-3-9} ### Improvements From a1d875b23340df6b33d3bbe6b17cca807eb0e3d2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Dec 2024 11:19:55 +0900 Subject: [PATCH 44/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 42623b08..a653f028 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.0" + VERSION = "3.4.1" REVISION = "" Copyright = COPYRIGHT From bb0bedd25dbb69b247b0894a6c357f8903a2b9a2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 19 Dec 2024 11:18:52 +0900 Subject: [PATCH 45/91] Optimize `IOSource#read_until` method by using `StringScanner#check_until(string)` (#226) ## Why? `StringScanner#check_until(string)` is faster than `StringScanner#check_until(regex)`. See: - https://github.com/ruby/strscan/pull/106 - https://github.com/ruby/strscan/pull/111 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.459 19.840 35.035 35.786 i/s - 100.000 times in 5.139034s 5.040369s 2.854304s 2.794367s sax 30.057 30.026 52.986 53.716 i/s - 100.000 times in 3.326998s 3.330499s 1.887303s 1.861652s pull 33.777 34.415 62.294 64.020 i/s - 100.000 times in 2.960622s 2.905668s 1.605284s 1.562002s stream 33.789 34.003 60.174 60.411 i/s - 100.000 times in 2.959521s 2.940916s 1.661845s 1.655334s Comparison: dom after(YJIT): 35.8 i/s before(YJIT): 35.0 i/s - 1.02x slower after: 19.8 i/s - 1.80x slower before: 19.5 i/s - 1.84x slower sax after(YJIT): 53.7 i/s before(YJIT): 53.0 i/s - 1.01x slower before: 30.1 i/s - 1.79x slower after: 30.0 i/s - 1.79x slower pull after(YJIT): 64.0 i/s before(YJIT): 62.3 i/s - 1.03x slower after: 34.4 i/s - 1.86x slower before: 33.8 i/s - 1.90x slower stream after(YJIT): 60.4 i/s before(YJIT): 60.2 i/s - 1.00x slower after: 34.0 i/s - 1.78x slower before: 33.8 i/s - 1.79x slower ``` - YJIT=ON : 1.00x - 1.03x faster - YJIT=OFF : 1.00x - 1.02x faster --- lib/rexml/source.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index b0b89b71..2409f76e 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -68,8 +68,14 @@ module Private SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} pre_defined_terms = ["'", '"', "<"] - pre_defined_terms.each do |term| - PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + if StringScanner::Version < "3.1.1" + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + end + else + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = term + end end end private_constant :Private From b70388c2638d90ebd2ae471bd85239d8469b8e62 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 21 Dec 2024 07:59:47 +0900 Subject: [PATCH 46/91] Use `StringScanner#peek_byte` to get double or single quotation mark (#227) ## Why? `StringScanner#peek_byte` is fast, because it does not generate String object. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.753 19.888 35.641 35.928 i/s - 100.000 times in 5.062402s 5.028121s 2.805792s 2.783339s sax 30.349 30.978 53.485 57.885 i/s - 100.000 times in 3.295012s 3.228103s 1.869671s 1.727567s pull 34.170 35.436 61.713 66.534 i/s - 100.000 times in 2.926534s 2.821955s 1.620404s 1.502996s stream 33.121 35.268 60.751 63.276 i/s - 100.000 times in 3.019222s 2.835443s 1.646065s 1.580374s Comparison: dom after(YJIT): 35.9 i/s before(YJIT): 35.6 i/s - 1.01x slower after: 19.9 i/s - 1.81x slower before: 19.8 i/s - 1.82x slower sax after(YJIT): 57.9 i/s before(YJIT): 53.5 i/s - 1.08x slower after: 31.0 i/s - 1.87x slower before: 30.3 i/s - 1.91x slower pull after(YJIT): 66.5 i/s before(YJIT): 61.7 i/s - 1.08x slower after: 35.4 i/s - 1.88x slower before: 34.2 i/s - 1.95x slower stream after(YJIT): 63.3 i/s before(YJIT): 60.8 i/s - 1.04x slower after: 35.3 i/s - 1.79x slower before: 33.1 i/s - 1.91x slower ``` - YJIT=ON : 1.01x - 1.08x faster - YJIT=OFF : 1.00x - 1.06x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 22 ++++++++++++++++++++-- lib/rexml/source.rb | 8 ++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 90851bb1..13cdd821 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -766,6 +766,25 @@ def process_instruction [:processing_instruction, name, content] end + if StringScanner::Version < "3.1.1" + def scan_quote + @source.match(/(['"])/, true)&.[](1) + end + else + def scan_quote + case @source.peek_byte + when 34 # '"'.ord + @source.scan_byte + '"' + when 39 # "'".ord + @source.scan_byte + "'" + else + nil + end + end + end + def parse_attributes(prefixes) attributes = {} expanded_names = {} @@ -785,11 +804,10 @@ def parse_attributes(prefixes) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - unless match = @source.match(/(['"])/, true) + unless quote = scan_quote message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = match[1] start_position = @source.position value = @source.read_until(quote) unless value.chomp!(quote) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 2409f76e..5ba5ab12 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -158,6 +158,14 @@ def position=(pos) @scanner.pos = pos end + def peek_byte + @scanner.peek_byte + end + + def scan_byte + @scanner.scan_byte + end + # @return true if the Source is exhausted def empty? @scanner.eos? From a4bf93a65e03c6bf26c688a8a616ad135f89244f Mon Sep 17 00:00:00 2001 From: OlofKalufs Date: Mon, 20 Jan 2025 15:38:08 +0100 Subject: [PATCH 47/91] Added rdoc as a development dependency (for Ruby 3.5+) (#235) Ruby 3.5+ requires that rdoc explicitly be declared as a dependency Should sort out GitHub Actions that are failing due to this --- Gemfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Gemfile b/Gemfile index 1710ec99..d323e2c5 100644 --- a/Gemfile +++ b/Gemfile @@ -16,6 +16,7 @@ group :development do # depends on "ostruct" explicitly. gem "ostruct" gem "rake" + gem "rdoc" end group :benchmark do From 107e273337b2e2160d6b0b15e10d0a9da0b9e164 Mon Sep 17 00:00:00 2001 From: OlofKalufs Date: Mon, 20 Jan 2025 23:13:00 +0100 Subject: [PATCH 48/91] Fix serialization of ATTLIST is incorrect (#234) GitHub: fix #233 Changed so that " EOL + assert_equal '', doc.doctype.children[0].to_s.gsub(/\s+/, " ") assert_equal 'gobble', doc.root.attributes['bar'] assert_equal 'xxx', doc.root.elements[2].namespace assert_equal 'two', doc.root.elements[1].namespace From f63c510287d29c2d6261ad94a641cb93f731be4a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Fri, 24 Jan 2025 09:55:57 +0900 Subject: [PATCH 49/91] Changed benchmark target to Ruby latest (#236) Ruby 3.4 has been released, we will change our benchmark target to Ruby latest(3.4). Co-authored-by: Sutou Kouhei --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 52349b44..2c638b03 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: ruby-version: - - '3.3' + - 'ruby' runs-on: - ubuntu-latest runs-on: ${{ matrix.runs-on }} From 67d21be36c87d23b7a00c4f50017d9db977319d2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 26 Jan 2025 19:56:59 +0900 Subject: [PATCH 50/91] Reduced regular expression processing in the form of processing white space first (#237) ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.4.1/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +PRISM [arm64-darwin24] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 19.849 20.109 36.064 38.655 i/s - 100.000 times in 5.038102s 4.972864s 2.772838s 2.586981s sax 30.339 30.449 52.946 54.873 i/s - 100.000 times in 3.296102s 3.284176s 1.888722s 1.822391s pull 34.785 34.916 65.808 65.219 i/s - 100.000 times in 2.874810s 2.863976s 1.519581s 1.533305s stream 34.766 34.921 61.920 63.277 i/s - 100.000 times in 2.876359s 2.863571s 1.615000s 1.580354s Comparison: dom after(YJIT): 38.7 i/s before(YJIT): 36.1 i/s - 1.07x slower after: 20.1 i/s - 1.92x slower before: 19.8 i/s - 1.95x slower sax after(YJIT): 54.9 i/s before(YJIT): 52.9 i/s - 1.04x slower after: 30.4 i/s - 1.80x slower before: 30.3 i/s - 1.81x slower pull before(YJIT): 65.8 i/s after(YJIT): 65.2 i/s - 1.01x slower after: 34.9 i/s - 1.88x slower before: 34.8 i/s - 1.89x slower stream after(YJIT): 63.3 i/s before(YJIT): 61.9 i/s - 1.02x slower after: 34.9 i/s - 1.81x slower before: 34.8 i/s - 1.82x slower ``` - YJIT=ON : 0.99x - 1.07x faster - YJIT=OFF : 1.00x - 1.01x faster --- lib/rexml/parsers/baseparser.rb | 13 ++++++++----- test/parse/test_document_type_declaration.rb | 10 +++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 87f50f09..44aacfa2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -297,10 +297,11 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer @@ -312,9 +313,10 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - if @source.match?(/\s*\[/um, true) + @source.match?(/\s*/um, true) # skip spaces + if @source.match?("[", true) @document_status = :in_doctype - elsif @source.match?(/\s*>/um, true) + elsif @source.match?(">", true) @document_status = :after_doctype @source.ensure_buffer else @@ -409,7 +411,8 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - unless @source.match?(/\s*>/um, true) + @source.match?(/\s*/um, true) # skip spaces + unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 99c23745..b22863a9 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -153,7 +153,7 @@ def test_no_literal Line: 3 Position: 26 Last 80 unconsumed characters: - SYSTEM> +SYSTEM> DETAIL end @@ -200,7 +200,7 @@ def test_content_double_quote Line: 3 Position: 62 Last 80 unconsumed characters: - PUBLIC 'double quote " is invalid' "r.dtd"> +PUBLIC 'double quote " is invalid' "r.dtd"> DETAIL end @@ -228,10 +228,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: garbage after external ID -Line: 3 -Position: 65 +Line: 1 +Position: 58 Last 80 unconsumed characters: -x'> +x'> DETAIL end From bfb37e9ca4cb974c9bb2dc2f06e1202719d1bc4d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 16 Feb 2025 10:57:37 +0900 Subject: [PATCH 51/91] Add 3.4.1 entry (#239) --- NEWS.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/NEWS.md b/NEWS.md index f25a33f2..51a45cab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,29 @@ # News +## 3.4.1 - 2025-02-16 {#version-3-4-1} + +### Improvement + + * Improved performance. + * GH-226 + * GH-227 + * GH-237 + * Patch by NAITOH Jun + +### Fixes + + * Fix serialization of ATTLIST is incorrect + * GH-233 + * GH-234 + * Patch by OlofKalufs + * Reported by OlofKalufs + +### Thanks + + * NAITOH Jun + + * OlofKalufs + ## 3.4.0 - 2024-12-15 {#version-3-4-0} ### Improvement From b97e454ceb2e1719a487bfebaae3da4a706a854b Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 16 Feb 2025 16:48:06 +0900 Subject: [PATCH 52/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index a653f028..bf3c0d32 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.1" + VERSION = "3.4.2" REVISION = "" Copyright = COPYRIGHT From 64a709e74551d5968f2241a772876f4b0c8dea22 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 2 Mar 2025 11:38:54 +0900 Subject: [PATCH 53/91] Improve CDATA parse performance (#244) ## Why? GitHub: fix #243 ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) dom 648.361 1.178k 591.590 1.046k i/s - 100.000 times in 0.154235s 0.084913s 0.169036s 0.095627s sax 699.061 1.378k 651.148 1.196k i/s - 100.000 times in 0.143049s 0.072549s 0.153575s 0.083611s pull 699.271 1.379k 660.275 1.210k i/s - 100.000 times in 0.143006s 0.072527s 0.151452s 0.082622s stream 701.725 1.383k 659.483 1.228k i/s - 100.000 times in 0.142506s 0.072307s 0.151634s 0.081455s Comparison: dom master: 1177.7 i/s master(YJIT): 1045.7 i/s - 1.13x slower rexml 3.4.1: 648.4 i/s - 1.82x slower 3.4.1(YJIT): 591.6 i/s - 1.99x slower sax master: 1378.4 i/s master(YJIT): 1196.0 i/s - 1.15x slower rexml 3.4.1: 699.1 i/s - 1.97x slower 3.4.1(YJIT): 651.1 i/s - 2.12x slower pull master: 1378.8 i/s master(YJIT): 1210.3 i/s - 1.14x slower rexml 3.4.1: 699.3 i/s - 1.97x slower 3.4.1(YJIT): 660.3 i/s - 2.09x slower stream master: 1383.0 i/s master(YJIT): 1227.7 i/s - 1.13x slower rexml 3.4.1: 701.7 i/s - 1.97x slower 3.4.1(YJIT): 659.5 i/s - 2.10x slower ``` - YJIT=ON : 1.76x - 1.83x faster - YJIT=OFF : 1.82x - 1.97x faster Reported by Masamune. Thanks!!! Co-authored-by: Sutou Kouhei --- benchmark/parse_cdata.yaml | 50 +++++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 10 +++++-- lib/rexml/source.rb | 2 +- test/parse/test_cdata.rb | 20 ++++++++++++- 4 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 benchmark/parse_cdata.yaml diff --git a/benchmark/parse_cdata.yaml b/benchmark/parse_cdata.yaml new file mode 100644 index 00000000..cde04306 --- /dev/null +++ b/benchmark/parse_cdata.yaml @@ -0,0 +1,50 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/pullparser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + def build_xml(size) + xml = "\n" + + "Test\n" + + "\n" + end + xml = build_xml(100000) + + class Listener + include REXML::StreamListener + end + +benchmark: + 'dom' : REXML::Document.new(xml) + 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse + 'pull' : | + parser = REXML::Parsers::PullParser.new(xml) + while parser.has_next? + parser.pull + end + 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 44aacfa2..e666c2af 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -471,9 +471,13 @@ def pull_event end return [ :comment, md[1] ] - else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) - return [ :cdata, md[1] ] if md + elsif @source.match?("[CDATA[", true) + text = @source.read_until("]]>") + if text.chomp!("]]>") + return [ :cdata, text ] + else + raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) + end end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 5ba5ab12..3ec1141e 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -67,7 +67,7 @@ class Source module Private SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<"] + pre_defined_terms = ["'", '"', "<", "]]>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb index b5f1a3bc..c742d6a1 100644 --- a/test/parse/test_cdata.rb +++ b/test/parse/test_cdata.rb @@ -7,10 +7,28 @@ module REXMLTests class TestParseCData < Test::Unit::TestCase include Test::Unit::CoreAssertions + def parse(xml) + REXML::Document.new(xml) + end + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + ' ]]>') + parse('" * n + ' ]]>') + end + end + + class TestInvalid < self + def test_unclosed_cdata + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed CDATA: Missing end ']]>' + Line: 1 + Position: 25 + Last 80 unconsumed characters: + DETAIL end end end From 434909171ef3756c1ca2b84f5c90923a72c6a591 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 3 Mar 2025 13:47:31 +0900 Subject: [PATCH 54/91] Improve comment parse performance (#245) ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) top_level 999.440 5.058k 922.416 3.340k i/s - 100.000 times in 0.100056s 0.019770s 0.108411s 0.029936s in_doctype 1.063k 4.890k 980.498 3.341k i/s - 100.000 times in 0.094116s 0.020449s 0.101989s 0.029927s after_doctype 638.321 1.304k 603.952 1.153k i/s - 100.000 times in 0.156661s 0.076710s 0.165576s 0.086748s Comparison: top_level master: 5058.2 i/s master(YJIT): 3340.5 i/s - 1.51x slower rexml 3.4.1: 999.4 i/s - 5.06x slower 3.4.1(YJIT): 922.4 i/s - 5.48x slower in_doctype master: 4890.2 i/s master(YJIT): 3341.5 i/s - 1.46x slower rexml 3.4.1: 1062.5 i/s - 4.60x slower 3.4.1(YJIT): 980.5 i/s - 4.99x slower after_doctype master: 1303.6 i/s master(YJIT): 1152.8 i/s - 1.13x slower rexml 3.4.1: 638.3 i/s - 2.04x slower 3.4.1(YJIT): 604.0 i/s - 2.16x slower ``` - YJIT=ON : 1.90x - 3.62x faster - YJIT=OFF : 2.04x - 5.06x faster --- benchmark/parse_comment.yaml | 36 ++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 39 ++++++++++++++------------------- test/parse/test_comment.rb | 21 +++++++++++++----- 3 files changed, 69 insertions(+), 27 deletions(-) create mode 100644 benchmark/parse_comment.yaml diff --git a/benchmark/parse_comment.yaml b/benchmark/parse_comment.yaml new file mode 100644 index 00000000..a0a3a771 --- /dev/null +++ b/benchmark/parse_comment.yaml @@ -0,0 +1,36 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + SIZE = 100000 + + top_level_xml = "\n" + in_doctype_xml = "]>" + after_doctype_xml = "" + +benchmark: + 'top_level' : REXML::Document.new(top_level_xml) + 'in_doctype' : REXML::Document.new(in_doctype_xml) + 'after_doctype' : REXML::Document.new(after_doctype_xml) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e666c2af..61d38ae2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -277,14 +277,7 @@ def pull_event return process_instruction elsif @source.match?("/um, true) - if md.nil? - raise REXML::ParseException.new("Unclosed comment", @source) - end - if /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] + return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match?(/\s+/um, true) @@ -417,12 +410,8 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - elsif md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] if md + elsif @source.match?("--", true) + return [ :comment, process_comment ] end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] @@ -463,14 +452,8 @@ def pull_event md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true) - - if md.nil? || /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - - return [ :comment, md[1] ] + if @source.match?("--", true) + return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) text = @source.read_until("]]>") if text.chomp!("]]>") @@ -738,6 +721,18 @@ def parse_id_invalid_details(accept_external_id:, end end + def process_comment + text = @source.read_until("-->") + unless text.chomp!("-->") + raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source) + end + + if text.include? "--" or text.end_with?("-") + raise REXML::ParseException.new("Malformed comment", @source) + end + text + end + def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") if @source.match?(/\s+/um, true) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 4475dca7..c573e711 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -17,7 +17,7 @@ def test_toplevel_unclosed_comment parse("' Line: 1 Position: 4 Last 80 unconsumed characters: @@ -48,6 +48,18 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("' + Line: 1 + Position: 19 + Last 80 unconsumed characters: + DETAIL + end + def test_doctype_malformed_comment_inner exception = assert_raise(REXML::ParseException) do parse("") @@ -72,16 +84,15 @@ def test_doctype_malformed_comment_end DETAIL end - def test_after_doctype_malformed_comment_short + def test_after_doctype_unclosed_comment exception = assert_raise(REXML::ParseException) do parse("") end - assert_equal(<<~DETAIL.chomp, exception.to_s) - Malformed comment + assert_equal(<<~DETAIL, exception.to_s) + Unclosed comment: Missing end '-->' Line: 1 Position: 8 Last 80 unconsumed characters: - --> DETAIL end From a5f31c49be106011c4d96cb0e308ebbba118d192 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 06:20:42 +0900 Subject: [PATCH 55/91] Improve CDATA and comment parse performance (#246) ## Why? Since `` are malformed node, they do not need to be checked before comments and CDATA. ## Benchmark : comment (after_doctype) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) after_doctype 1.306k 5.586k 1.152k 3.569k i/s - 100.000 times in 0.076563s 0.017903s 0.086822s 0.028020s Comparison: after_doctype after: 5585.7 i/s after(YJIT): 3568.9 i/s - 1.57x slower before: 1306.1 i/s - 4.28x slower before(YJIT): 1151.8 i/s - 4.85x slower ``` - YJIT=ON : 3.09x faster - YJIT=OFF : 4.28x faster ## Benchmark : CDATA ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 1.269k 5.548k 1.053k 3.072k i/s - 100.000 times in 0.078808s 0.018026s 0.094976s 0.032553s sax 1.399k 8.244k 1.220k 4.460k i/s - 100.000 times in 0.071458s 0.012130s 0.081958s 0.022422s pull 1.411k 8.319k 1.260k 4.806k i/s - 100.000 times in 0.070883s 0.012021s 0.079335s 0.020809s stream 1.420k 8.320k 1.254k 4.728k i/s - 100.000 times in 0.070406s 0.012019s 0.079738s 0.021149s Comparison: dom after: 5547.5 i/s after(YJIT): 3071.9 i/s - 1.81x slower before: 1268.9 i/s - 4.37x slower before(YJIT): 1052.9 i/s - 5.27x slower sax after: 8244.0 i/s after(YJIT): 4459.9 i/s - 1.85x slower before: 1399.4 i/s - 5.89x slower before(YJIT): 1220.1 i/s - 6.76x slower pull after: 8318.8 i/s after(YJIT): 4805.6 i/s - 1.73x slower before: 1410.8 i/s - 5.90x slower before(YJIT): 1260.5 i/s - 6.60x slower stream after: 8320.2 i/s after(YJIT): 4728.4 i/s - 1.76x slower before: 1420.3 i/s - 5.86x slower before(YJIT): 1254.1 i/s - 6.63x slower ``` - YJIT=ON : 2.91x - 3.80x faster - YJIT=OFF : 4.37x - 5.90x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 6 ++---- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 61d38ae2..de85aebd 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -449,9 +449,7 @@ def pull_event end return [ :end_element, last_tag ] elsif @source.match?("!", true) - md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" - raise REXML::ParseException.new("Malformed node", @source) unless md if @source.match?("--", true) return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) @@ -461,9 +459,9 @@ def pull_event else raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) end + else + raise REXML::ParseException.new("Malformed node: Started with '") From a85203e88c8f50f64140fb50492cf9dbe3d79301 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 09:45:19 +0900 Subject: [PATCH 56/91] Raise appropriate exception when failing to match start tag in DOCTYPE (#247) ## Why? Added exception to make the process easier to understand. --- lib/rexml/parsers/baseparser.rb | 5 +++-- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index de85aebd..750b1697 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -412,14 +412,15 @@ def pull_event return [:notationdecl, name, *id] elsif @source.match?("--", true) return [ :comment, process_comment ] + else + raise REXML::ParseException.new("Malformed node: Started with '/um, true) @document_status = :after_doctype return [ :end_doctype ] - end - if @document_status == :in_doctype + else raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) end end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 5349c18e..6339835d 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -48,6 +48,19 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_malformed_node + exception = assert_raise(REXML::ParseException) do + parse(" Date: Thu, 3 Apr 2025 03:45:35 -0400 Subject: [PATCH 57/91] Fix docs typo in code example (#248) --- lib/rexml/document.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index d1747dd4..1960012c 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -309,8 +309,8 @@ def stand_alone? end # :call-seq: - # doc.write(output=$stdout, indent=-1, transtive=false, ie_hack=false, encoding=nil) - # doc.write(options={:output => $stdout, :indent => -1, :transtive => false, :ie_hack => false, :encoding => nil}) + # doc.write(output=$stdout, indent=-1, transitive=false, ie_hack=false, encoding=nil) + # doc.write(options={:output => $stdout, :indent => -1, :transitive => false, :ie_hack => false, :encoding => nil}) # # Write the XML tree out, optionally with indent. This writes out the # entire XML document, including XML declarations, doctype declarations, From d944fa478a972febe9c3ad2cf35232223d391597 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 3 May 2025 09:03:12 +0900 Subject: [PATCH 58/91] NEWS.md : Fix the mentioned of the PR in CVE-2024-35176. (#253) I think the mentioned of CVE-2024-35176 in NEWS.md is incorrect. ``` - Improved parse performance when an attribute has many ' characters. --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 51a45cab..7f95d829 100644 --- a/NEWS.md +++ b/NEWS.md @@ -386,7 +386,7 @@ * Patch by NAITOH Jun. - * Improved parse performance when an attribute has many `<`s. + * Improved parse performance when an attribute has many `>`s. * GH-126 From de6f40ed8749dd6ab4b7c4b80494a824f7f9027a Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Sat, 3 May 2025 09:21:27 +0900 Subject: [PATCH 59/91] Fix reverse sort in xpath_parser (#251) The code below was failing with `REXML::XPathParser#sort': undefined method '-@' for an instance of Array` ```ruby d = REXML::Document.new("") matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") # Before: error # After: [, , ] ``` This pull request will fix it. --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 5eb1e5a9..f86a87e6 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -671,7 +671,7 @@ def sort(array_of_nodes, order) if order == :forward index else - -index + index.map(&:-@) end end ordered.collect do |_index, node| diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 1dacd69d..53264a9e 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,6 +416,12 @@ def test_preceding assert_equal( 4, cs.length ) end + def test_preceding_sibling + d = REXML::Document.new("") + matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + assert_equal(["e", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From 249d770b4ead129abf475708e84e3f1f7908962a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 6 May 2025 21:33:00 +0900 Subject: [PATCH 60/91] Fix duplicate responses in XPath following, following-sibling, preceding, preceding-sibling (#255) ## Why? See: https://github.com/ruby/rexml/pull/251#issuecomment-2845103143 ## Expected values - XPath : a/d/preceding::* => ["d", "c", "b"] ```xml ``` - XPath : a/d/following::* => ["d", "e", "f"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "e"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "x", "e"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "d", "c"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "x", "d", "c"] ```xml ``` - XPath : //a/following-sibling:*[1] => ["w", "x", "y", "z"] ```xml ``` --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 97 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index f86a87e6..cde2e5d5 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -144,7 +144,7 @@ def match(path_stack, nodeset) result = expr(path_stack, nodeset) case result when Array # nodeset - unnode(result) + unnode(result).uniq else [result] end diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 53264a9e..b923eed2 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,12 +416,103 @@ def test_preceding assert_equal( 4, cs.length ) end - def test_preceding_sibling - d = REXML::Document.new("") - matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + def test_preceding_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/preceding::*") + assert_equal(["d", "c", "b"], matches.map(&:name)) + end + + def test_following_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/following::*") + assert_equal(["d", "e", "f"], matches.map(&:name)) + end + + def test_following_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "e"], matches.map(&:name)) + end + + def test_following_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "x", "e"], matches.map(&:name)) + end + + def test_following_sibling_predicates + source = <<-XML + + XML + doc = REXML::Document.new(source) + # Finds a node flowing + matches = REXML::XPath.match(doc, "//a/following-sibling::*[1]") + assert_equal(["w", "x", "y", "z"], matches.map(&:name)) + end + + def test_preceding_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") assert_equal(["e", "d", "c"], matches.map(&:name)) end + def test_preceding_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") + assert_equal(["e", "x", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From cd575a10cac58eb47f235ed186060ac65ffb5284 Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Wed, 7 May 2025 21:02:31 +0900 Subject: [PATCH 61/91] Deprecate accepting array as an element in XPath.match, first and each (#252) `XPath.match`, `XPath.first`, `XPath.each`, `XPathParser#parse` and `XPathParser#match` accepted nodeset as element. This pull request changes the first parameter of these method to be an element instead of nodeset. Passing nodeset will be deprecated. ```ruby # Documented usage. OK REXML::XPath.match(element, xpath) # Undocumented usage. Deprecate in this pull request nodeset = [element] REXML::XPath.match(nodeset, xpath) ``` ### Background #249 will introduce a temporary cache. ```ruby def parse path, nodeset path_stack = @parser.parse( path ) nodeset.first.document.send(:enable_cache) do match( path_stack, nodeset ) end end ``` But the signature `XPathParser#match(path, nodeset)` does not guarantee that all nodes in the nodeset has the same root document. So cache does not work in the code below. It's still slow. ```ruby REXML::XPath.match(2.times.map { REXML::Document.new(''*400+''*400) }, 'a//a') ``` The interface is holding our back, so I propose to drop accepting array as element. This change is a backward incompatibility, but it just drops undocumented feature. I think only the test code was unintentionally using this feature. ### XPath.match with array XPath.match only traverse the first element of the array for some selectors. ```ruby nodeset = [REXML::Document.new(""), REXML::Document.new("")] REXML::XPath.match(nodeset, "a/*") #=> [, ] REXML::XPath.match(nodeset, "//a/*") #=> [] # I expect [, ] but the second document is ignored ``` It indicates that `XPath.match` is not designed to search inside multiple nodes/documents. --------- Co-authored-by: Sutou Kouhei --- lib/rexml/xpath.rb | 3 --- lib/rexml/xpath_parser.rb | 22 ++++++++++++---------- test/test_jaxen.rb | 16 ++++++++++------ test/xpath/test_base.rb | 17 ++++++++++++++--- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index a0921bd8..666d764f 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -35,7 +35,6 @@ def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).flatten[0] end @@ -64,7 +63,6 @@ def XPath::each(element, path=nil, namespaces=nil, variables={}, options={}, &bl parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).each( &block ) end @@ -74,7 +72,6 @@ def XPath::match(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path,element) end end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index cde2e5d5..8440015b 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -76,19 +76,19 @@ def variables=( vars={} ) @variables = vars end - def parse path, nodeset + def parse path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end - def get_first path, nodeset + def get_first path, node path_stack = @parser.parse( path ) - first( path_stack, nodeset ) + first( path_stack, node ) end - def predicate path, nodeset + def predicate path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end def []=( variable_name, value ) @@ -136,11 +136,13 @@ def first( path_stack, node ) end - def match(path_stack, nodeset) - nodeset = nodeset.collect.with_index do |node, i| - position = i + 1 - XPathNode.new(node, position: position) + def match(path_stack, node) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first end + nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result when Array # nodeset diff --git a/test/test_jaxen.rb b/test/test_jaxen.rb index 6038e88e..548120d6 100644 --- a/test/test_jaxen.rb +++ b/test/test_jaxen.rb @@ -56,7 +56,9 @@ def process_test_case(name) # processes a tests/document/context node def process_context(doc, context) - test_context = XPath.match(doc, context.attributes["select"]) + matched = XPath.match(doc, context.attributes["select"]) + assert_equal(1, matched.size) + test_context = matched.first namespaces = context.namespaces namespaces.delete("var") namespaces = nil if namespaces.empty? @@ -101,10 +103,14 @@ def process_nominal_test(context, variables, namespaces, test) assert_equal(Integer(expected, 10), matched.size, user_message(context, xpath, matched)) + else + assert_operator(matched.size, :>, 0, user_message(context, xpath, matched)) end XPath.each(test, "valueOf") do |value_of| - process_value_of(matched, variables, namespaces, value_of) + matched.each do |subcontext| + process_value_of(subcontext, variables, namespaces, value_of) + end end end @@ -118,10 +124,8 @@ def process_exceptional_test(context, variables, namespaces, test) def user_message(context, xpath, matched) message = "" - context.each_with_index do |node, i| - message << "Node#{i}:\n" - message << "#{node}\n" - end + message << "Node:\n" + message << "#{context}\n" message << "XPath: <#{xpath}>\n" message << "Matched <#{matched}>" message diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index b923eed2..ab22f6f9 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -411,9 +411,10 @@ def test_preceding s = "" d = REXML::Document.new(s) - c = REXML::XPath.match( d, "//c[@id = '5']") - cs = REXML::XPath.match( c, "preceding::c" ) - assert_equal( 4, cs.length ) + c = REXML::XPath.match(d, "//c[@id = '5']") + assert_equal(1, c.length) + cs = REXML::XPath.match(c.first, "preceding::c") + assert_equal(4, cs.length) end def test_preceding_multiple @@ -1255,5 +1256,15 @@ def test_or_and end assert_equal(["/"], hrefs, "Bug #3842 [ruby-core:32447]") end + + def test_match_with_deprecated_usage + verbose, $VERBOSE = $VERBOSE, nil + doc = Document.new("") + assert_equal(['b'], XPath.match([doc, doc], '//b').map(&:name)) + assert_equal(['b'], XPath.match([doc], '//b').map(&:name)) + assert_equal([], XPath.match([], '//b').map(&:name)) + ensure + $VERBOSE = verbose + end end end From e80ffdd12713cd138dbe33f26968452dc33d20df Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 12 May 2025 10:22:11 +0900 Subject: [PATCH 62/91] Improve using `//` in XPath performance (#249) When using `//` in XPath, the deeper the tag hierarchy, the slower it becomes due to the namespace acquisition process. Caching namespace information improves performance when using `//` with XPath. ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/xpath.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) REXML::XPath.match(REXML::Document.new(xml), 'a//a') 29.215 234.909 108.945 492.410 i/s - 100.000 times in 3.422925s 0.425697s 0.917898s 0.203083s Comparison: REXML::XPath.match(REXML::Document.new(xml), 'a//a') master(YJIT): 492.4 i/s master: 234.9 i/s - 2.10x slower 3.4.1(YJIT): 108.9 i/s - 4.52x slower rexml 3.4.1: 29.2 i/s - 16.85x slower ``` - YJIT=ON : 4.52x faster - YJIT=OFF : 8.04x faster --------- Co-authored-by: tomoya ishida Co-authored-by: Sutou Kouhei --- benchmark/xpath.yaml | 32 ++++++++++++++++++++++++++++++++ lib/rexml/attribute.rb | 4 ++++ lib/rexml/document.rb | 14 ++++++++++++++ lib/rexml/element.rb | 33 +++++++++++++++++---------------- lib/rexml/xpath_parser.rb | 27 ++++++++++++--------------- test/test_core.rb | 23 +++++++++++++++++------ test/xpath/test_base.rb | 10 ++++++++++ 7 files changed, 106 insertions(+), 37 deletions(-) create mode 100644 benchmark/xpath.yaml diff --git a/benchmark/xpath.yaml b/benchmark/xpath.yaml new file mode 100644 index 00000000..d6e970eb --- /dev/null +++ b/benchmark/xpath.yaml @@ -0,0 +1,32 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + DEPTH = 100 + xml = '' * DEPTH + '' * DEPTH + doc = REXML::Document.new(xml) + +benchmark: + "REXML::XPath.match(REXML::Document.new(xml), 'a//a')" : REXML::XPath.match(doc, "a//a") diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index fe48745c..7a190225 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -206,6 +206,10 @@ def xpath path += "/@#{self.expanded_name}" return path end + + def document + @element&.document + end end end #vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1960012c..1c678bef 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -448,6 +448,20 @@ def document end private + + attr_accessor :namespaces_cache + + # New document level cache is created and available in this block. + # This API is thread unsafe. Users can't change this document in this block. + def enable_cache + @namespaces_cache = {} + begin + yield + ensure + @namespaces_cache = nil + end + end + def build( source ) Parsers::TreeParser.new( source, self ).parse end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4e3a60b9..b62b6cc2 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -589,10 +589,12 @@ def prefixes # d.elements['//c'].namespaces # => {"x"=>"1", "y"=>"2", "z"=>"3"} # def namespaces - namespaces = {} - namespaces = parent.namespaces if parent - namespaces = namespaces.merge( attributes.namespaces ) - return namespaces + namespaces_cache = document&.__send__(:namespaces_cache) + if namespaces_cache + namespaces_cache[self] ||= calculate_namespaces + else + calculate_namespaces + end end # :call-seq: @@ -619,17 +621,9 @@ def namespace(prefix=nil) if prefix.nil? prefix = prefix() end - if prefix == '' - prefix = "xmlns" - else - prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' - end - ns = nil - target = self - while ns.nil? and target - ns = target.attributes[prefix] - target = target.parent - end + prefix = (prefix == '') ? 'xmlns' : prefix.delete_prefix("xmlns:") + ns = namespaces[prefix] + ns = '' if ns.nil? and prefix == 'xmlns' return ns end @@ -1516,8 +1510,15 @@ def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) formatter.write( self, output ) end - private + def calculate_namespaces + if parent + parent.namespaces.merge(attributes.namespaces) + else + attributes.namespaces + end + end + def __to_xpath_helper node rv = node.expanded_name.clone if node.parent diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 8440015b..70ae8919 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -78,7 +78,15 @@ def variables=( vars={} ) def parse path, node path_stack = @parser.parse( path ) - match( path_stack, node ) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first + end + + node.document.__send__(:enable_cache) do + match( path_stack, node ) + end end def get_first path, node @@ -137,11 +145,6 @@ def first( path_stack, node ) def match(path_stack, node) - if node.is_a?(Array) - Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) - return [] if node.empty? - node = node.first - end nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result @@ -494,14 +497,10 @@ def node_test(path_stack, nodesets, any_type: :element) if strict? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end when :attribute if prefix.nil? @@ -509,9 +508,7 @@ def node_test(path_stack, nodesets, any_type: :element) elsif prefix.empty? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node.element, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node.element, prefix) end else false diff --git a/test/test_core.rb b/test/test_core.rb index 34fe9e07..651056f2 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -653,18 +653,23 @@ def test_namespace assert_equal "Some text", out end - def test_add_namespace e = Element.new 'a' + assert_equal("", e.namespace) + assert_nil(e.namespace('foo')) e.add_namespace 'someuri' e.add_namespace 'foo', 'otheruri' e.add_namespace 'xmlns:bar', 'thirduri' - assert_equal 'someuri', e.attributes['xmlns'] - assert_equal 'otheruri', e.attributes['xmlns:foo'] - assert_equal 'thirduri', e.attributes['xmlns:bar'] + assert_equal("someuri", e.namespace) + assert_equal("otheruri", e.namespace('foo')) + assert_equal("otheruri", e.namespace('xmlns:foo')) + assert_equal("thirduri", e.namespace('bar')) + assert_equal("thirduri", e.namespace('xmlns:bar')) + assert_equal('someuri', e.attributes['xmlns']) + assert_equal('otheruri', e.attributes['xmlns:foo']) + assert_equal('thirduri', e.attributes['xmlns:bar']) end - def test_big_documentation d = File.open(fixture_path("documentation.xml")) {|f| Document.new f } assert_equal "Sean Russell", d.elements["documentation/head/author"].text.tr("\n\t", " ").squeeze(" ") @@ -764,9 +769,15 @@ def test_attributes_each def test_delete_namespace doc = Document.new "" + assert_equal("1", doc.root.namespace) + assert_equal("2", doc.root.namespace('x')) + assert_equal("2", doc.root.namespace('xmlns:x')) doc.root.delete_namespace doc.root.delete_namespace 'x' - assert_equal "", doc.to_s + assert_equal("", doc.to_s) + assert_equal("", doc.root.namespace) + assert_nil(doc.root.namespace('x')) + assert_nil(doc.root.namespace('xmlns:x')) end def test_each_element_with_attribute diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index ab22f6f9..764171ab 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -1193,6 +1193,16 @@ def test_namespaces_0 assert_equal( 1, XPath.match( d, "//x:*" ).size ) end + def test_namespaces_cache + doc = Document.new("") + assert_equal("", XPath.first(doc, "//b[namespace-uri()='1']").to_s) + assert_nil(XPath.first(doc, "//b[namespace-uri()='']")) + + doc.root.delete_namespace + assert_nil(XPath.first(doc, "//b[namespace-uri()='1']")) + assert_equal("", XPath.first(doc, "//b[namespace-uri()='']").to_s) + end + def test_ticket_71 doc = Document.new(%Q{}) el = doc.root.elements[1] From 3dc9eca877f8444b7ac1d6008feb724cbfdc239a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 29 May 2025 10:14:32 +0900 Subject: [PATCH 63/91] Improve `Text.check` performance (#256) The doctype parameter of Text.check is not being used. Changing the doctype parameter to an optional parameter improves the parsing speed of the DOM. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 19.854 23.805 33.969 37.712 i/s - 100.000 times in 5.036779s 4.200839s 2.943877s 2.651709s sax 29.436 30.494 54.070 55.089 i/s - 100.000 times in 3.397155s 3.279348s 1.849463s 1.815255s pull 34.908 34.857 62.969 64.895 i/s - 100.000 times in 2.864651s 2.868842s 1.588082s 1.540939s stream 34.570 34.281 60.616 60.355 i/s - 100.000 times in 2.892656s 2.917080s 1.649737s 1.656866s Comparison: dom after(YJIT): 37.7 i/s before(YJIT): 34.0 i/s - 1.11x slower after: 23.8 i/s - 1.58x slower before: 19.9 i/s - 1.90x slower sax after(YJIT): 55.1 i/s before(YJIT): 54.1 i/s - 1.02x slower after: 30.5 i/s - 1.81x slower before: 29.4 i/s - 1.87x slower pull after(YJIT): 64.9 i/s before(YJIT): 63.0 i/s - 1.03x slower before: 34.9 i/s - 1.86x slower after: 34.9 i/s - 1.86x slower stream before(YJIT): 60.6 i/s after(YJIT): 60.4 i/s - 1.00x slower before: 34.6 i/s - 1.75x slower after: 34.3 i/s - 1.77x slower ``` - YJIT=ON : 1.00x - 1.11x faster (dom: 1.11x faster) - YJIT=OFF : 1.00x - 1.20x faster (dom: 1.20x faster) --- lib/rexml/attribute.rb | 2 +- lib/rexml/text.rb | 6 +++--- test/test_text_check.rb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 7a190225..ba49207c 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -173,7 +173,7 @@ def element=( element ) @element = element if @normalized - Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype ) + Text.check( @normalized, NEEDS_A_SECOND_CHECK ) end self diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 2bf480fb..6f821472 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -104,16 +104,16 @@ def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, @entity_filter = entity_filter if entity_filter clear_cache - Text.check(@string, illegal, doctype) if @raw + Text.check(@string, illegal) if @raw end def parent= parent super(parent) - Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw and @parent end # check for illegal characters - def Text.check string, pattern, doctype + def Text.check string, pattern, doctype = nil # illegal anywhere if !string.match?(VALID_XML_CHARS) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 11cf65a3..3f2f7864 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -4,7 +4,7 @@ module REXMLTests class TextCheckTester < Test::Unit::TestCase def check(string) - REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK) end def assert_check(string) From 95b8ef8d8549eb98763477e6e5307bf97c1dc4c5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 7 Jul 2025 14:16:56 +0900 Subject: [PATCH 64/91] Fix wrong Encoding resolution (#258) In this context, `Encoding` means `REXML::Encoding` not `Encoding`. --- lib/rexml/encoding.rb | 2 +- test/test_source.rb | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 test/test_source.rb diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index da2d70d6..f8459316 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -5,7 +5,7 @@ module Encoding # ID ---> Encoding name attr_reader :encoding def encoding=(encoding) - encoding = encoding.name if encoding.is_a?(Encoding) + encoding = encoding.name if encoding.is_a?(::Encoding) if encoding.is_a?(String) original_encoding = encoding encoding = find_encoding(encoding) diff --git a/test/test_source.rb b/test/test_source.rb new file mode 100644 index 00000000..b309105a --- /dev/null +++ b/test/test_source.rb @@ -0,0 +1,21 @@ +require "rexml/source" + +module REXMLTests + class TestSource < Test::Unit::TestCase + def setup + @source = REXML::Source.new(+"") + end + + sub_test_case("#encoding=") do + test("String") do + @source.encoding = "UTF-8" + assert_equal("UTF-8", @source.encoding) + end + + test("Encoding") do + @source.encoding = Encoding::UTF_8 + assert_equal("UTF-8", @source.encoding) + end + end + end +end From 548172637b8eb106ea38f3b91f54d0fc2e6e8e08 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 9 Jul 2025 06:14:08 +0900 Subject: [PATCH 65/91] Don't call needless encoding_updated (#259) Needless encoding_updated call may have performance penalty a bit. --- lib/rexml/encoding.rb | 7 ++----- test/test_source.rb | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index f8459316..7eb05f4d 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -13,12 +13,9 @@ def encoding=(encoding) raise ArgumentError, "Bad encoding name #{original_encoding}" end end + encoding = encoding.upcase if encoding return false if defined?(@encoding) and encoding == @encoding - if encoding - @encoding = encoding.upcase - else - @encoding = 'UTF-8' - end + @encoding = encoding || "UTF-8" true end diff --git a/test/test_source.rb b/test/test_source.rb index b309105a..86755f37 100644 --- a/test/test_source.rb +++ b/test/test_source.rb @@ -12,6 +12,21 @@ def setup assert_equal("UTF-8", @source.encoding) end + test("encoding_updated") do + def @source.n_encoding_updated_called + @n_encoding_updated_called + end + def @source.encoding_updated + super + @n_encoding_updated_called ||= 0 + @n_encoding_updated_called += 1 + end + @source.encoding = "shift-jis" + assert_equal(1, @source.n_encoding_updated_called) + @source.encoding = "Shift-JIS" + assert_equal(1, @source.n_encoding_updated_called) + end + test("Encoding") do @source.encoding = Encoding::UTF_8 assert_equal("UTF-8", @source.encoding) From ec410a0d5e5e5daddca82fd1455824219403f676 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:06:45 +0700 Subject: [PATCH 66/91] Reuse XPath.match (#263) `XPath.each` and `XPath.first` can reuse `XPath.match`. --- lib/rexml/xpath.rb | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index 666d764f..eed0300c 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -31,11 +31,7 @@ class XPath def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new(**options) - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.parse(path, element).flatten[0] + match(element, path, namespaces, variables, options).flatten[0] end # Iterates over nodes that match the given path, calling the supplied @@ -59,11 +55,7 @@ def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) def XPath::each(element, path=nil, namespaces=nil, variables={}, options={}, &block) raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new(**options) - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.parse(path, element).each( &block ) + match(element, path, namespaces, variables, options).each( &block ) end # Returns an array of nodes matching a given XPath. From 2271fd374403bcdfb0b9f288cc0d97c92af9d886 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:18:20 +0700 Subject: [PATCH 67/91] docs: Use # to reference instance methods (#270) Fixes #269 We should use `XXX#method` not `XXX.method` to reference instance methods. --- lib/rexml/cdata.rb | 2 +- lib/rexml/comment.rb | 2 +- lib/rexml/element.rb | 2 +- lib/rexml/instruction.rb | 2 +- lib/rexml/node.rb | 2 +- lib/rexml/text.rb | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb index 997f5a08..264ad642 100644 --- a/lib/rexml/cdata.rb +++ b/lib/rexml/cdata.rb @@ -58,7 +58,7 @@ def value # c = CData.new( " Some text " ) # c.write( $stdout ) #-> def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#write is deprecated", uplevel: 1) indent( output, indent ) output << START output << @string diff --git a/lib/rexml/comment.rb b/lib/rexml/comment.rb index 52c58b46..e7e104d4 100644 --- a/lib/rexml/comment.rb +++ b/lib/rexml/comment.rb @@ -48,7 +48,7 @@ def clone # ie_hack:: # Needed for conformity to the child API, but not used by this class. def write( output, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("Comment.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) indent( output, indent ) output << START output << @string diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index b62b6cc2..4311d58f 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -1496,7 +1496,7 @@ def texts # doc.write( out ) #-> doc is written to the string 'out' # doc.write( $stdout ) #-> doc written to the console def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 if transitive require_relative "formatters/transitive" diff --git a/lib/rexml/instruction.rb b/lib/rexml/instruction.rb index 318741f0..a3dfbbec 100644 --- a/lib/rexml/instruction.rb +++ b/lib/rexml/instruction.rb @@ -49,7 +49,7 @@ def clone # See the rexml/formatters package # def write writer, indent=-1, transitive=false, ie_hack=false - Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#write is deprecated", uplevel: 1) indent(writer, indent) writer << START writer << @target diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index c771db70..033b740d 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -26,7 +26,7 @@ def previous_sibling_node # REXML::Formatters package for changing the output style. def to_s indent=nil unless indent.nil? - Kernel.warn( "#{self.class.name}.to_s(indent) parameter is deprecated", uplevel: 1) + Kernel.warn( "#{self.class.name}#to_s(indent) parameter is deprecated", uplevel: 1) f = REXML::Formatters::Pretty.new( indent ) f.write( self, rv = "" ) else diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 6f821472..e03ce9d1 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -287,7 +287,7 @@ def indent_text(string, level=1, style="\t", indentfirstline=true) # See REXML::Formatters # def write( writer, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) + Kernel.warn("#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 REXML::Formatters::Pretty.new( indent ) else From d427fc5914fcc17d7247c5ff9099ee38639d6702 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:20:21 +0700 Subject: [PATCH 68/91] Avoid redundant calls for doctype (#264) We can avoid calling `Document#doctype` by keeping `Document#doctype` result in a local variable. --- lib/rexml/element.rb | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4311d58f..1c580577 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -2325,11 +2325,11 @@ def get_attribute( name ) return attr end end - element_document = @element.document - if element_document and element_document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = element_document.doctype.name if expn.size == 0 - attr_val = element_document.doctype.attribute_of(expn, name) + expn = doctype.name if expn.size == 0 + attr_val = doctype.attribute_of(expn, name) return Attribute.new( name, attr_val ) if attr_val end return nil @@ -2371,8 +2371,9 @@ def []=( name, value ) end unless value.kind_of? Attribute - if @element.document and @element.document.doctype - value = Text::normalize( value, @element.document.doctype ) + doctype = @element.document&.doctype + if doctype + value = Text::normalize( value, doctype ) else value = Text::normalize( value, nil ) end @@ -2409,10 +2410,11 @@ def prefixes each_attribute do |attribute| ns << attribute.name if attribute.prefix == 'xmlns' end - if @element.document and @element.document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = @element.document.doctype.name if expn.size == 0 - @element.document.doctype.attributes_of(expn).each { + expn = doctype.name if expn.size == 0 + doctype.attributes_of(expn).each { |attribute| ns << attribute.name if attribute.prefix == 'xmlns' } @@ -2434,10 +2436,11 @@ def namespaces each_attribute do |attribute| namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' end - if @element.document and @element.document.doctype + doctype = @element.document&.doctype + if doctype expn = @element.expanded_name - expn = @element.document.doctype.name if expn.size == 0 - @element.document.doctype.attributes_of(expn).each { + expn = doctype.name if expn.size == 0 + doctype.attributes_of(expn).each { |attribute| namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' } From 63f3e9772595a64b036953f0ab026d2ea5560a3b Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 07:21:10 +0700 Subject: [PATCH 69/91] Use Safe Navigation (&.) from Ruby 2.3 (#265) We can simplify our code by using `&.`. --- lib/rexml/attribute.rb | 5 +---- lib/rexml/child.rb | 3 +-- lib/rexml/doctype.rb | 11 +++-------- lib/rexml/element.rb | 3 +-- lib/rexml/text.rb | 5 +---- 5 files changed, 7 insertions(+), 20 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index ba49207c..1326563a 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -130,10 +130,7 @@ def to_string end def doctype - if @element - doc = @element.document - doc.doctype if doc - end + @element&.document&.doctype end # Returns the attribute value, with entities replaced diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb index cc6e9a47..40abde87 100644 --- a/lib/rexml/child.rb +++ b/lib/rexml/child.rb @@ -83,8 +83,7 @@ def previous_sibling=(other) # Returns:: the document this child belongs to, or nil if this child # belongs to no document def document - return parent.document unless parent.nil? - nil + parent&.document end # This doesn't yet handle encodings diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index f3590484..a9cf9f7e 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -171,15 +171,11 @@ def write( output, indent=0, transitive=false, ie_hack=false ) end def context - if @parent - @parent.context - else - nil - end + @parent&.context end def entity( name ) - @entities[name].unnormalized if @entities[name] + @entities[name]&.unnormalized end def add child @@ -288,8 +284,7 @@ def initialize name, middle, pub, sys end def to_s - context = nil - context = parent.context if parent + context = parent&.context notation = "( other ) end def doctype - if @parent - doc = @parent.document - doc.doctype if doc - end + @parent&.document&.doctype end REFERENCE = /#{Entity::REFERENCE}/ From 66232eaf680d0937ae59bea285cdb8e4d3d88a93 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 08:12:02 +0700 Subject: [PATCH 70/91] Remove redundant return statements (#266) Very slight behavior change here in `REXML::Valdiation::Event#matches?`, which is to align the predicate method's return value with the expected behavior of a predicate method (which is to return one of true or false). --- lib/rexml/attribute.rb | 4 +-- lib/rexml/document.rb | 4 +-- lib/rexml/element.rb | 47 +++++++++++--------------- lib/rexml/functions.rb | 6 ++-- lib/rexml/namespace.rb | 8 ++--- lib/rexml/node.rb | 2 +- lib/rexml/parsers/baseparser.rb | 7 ++-- lib/rexml/parsers/xpathparser.rb | 8 ++--- lib/rexml/quickpath.rb | 37 +++++++++++---------- lib/rexml/security.rb | 4 +-- lib/rexml/text.rb | 14 ++++---- lib/rexml/validation/relaxng.rb | 53 +++++++++++++++--------------- lib/rexml/validation/validation.rb | 16 ++++----- lib/rexml/xpath_parser.rb | 38 ++++++++++----------- 14 files changed, 118 insertions(+), 130 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 1326563a..c5673249 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -199,9 +199,7 @@ def inspect end def xpath - path = @element.xpath - path += "/@#{self.expanded_name}" - return path + @element.xpath + "/@#{self.expanded_name}" end def document diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1c678bef..96ae5b75 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -415,7 +415,7 @@ def Document::entity_expansion_limit=( val ) # # Deprecated. Use REXML::Security.entity_expansion_limit= instead. def Document::entity_expansion_limit - return Security.entity_expansion_limit + Security.entity_expansion_limit end # Set the entity expansion limit. By default the limit is set to 10240. @@ -429,7 +429,7 @@ def Document::entity_expansion_text_limit=( val ) # # Deprecated. Use REXML::Security.entity_expansion_text_limit instead. def Document::entity_expansion_text_limit - return Security.entity_expansion_text_limit + Security.entity_expansion_text_limit end attr_reader :entity_expansion_count diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index e9ca684e..0d74811e 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -565,7 +565,7 @@ def prefixes prefixes = [] prefixes = parent.prefixes if parent prefixes |= attributes.prefixes - return prefixes + prefixes end # :call-seq: @@ -624,7 +624,7 @@ def namespace(prefix=nil) ns = namespaces[prefix] ns = '' if ns.nil? and prefix == 'xmlns' - return ns + ns end # :call-seq: @@ -956,7 +956,7 @@ def get_elements( xpath ) def next_element element = next_sibling element = element.next_sibling until element.nil? or element.kind_of? Element - return element + element end # :call-seq: @@ -972,7 +972,7 @@ def next_element def previous_element element = previous_sibling element = element.previous_sibling until element.nil? or element.kind_of? Element - return element + element end @@ -1022,8 +1022,7 @@ def has_text? # def text( path = nil ) rv = get_text(path) - return rv.value unless rv.nil? - nil + rv&.value end # :call-seq: @@ -1051,7 +1050,7 @@ def get_text path = nil else rv = @children.find { |node| node.kind_of? Text } end - return rv + rv end # :call-seq: @@ -1095,7 +1094,7 @@ def text=( text ) old_text.replace_with( text ) end end - return self + self end # :call-seq: @@ -1146,7 +1145,7 @@ def add_text( text ) text = Text.new( text, whitespace(), nil, raw() ) end self << text unless text.nil? - return self + self end # :call-seq: @@ -1190,7 +1189,7 @@ def xpath cur = cur.parent path_elements << __to_xpath_helper( cur ) end - return path_elements.reverse.join( "/" ) + path_elements.reverse.join( "/" ) end ################################################# @@ -1292,7 +1291,6 @@ def attribute( name, namespace=nil ) return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] ) attributes.get_attribute( name ) - end # :call-seq: @@ -1306,7 +1304,7 @@ def attribute( name, namespace=nil ) # b.has_attributes? # => false # def has_attributes? - return !@attributes.empty? + !@attributes.empty? end # :call-seq: @@ -1684,11 +1682,7 @@ def []( index, name=nil) (num += 1) == index } else - return XPath::first( @element, index ) - #{ |element| - # return element if element.kind_of? Element - #} - #return nil + XPath::first( @element, index ) end end @@ -1735,7 +1729,7 @@ def []=( index, element ) else previous.replace_with element end - return previous + previous end # :call-seq: @@ -1774,7 +1768,7 @@ def index element child == element end return rv if found == element - return -1 + -1 end # :call-seq: @@ -1853,7 +1847,7 @@ def delete_all( xpath ) @element.delete element element.remove end - return rv + rv end # :call-seq: @@ -2180,8 +2174,7 @@ def initialize element # def [](name) attr = get_attribute(name) - return attr.value unless attr.nil? - return nil + attr&.value end # :call-seq: @@ -2336,7 +2329,7 @@ def get_attribute( name ) if attr.kind_of? Hash attr = attr[ @element.prefix ] end - return attr + attr end # :call-seq: @@ -2390,7 +2383,7 @@ def []=( name, value ) else store value.name, value end - return @element + @element end # :call-seq: @@ -2494,9 +2487,7 @@ def delete( attribute ) old.each_value{|v| repl = v} store name, repl end - elsif old.nil? - return @element - else # the supplied attribute is a top-level one + elsif old # the supplied attribute is a top-level one super(name) end @element @@ -2550,7 +2541,7 @@ def delete_all( name ) rv << attribute if attribute.expanded_name == name } rv.each{ |attr| attr.remove } - return rv + rv end # :call-seq: diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index 4c114616..60ae34e7 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -39,11 +39,11 @@ def Functions::context=(value); @@context = value; end def Functions::text( ) if @@context[:node].node_type == :element - return @@context[:node].find_all{|n| n.node_type == :text}.collect{|n| n.value} + @@context[:node].find_all{|n| n.node_type == :text}.collect{|n| n.value} elsif @@context[:node].node_type == :text - return @@context[:node].value + @@context[:node].value else - return false + false end end diff --git a/lib/rexml/namespace.rb b/lib/rexml/namespace.rb index 2e67252a..232b7ca4 100644 --- a/lib/rexml/namespace.rb +++ b/lib/rexml/namespace.rb @@ -42,11 +42,11 @@ def name=( name ) # Compares names optionally WITH namespaces def has_name?( other, ns=nil ) if ns - return (namespace() == ns and name() == other) + namespace() == ns and name() == other elsif other.include? ":" - return fully_expanded_name == other + fully_expanded_name == other else - return name == other + name == other end end @@ -57,7 +57,7 @@ def has_name?( other, ns=nil ) def fully_expanded_name ns = prefix return "#{ns}:#@name" if ns.size > 0 - return @name + @name end end end diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 033b740d..bccacc51 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -68,7 +68,7 @@ def find_first_recursive(&block) # :yields: node each_recursive {|node| return node if block.call(node) } - return nil + nil end # Returns the position that +self+ holds in its parent's array, indexed diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 750b1697..a87657b5 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -206,12 +206,12 @@ def position # Returns true if there are no more events def empty? - return (@source.empty? and @stack.empty?) + (@source.empty? and @stack.empty?) end # Returns true if there are more events. Synonymous with !empty? def has_next? - return !(@source.empty? and @stack.empty?) + !(@source.empty? and @stack.empty?) end # Push an event back on the head of the stream. This method @@ -522,7 +522,8 @@ def pull_event raise REXML::ParseException.new( "Exception parsing", @source, self, (error ? error : $!) ) end - return [ :dummy ] + # NOTE: The end of the method never runs, because it is unreachable. + # All branches of code above have explicit unconditional return or raise statements. end private :pull_event diff --git a/lib/rexml/parsers/xpathparser.rb b/lib/rexml/parsers/xpathparser.rb index bd3b6856..a6d76fdc 100644 --- a/lib/rexml/parsers/xpathparser.rb +++ b/lib/rexml/parsers/xpathparser.rb @@ -215,7 +215,7 @@ def predicate_to_path(parsed, &block) else path << yield( parsed ) end - return path.squeeze(" ") + path.squeeze(" ") end # For backward compatibility alias_method :preciate_to_string, :predicate_to_path @@ -252,7 +252,7 @@ def LocationPath path, parsed path = path[1..-1] end end - return RelativeLocationPath( path, parsed ) if path.size > 0 + RelativeLocationPath( path, parsed ) if path.size > 0 end #RelativeLocationPath @@ -388,7 +388,7 @@ def NodeTest path, parsed else path = original_path end - return path + path end # Filters the supplied nodeset on the predicate(s) @@ -600,7 +600,7 @@ def PathExpr path, parsed end rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w*]/ parsed.concat(n) - return rest + rest end #| FilterExpr Predicate diff --git a/lib/rexml/quickpath.rb b/lib/rexml/quickpath.rb index a0466b25..cded06f5 100644 --- a/lib/rexml/quickpath.rb +++ b/lib/rexml/quickpath.rb @@ -41,7 +41,7 @@ def QuickPath::match element, path, namespaces=EMPTY_HASH else results = filter([element], path) end - return results + results end # Given an array of nodes it filters the array based on the path. The @@ -51,18 +51,18 @@ def QuickPath::filter elements, path return elements if path.nil? or path == '' or elements.size == 0 case path when /^\/\//u # Descendant - return axe( elements, "descendant-or-self", $' ) + axe( elements, "descendant-or-self", $' ) when /^\/?\b(\w[-\w]*)\b::/u # Axe - return axe( elements, $1, $' ) + axe( elements, $1, $' ) when /^\/(?=\b([:!\w][-\.\w]*:)?[-!\*\.\w]*\b([^:(]|$)|\*)/u # Child rest = $' results = [] elements.each do |element| results |= filter( element.to_a, rest ) end - return results + results when /^\/?(\w[-\w]*)\(/u # / Function - return function( elements, $1, $' ) + function( elements, $1, $' ) when Namespace::NAMESPLIT # Element name name = $2 ns = $1 @@ -73,21 +73,21 @@ def QuickPath::filter elements, path (element.name == name and element.namespace == Functions.namespace_context[ns]))) end - return filter( elements, rest ) + filter( elements, rest ) when /^\/\[/u matches = [] elements.each do |element| matches |= predicate( element.to_a, path[1..-1] ) if element.kind_of? Element end - return matches + matches when /^\[/u # Predicate - return predicate( elements, path ) + predicate( elements, path ) when /^\/?\.\.\./u # Ancestor - return axe( elements, "ancestor", $' ) + axe( elements, "ancestor", $' ) when /^\/?\.\./u # Parent - return filter( elements.collect{|e|e.parent}, $' ) + filter( elements.collect{|e|e.parent}, $' ) when /^\/?\./u # Self - return filter( elements, $' ) + filter( elements, $' ) when /^\*/u # Any results = [] elements.each do |element| @@ -98,9 +98,10 @@ def QuickPath::filter elements, path # results |= filter( children, $' ) #end end - return results + results + else + [] end - return [] end def QuickPath::axe( elements, axe_name, rest ) @@ -138,7 +139,7 @@ def QuickPath::axe( elements, axe_name, rest ) matches = filter(elements.collect{|element| element.previous_sibling}.uniq, rest ) end - return matches.uniq + matches.uniq end OPERAND_ = '((?=(?:(?!and|or).)*[^\s<>=])[^\s<>=]+)' @@ -200,15 +201,15 @@ def QuickPath::predicate( elements, path ) results << element end end - return filter( results, rest ) + filter( results, rest ) end def QuickPath::attribute( name ) - return Functions.node.attributes[name] if Functions.node.kind_of? Element + Functions.node.attributes[name] if Functions.node.kind_of? Element end def QuickPath::name() - return Functions.node.name if Functions.node.kind_of? Element + Functions.node.name if Functions.node.kind_of? Element end def QuickPath::method_missing( id, *args ) @@ -234,7 +235,7 @@ def QuickPath::function( elements, fname, rest ) results << element if Functions.pair[0] == res end end - return results + results end def QuickPath::parse_args( element, string ) diff --git a/lib/rexml/security.rb b/lib/rexml/security.rb index 99b74607..e8e8c6b4 100644 --- a/lib/rexml/security.rb +++ b/lib/rexml/security.rb @@ -10,7 +10,7 @@ def self.entity_expansion_limit=( val ) # Get the entity expansion limit. By default the limit is set to 10000. def self.entity_expansion_limit - return @@entity_expansion_limit + @@entity_expansion_limit end @@entity_expansion_text_limit = 10_240 @@ -22,7 +22,7 @@ def self.entity_expansion_text_limit=( val ) # Get the entity expansion limit. By default the limit is set to 10240. def self.entity_expansion_text_limit - return @@entity_expansion_text_limit + @@entity_expansion_text_limit end end end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index c70f73f2..8799d89d 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -177,7 +177,7 @@ def empty? def clone - return Text.new(self, true) + Text.new(self, true) end @@ -261,10 +261,10 @@ def wrap(string, width, addnewline=false) # Recursively wrap string at width. return string if string.length <= width place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - if addnewline then - return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + if addnewline + "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) else - return string[0,place] + "\n" + wrap(string[place+1..-1], width) + string[0,place] + "\n" + wrap(string[place+1..-1], width) end end @@ -277,7 +277,7 @@ def indent_text(string, level=1, style="\t", indentfirstline=true) new_string << new_line } new_string.strip! unless indentfirstline - return new_string + new_string end # == DEPRECATED @@ -296,9 +296,7 @@ def write( writer, indent=-1, transitive=false, ie_hack=false ) # FIXME # This probably won't work properly def xpath - path = @parent.xpath - path += "/text()" - return path + @parent.xpath + "/text()" end # Writes out text, substituting special characters beforehand. diff --git a/lib/rexml/validation/relaxng.rb b/lib/rexml/validation/relaxng.rb index f29a2c05..c6894dcb 100644 --- a/lib/rexml/validation/relaxng.rb +++ b/lib/rexml/validation/relaxng.rb @@ -157,16 +157,16 @@ def next( event ) if ( @events[@current].matches?(event) ) @current += 1 if @events[@current].nil? - return @previous.pop + @previous.pop elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else - return nil + nil end end @@ -186,7 +186,7 @@ def inspect end def expected - return [@events[@current]] + [@events[@current]] end def <<( event ) @@ -244,7 +244,7 @@ def generate_event( event ) evt = :end_attribute end end - return Event.new( evt, arg ) + Event.new( evt, arg ) end end @@ -262,9 +262,10 @@ def next( event ) rv = super return rv if rv @prior = @previous.pop - return @prior.next( event ) + @prior.next( event ) + else + super end - super end def matches?(event) @@ -274,7 +275,7 @@ def matches?(event) def expected return [ @prior.expected, @events[0] ].flatten if @current == 0 - return [@events[@current]] + [@events[@current]] end end @@ -286,24 +287,24 @@ def next( event ) @current += 1 if @events[@current].nil? @current = 0 - return self + self elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else @prior = @previous.pop return @prior.next( event ) if @current == 0 - return nil + nil end end def expected return [ @prior.expected, @events[0] ].flatten if @current == 0 - return [@events[@current]] + [@events[@current]] end end @@ -326,17 +327,17 @@ def next( event ) @ord += 1 if @events[@current].nil? @current = 0 - return self + self elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else return @previous.pop.next( event ) if @current == 0 and @ord > 0 - return nil + nil end end @@ -347,9 +348,9 @@ def matches?( event ) def expected if @current == 0 and @ord > 0 - return [@previous[-1].expected, @events[0]].flatten + [@previous[-1].expected, @events[0]].flatten else - return [@events[@current]] + [@events[@current]] end end end @@ -403,7 +404,7 @@ def matches?( event ) def expected return [@events[@current]] if @events.size > 0 - return @choices.collect do |x| + @choices.collect do |x| if x[0].kind_of? State x[0].expected else @@ -490,16 +491,16 @@ def next( event ) @current += 1 if @events[@current].nil? return self unless @choices[@choice].nil? - return @previous.pop + @previous.pop elsif @events[@current].kind_of? State @current += 1 @events[@current-1].previous = self - return @events[@current-1] + @events[@current-1] else - return self + self end else - return nil + nil end end @@ -510,7 +511,7 @@ def matches?( event ) def expected return [@events[@current]] if @events[@current] - return @choices[@choice..-1].collect do |x| + @choices[@choice..-1].collect do |x| if x[0].kind_of? State x[0].expected else diff --git a/lib/rexml/validation/validation.rb b/lib/rexml/validation/validation.rb index 0ad6ada4..6475c628 100644 --- a/lib/rexml/validation/validation.rb +++ b/lib/rexml/validation/validation.rb @@ -80,26 +80,26 @@ def done? end def single? - return (@event_type != :start_element and @event_type != :start_attribute) + (@event_type != :start_element and @event_type != :start_attribute) end def matches?( event ) return false unless event[0] == @event_type case event[0] when nil - return true + true when :start_element - return true if event[1] == @event_arg + event[1] == @event_arg when :end_element - return true + true when :start_attribute - return true if event[1] == @event_arg + event[1] == @event_arg when :end_attribute - return true + true when :end_document - return true + true when :text - return (@event_arg.nil? or @event_arg == event[1]) + @event_arg.nil? || @event_arg == event[1] =begin when :processing_instruction false diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 70ae8919..5cf3f28c 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -114,7 +114,7 @@ def first( path_stack, node ) case path[0] when :document # do nothing - return first( path[1..-1], node ) + first( path[1..-1], node ) when :child for c in node.children r = first( path[1..-1], c ) @@ -124,9 +124,9 @@ def first( path_stack, node ) name = path[2] if node.name == name return node if path.size == 3 - return first( path[3..-1], node ) + first( path[3..-1], node ) else - return nil + nil end when :descendant_or_self r = first( path[1..-1], node ) @@ -136,11 +136,12 @@ def first( path_stack, node ) return r if r end when :node - return first( path[1..-1], node ) + first( path[1..-1], node ) when :any - return first( path[1..-1], node ) + first( path[1..-1], node ) + else + nil end - return nil end @@ -167,10 +168,10 @@ def strict? # 2. If no mapping was supplied, use the context node to look up the namespace def get_namespace( node, prefix ) if @namespaces - return @namespaces[prefix] || '' + @namespaces[prefix] || '' else return node.namespace( prefix ) if node.node_type == :element - return '' + '' end end @@ -757,22 +758,19 @@ def following(node) end def following_node_of( node ) - if node.kind_of? Element and node.children.size > 0 - return node.children[0] - end - return next_sibling_node(node) + return node.children[0] if node.kind_of?(Element) and node.children.size > 0 + + next_sibling_node(node) end def next_sibling_node(node) psn = node.next_sibling_node while psn.nil? - if node.parent.nil? or node.parent.class == Document - return nil - end + return nil if node.parent.nil? or node.parent.class == Document node = node.parent psn = node.next_sibling_node end - return psn + psn end def child(nodeset) @@ -805,13 +803,13 @@ def child(nodeset) def norm b case b when true, false - return b + b when 'true', 'false' - return Functions::boolean( b ) + Functions::boolean( b ) when /^\d+(\.\d+)?$/, Numeric - return Functions::number( b ) + Functions::number( b ) else - return Functions::string( b ) + Functions::string( b ) end end From 04a589a61bf4e366abee8764ee74b03f4aecc4aa Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 9 Jul 2025 08:17:16 +0700 Subject: [PATCH 71/91] Fix a bug that XPath can't be used for no document element (#268) Fixes #267 #249 improved performance by introducing cache. It requires document but we should not break backward compatibility for performance improvement. This restores the previous behavior but no document case doesn't have performance improvement introduced by #249. --- lib/rexml/xpath_parser.rb | 7 ++++++- test/parser/test_xpath.rb | 2 +- test/test_xpath_parser.rb | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 test/test_xpath_parser.rb diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 5cf3f28c..64c8846a 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -84,7 +84,12 @@ def parse path, node node = node.first end - node.document.__send__(:enable_cache) do + document = node.document + if document + document.__send__(:enable_cache) do + match( path_stack, node ) + end + else match( path_stack, node ) end end diff --git a/test/parser/test_xpath.rb b/test/parser/test_xpath.rb index 9143d25c..5d62afee 100644 --- a/test/parser/test_xpath.rb +++ b/test/parser/test_xpath.rb @@ -4,7 +4,7 @@ require "rexml/parsers/xpathparser" module REXMLTests - class TestXPathParser < Test::Unit::TestCase + class TestParserXPathParser < Test::Unit::TestCase sub_test_case("#abbreviate") do def abbreviate(xpath) parser = REXML::Parsers::XPathParser.new diff --git a/test/test_xpath_parser.rb b/test/test_xpath_parser.rb new file mode 100644 index 00000000..bcb14c34 --- /dev/null +++ b/test/test_xpath_parser.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module REXMLTests + class TestXPathParser < Test::Unit::TestCase + def setup + @root_element = make_service_element(["urn:type1", "urn:type2"], ["http://uri"]) + @element = @root_element.children[0] + @parser = REXML::XPathParser.new + end + + def make_service_element(types, uris) + root_element = REXML::Element.new + element = root_element.add_element("Service") + types.each do |type_text| + element.add_element("Type").text = type_text + end + uris.each do |uri_text| + element.add_element("URI").text = uri_text + end + root_element + end + + def test_found + res = @parser.parse("/Service", @root_element) + assert_equal([@element], + res) + end + + def test_not_found + res = @parser.parse("/nonexistent", @root_element) + assert_equal([], + res) + end + end +end From 9b084d78708638cedff54743edc0907c4bd6574a Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Tue, 15 Jul 2025 08:23:42 +0700 Subject: [PATCH 72/91] Fix & Deprecate REXML::Text#text_indent (#275) - Fixes #273 - "Fix" in the sense that it restores the original behavior pre-v3.2.6, regardless of its fitness for purpose. - Regression Test Added --------- Co-authored-by: Sutou Kouhei --- lib/rexml/child.rb | 2 +- lib/rexml/text.rb | 4 +++- test/test_text.rb | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb index 40abde87..2718040f 100644 --- a/lib/rexml/child.rb +++ b/lib/rexml/child.rb @@ -88,7 +88,7 @@ def document # This doesn't yet handle encodings def bytes - document.encoding + document&.encoding to_s end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 8799d89d..8d5281cd 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -269,8 +269,10 @@ def wrap(string, width, addnewline=false) end def indent_text(string, level=1, style="\t", indentfirstline=true) + Kernel.warn("#{self.class.name}#indent_text is deprecated. See REXML::Formatters", uplevel: 1) return string if level < 0 - new_string = '' + + new_string = +'' string.each_line { |line| indent_string = style * level new_line = (indent_string + line).sub(/[\s]+$/,'') diff --git a/test/test_text.rb b/test/test_text.rb index bae21656..c1f5765e 100644 --- a/test/test_text.rb +++ b/test/test_text.rb @@ -2,6 +2,7 @@ module REXMLTests class TextTester < Test::Unit::TestCase + include Helper::Global include REXML def test_new_text_response_whitespace_default @@ -69,5 +70,12 @@ def test_clone assert_equal(text.to_s, text.clone.to_s) end + + def test_indent_text + text = Text.new("") + suppress_warning do + assert_equal("\tline1\tline2\tline3", text.indent_text("line1\r\nline2\r\nline3\r\n")) + end + end end end From c60ae027a3c20f359fdf76fa41ae64d22313f482 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 16 Jul 2025 08:22:47 +0700 Subject: [PATCH 73/91] Remove bundler from dev deps (#277) Fixes #276 It's redundant. --- Gemfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Gemfile b/Gemfile index d323e2c5..a680c133 100644 --- a/Gemfile +++ b/Gemfile @@ -6,7 +6,6 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } gemspec group :development do - gem "bundler" # This is for suppressing the following warning: # # warning: ostruct was loaded from the standard library, but will From c87bda8bb8773da7e5a0faf9f16ff165eb052a35 Mon Sep 17 00:00:00 2001 From: "|7eter l-|. l3oling" Date: Wed, 16 Jul 2025 08:39:31 +0700 Subject: [PATCH 74/91] Remove ostruct from dev deps (#281) Fixes #280 It seems that it's no longer needed. --- Gemfile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Gemfile b/Gemfile index a680c133..22520c65 100644 --- a/Gemfile +++ b/Gemfile @@ -6,14 +6,6 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } gemspec group :development do - # This is for suppressing the following warning: - # - # warning: ostruct was loaded from the standard library, but will - # no longer be part of the default gems starting from Ruby 3.5.0. - # - # This should be part of "json". We can remove this when "json" - # depends on "ostruct" explicitly. - gem "ostruct" gem "rake" gem "rdoc" end From 1d876e3bf658b7b4ec7c3372867521695e8eb023 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:38:38 +0900 Subject: [PATCH 75/91] Bump actions/checkout from 4 to 5 (#283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 5.
Release notes

Sourced from actions/checkout's releases.

v5.0.0

What's Changed

⚠️ Minimum Compatible Runner Version

v2.327.1
Release Notes

Make sure your runner is updated to this version or newer to use this release.

Full Changelog: https://github.com/actions/checkout/compare/v4...v5.0.0

v4.3.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4...v4.3.0

v4.2.2

What's Changed

Full Changelog: https://github.com/actions/checkout/compare/v4.2.1...v4.2.2

v4.2.1

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v4.2.0...v4.2.1

... (truncated)

Changelog

Sourced from actions/checkout's changelog.

Changelog

V5.0.0

V4.3.0

v4.2.2

v4.2.1

v4.2.0

v4.1.7

v4.1.6

v4.1.5

v4.1.4

v4.1.3

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/checkout&package-manager=github_actions&previous-version=4&new-version=5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/benchmark.yml | 2 +- .github/workflows/release.yml | 4 ++-- .github/workflows/test.yml | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2c638b03..651df879 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -16,7 +16,7 @@ jobs: - ubuntu-latest runs-on: ${{ matrix.runs-on }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 76269f44..f3dffca7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 10 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Extract release note run: | ruby \ @@ -37,7 +37,7 @@ jobs: id-token: write environment: release steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0bd43457..31dc02a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: # - runs-on: ubuntu-latest # ruby-version: truffleruby steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -39,7 +39,7 @@ jobs: name: frozen-string-literal runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby @@ -66,7 +66,7 @@ jobs: - windows-latest ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -95,7 +95,7 @@ jobs: name: "Document" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ruby @@ -105,7 +105,7 @@ jobs: - name: Build document run: | bundle exec rake warning:error rdoc - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 if: | github.event_name == 'push' with: From 5859bdeac792687eaf93d8e8f0b7e3c1e2ed5c23 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 23 Aug 2025 08:11:58 +0900 Subject: [PATCH 76/91] Added XML declaration check & `Source#skip_spaces` method (#282) ## Why? ### Added XML declaration check - The version attribute is required in XML declaration. - Only version attribute, encoding attribute, and standalone attribute are allowed in XML declaration. - XML declaration is only allowed once. See: https://www.w3.org/TR/xml/#NT-XMLDecl ### Added `Source#skip_spaces` method In the case of `@source.match?(/\s+/um, true)`, if there are no spaces at the beginning, I want to stop reading immediately. However, it continues to read the buffer until it finds a match, but it never finds a match. As a result, it continues reading until the end of the file. In the case of large XML files, drop_parsed_content occur frequently until the buffer is cleared, which may affect performance. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 32.534 35.130 54.559 53.528 i/s - 100.000 times in 3.073715s 2.846540s 1.832883s 1.868189s sax 44.785 44.089 78.303 77.842 i/s - 100.000 times in 2.232907s 2.268138s 1.277093s 1.284657s pull 51.750 51.105 90.819 90.658 i/s - 100.000 times in 1.932351s 1.956759s 1.101094s 1.103050s stream 51.427 51.444 89.820 88.971 i/s - 100.000 times in 1.944502s 1.943855s 1.113340s 1.123960s Comparison: dom before(YJIT): 54.6 i/s after(YJIT): 53.5 i/s - 1.02x slower after: 35.1 i/s - 1.55x slower before: 32.5 i/s - 1.68x slower sax before(YJIT): 78.3 i/s after(YJIT): 77.8 i/s - 1.01x slower before: 44.8 i/s - 1.75x slower after: 44.1 i/s - 1.78x slower pull before(YJIT): 90.8 i/s after(YJIT): 90.7 i/s - 1.00x slower before: 51.8 i/s - 1.75x slower after: 51.1 i/s - 1.78x slower stream before(YJIT): 89.8 i/s after(YJIT): 89.0 i/s - 1.01x slower after: 51.4 i/s - 1.75x slower before: 51.4 i/s - 1.75x slower ``` - YJIT=ON : 0.98x - 1.00x faster - YJIT=OFF : 0.98x - 1.07x faster --- lib/rexml/parsers/baseparser.rb | 156 +++++++++++++------ lib/rexml/source.rb | 7 +- test/parse/test_document_type_declaration.rb | 6 +- test/parse/test_processing_instruction.rb | 130 +++++++++++++++- test/test_xml_declaration.rb | 2 +- 5 files changed, 244 insertions(+), 57 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a87657b5..9304e96d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -144,6 +144,7 @@ module Private PEREFERENCE_PATTERN = /#{PEREFERENCE}/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + EQUAL_PATTERN = /\s*=\s*/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um NAME_PATTERN = /#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" @@ -168,6 +169,7 @@ def initialize( source ) @entity_expansion_limit = Security.entity_expansion_limit @entity_expansion_text_limit = Security.entity_expansion_text_limit @source.ensure_buffer + @version = nil end def add_listener( listener ) @@ -280,7 +282,7 @@ def pull_event return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" - unless @source.match?(/\s+/um, true) + unless @source.skip_spaces if @source.match?(">") message = "#{base_error_message}: name is missing" else @@ -290,7 +292,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) id = [nil, nil, nil] @document_status = :in_doctype @@ -306,7 +308,7 @@ def pull_event # For backward compatibility id[1], id[2] = id[2], nil end - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces if @source.match?("[", true) @document_status = :in_doctype elsif @source.match?(">", true) @@ -319,7 +321,7 @@ def pull_event end args = [:start_doctype, name, *id] if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces @stack << [ :end_doctype ] end return args @@ -330,7 +332,7 @@ def pull_event end end if @document_status == :in_doctype - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces start_position = @source.position if @source.match?("") message = "#{base_error_message}: name is missing" else @@ -404,7 +406,7 @@ def pull_event id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) - @source.match?(/\s*/um, true) # skip spaces + @source.skip_spaces unless @source.match?(">", true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) @@ -425,7 +427,7 @@ def pull_event end end if @document_status == :after_doctype - @source.match?(/\s*/um, true) + @source.skip_spaces end begin start_position = @source.position @@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding) true end + def normalize_xml_declaration_encoding(xml_declaration_encoding) + /\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil + end + def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md @@ -735,37 +741,85 @@ def process_comment def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") - if @source.match?(/\s+/um, true) - match_data = @source.match(/(.*?)\?>/um, true) - unless match_data - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + if name == "xml" + xml_declaration + else # PITarget + if @source.skip_spaces # e.g. + start_position = @source.position + content = @source.read_until("?>") + unless content.chomp!("?>") + @source.position = start_position + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end + else # e.g. + content = nil + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source) + end end - content = match_data[1] - else - content = nil + [:processing_instruction, name, content] + end + end + + def xml_declaration + unless @version.nil? + raise ParseException.new("Malformed XML: XML declaration is duplicated", @source) + end + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end + unless @source.skip_spaces + raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source) + end + unless @source.match?("version", true) + raise ParseException.new("Malformed XML: XML declaration misses version", @source) + end + @version = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces unless @source.match?("?>", true) - raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) end + encoding = normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if name == "xml" - if @document_status - raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) - end - version = VERSION.match(content) - version = version[1] unless version.nil? - encoding = ENCODING.match(content) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding + + if @source.match?("encoding", true) + encoding = parse_attribute_value_with_equal("xml") + unless @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + return [ :xmldecl, @version, encoding, nil ] # e.g. end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" + end + + if @source.match?("standalone", true) + standalone = parse_attribute_value_with_equal("xml") + case standalone + when "yes", "no" + else + raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source) end - standalone = STANDALONE.match(content) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, name, content] + @source.skip_spaces + unless @source.match?("?>", true) + raise ParseException.new("Malformed XML: Unclosed XML declaration", @source) + end + + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + encoding ||= normalize_xml_declaration_encoding(@source.encoding) + + # e.g. + # + # + # + [ :xmldecl, @version, encoding, standalone ] end if StringScanner::Version < "3.1.1" @@ -787,6 +841,25 @@ def scan_quote end end + def parse_attribute_value_with_equal(name) + unless @source.match?(Private::EQUAL_PATTERN, true) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + unless quote = scan_quote + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + start_position = @source.position + value = @source.read_until(quote) + unless value.chomp!(quote) + @source.position = start_position + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" + raise REXML::ParseException.new(message, @source) + end + value + end + def parse_attributes(prefixes) attributes = {} expanded_names = {} @@ -801,23 +874,8 @@ def parse_attributes(prefixes) name = match[1] prefix = match[2] local_part = match[3] - - unless @source.match?(/\s*=\s*/um, true) - message = "Missing attribute equal: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - unless quote = scan_quote - message = "Missing attribute value start quote: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - start_position = @source.position - value = @source.read_until(quote) - unless value.chomp!(quote) - @source.position = start_position - message = "Missing attribute value end quote: <#{name}>: <#{quote}>" - raise REXML::ParseException.new(message, @source) - end - @source.match?(/\s*/um, true) + value = parse_attribute_value_with_equal(name) + @source.skip_spaces if prefix == "xmlns" if local_part == "xml" if value != Private::XML_PREFIXED_NAMESPACE diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3ec1141e..99500072 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -65,9 +65,10 @@ class Source attr_reader :encoding module Private + SPACES_PATTERN = /\s+/um SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<", "]]>"] + pre_defined_terms = ["'", '"', "<", "]]>", "?>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ @@ -150,6 +151,10 @@ def match?(pattern, cons=false) end end + def skip_spaces + @scanner.skip(Private::SPACES_PATTERN) ? true : false + end + def position @scanner.pos end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index b22863a9..d4658b9e 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -49,10 +49,10 @@ def test_no_name end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: name is missing -Line: 3 -Position: 17 +Line: 1 +Position: 10 Last 80 unconsumed characters: - + DETAIL end end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index ba381dc4..70d17747 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -30,7 +30,7 @@ def test_unclosed_content parse(" Line: 1 Position: 14 Last 80 unconsumed characters: @@ -43,7 +43,7 @@ def test_unclosed_no_content parse(" Line: 1 Position: 6 Last 80 unconsumed characters: @@ -51,6 +51,19 @@ def test_unclosed_no_content DETAIL end + def test_xml_declaration_duplicated + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration is duplicated +Line: 1 +Position: 42 +Last 80 unconsumed characters: + version="1.0"?> + DETAIL + end + def test_xml_declaration_not_at_document_start exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('') @@ -64,7 +77,118 @@ def test_xml_declaration_not_at_document_start Line: 1 Position: 25 Last 80 unconsumed characters: + version="1.0" ?> + DETAIL + end + + def test_xml_declaration_missing_spaces + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses spaces before version + Line: 1 + Position: 7 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_missing_version + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration misses version + Line: 1 + Position: 8 + Last 80 unconsumed characters: + ?> + DETAIL + end + + def test_xml_declaration_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 37 + Last 80 unconsumed characters: + encoding="UTF-8"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_missing_space_after_encoding + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 53 + Last 80 unconsumed characters: + standalone="no"?> + DETAIL + end + + def test_xml_declaration_unclosed_content_with_unknown_attributes + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Unclosed XML declaration + Line: 1 + Position: 31 + Last 80 unconsumed characters: + test="no"?> + DETAIL + end + + def test_xml_declaration_standalone_no_yes_or_no + exception = assert_raise(REXML::ParseException) do + parse('') + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: XML declaration standalone is not yes or no : +Line: 1 +Position: 38 +Last 80 unconsumed characters: +?> DETAIL end end @@ -113,7 +237,7 @@ def test_content_question def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new("" * n + " ?>") + REXML::Document.new("" * n + " ?>") end end diff --git a/test/test_xml_declaration.rb b/test/test_xml_declaration.rb index 6a1f4df0..4503a90e 100644 --- a/test/test_xml_declaration.rb +++ b/test/test_xml_declaration.rb @@ -7,7 +7,7 @@ module REXMLTests class TestXmlDeclaration < Test::Unit::TestCase def setup xml = <<~XML - + XML From f36916fe1c66b8cdc1fe482263115625e084d8fe Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 26 Aug 2025 14:29:47 +0900 Subject: [PATCH 77/91] Add 3.4.2 entry (#284) --- NEWS.md | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7f95d829..313b07d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,107 @@ # News +## 3.4.2 - 2025-08-26 {#version-3-4-2} + +### Improvement + + * Improved performance. + * GH-244 + * GH-245 + * GH-246 + * GH-249 + * GH-256 + * Patch by NAITOH Jun + + * Raise appropriate exception when failing to match start tag in DOCTYPE + * GH-247 + * Patch by NAITOH Jun + + * Deprecate accepting array as an element in XPath.match, first and each + * GH-252 + * Patch by tomoya ishida + + * Don't call needless encoding_updated + * GH-259 + * Patch by Sutou Kouhei + + * Reuse XPath::match + * GH-263 + * Patch by pboling + + * Cache redundant calls for doctype + * GH-264 + * Patch by pboling + + * Use Safe Navigation (&.) from Ruby 2.3 + * GH-265 + * Patch by pboling + + * Remove redundant return statements + * GH-266 + * Patch by pboling + + * Added XML declaration check & Source#skip_spaces method + * GH-282 + * Patch by NAITOH Jun + * Reported by Sofi Aberegg + +### Fixes + + * Fix docs typo + * GH-248 + * Patch by James Coleman + + * Fix reverse sort in xpath_parser + * GH-251 + * Patch by tomoya ishida + + * Fix duplicate responses in XPath following, following-sibling, preceding, preceding-sibling + * GH-255 + * Patch by NAITOH Jun + + * Fix wrong Encoding resolution + * GH-258 + * Patch by Sutou Kouhei + + * Handle nil when parsing fragment + * GH-267 + * GH-268 + * Patch by pboling + + * [Documentation] Use # to reference instance methods + * GH-269 + * GH-270 + * Patch by pboling + + * Fix & Deprecate REXML::Text#text_indent + * GH-273 + * GH-275 + * Patch by pboling + + * remove bundler from dev deps + * GH-276 + * GH-277 + * Patch by pboling + + * remove ostruct from dev deps + * GH-280 + * GH-281 + * Patch by pboling + +### Thanks + + * NAITOH Jun + + * tomoya ishida + + * James Coleman + + * pboling + + * Sutou Kouhei + + * Sofi Aberegg + ## 3.4.1 - 2025-02-16 {#version-3-4-1} ### Improvement From 185bdc737da406ba4f9564726849ad3477858eb2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 26 Aug 2025 14:36:22 +0900 Subject: [PATCH 78/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index bf3c0d32..91d2a6ff 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.2" + VERSION = "3.4.3" REVISION = "" Copyright = COPYRIGHT From 1531862d18ec3ecd659060d60b8bb49accee5a42 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 4 Sep 2025 20:17:55 +0900 Subject: [PATCH 79/91] Fixed an issue with `IOSource#read_until` when reaching the end of a file (#288) Fix GH-287 ## Why? If `@source = nil` is set at the end of the file, prevent `IOSource#read_until` from raising an error. > /.gems/gems/rexml-3.4.2/lib/rexml/parsers/baseparser.rb:524:in 'REXML::Parsers::BaseParser#pull_event': # (REXML::ParseException) > .gems/gems/rexml-3.4.2/lib/rexml/source.rb:275:in 'REXML::IOSource#read_until' https://github.com/ruby/rexml/blob/185bdc737da406ba4f9564726849ad3477858eb2/lib/rexml/source.rb#L266-L276 1. Reaching the end of a file in `IOSource#read`. 2. If the string remains in `@scanner.rest`, use `@scanner.scan_until(pattern)` to return `str`. 3. If `@source` is `nil`, an exception occurs. --- lib/rexml/source.rb | 2 +- test/test_io_source.rb | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 test/test_io_source.rb diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 99500072..8b8ba0da 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -272,7 +272,7 @@ def read_until(term) @scanner << readline(term) end if str - read if @scanner.eos? and !@source.eof? + read if @scanner.eos? and @source and !@source.eof? str else rest = @scanner.rest diff --git a/test/test_io_source.rb b/test/test_io_source.rb new file mode 100644 index 00000000..11776efc --- /dev/null +++ b/test/test_io_source.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: false + +require "rexml/source" + +module REXMLTests + class TestIOSource < Test::Unit::TestCase + def setup + @source = REXML::SourceFactory.create_from('') + end + + sub_test_case("#read_until") do + test("eof") do + assert_true(@source.read("nonexistent")) # Consume all data + assert_false(@source.read("nonexistent")) # Set EOF + assert_equal('', @source.read_until(">")) + end + end + end +end From b5b148ed3c8a02fb53e971e312cee94b5301555a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 7 Sep 2025 16:11:36 +0900 Subject: [PATCH 80/91] The Zlib::GzipReader in JRuby does not behave as expected with REXML, so the test is skipped (#292) ## Why? JRuby's `Zlib::GzipReader#readline` does not support arguments, so when using `Zlib::GzipReader.open`, it always returns an empty string and does not function. We will temporarily exclude `Zlib::GzipReader` from test coverage. ``` $ test/run.rb --location test/test_order.rb:49 Failure: test_more_ordering(REXMLTests::OrderTester) /Users/naitoh/ghq/github.com/naitoh/rexml/test/test_order.rb:110:in `test_more_ordering' 107: count += 1 108: } 109: => 110: assert_equal( actual.size, count ) 111: end if defined?(Zlib::GzipReader) 112: end 113: end <46> expected but was <0> ``` See: https://github.com/jruby/jruby/issues/8997 --- test/test_order.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_order.rb b/test/test_order.rb index f3f9cc5d..9d08ea1e 100644 --- a/test/test_order.rb +++ b/test/test_order.rb @@ -47,6 +47,8 @@ def test_order end # Provided by Tom Talbott def test_more_ordering + omit("not supported on JRuby") if RUBY_ENGINE == "jruby" + doc = Zlib::GzipReader.open(fixture_path('LostineRiver.kml.gz'), encoding: 'utf-8') do |f| REXML::Document.new(f) end @@ -104,6 +106,8 @@ def test_more_ordering assert_equal( actual[count], n ) unless n =~ /Arrive at/ count += 1 } + + assert_equal( actual.size, count ) end if defined?(Zlib::GzipReader) end end From 6ba286cfd402e4040627615e2623f5b097261543 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 7 Sep 2025 16:51:14 +0900 Subject: [PATCH 81/91] Reject no root element XML as an invalid XML (#291) ## Why? GitHub: fix https://github.com/ruby/rexml/issues/289 We must reject all well-formed XMLs: https://www.w3.org/TR/2006/REC-xml11-20060816/#proc-types > Validating and non-validating processors alike MUST report violations of this specification's well-formedness constraints in the content of the [document entity](https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-docent) and any other [parsed entities](https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-parsedent) that they read. No root element XML is not well-formed because `document` requires one `element`: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed > [1] document ::= ( [prolog](https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog) [element](https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element) [Misc](https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc)* ) - ( [Char](https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Char)* [RestrictedChar](https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar) [Char](https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Char)* ) --- lib/rexml/parsers/baseparser.rb | 5 +++ test/parse/test_attribute_list_declaration.rb | 6 +-- test/parse/test_comment.rb | 2 +- test/parse/test_element.rb | 39 +++++++++++++++++++ test/parse/test_entity_declaration.rb | 8 ++-- test/parse/test_processing_instruction.rb | 4 +- test/test_contrib.rb | 2 +- test/test_core.rb | 18 ++++----- test/test_document.rb | 6 +-- test/test_entity.rb | 2 +- 10 files changed, 66 insertions(+), 26 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 9304e96d..8fe287a7 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -266,6 +266,11 @@ def pull_event path = "/" + @tags.join("/") raise ParseException.new("Missing end tag for '#{path}'", @source) end + + unless @document_status == :in_element + raise ParseException.new("Malformed XML: No root element", @source) + end + return [ :end_document ] end return @stack.shift if @stack.size > 0 diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb index 43882528..4fa54218 100644 --- a/test/parse/test_attribute_list_declaration.rb +++ b/test/parse/test_attribute_list_declaration.rb @@ -10,9 +10,9 @@ class TestParseAttributeListDeclaration < Test::Unit::TestCase def test_linear_performance_space seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new("]>") + " root v CDATA #FIXED \"test\">]>") end end @@ -23,7 +23,7 @@ def test_linear_performance_tab_and_gt "\t" * n + "root value CDATA \"" + ">" * n + - "\">]>") + "\">]>") end end end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 6339835d..d00c8f32 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -174,7 +174,7 @@ def test_after_root def test_linear_performance_top_level_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('') + REXML::Document.new('') end end diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index f07a7d5a..de66447d 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -12,6 +12,45 @@ def parse(xml) end class TestInvalid < self + def test_top_level_no_tag + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: No root element +Line: 0 +Position: 0 +Last 80 unconsumed characters: + + DETAIL + end + + def test_top_level_no_tag_with_xml_declaration + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: No root element +Line: 1 +Position: 21 +Last 80 unconsumed characters: + + DETAIL + end + + def test_top_level_no_tag_with_comment + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed XML: No root element +Line: 1 +Position: 16 +Last 80 unconsumed characters: + + DETAIL + end + def test_top_level_end_tag exception = assert_raise(REXML::ParseException) do parse("") diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index 81d95b58..0cbd4550 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -523,7 +523,7 @@ def test_linear_performance_entity_value_gt assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new("" * n + - "\">]>") + "\">]>") end end @@ -532,7 +532,7 @@ def test_linear_performance_entity_value_gt_right_bracket assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new("]" * n + - "\">]>") + "\">]>") end end @@ -541,7 +541,7 @@ def test_linear_performance_system_literal_in_system_gt_right_bracket assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new("]" * n + - "\">]>") + "\">]>") end end @@ -550,7 +550,7 @@ def test_linear_performance_system_literal_in_public_gt_right_bracket assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new("]" * n + - "\">]>") + "\">]>") end end end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 70d17747..723a8cd0 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -237,14 +237,14 @@ def test_content_question def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new("" * n + " ?>") + REXML::Document.new("" * n + " ?>") end end def test_linear_performance_tab seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new(" ?>") + REXML::Document.new(" ?>") end end end diff --git a/test/test_contrib.rb b/test/test_contrib.rb index 23ee35b1..0b66672c 100644 --- a/test/test_contrib.rb +++ b/test/test_contrib.rb @@ -472,7 +472,7 @@ def test_maintain_dtd %extern-packages; %extern-common; -]>} +]>} doc = Document.new( src ) doc.write( out="" ) src = src.tr('"', "'") diff --git a/test/test_core.rb b/test/test_core.rb index 651056f2..c998490e 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -329,7 +329,7 @@ def test_instruction REXML::Formatters::Default.new.write( instruction, out = "" ) assert_equal(source, out) - d = Document.new( source ) + d = Document.new( source + "") instruction2 = d[0] assert_equal(instruction.to_s, instruction2.to_s) @@ -875,7 +875,7 @@ def test_entities def test_element_decl element_decl = Source.new(" -]>") +]>") doc = Document.new( element_decl ) d = doc[0] assert_equal("", d.to_s.split(/\n/)[1].strip) @@ -1329,7 +1329,7 @@ def test_ticket_53 end def test_ticket_52 - source = "" + source = "" d = REXML::Document.new(source) d.write(k="") assert_equal( source, k ) @@ -1408,10 +1408,10 @@ def test_ticket_48_part_II end def test_ticket_88 - doc = REXML::Document.new("") - assert_equal("", doc.to_s) - doc = REXML::Document.new("") - assert_equal("", doc.to_s) + doc = REXML::Document.new("") + assert_equal("", doc.to_s) + doc = REXML::Document.new("") + assert_equal("", doc.to_s) end def test_ticket_85 @@ -1550,10 +1550,6 @@ def test_ticket_138 REXML::Document.new(doc.root.to_s).root.attributes.to_h) end - def test_empty_doc - assert(REXML::Document.new('').children.empty?) - end - private def attribute(name, value) REXML::Attribute.new(name, value) diff --git a/test/test_document.rb b/test/test_document.rb index 39b6c337..71fed190 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -151,11 +151,11 @@ def test_tag_in_cdata_with_not_ascii_only_but_ascii8bit_encoding_source def test_xml_declaration_standalone bug2539 = '[ruby-core:27345]' - doc = REXML::Document.new('') + doc = REXML::Document.new('') assert_equal('no', doc.stand_alone?, bug2539) - doc = REXML::Document.new('') + doc = REXML::Document.new('') assert_equal('no', doc.stand_alone?, bug2539) - doc = REXML::Document.new('') + doc = REXML::Document.new('') assert_equal('no', doc.stand_alone?, bug2539) end diff --git a/test/test_entity.rb b/test/test_entity.rb index 89f83894..03f268dd 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -79,7 +79,7 @@ def test_constructor - ]>} + ]>} d = REXML::Document.new( source ) dt = d.doctype From 822530c70f898bd33c075622fe068c6d6433e6c9 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 7 Sep 2025 17:46:02 +0900 Subject: [PATCH 82/91] Add 3.4.3 entry (#293) --- NEWS.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/NEWS.md b/NEWS.md index 313b07d5..7ee78b15 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,31 @@ # News +## 3.4.3 - 2025-09-07 {#version-3-4-3} + +### Improvement + + * Reject no root element XML as an invalid XML + * GH-289 + * GH-291 + * Patch by NAITOH Jun + * Reported by Sutou Kouhei + +### Fixes + + * Fixed an issue with `IOSource#read_until` when reaching the end of a file + * GH-287 + * GH-288 + * Patch by NAITOH Jun + * Reported by Jason Thomas + +### Thanks + + * NAITOH Jun + + * Sutou Kouhei + + * Jason Thomas + ## 3.4.2 - 2025-08-26 {#version-3-4-2} ### Improvement From 4ffe211b501614e769a8bf37d63a7037bb5d2e73 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 7 Sep 2025 18:06:08 +0900 Subject: [PATCH 83/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 91d2a6ff..226e1986 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.3" + VERSION = "3.4.4" REVISION = "" Copyright = COPYRIGHT From 37cde3f4e660f9748f90a933daf7a9e51337d013 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 10 Sep 2025 10:23:35 +0900 Subject: [PATCH 84/91] Accept `REXML::Document.new("")` for backward compatibility (#295) ## Why? GitHub: Fix GH-296 XML without a root element is invalid XML, but to maintain backward compatibility, we will change the behavior of `REXML::Document.new("")` to be the same as `REXML::Document.new(nil)`. Reported by Joe Rafaniello. Thanks!!! --- lib/rexml/document.rb | 4 +++- test/parse/test_element.rb | 6 +++--- test/test_core.rb | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 96ae5b75..bd3a0045 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -95,7 +95,9 @@ def initialize( source = nil, context = {} ) @entity_expansion_text_limit = Security.entity_expansion_text_limit super() @context = context - return if source.nil? + # `source = ""` is an invalid usage because no root element XML is an invalid XML. + # But we accept `""` for backward compatibility. + return if source.nil? or source == "" if source.kind_of? Document @context = source.context super source diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index de66447d..d1c1c8d3 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -14,12 +14,12 @@ def parse(xml) class TestInvalid < self def test_top_level_no_tag exception = assert_raise(REXML::ParseException) do - parse("") + parse(" ") end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed XML: No root element -Line: 0 -Position: 0 +Line: 1 +Position: 1 Last 80 unconsumed characters: DETAIL diff --git a/test/test_core.rb b/test/test_core.rb index c998490e..beaaeed7 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -1550,6 +1550,26 @@ def test_ticket_138 REXML::Document.new(doc.root.to_s).root.attributes.to_h) end + def test_document_with_no_arguments + assert do + REXML::Document.new(nil).children.empty? + end + end + + def test_document_with_nil_arguments + assert do + REXML::Document.new(nil).children.empty? + end + end + + def test_empty_doc + # This is an invalid usage because no root element XML is an invalid XML. + # But we accept `""` for backward compatibility. + assert do + REXML::Document.new('').children.empty? + end + end + private def attribute(name, value) REXML::Attribute.new(name, value) From 4f32ea33bc3f71cced67487659beef58edcf6d56 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 10 Sep 2025 20:41:35 +0900 Subject: [PATCH 85/91] Add 3.4.4 entry (#297) --- NEWS.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7ee78b15..14627121 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,21 @@ # News +## 3.4.4 - 2025-09-10 {#version-3-4-4} + +### Improvement + + * Accept `REXML::Document.new("")` for backward compatibility + * GH-296 + * GH-295 + * Patch by NAITOH Jun + * Reported by Joe Rafaniello + +### Thanks + + * NAITOH Jun + + * Joe Rafaniello + ## 3.4.3 - 2025-09-07 {#version-3-4-3} ### Improvement From 39e7e28f79705d8e50513330f26e67195bece819 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 11 Sep 2025 07:20:45 +0900 Subject: [PATCH 86/91] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 226e1986..cd865f25 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.4" + VERSION = "3.4.5" REVISION = "" Copyright = COPYRIGHT From 5474ab2ac700234289e6b3890ec51fc2c32a42b6 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 14 Sep 2025 20:59:54 +0900 Subject: [PATCH 87/91] Use more StringScanner based API for parse_id (#299) ## Why? Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner. --- lib/rexml/parsers/baseparser.rb | 92 +++++++------------- test/parse/test_document_type_declaration.rb | 40 +++++++-- test/parse/test_notation_declaration.rb | 38 ++++---- 3 files changed, 87 insertions(+), 83 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 8fe287a7..0b48ec14 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -158,6 +158,9 @@ module Private DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ end XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + EXTERNAL_ID_PUBLIC_PATTERN = /\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um + EXTERNAL_ID_SYSTEM_PATTERN = /\s+#{SYSTEMLITERAL}/um + PUBLIC_ID_PATTERN = /\s+#{PUBIDLITERAL}/um end private_constant :Private @@ -307,7 +310,6 @@ def pull_event @source.ensure_buffer else id = parse_id(base_error_message, - accept_external_id: true, accept_public_id: false) if id[0] == "SYSTEM" # For backward compatibility @@ -409,7 +411,6 @@ def pull_event end name = parse_name(base_error_message) id = parse_id(base_error_message, - accept_external_id: true, accept_public_id: true) @source.skip_spaces unless @source.match?(">", true) @@ -667,68 +668,41 @@ def parse_name(base_error_message) end def parse_id(base_error_message, - accept_external_id:, accept_public_id:) - if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true)) - pubid = system = nil - pubid_literal = md[1] - pubid = pubid_literal[1..-2] if pubid_literal # Remove quote - system_literal = md[2] - system = system_literal[1..-2] if system_literal # Remove quote - ["PUBLIC", pubid, system] - elsif accept_public_id and (md = @source.match(PUBLIC_ID, true)) - pubid = system = nil - pubid_literal = md[1] - pubid = pubid_literal[1..-2] if pubid_literal # Remove quote - ["PUBLIC", pubid, nil] - elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true)) - system = nil - system_literal = md[1] - system = system_literal[1..-2] if system_literal # Remove quote - ["SYSTEM", nil, system] - else - details = parse_id_invalid_details(accept_external_id: accept_external_id, - accept_public_id: accept_public_id) - message = "#{base_error_message}: #{details}" - raise REXML::ParseException.new(message, @source) - end - end - - def parse_id_invalid_details(accept_external_id:, - accept_public_id:) - public = /\A\s*PUBLIC/um - system = /\A\s*SYSTEM/um - if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um) - if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um) - return "public ID literal is missing" - end - unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um) - return "invalid public ID literal" - end - if accept_public_id - if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) - return "system ID literal is missing" - end - unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) - return "invalid system literal" - end - "garbage after system literal" + @source.skip_spaces + if @source.match?("PUBLIC", true) + if (md = @source.match(Private::EXTERNAL_ID_PUBLIC_PATTERN, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + system_literal = md[2] + system = system_literal[1..-2] if system_literal # Remove quote + ["PUBLIC", pubid, system] + elsif accept_public_id and (md = @source.match(Private::PUBLIC_ID_PATTERN, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + ["PUBLIC", pubid, nil] + elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um) + raise REXML::ParseException.new("#{base_error_message}: public ID literal is missing", @source) + elsif !@source.match?(Private::PUBLIC_ID_PATTERN) + raise REXML::ParseException.new("#{base_error_message}: invalid public ID literal", @source) else - "garbage after public ID literal" + raise REXML::ParseException.new("#{base_error_message}: garbage after public ID literal", @source) end - elsif accept_external_id and @source.match?(/#{system}/um) - if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um) - return "system literal is missing" - end - unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um) - return "invalid system literal" + elsif @source.match?("SYSTEM", true) + if (md = @source.match(Private::EXTERNAL_ID_SYSTEM_PATTERN, true)) + system = nil + system_literal = md[1] + system = system_literal[1..-2] if system_literal # Remove quote + ["SYSTEM", nil, system] + elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um) + raise REXML::ParseException.new("#{base_error_message}: system literal is missing", @source) + else + raise REXML::ParseException.new("#{base_error_message}: invalid system literal", @source) end - "garbage after system literal" else - unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um) - return "invalid ID type" - end - "ID type is missing" + raise REXML::ParseException.new("#{base_error_message}: invalid ID type", @source) end end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index d4658b9e..cc37ad3f 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -153,7 +153,22 @@ def test_no_literal Line: 3 Position: 26 Last 80 unconsumed characters: -SYSTEM> +> + DETAIL + end + + def test_garbage_invalid_system_literal + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + DETAIL end @@ -165,10 +180,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: garbage after external ID -Line: 3 -Position: 36 +Line: 1 +Position: 29 Last 80 unconsumed characters: -x'> +x'> DETAIL end @@ -200,7 +215,7 @@ def test_content_double_quote Line: 3 Position: 62 Last 80 unconsumed characters: -PUBLIC 'double quote " is invalid' "r.dtd"> + 'double quote " is invalid' "r.dtd"> DETAIL end @@ -220,6 +235,21 @@ def test_double_quote end class TestSystemLiteral < self + def test_garbage_after_public_ID_literal + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: garbage after public ID literal +Line: 3 +Position: 54 +Last 80 unconsumed characters: + "public-id-literal" 'system> + DETAIL + end + def test_garbage_after_literal exception = assert_raise(REXML::ParseException) do parse(<<-DOCTYPE) diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb index 9e81b6a4..11914d37 100644 --- a/test/parse/test_notation_declaration.rb +++ b/test/parse/test_notation_declaration.rb @@ -32,10 +32,10 @@ def test_no_name end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: name is missing -Line: 5 -Position: 72 +Line: 2 +Position: 62 Last 80 unconsumed characters: - ]> + DETAIL end @@ -62,10 +62,10 @@ def test_no_id_type end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: invalid ID type -Line: 5 -Position: 77 +Line: 2 +Position: 67 Last 80 unconsumed characters: -> ]> +> DETAIL end @@ -77,10 +77,10 @@ def test_invalid_id_type end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: invalid ID type -Line: 5 -Position: 85 +Line: 2 +Position: 75 Last 80 unconsumed characters: - INVALID> ]> +INVALID> DETAIL end end @@ -98,7 +98,7 @@ def test_no_literal Line: 5 Position: 84 Last 80 unconsumed characters: - SYSTEM> ]> +> ]> DETAIL end @@ -110,10 +110,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: garbage before end > -Line: 5 -Position: 103 +Line: 2 +Position: 93 Last 80 unconsumed characters: -x'> ]> +x'> DETAIL end @@ -145,7 +145,7 @@ def test_content_double_quote Line: 5 Position: 129 Last 80 unconsumed characters: - PUBLIC 'double quote " is invalid' "system-literal"> ]> + 'double quote " is invalid' "system-literal"> ]> DETAIL end @@ -173,10 +173,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: garbage before end > -Line: 5 -Position: 123 +Line: 2 +Position: 113 Last 80 unconsumed characters: -x'> ]> +x'> DETAIL end @@ -229,7 +229,7 @@ def test_no_literal Line: 5 Position: 84 Last 80 unconsumed characters: - PUBLIC> ]> +> ]> DETAIL end @@ -244,7 +244,7 @@ def test_literal_content_double_quote Line: 5 Position: 128 Last 80 unconsumed characters: - PUBLIC 'double quote \" is invalid in PubidLiteral'> ]> + 'double quote \" is invalid in PubidLiteral'> ]> DETAIL end From fd7735d5ac7eda93587872d43d080216d8e82068 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 17 Sep 2025 11:09:24 +0900 Subject: [PATCH 88/91] Optimization of REXML::Parsers::SAX2Parser#get_namespace (#300) ## Benchmark ``` before after before(YJIT) after(YJIT) sax 1.415k 2.297k 1.755k 2.958k i/s - 100.000 times in 0.070687s 0.043539s 0.056966s 0.033810s Comparison: sax after(YJIT): 2957.7 i/s after: 2296.8 i/s - 1.29x slower before(YJIT): 1755.4 i/s - 1.68x slower before: 1414.7 i/s - 2.09x slower ``` - YJIT=ON : 1.68x faster - YJIT=OFF : 1.62x faster --- benchmark/sax.yaml | 34 +++++++++++++++++++++++++++++++++ lib/rexml/parsers/sax2parser.rb | 11 ++++------- 2 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 benchmark/sax.yaml diff --git a/benchmark/sax.yaml b/benchmark/sax.yaml new file mode 100644 index 00000000..1cc27d7e --- /dev/null +++ b/benchmark/sax.yaml @@ -0,0 +1,34 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/parsers/sax2parser' + + DEPTH = 100 + xml = '' * DEPTH + '' * DEPTH + +benchmark: + 'sax' : | + parser = REXML::Parsers::SAX2Parser.new(xml) + parser.listen(:start_element){|uri, localname, qname, attrs|} + parser.parse diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index a51477de..79838528 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -12,7 +12,7 @@ def initialize source @parser = BaseParser.new(source) @listeners = [] @procs = [] - @namespace_stack = [] + @namespace_stack = [{}] @has_listeners = false @tag_stack = [] @entities = {} @@ -122,7 +122,7 @@ def parse event[2].each {|n, v| event[2][n] = @parser.normalize(v)} nsdecl = event[2].find_all { |n, value| n =~ /^xmlns(:|$)/ } nsdecl.collect! { |n, value| [ n[6..-1], value ] } - @namespace_stack.push({}) + @namespace_stack.push(@namespace_stack[-1].dup) nsdecl.each do |n,v| @namespace_stack[-1][n] = v # notify observers of namespaces @@ -259,11 +259,8 @@ def add( pair ) end def get_namespace( prefix ) - return nil if @namespace_stack.empty? - - uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) || - (@namespace_stack.find { |ns| not ns[nil].nil? }) - uris[-1][prefix] unless uris.nil? or 0 == uris.size + current_namespace = @namespace_stack[-1] + current_namespace[prefix] || current_namespace[nil] end end end From aae683cc66c764061a4f1ce10646f310e9a26adf Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 21 Sep 2025 14:51:49 +0900 Subject: [PATCH 89/91] Fixed an issue when executing `REXML::Parsers::SAX2Parser#parse` without specifying `#listen` (#302) ## Why? ``` > require 'rexml/parsers/sax2parser' > REXML::Parsers::SAX2Parser.new("").parse lib/rexml/parsers/sax2parser.rb:263:in 'REXML::Parsers::SAX2Parser#get_namespace': undefined method '[]' for nil (NoMethodError) current_namespace[prefix] || current_namespace[nil] ^^^^^^^^ from lib/rexml/parsers/sax2parser.rb:150:in 'REXML::Parsers::SAX2Parser#parse' ``` --- lib/rexml/parsers/sax2parser.rb | 2 +- test/test_sax.rb | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 79838528..f12494d3 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -157,7 +157,7 @@ def parse ob.end_element( uri, local, event[1] ) } if listeners - namespace_mapping = @namespace_stack.pop + namespace_mapping = @namespace_stack.pop if procs or listeners # find the observers for namespaces procs = get_procs( :end_prefix_mapping, event[1] ) listeners = get_listeners( :end_prefix_mapping, event[1] ) diff --git a/test/test_sax.rb b/test/test_sax.rb index caec983b..9fa27713 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -7,6 +7,12 @@ module REXMLTests class SAX2Tester < Test::Unit::TestCase include Helper::Fixture include REXML + + def test_without_listien + p = Parsers::SAX2Parser.new "" + p.parse + end + def test_characters d = Document.new( "@blah@" ) txt = d.root.text From 8c41d35b62a7dd0b5e9c7ed725ee5f90a2024b6d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 22 Sep 2025 09:43:31 +0900 Subject: [PATCH 90/91] Optimization of REXML::Parsers::SAX2Parser#get_procs, #get_listeners (#303) ## Benchmark ``` $ benchmark-driver benchmark/sax.yaml before after before(YJIT) after(YJIT) sax 2.309k 2.317k 2.760k 2.879k i/s - 100.000 times in 0.043300s 0.043155s 0.036238s 0.034738s Comparison: sax after(YJIT): 2878.7 i/s before(YJIT): 2759.5 i/s - 1.04x slower after: 2317.2 i/s - 1.24x slower before: 2309.5 i/s - 1.25x slower ``` - YJIT=ON : 1.04x faster - YJIT=OFF : 1.00x faster --- lib/rexml/parsers/sax2parser.rb | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index f12494d3..ef1c4d9a 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -226,12 +226,8 @@ def get_procs( symbol, name ) return nil if @procs.size == 0 @procs.find_all do |sym, match, block| ( - (sym.nil? or symbol == sym) and - ((name.nil? and match.nil?) or match.nil? or ( - (name == match) or - (match.kind_of? Regexp and name =~ match) - ) - ) + (sym.nil? or (symbol == sym)) and + (match.nil? or (name == match) or (match.kind_of? Regexp and match.match?(name))) ) end.collect{|x| x[-1]} end @@ -239,12 +235,8 @@ def get_listeners( symbol, name ) return nil if @listeners.size == 0 @listeners.find_all do |sym, match, block| ( - (sym.nil? or symbol == sym) and - ((name.nil? and match.nil?) or match.nil? or ( - (name == match) or - (match.kind_of? Regexp and name =~ match) - ) - ) + (sym.nil? or (symbol == sym)) and + (match.nil? or (name == match) or (match.kind_of? Regexp and match.match?(name))) ) end.collect{|x| x[-1]} end From 6c1814718eb26080f94633f0ec9b08bd69382cdc Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 23 Sep 2025 06:21:39 +0900 Subject: [PATCH 91/91] Fix invalid XML without root element in Benchmark (#304) ## Why? A fix was missed in https://github.com/ruby/rexml/pull/291. --- benchmark/parse_comment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/parse_comment.yaml b/benchmark/parse_comment.yaml index a0a3a771..b9d7fdc0 100644 --- a/benchmark/parse_comment.yaml +++ b/benchmark/parse_comment.yaml @@ -26,8 +26,8 @@ prelude: | SIZE = 100000 - top_level_xml = "\n" - in_doctype_xml = "]>" + top_level_xml = "\n" + in_doctype_xml = "]>" after_doctype_xml = "" benchmark: