Skip to content

Commit

Permalink
Fix a bug that invalid document declaration may be accepted
Browse files Browse the repository at this point in the history
HackerOne: HO-1104077

It's caused by ignoring garbage before "\n<!DOCTYPE..." and after
"<!DOCTYPE\n".

Reported by Juho Nurminen. Thanks!!!
  • Loading branch information
kou authored and mame committed Apr 5, 2021
1 parent f9d88e4 commit 9b311e5
Show file tree
Hide file tree
Showing 3 changed files with 326 additions and 95 deletions.
200 changes: 126 additions & 74 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class BaseParser

DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
DOCTYPE_END = /\A\s*\]\s*>/um
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A<!--/u
COMMENT_PATTERN = /<!--(.*?)-->/um
Expand All @@ -69,7 +68,6 @@ class BaseParser
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um

ENTITY_START = /\A\s*<!ENTITY/
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
Expand Down Expand Up @@ -101,8 +99,9 @@ class BaseParser
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um

NOTATIONDECL_START = /\A\s*<!NOTATION/um
PUBLIC = /\A\s*<!NOTATION\s+#{NAME}\s+(PUBLIC)\s+#{PUBIDLITERAL}(?:\s+#{SYSTEMLITERAL})?\s*>/um
SYSTEM = /\A\s*<!NOTATION\s+#{NAME}\s+(SYSTEM)\s+#{SYSTEMLITERAL}\s*>/um
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um

EREFERENCE = /&(?!#{NAME};)/

Expand Down Expand Up @@ -225,24 +224,37 @@ def pull_event
when INSTRUCTION_START
return process_instruction
when DOCTYPE_START
md = @source.match( DOCTYPE_PATTERN, true )
base_error_message = "Malformed DOCTYPE"
@source.match(DOCTYPE_START, true)
@nsstack.unshift(curr_ns=Set.new)
identity = md[1]
close = md[2]
identity =~ IDENTITY
name = $1
raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
pub_sys = $2.nil? ? nil : $2.strip
long_name = $4.nil? ? nil : $4.strip
uri = $6.nil? ? nil : $6.strip
args = [ :start_doctype, name, pub_sys, long_name, uri ]
if close == ">"
name = parse_name(base_error_message)
if @source.match(/\A\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
@source.read if @source.buffer.size<2
md = @source.match(/^\s*/um, true)
@stack << [ :end_doctype ]
else
@document_status = :in_doctype
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
end
if @source.match(/\A\s*\[/um, true)
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
@document_status = :after_doctype
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
end
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\A\s*/um, true)
@stack << [ :end_doctype ]
end
return args
when /^\s+/
Expand Down Expand Up @@ -313,27 +325,24 @@ def pull_event
end
return [ :attlistdecl, element, pairs, contents ]
when NOTATIONDECL_START
md = nil
if @source.match( PUBLIC )
md = @source.match( PUBLIC, true )
pubid = system = nil
pubid_literal = md[3]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
system_literal = md[4]
system = system_literal[1..-2] if system_literal # Remove quote
vals = [md[1], md[2], pubid, system]
elsif @source.match( SYSTEM )
md = @source.match( SYSTEM, true )
system = nil
system_literal = md[3]
system = system_literal[1..-2] if system_literal # Remove quote
vals = [md[1], md[2], nil, system]
else
details = notation_decl_invalid_details
message = "Malformed notation declaration: #{details}"
base_error_message = "Malformed notation declaration"
unless @source.match(/\A\s*<!NOTATION\s+/um, true)
if @source.match(/\A\s*<!NOTATION\s*>/um)
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid declaration name"
end
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\A\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
return [ :notationdecl, *vals ]
return [:notationdecl, name, *id]
when DOCTYPE_END
@document_status = :after_doctype
@source.match( DOCTYPE_END, true )
Expand Down Expand Up @@ -488,6 +497,85 @@ def need_source_encoding_update?(xml_declaration_encoding)
true
end

def parse_name(base_error_message)
md = @source.match(/\A\s*#{NAME}/um, true)
unless md
if @source.match(/\A\s*\S/um)
message = "#{base_error_message}: invalid name"
else
message = "#{base_error_message}: name is missing"
end
raise REXML::ParseException.new(message, @source)
end
md[1]
end

def parse_id(base_error_message,
accept_external_id:,
accept_public_id:)
if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
system_literal = md[2]
system = system_literal[1..-2] if system_literal # Remove quote
["PUBLIC", pubid, system]
elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
["PUBLIC", pubid, nil]
elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
system = nil
system_literal = md[1]
system = system_literal[1..-2] if system_literal # Remove quote
["SYSTEM", nil, system]
else
details = parse_id_invalid_details(accept_external_id: accept_external_id,
accept_public_id: accept_public_id)
message = "#{base_error_message}: #{details}"
raise REXML::ParseException.new(message, @source)
end
end

def parse_id_invalid_details(accept_external_id:,
accept_public_id:)
public = /\A\s*PUBLIC/um
system = /\A\s*SYSTEM/um
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
return "public ID literal is missing"
end
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
return "invalid public ID literal"
end
if accept_public_id
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
return "system ID literal is missing"
end
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
end
"garbage after system literal"
else
"garbage after public ID literal"
end
elsif accept_external_id and @source.match(/#{system}/um)
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
return "system literal is missing"
end
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
end
"garbage after system literal"
else
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
return "invalid ID type"
end
"ID type is missing"
end
end

def process_instruction
match_data = @source.match(INSTRUCTION_PATTERN, true)
unless match_data
Expand Down Expand Up @@ -580,42 +668,6 @@ def parse_attributes(prefixes, curr_ns)
end
return attributes, closed
end

def notation_decl_invalid_details
name = /#{NOTATIONDECL_START}\s+#{NAME}/um
public = /#{name}\s+PUBLIC/um
system = /#{name}\s+SYSTEM/um
if @source.match(/#{NOTATIONDECL_START}\s*>/um)
return "name is missing"
elsif not @source.match(/#{name}[\s>]/um)
return "invalid name"
elsif @source.match(/#{name}\s*>/um)
return "ID type is missing"
elsif not @source.match(/#{name}\s+(?:PUBLIC|SYSTEM)[\s>]/um)
return "invalid ID type"
elsif @source.match(/#{public}/um)
if @source.match(/#{public}\s*>/um)
return "public ID literal is missing"
elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
return "invalid public ID literal"
elsif @source.match(/#{public}\s+#{PUBIDLITERAL}[^\s>]/um)
return "garbage after public ID literal"
elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*>/um)
return "garbage after system literal"
end
elsif @source.match(/#{system}/um)
if @source.match(/#{system}\s*>/um)
return "system literal is missing"
elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}\s*>/um)
return "garbage after system literal"
end
end
"end > is missing"
end
end
end
end
Expand Down
Loading

0 comments on commit 9b311e5

Please sign in to comment.