From 3da03397ce1dca2eaf21149dcdc88a212b6afdc2 Mon Sep 17 00:00:00 2001 From: akira Date: Wed, 10 Sep 2008 09:34:49 +0000 Subject: * lib/uri/common.rb (URI::Parser): new class. * lib/uri/mailto.rb, lib/uri/generic.rb: follow the above change. * test/uri/test_parser.rb: added tests for URI::Parser. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19282 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 8 + lib/uri/common.rb | 530 +++++++++++++++++++++++++++++------------------ lib/uri/generic.rb | 46 ++-- lib/uri/mailto.rb | 14 +- test/uri/test_generic.rb | 101 +++------ test/uri/test_parser.rb | 41 ++++ 6 files changed, 435 insertions(+), 305 deletions(-) create mode 100644 test/uri/test_parser.rb diff --git a/ChangeLog b/ChangeLog index 998204a04b..683545bfad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Wed Sep 10 18:25:19 2008 akira yamada + + * lib/uri/common.rb (URI::Parser): new class. + + * lib/uri/mailto.rb, lib/uri/generic.rb: follow the above change. + + * test/uri/test_parser.rb: added tests for URI::Parser. + Wed Sep 10 10:35:32 2008 Takeyuki Fujioka * lib/cgi/cookie.rb (CGI::Cookie#to_s): performance improvement diff --git a/lib/uri/common.rb b/lib/uri/common.rb index f0d68884de..98dda2a350 100644 --- a/lib/uri/common.rb +++ b/lib/uri/common.rb @@ -38,22 +38,232 @@ module URI # "$" | "," | "[" | "]" (RFC 2732) RESERVED = ";/?:@&=+$,\\[\\]" + # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum + DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)" + # toplabel = alpha | alpha *( alphanum | "-" ) alphanum + TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)" + # hostname = *( domainlabel "." ) toplabel [ "." ] + HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?" + + # :startdoc: + end # PATTERN + + # :startdoc: + end # REGEXP + + class Parser + include REGEXP + + # + # == Synopsis + # + # URI::Parser.new([opts]) + # + # == Args + # + # The constructor accepts a hash as options for parser. + # Keys of options are pattern names of URI components + # and values of options are pattern strings. + # The constructor generetes set of regexps for parsing URIs. + # + # You can use the following keys: + # + # * :ESCAPED (URI::PATTERN::ESCAPED in default) + # * :UNRESERVED (URI::PATTERN::UNRESERVED in default) + # * :DOMLABEL (URI::PATTERN::DOMLABEL in default) + # * :TOPLABEL (URI::PATTERN::TOPLABEL in default) + # * :HOSTNAME (URI::PATTERN::HOSTNAME in default) + # + # == Examples + # + # p = URI::Parser.new(:ESCPAED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})" + # u = p.parse("http://example.jp/%uABCD") #=> # + # URI.parse(u.to_s) #=> raises URI::InvalidURIError + # + # s = "http://examle.com/ABCD" + # u1 = p.parse(s) #=> # + # u2 = URI.parse(s) #=> # + # u1 == u2 #=> true + # u1.eql?(u2) #=> false + # + def initialize(opts = {}) + @pattern = initialize_pattern(opts) + @pattern.each_value {|v| v.freeze} + @pattern.freeze + + @regexp = initialize_regexp(@pattern) + @regexp.each_value {|v| v.freeze} + @regexp.freeze + end + attr_reader :pattern, :regexp + + def split(uri) + case uri + when '' + # null uri + + when @regexp[:ABS_URI] + scheme, opaque, userinfo, host, port, + registry, path, query, fragment = $~[1..-1] + + # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + + # absoluteURI = scheme ":" ( hier_part | opaque_part ) + # hier_part = ( net_path | abs_path ) [ "?" query ] + # opaque_part = uric_no_slash *uric + + # abs_path = "/" path_segments + # net_path = "//" authority [ abs_path ] + + # authority = server | reg_name + # server = [ [ userinfo "@" ] hostport ] + + if !scheme + raise InvalidURIError, + "bad URI(absolute but no scheme): #{uri}" + end + if !opaque && (!path && (!host && !registry)) + raise InvalidURIError, + "bad URI(absolute but no path): #{uri}" + end + + when @regexp[:REL_URI] + scheme = nil + opaque = nil + + userinfo, host, port, registry, + rel_segment, abs_path, query, fragment = $~[1..-1] + if rel_segment && abs_path + path = rel_segment + abs_path + elsif rel_segment + path = rel_segment + elsif abs_path + path = abs_path + end + + # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + + # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] + + # net_path = "//" authority [ abs_path ] + # abs_path = "/" path_segments + # rel_path = rel_segment [ abs_path ] + + # authority = server | reg_name + # server = [ [ userinfo "@" ] hostport ] + + else + raise InvalidURIError, "bad URI(is not URI?): #{uri}" + end + + path = '' if !path && !opaque # (see RFC2396 Section 5.2) + ret = [ + scheme, + userinfo, host, port, # X + registry, # X + path, # Y + opaque, # Y + query, + fragment + ] + return ret + end + + def parse(uri) + scheme, userinfo, host, port, + registry, path, opaque, query, fragment = self.split(uri) + + if scheme && URI.scheme_list.include?(scheme.upcase) + URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, + registry, path, opaque, query, + fragment, self) + else + Generic.new(scheme, userinfo, host, port, + registry, path, opaque, query, + fragment, self) + end + end + + def join(*str) + u = self.parse(str[0]) + str[1 .. -1].each do |x| + u = u.merge(x) + end + u + end + + def extract(str, schemes = nil, &block) + if block_given? + str.scan(make_regexp(schemes)) { yield $& } + nil + else + result = [] + str.scan(make_regexp(schemes)) { result.push $& } + result + end + end + + def make_regexp(schemes = nil) + unless schemes + @regexp[:ABS_URI_REF] + else + /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x + end + end + + def escape(str, unsafe = @regexp[:UNSAFE]) + unless unsafe.kind_of?(Regexp) + # perhaps unsafe is String object + unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false) + end + str.gsub(unsafe) do + us = $& + tmp = '' + us.each_byte do |uc| + tmp << sprintf('%%%02X', uc) + end + tmp + end + end + + def unescape(str, escaped = @regexp[:ESCAPED]) + str.gsub(escaped) { [$&.hex].pack('U') } + end + + @@to_s = Kernel.instance_method(:to_s) + def inspect + @@to_s.bind(self).call + end + + private + + def initialize_pattern(opts = {}) + ret = {} + ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED) + ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED + ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED + ret[:DOMLABEL] = domlabel = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL + ret[:TOPLABEL] = toplabel = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL + ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME) + + # RFC 2396 (URI Generic Syntax) + # RFC 2732 (IPv6 Literal Addresses in URL's) + # RFC 2373 (IPv6 Addressing Architecture) + # uric = reserved | unreserved | escaped - URIC = "(?:[#{UNRESERVED}#{RESERVED}]|#{ESCAPED})" + ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})" # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | # "&" | "=" | "+" | "$" | "," - URIC_NO_SLASH = "(?:[#{UNRESERVED};?:@&=+$,]|#{ESCAPED})" + ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})" # query = *uric - QUERY = "#{URIC}*" + ret[:QUERY] = query = "#{uric}*" # fragment = *uric - FRAGMENT = "#{URIC}*" + ret[:FRAGMENT] = fragment = "#{uric}*" - # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum - DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)" - # toplabel = alpha | alpha *( alphanum | "-" ) alphanum - TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)" # hostname = *( domainlabel "." ) toplabel [ "." ] - HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?" + unless hostname + ret[:HOSTNAME] = hostname = "(?:#{domlabel}\\.)*#{toplabel}\\.?" + end # RFC 2373, APPENDIX B: # IPv6address = hexpart [ ":" IPv4address ] @@ -66,152 +276,164 @@ module URI # allowed too. Here is a replacement. # # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT - IPV4ADDR = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}" + ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}" # hex4 = 1*4HEXDIG - HEX4 = "[#{HEX}]{1,4}" + hex4 = "[#{PATTERN::HEX}]{1,4}" # lastpart = hex4 | IPv4address - LASTPART = "(?:#{HEX4}|#{IPV4ADDR})" + lastpart = "(?:#{hex4}|#{ipv4addr})" # hexseq1 = *( hex4 ":" ) hex4 - HEXSEQ1 = "(?:#{HEX4}:)*#{HEX4}" + hexseq1 = "(?:#{hex4}:)*#{hex4}" # hexseq2 = *( hex4 ":" ) lastpart - HEXSEQ2 = "(?:#{HEX4}:)*#{LASTPART}" + hexseq2 = "(?:#{hex4}:)*#{lastpart}" # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ] - IPV6ADDR = "(?:#{HEXSEQ2}|(?:#{HEXSEQ1})?::(?:#{HEXSEQ2})?)" + ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)" # IPv6prefix = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT # unused # ipv6reference = "[" IPv6address "]" (RFC 2732) - IPV6REF = "\\[#{IPV6ADDR}\\]" + ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]" # host = hostname | IPv4address # host = hostname | IPv4address | IPv6reference (RFC 2732) - HOST = "(?:#{HOSTNAME}|#{IPV4ADDR}|#{IPV6REF})" + ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})" # port = *digit - PORT = '\d*' + port = '\d*' # hostport = host [ ":" port ] - HOSTPORT = "#{HOST}(?::#{PORT})?" + ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?" # userinfo = *( unreserved | escaped | # ";" | ":" | "&" | "=" | "+" | "$" | "," ) - USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})*" + ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*" # pchar = unreserved | escaped | # ":" | "@" | "&" | "=" | "+" | "$" | "," - PCHAR = "(?:[#{UNRESERVED}:@&=+$,]|#{ESCAPED})" + pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})" # param = *pchar - PARAM = "#{PCHAR}*" + param = "#{pchar}*" # segment = *pchar *( ";" param ) - SEGMENT = "#{PCHAR}*(?:;#{PARAM})*" + segment = "#{pchar}*(?:;#{param})*" # path_segments = segment *( "/" segment ) - PATH_SEGMENTS = "#{SEGMENT}(?:/#{SEGMENT})*" + ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*" # server = [ [ userinfo "@" ] hostport ] - SERVER = "(?:#{USERINFO}@)?#{HOSTPORT}" + server = "(?:#{userinfo}@)?#{hostport}" # reg_name = 1*( unreserved | escaped | "$" | "," | # ";" | ":" | "@" | "&" | "=" | "+" ) - REG_NAME = "(?:[#{UNRESERVED}$,;:@&=+]|#{ESCAPED})+" + ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+" # authority = server | reg_name - AUTHORITY = "(?:#{SERVER}|#{REG_NAME})" + authority = "(?:#{server}|#{reg_name})" # rel_segment = 1*( unreserved | escaped | # ";" | "@" | "&" | "=" | "+" | "$" | "," ) - REL_SEGMENT = "(?:[#{UNRESERVED};@&=+$,]|#{ESCAPED})+" + ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+" # scheme = alpha *( alpha | digit | "+" | "-" | "." ) - SCHEME = "[#{ALPHA}][-+.#{ALPHA}\\d]*" + ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][-+.#{PATTERN::ALPHA}\\d]*" # abs_path = "/" path_segments - ABS_PATH = "/#{PATH_SEGMENTS}" + ret[:ABS_PATH] = abs_path = "/#{path_segments}" # rel_path = rel_segment [ abs_path ] - REL_PATH = "#{REL_SEGMENT}(?:#{ABS_PATH})?" + ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?" # net_path = "//" authority [ abs_path ] - NET_PATH = "//#{AUTHORITY}(?:#{ABS_PATH})?" + ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?" # hier_part = ( net_path | abs_path ) [ "?" query ] - HIER_PART = "(?:#{NET_PATH}|#{ABS_PATH})(?:\\?(?:#{QUERY}))?" + ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?" # opaque_part = uric_no_slash *uric - OPAQUE_PART = "#{URIC_NO_SLASH}#{URIC}*" + ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*" # absoluteURI = scheme ":" ( hier_part | opaque_part ) - ABS_URI = "#{SCHEME}:(?:#{HIER_PART}|#{OPAQUE_PART})" + ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})" # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] - REL_URI = "(?:#{NET_PATH}|#{ABS_PATH}|#{REL_PATH})(?:\\?#{QUERY})?" + ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?" # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] - URI_REF = "(?:#{ABS_URI}|#{REL_URI})?(?:##{FRAGMENT})?" + ret[:URI_REF] = uri_ref = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?" - # XXX: - X_ABS_URI = " - (#{PATTERN::SCHEME}): (?# 1: scheme) + ret[:X_ABS_URI] = " + (#{scheme}): (?# 1: scheme) (?: - (#{PATTERN::OPAQUE_PART}) (?# 2: opaque) + (#{opaque_part}) (?# 2: opaque) | (?:(?: //(?: - (?:(?:(#{PATTERN::USERINFO})@)? (?# 3: userinfo) - (?:(#{PATTERN::HOST})(?::(\\d*))?))?(?# 4: host, 5: port) + (?:(?:(#{userinfo})@)? (?# 3: userinfo) + (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port) | - (#{PATTERN::REG_NAME}) (?# 6: registry) + (#{reg_name}) (?# 6: registry) ) | - (?!//)) (?# XXX: '//' is the mark for hostport) - (#{PATTERN::ABS_PATH})? (?# 7: path) - )(?:\\?(#{PATTERN::QUERY}))? (?# 8: query) + (?!//)) (?# XXX: '//' is the mark for hostport) + (#{abs_path})? (?# 7: path) + )(?:\\?(#{query}))? (?# 8: query) ) - (?:\\#(#{PATTERN::FRAGMENT}))? (?# 9: fragment) + (?:\\#(#{fragment}))? (?# 9: fragment) " - X_REL_URI = " + + ret[:X_REL_URI] = " (?: (?: // (?: - (?:(#{PATTERN::USERINFO})@)? (?# 1: userinfo) - (#{PATTERN::HOST})?(?::(\\d*))? (?# 2: host, 3: port) + (?:(#{userinfo})@)? (?# 1: userinfo) + (#{host})?(?::(\\d*))? (?# 2: host, 3: port) | - (#{PATTERN::REG_NAME}) (?# 4: registry) + (#{reg_name}) (?# 4: registry) ) ) | - (#{PATTERN::REL_SEGMENT}) (?# 5: rel_segment) + (#{rel_segment}) (?# 5: rel_segment) )? - (#{PATTERN::ABS_PATH})? (?# 6: abs_path) - (?:\\?(#{PATTERN::QUERY}))? (?# 7: query) - (?:\\#(#{PATTERN::FRAGMENT}))? (?# 8: fragment) + (#{abs_path})? (?# 6: abs_path) + (?:\\?(#{query}))? (?# 7: query) + (?:\\#(#{fragment}))? (?# 8: fragment) " - # :startdoc: - end # PATTERN - # :stopdoc: - - # for URI::split - ABS_URI = Regexp.new('^' + PATTERN::X_ABS_URI + '$', #' - Regexp::EXTENDED).freeze - REL_URI = Regexp.new('^' + PATTERN::X_REL_URI + '$', #' - Regexp::EXTENDED).freeze - - # for URI::extract - URI_REF = Regexp.new(PATTERN::URI_REF).freeze - ABS_URI_REF = Regexp.new(PATTERN::X_ABS_URI, Regexp::EXTENDED).freeze - REL_URI_REF = Regexp.new(PATTERN::X_REL_URI, Regexp::EXTENDED).freeze - - # for URI::escape/unescape - ESCAPED = Regexp.new(PATTERN::ESCAPED).freeze - UNSAFE = Regexp.new("[^#{PATTERN::UNRESERVED}#{PATTERN::RESERVED}]").freeze - - # for Generic#initialize - SCHEME = Regexp.new("^#{PATTERN::SCHEME}$").freeze #" - USERINFO = Regexp.new("^#{PATTERN::USERINFO}$").freeze #" - HOST = Regexp.new("^#{PATTERN::HOST}$").freeze #" - PORT = Regexp.new("^#{PATTERN::PORT}$").freeze #" - OPAQUE = Regexp.new("^#{PATTERN::OPAQUE_PART}$").freeze #" - REGISTRY = Regexp.new("^#{PATTERN::REG_NAME}$").freeze #" - ABS_PATH = Regexp.new("^#{PATTERN::ABS_PATH}$").freeze #" - REL_PATH = Regexp.new("^#{PATTERN::REL_PATH}$").freeze #" - QUERY = Regexp.new("^#{PATTERN::QUERY}$").freeze #" - FRAGMENT = Regexp.new("^#{PATTERN::FRAGMENT}$").freeze #" - # :startdoc: - end # REGEXP + ret + end + + def initialize_regexp(pattern) + ret = {} + + # for URI::split + ret[:ABS_URI] = Regexp.new('^' + pattern[:X_ABS_URI] + '$', Regexp::EXTENDED) + ret[:REL_URI] = Regexp.new('^' + pattern[:X_REL_URI] + '$', Regexp::EXTENDED) + + # for URI::extract + ret[:URI_REF] = Regexp.new(pattern[:URI_REF]) + ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED) + ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED) + + # for URI::escape/unescape + ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED]) + ret[:UNSAFE] = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]") + + # for Generic#initialize + ret[:SCHEME] = Regexp.new("^#{pattern[:SCHEME]}$") + ret[:USERINFO] = Regexp.new("^#{pattern[:USERINFO]}$") + ret[:HOST] = Regexp.new("^#{pattern[:HOST]}$") + ret[:PORT] = Regexp.new("^#{pattern[:PORT]}$") + ret[:OPAQUE] = Regexp.new("^#{pattern[:OPAQUE_PART]}$") + ret[:REGISTRY] = Regexp.new("^#{pattern[:REG_NAME]}$") + ret[:ABS_PATH] = Regexp.new("^#{pattern[:ABS_PATH]}$") + ret[:REL_PATH] = Regexp.new("^#{pattern[:REL_PATH]}$") + ret[:QUERY] = Regexp.new("^#{pattern[:QUERY]}$") + ret[:FRAGMENT] = Regexp.new("^#{pattern[:FRAGMENT]}$") + + ret + end + end # class Parser + + DEFAULT_PARSER = Parser.new + DEFAULT_PARSER.pattern.each_pair do |sym, str| + unless REGEXP::PATTERN.const_defined?(sym) + REGEXP::PATTERN.const_set(sym, str) + end + end + DEFAULT_PARSER.regexp.each_pair do |sym, str| + const_set(sym, str) + end module Util # :nodoc: def make_components_hash(klass, array_hash) @@ -246,8 +468,6 @@ module URI end module Escape - include REGEXP - # # == Synopsis # @@ -280,19 +500,8 @@ module URI # p URI.escape("@?@!", "!?") # # => "@%3F@%21" # - def escape(str, unsafe = UNSAFE) - unless unsafe.kind_of?(Regexp) - # perhaps unsafe is String object - unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false, 'N') - end - str.gsub(unsafe) do - us = $& - tmp = '' - us.each_byte do |uc| - tmp << sprintf('%%%02X', uc) - end - tmp - end + def escape(*arg) + DEFAULT_PARSER.escape(*arg) end alias encode escape # @@ -316,18 +525,19 @@ module URI # p URI.unescape(enc_uri) # # => "http://example.com/?a=\t\r" # - def unescape(str) - str.gsub(ESCAPED) do - $&[1,2].hex.chr - end + def unescape(*arg) + DEFAULT_PARSER.unescape(*arg) end alias decode unescape end - include REGEXP extend Escape + include REGEXP @@schemes = {} + def self.scheme_list + @@schemes + end # # Base class for all URI exceptions. @@ -378,75 +588,7 @@ module URI # # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil] # def self.split(uri) - case uri - when '' - # null uri - - when ABS_URI - scheme, opaque, userinfo, host, port, - registry, path, query, fragment = $~[1..-1] - - # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] - - # absoluteURI = scheme ":" ( hier_part | opaque_part ) - # hier_part = ( net_path | abs_path ) [ "?" query ] - # opaque_part = uric_no_slash *uric - - # abs_path = "/" path_segments - # net_path = "//" authority [ abs_path ] - - # authority = server | reg_name - # server = [ [ userinfo "@" ] hostport ] - - if !scheme - raise InvalidURIError, - "bad URI(absolute but no scheme): #{uri}" - end - if !opaque && (!path && (!host && !registry)) - raise InvalidURIError, - "bad URI(absolute but no path): #{uri}" - end - - when REL_URI - scheme = nil - opaque = nil - - userinfo, host, port, registry, - rel_segment, abs_path, query, fragment = $~[1..-1] - if rel_segment && abs_path - path = rel_segment + abs_path - elsif rel_segment - path = rel_segment - elsif abs_path - path = abs_path - end - - # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] - - # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] - - # net_path = "//" authority [ abs_path ] - # abs_path = "/" path_segments - # rel_path = rel_segment [ abs_path ] - - # authority = server | reg_name - # server = [ [ userinfo "@" ] hostport ] - - else - raise InvalidURIError, "bad URI(is not URI?): #{uri}" - end - - path = '' if !path && !opaque # (see RFC2396 Section 5.2) - ret = [ - scheme, - userinfo, host, port, # X - registry, # X - path, # Y - opaque, # Y - query, - fragment - ] - return ret + DEFAULT_PARSER.split(uri) end # @@ -481,18 +623,7 @@ module URI # # => "www.ruby-lang.org" # def self.parse(uri) - scheme, userinfo, host, port, - registry, path, opaque, query, fragment = self.split(uri) - - if scheme && @@schemes.include?(scheme.upcase) - @@schemes[scheme.upcase].new(scheme, userinfo, host, port, - registry, path, opaque, query, - fragment) - else - Generic.new(scheme, userinfo, host, port, - registry, path, opaque, query, - fragment) - end + DEFAULT_PARSER.parse(uri) end # @@ -517,11 +648,7 @@ module URI # # => # # def self.join(*str) - u = self.parse(str[0]) - str[1 .. -1].each do |x| - u = u.merge(x) - end - u + DEFAULT_PARSER.join(*str) end # @@ -549,14 +676,7 @@ module URI # # => ["http://foo.example.com/bla", "mailto:test@example.com"] # def self.extract(str, schemes = nil, &block) - if block_given? - str.scan(regexp(schemes)) { yield $& } - nil - else - result = [] - str.scan(regexp(schemes)) { result.push $& } - result - end + DEFAULT_PARSER.extract(str, schemes, &block) end # @@ -591,11 +711,7 @@ module URI # end # def self.regexp(schemes = nil) - unless schemes - ABS_URI_REF - else - /(?=#{Regexp.union(*schemes)}:)#{PATTERN::X_ABS_URI}/xn - end + DEFAULT_PARSER.make_regexp(schemes) end end diff --git a/lib/uri/generic.rb b/lib/uri/generic.rb index d907e0b4b2..8f5ab6375d 100644 --- a/lib/uri/generic.rb +++ b/lib/uri/generic.rb @@ -16,7 +16,6 @@ module URI # class Generic include URI - include REGEXP DEFAULT_PORT = nil @@ -74,7 +73,7 @@ module URI if args.kind_of?(Array) return self.build(args.collect{|x| if x - URI.escape(x) + @parser.escape(x) else x end @@ -83,7 +82,7 @@ module URI tmp = {} args.each do |key, value| tmp[key] = if value - URI.escape(value) + @parser.escape(value) else value end @@ -122,6 +121,7 @@ module URI "expected Array of or Hash of components of #{self.class} (#{self.class.component.join(', ')})" end + tmp << DEFAULT_PARSER tmp << true return self.new(*tmp) end @@ -146,6 +146,8 @@ module URI # Query data # +fragment+:: # A part of URI after '#' sign + # +parser+:: + # Parser for internal use [URI::DEFAULT_PARSER by default] # +arg_check+:: # Check arguments [false by default] # @@ -158,6 +160,7 @@ module URI path, opaque, query, fragment, + parser = DEFAULT_PARSER, arg_check = false) @scheme = nil @user = nil @@ -169,6 +172,7 @@ module URI @opaque = nil @registry = nil @fragment = nil + @parser = parser if arg_check self.scheme = scheme @@ -208,6 +212,7 @@ module URI attr_reader :query attr_reader :opaque attr_reader :fragment + attr_reader :parser # replace self by other URI object def replace!(oth) @@ -226,7 +231,7 @@ module URI end def check_scheme(v) - if v && SCHEME !~ v + if v && @parser.regexp[:SCHEME] !~ v raise InvalidComponentError, "bad component(expected scheme component): #{v}" end @@ -265,7 +270,7 @@ module URI return v unless v - if USERINFO !~ v + if @parser.regexp[:USERINFO] !~ v raise InvalidComponentError, "bad component(expected userinfo component or user component): #{v}" end @@ -286,7 +291,7 @@ module URI "password component depends user component" end - if USERINFO !~ v + if @parser.regexp[:USERINFO] !~ v raise InvalidComponentError, "bad component(expected user component): #{v}" end @@ -351,7 +356,7 @@ module URI private :split_userinfo def escape_userpass(v) - v = URI.escape(v, /[@:\/]/o) # RFC 1738 section 3.1 #/ + v = @parser.escape(v, /[@:\/]/o) # RFC 1738 section 3.1 #/ end private :escape_userpass @@ -379,7 +384,7 @@ module URI if @registry || @opaque raise InvalidURIError, "can not set host with registry or opaque" - elsif HOST !~ v + elsif @parser.regexp[:HOST] !~ v raise InvalidComponentError, "bad component(expected host component): #{v}" end @@ -405,7 +410,7 @@ module URI if @registry || @opaque raise InvalidURIError, "can not set port with registry or opaque" - elsif !v.kind_of?(Fixnum) && PORT !~ v + elsif !v.kind_of?(Fixnum) && @parser.regexp[:PORT] !~ v raise InvalidComponentError, "bad component(expected port component): #{v}" end @@ -441,7 +446,7 @@ module URI if @host || @port || @user # userinfo = @user + ':' + @password raise InvalidURIError, "can not set registry with host, port, or userinfo" - elsif v && REGISTRY !~ v + elsif v && @parser.regexp[:REGISTRY] !~ v raise InvalidComponentError, "bad component(expected registry component): #{v}" end @@ -471,12 +476,12 @@ module URI end if @scheme - if v && v != '' && ABS_PATH !~ v + if v && v != '' && @parser.regexp[:ABS_PATH] !~ v raise InvalidComponentError, "bad component(expected absolute path component): #{v}" end else - if v && v != '' && ABS_PATH !~ v && REL_PATH !~ v + if v && v != '' && @parser.regexp[:ABS_PATH] !~ v && @parser.regexp[REL_PATH] !~ v raise InvalidComponentError, "bad component(expected relative path component): #{v}" end @@ -508,7 +513,7 @@ module URI "query conflicts with opaque" end - if v && v != '' && QUERY !~ v + if v && v != '' && @parser.regexp[:QUERY] !~ v raise InvalidComponentError, "bad component(expected query component): #{v}" end @@ -537,7 +542,7 @@ module URI if @host || @port || @user || @path # userinfo = @user + ':' + @password raise InvalidURIError, "can not set opaque with host, port, userinfo or path" - elsif v && OPAQUE !~ v + elsif v && @parser.regexp[:OPAQUE] !~ v raise InvalidComponentError, "bad component(expected opaque component): #{v}" end @@ -560,7 +565,7 @@ module URI def check_fragment(v) return v unless v - if v && v != '' && FRAGMENT !~ v + if v && v != '' && @parser.regexp[:FRAGMENT] !~ v raise InvalidComponentError, "bad component(expected fragment component): #{v}" end @@ -772,7 +777,7 @@ module URI case oth when Generic when String - oth = URI.parse(oth) + oth = @parser.parse(oth) else raise ArgumentError, "bad argument(expected URI object or URI string)" @@ -843,7 +848,7 @@ module URI case oth when Generic when String - oth = URI.parse(oth) + oth = @parser.parse(oth) else raise ArgumentError, "bad argument(expected URI object or URI string)" @@ -864,7 +869,7 @@ module URI rel = URI::Generic.new(nil, # it is relative URI self.userinfo, self.host, self.port, self.registry, self.path, self.opaque, - self.query, self.fragment) + self.query, self.fragment, @parser) if rel.userinfo != oth.userinfo || rel.host.to_s.downcase != oth.host.to_s.downcase || @@ -955,7 +960,7 @@ module URI case oth when Generic when String - oth = URI.parse(oth) + oth = @parser.parse(oth) else raise ArgumentError, "bad argument(expected URI object or URI string)" @@ -1054,6 +1059,7 @@ module URI end def eql?(oth) + @parser == oth.parser && self.component_ary.eql?(oth.component_ary) end @@ -1111,7 +1117,7 @@ module URI def coerce(oth) case oth when String - oth = URI.parse(oth) + oth = @parser.parse(oth) else super end diff --git a/lib/uri/mailto.rb b/lib/uri/mailto.rb index 3a9d15318f..c66c95fcd0 100644 --- a/lib/uri/mailto.rb +++ b/lib/uri/mailto.rb @@ -159,7 +159,7 @@ module URI return true unless v return true if v.size == 0 - if OPAQUE !~ v || /\A#{MAILBOX_PATTERN}*\z/o !~ v + if @parser.regexp[:OPAQUE] !~ v || /\A#{MAILBOX_PATTERN}*\z/o !~ v raise InvalidComponentError, "bad component(expected opaque component): #{v}" end @@ -183,7 +183,7 @@ module URI return true unless v return true if v.size == 0 - if OPAQUE !~ v || + if @parser.regexp[:OPAQUE] !~ v || /\A(#{HEADER_PATTERN}(?:\&#{HEADER_PATTERN})*)\z/o !~ v raise InvalidComponentError, "bad component(expected opaque component): #{v}" @@ -239,18 +239,18 @@ module URI # # => "To: ruby-list@ruby-lang.org\nSubject: subscribe\nCc: myaddr\n\n\n" # def to_mailtext - to = URI::unescape(@to) + to = @parser.unescape(@to) head = '' body = '' @headers.each do |x| case x[0] when 'body' - body = URI::unescape(x[1]) + body = @parser.unescape(x[1]) when 'to' - to << ', ' + URI::unescape(x[1]) + to << ', ' + @parser.unescape(x[1]) else - head << URI::unescape(x[0]).capitalize + ': ' + - URI::unescape(x[1]) + "\n" + head << @parser.unescape(x[0]).capitalize + ': ' + + @parser.unescape(x[1]) + "\n" end end diff --git a/test/uri/test_generic.rb b/test/uri/test_generic.rb index 6661b4a5d2..e8024c8bf2 100644 --- a/test/uri/test_generic.rb +++ b/test/uri/test_generic.rb @@ -39,6 +39,17 @@ class URI::TestGeneric < Test::Unit::TestCase ] ary = uri_to_ary(url) assert_equal(exp, ary) + # 1' + url = URI.parse('ftp://ftp.is.co.za/%2Frfc/rfc1808.txt') + assert_kind_of(URI::FTP, url) + + exp = [ + 'ftp', + nil, 'ftp.is.co.za', URI::FTP.default_port, + '/rfc/rfc1808.txt', nil, + ] + ary = uri_to_ary(url) + assert_equal(exp, ary) # 2 url = URI.parse('gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles') @@ -230,7 +241,7 @@ class URI::TestGeneric < Test::Unit::TestCase assert_equal('', url.to_s) end - def test_rfc2396_examples + def test_rfc3986_examples # http://a/b/c/d;p?q # g:h = g:h url = @base_url.merge('g:h') @@ -306,11 +317,11 @@ class URI::TestGeneric < Test::Unit::TestCase assert_equal('g?y', url.to_s) # http://a/b/c/d;p?q -# #s = (current document)#s +# #s = http://a/b/c/d;p?q#s url = @base_url.merge('#s') assert_kind_of(URI::HTTP, url) - assert_equal(@base_url.to_s + '#s', url.to_s) - url = @base_url.route_to(@base_url.to_s + '#s') + assert_equal('http://a/b/c/d;p?q#s', url.to_s) + url = @base_url.route_to('http://a/b/c/d;p?q#s') assert_kind_of(URI::Generic, url) assert_equal('#s', url.to_s) @@ -448,18 +459,18 @@ class URI::TestGeneric < Test::Unit::TestCase url = @base_url.merge('/./g') assert_kind_of(URI::HTTP, url) assert_equal('http://a/g', url.to_s) - url = @base_url.route_to('http://a/./g') - assert_kind_of(URI::Generic, url) - assert_equal('/./g', url.to_s) +# url = @base_url.route_to('http://a/./g') +# assert_kind_of(URI::Generic, url) +# assert_equal('/./g', url.to_s) # http://a/b/c/d;p?q -# /../g = http://a/../g +# /../g = http://a/g url = @base_url.merge('/../g') assert_kind_of(URI::HTTP, url) assert_equal('http://a/g', url.to_s) - url = @base_url.route_to('http://a/../g') - assert_kind_of(URI::Generic, url) - assert_equal('/../g', url.to_s) +# url = @base_url.route_to('http://a/../g') +# assert_kind_of(URI::Generic, url) +# assert_equal('/../g', url.to_s) # http://a/b/c/d;p?q # g. = http://a/b/c/g. @@ -502,20 +513,20 @@ class URI::TestGeneric < Test::Unit::TestCase url = @base_url.merge('../../../g') assert_kind_of(URI::HTTP, url) assert_equal('http://a/g', url.to_s) - url = @base_url.route_to('http://a/../g') + url = @base_url.route_to('http://a/g') assert_kind_of(URI::Generic, url) - assert('../../../g' != url.to_s) # ok? yes, it confuses you - assert_equal('/../g', url.to_s) # and it is clearly + assert('../../../g' != url.to_s) # ok? yes, it confuses you + assert_equal('../../g', url.to_s) # and it is clearly # http://a/b/c/d;p?q -# ../../../../g = http://a/../../g +# ../../../../g = http://a/g url = @base_url.merge('../../../../g') assert_kind_of(URI::HTTP, url) assert_equal('http://a/g', url.to_s) - url = @base_url.route_to('http://a/../../g') + url = @base_url.route_to('http://a/g') assert_kind_of(URI::Generic, url) assert('../../../../g' != url.to_s) # ok? yes, it confuses you - assert_equal('/../../g', url.to_s) # and it is clearly + assert_equal('../../g', url.to_s) # and it is clearly # http://a/b/c/d;p?q # ./../g = http://a/b/g @@ -644,9 +655,10 @@ class URI::TestGeneric < Test::Unit::TestCase assert_equal(URI.parse('http://foo/hoge'), URI.join('http://foo', 'bar/baz', '/hoge')) end + # ruby-dev:16728 def test_set_component uri = URI.parse('http://foo:bar@baz') - assert_equal('oof', uri.user = 'oof', "[ruby-dev:16728]") + assert_equal('oof', uri.user = 'oof') assert_equal('http://oof:bar@baz', uri.to_s) assert_equal('rab', uri.password = 'rab') assert_equal('http://oof:rab@baz', uri.to_s) @@ -683,57 +695,4 @@ class URI::TestGeneric < Test::Unit::TestCase assert_raises(URI::InvalidURIError) { uri.path = 'bar' } assert_raises(URI::InvalidURIError) { uri.query = 'bar' } end - - def m(s) - @base_url.merge(s).to_s -end - - def test_rfc3986_examples - assert_equal("g:h", m("g:h")) - assert_equal("http://a/b/c/g", m("g")) - assert_equal("http://a/b/c/g", m("./g")) - assert_equal("http://a/b/c/g/", m("g/")) - assert_equal("http://a/g", m("/g")) - assert_equal("http://g", m("//g")) - assert_equal("http://a/b/c/d;p?y", m("?y")) - assert_equal("http://a/b/c/g?y", m("g?y")) - assert_equal("http://a/b/c/d;p?q#s", m("#s")) - assert_equal("http://a/b/c/g#s", m("g#s")) - assert_equal("http://a/b/c/g?y#s", m("g?y#s")) - assert_equal("http://a/b/c/;x", m(";x")) - assert_equal("http://a/b/c/g;x", m("g;x")) - assert_equal("http://a/b/c/g;x?y#s", m("g;x?y#s")) - assert_equal("http://a/b/c/d;p?q", m("")) - assert_equal("http://a/b/c/", m(".")) - assert_equal("http://a/b/c/", m("./")) - assert_equal("http://a/b/", m("..")) - assert_equal("http://a/b/", m("../")) - assert_equal("http://a/b/g", m("../g")) - assert_equal("http://a/", m("../..")) - assert_equal("http://a/", m("../../")) - assert_equal("http://a/g", m("../../g")) - assert_equal("http://a/g", m("../../../g")) - assert_equal("http://a/g", m("../../../../g")) - - assert_equal("http://a/g", m("/./g")) - assert_equal("http://a/g", m("/../g")) - assert_equal("http://a/b/c/g.", m("g.")) - assert_equal("http://a/b/c/.g", m(".g")) - assert_equal("http://a/b/c/g..", m("g..")) - assert_equal("http://a/b/c/..g", m("..g")) - - assert_equal("http://a/b/g", m("./../g")) - assert_equal("http://a/b/c/g/", m("./g/.")) - assert_equal("http://a/b/c/g/h", m("g/./h")) - assert_equal("http://a/b/c/h", m("g/../h")) - assert_equal("http://a/b/c/g;x=1/y", m("g;x=1/./y")) - assert_equal("http://a/b/c/y", m("g;x=1/../y")) - - assert_equal("http://a/b/c/g?y/./x", m("g?y/./x")) - assert_equal("http://a/b/c/g?y/../x", m("g?y/../x")) - assert_equal("http://a/b/c/g#s/./x", m("g#s/./x")) - assert_equal("http://a/b/c/g#s/../x", m("g#s/../x")) - - assert_equal("http:g", m("http:g")) - end end diff --git a/test/uri/test_parser.rb b/test/uri/test_parser.rb new file mode 100644 index 0000000000..adf8a1292c --- /dev/null +++ b/test/uri/test_parser.rb @@ -0,0 +1,41 @@ +require 'test/unit' +require 'uri' + +class URI::TestParser < Test::Unit::TestCase + def uri_to_ary(uri) + uri.class.component.collect {|c| uri.send(c)} + end + + def test_compare + url = 'http://a/b/c/d;p?q' + u0 = URI.parse(url) + u1 = URI.parse(url) + p = URI::Parser.new + u2 = p.parse(url) + u3 = p.parse(url) + + assert(u0 == u1) + assert(u0.eql?(u1)) + assert(!u0.equal?(u1)) + + assert(u1 == u2) + assert(!u1.eql?(u2)) + assert(!u1.equal?(u2)) + + assert(u2 == u3) + assert(u2.eql?(u3)) + assert(!u2.equal?(u3)) + end + + def test_parse + escaped = URI::REGEXP::PATTERN::ESCAPED + hex = URI::REGEXP::PATTERN::HEX + p1 = URI::Parser.new(:ESCAPED => "(?:#{escaped}|%u[#{hex}]{4})") + u1 = p1.parse('http://a/b/%uABCD') + assert_equal(['http', nil, 'a', URI::HTTP.default_port, '/b/%uABCD', nil, nil], + uri_to_ary(u1)) + u1.path = '/%uDCBA' + assert_equal(['http', nil, 'a', URI::HTTP.default_port, '/%uDCBA', nil, nil], + uri_to_ary(u1)) + end +end -- cgit v1.2.3