diff options
Diffstat (limited to 'url/url_canon_host.cc')
-rw-r--r-- | url/url_canon_host.cc | 49 |
1 files changed, 36 insertions, 13 deletions
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc index d3b1222f1..4e45839f8 100644 --- a/url/url_canon_host.cc +++ b/url/url_canon_host.cc @@ -6,11 +6,14 @@ #include "base/cpu_reduction_experiment.h" #include "url/url_canon.h" #include "url/url_canon_internal.h" +#include "url/url_features.h" namespace url { namespace { +// clang-format off +// // For reference, here's what IE supports: // Key: 0 (disallowed: failure if present in the input) // + (allowed either escaped or unescaped, and unmodified) @@ -37,19 +40,15 @@ namespace { // I also didn't test if characters affecting HTML parsing are allowed // unescaped, e.g. (") or (#), which would indicate the beginning of the path. // Surprisingly, space is accepted in the input and always escaped. - +// +// TODO(https://crbug.com/1416013): Remove the above historical reference +// information once we are 100% standard compliant to the URL Standard. +// // This table lists the canonical version of all characters we allow in the // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar // value to indicate that this character should be escaped. We are a little more // restrictive than IE, but less restrictive than Firefox. // -// Note that we disallow the % character. We will allow it when part of an -// escape sequence, of course, but this disallows "%25". Even though IE allows -// it, allowing it would put us in a funny state. If there was an invalid -// escape sequence like "%zz", we'll add "%25zz" to the output and fail. -// Allowing percents means we'll succeed a second time, so validity would change -// based on how many times you run the canonicalizer. We prefer to always report -// the same vailidity, so reject this. const unsigned char kEsc = 0xff; const unsigned char kHostCharLookup[0x80] = { // 00-1f: all are invalid @@ -68,6 +67,27 @@ const unsigned char kHostCharLookup[0x80] = { // p q r s t u v w x y z { | } ~ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; +// The following table is used when kStandardCompliantHostCharLookup feature is +// enabled. See https://crbug.com/1416013 for details. At present, ' ' (SPACE) +// and '*' (asterisk) are still non-compliant to the URL Standard. +const unsigned char kStandardCompliantHostCharLookup[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + kEsc,'!', '"', 0, '$', 0, '&', '\'','(', ')', kEsc, '+', ',', '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';' , 0, '=', 0, 0, +// @ A B C D E F G H I J K L M N O + 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0, ']', 0, '_', +// ` a b c d e f g h i j k l m n o + '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', 0, '}', '~', 0 }; +// clang-format on + // RFC1034 maximum FQDN length. constexpr size_t kMaxHostLength = 253; @@ -149,7 +169,12 @@ bool DoSimpleHost(const INCHAR* host, if (source < 0x80) { // We have ASCII input, we can use our lookup table. - unsigned char replacement = kHostCharLookup[source]; + unsigned char replacement; + if (url::IsUsingStandardCompliantHostCharacters()) { + replacement = kStandardCompliantHostCharLookup[source]; + } else { + replacement = kHostCharLookup[source]; + } if (!replacement) { // Invalid character, add it as percent-escaped and mark as failed. AppendEscapedChar(source, output); @@ -189,9 +214,7 @@ bool DoIDNHost(const char16_t* src, size_t src_len, CanonOutput* output) { } StackBufferW wide_output; - if (!IDNToASCII(url_escaped_host.data(), - url_escaped_host.length(), - &wide_output)) { + if (!IDNToASCII(url_escaped_host.view(), &wide_output)) { // Some error, give up. This will write some reasonable looking // representation of the string to the output. AppendInvalidNarrowString(src, 0, src_len, output); @@ -381,7 +404,7 @@ void DoHost(const CHAR* spec, // we just leave it in place. if (host_info->IsIPAddress()) { output->set_length(output_begin); - output->Append(canon_ip.data(), canon_ip.length()); + output->Append(canon_ip.view()); } } else { // Canonicalization failed. Set BROKEN to notify the caller. |