diff options
author | btolsch <btolsch@chromium.org> | 2020-02-20 14:55:18 -0800 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2020-02-21 00:01:46 +0000 |
commit | e060aaf770ca8bc122fca0a561a219fda1cfdba9 (patch) | |
tree | 2f9c0d0864fec6db89bc32ce18163384dd6568b0 /third_party | |
parent | 88012faee163f41058bca9ea035aa81bd7226965 (diff) | |
download | openscreen-e060aaf770ca8bc122fca0a561a219fda1cfdba9.tar.gz |
Add URL parsing support
This change introduces URL parsing by copying parsing functions in from
Chromium, which were themselves previously ported from Mozilla.
Bug: openscreen:60
Change-Id: Ic3631365a6bf03682f314dc309be160612720313
Reviewed-on: https://chromium-review.googlesource.com/c/openscreen/+/2053046
Commit-Queue: Brandon Tolsch <btolsch@chromium.org>
Reviewed-by: Ryan Keane <rwkeane@google.com>
Reviewed-by: Takumi Fujimoto <takumif@chromium.org>
Diffstat (limited to 'third_party')
-rw-r--r-- | third_party/mozilla/BUILD.gn | 14 | ||||
-rw-r--r-- | third_party/mozilla/LICENSE.txt | 65 | ||||
-rw-r--r-- | third_party/mozilla/README.md | 7 | ||||
-rw-r--r-- | third_party/mozilla/url_parse.cc | 858 | ||||
-rw-r--r-- | third_party/mozilla/url_parse.h | 322 | ||||
-rw-r--r-- | third_party/mozilla/url_parse_internal.cc | 87 | ||||
-rw-r--r-- | third_party/mozilla/url_parse_internal.h | 50 |
7 files changed, 1403 insertions, 0 deletions
diff --git a/third_party/mozilla/BUILD.gn b/third_party/mozilla/BUILD.gn new file mode 100644 index 00000000..051917d2 --- /dev/null +++ b/third_party/mozilla/BUILD.gn @@ -0,0 +1,14 @@ +# Copyright 2020 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +source_set("mozilla") { + sources = [ + "url_parse.cc", + "url_parse.h", + "url_parse_internal.cc", + "url_parse_internal.h", + ] + + public_configs = [ "../../build:openscreen_include_dirs" ] +} diff --git a/third_party/mozilla/LICENSE.txt b/third_party/mozilla/LICENSE.txt new file mode 100644 index 00000000..ac408378 --- /dev/null +++ b/third_party/mozilla/LICENSE.txt @@ -0,0 +1,65 @@ +Copyright 2007, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is +licensed separately as follows: + +The contents of this file are subject to the Mozilla Public License Version +1.1 (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at +http://www.mozilla.org/MPL/ + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +for the specific language governing rights and limitations under the +License. + +The Original Code is mozilla.org code. + +The Initial Developer of the Original Code is +Netscape Communications Corporation. +Portions created by the Initial Developer are Copyright (C) 1998 +the Initial Developer. All Rights Reserved. + +Contributor(s): + Darin Fisher (original author) + +Alternatively, the contents of this file may be used under the terms of +either the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +in which case the provisions of the GPL or the LGPL are applicable instead +of those above. If you wish to allow use of your version of this file only +under the terms of either the GPL or the LGPL, and not to allow others to +use your version of this file under the terms of the MPL, indicate your +decision by deleting the provisions above and replace them with the notice +and other provisions required by the GPL or the LGPL. If you do not delete +the provisions above, a recipient may use your version of this file under +the terms of any one of the MPL, the GPL or the LGPL. diff --git a/third_party/mozilla/README.md b/third_party/mozilla/README.md new file mode 100644 index 00000000..ed4c24d8 --- /dev/null +++ b/third_party/mozilla/README.md @@ -0,0 +1,7 @@ +# url_parse + +`url_parse.{h,cc}` are based on the same files in Chromium under +`//url/third_party/mozilla` but have been slightly modified for our use case. +`url_parse_internal.{h,cc}` contains additional functions needed by the former +files but aren't provided directly. These are also ported from Chromium's +version. diff --git a/third_party/mozilla/url_parse.cc b/third_party/mozilla/url_parse.cc new file mode 100644 index 00000000..e6efd9e7 --- /dev/null +++ b/third_party/mozilla/url_parse.cc @@ -0,0 +1,858 @@ +/* Based on nsURLParsers.cc from Mozilla + * ------------------------------------- + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Darin Fisher (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "third_party/mozilla/url_parse.h" + +#include <assert.h> +#include <ctype.h> +#include <stdlib.h> + +#include "third_party/mozilla/url_parse_internal.h" + +namespace openscreen { +namespace { + +// Returns true if the given character is a valid digit to use in a port. +bool IsPortDigit(char ch) { + return ch >= '0' && ch <= '9'; +} + +// Returns the offset of the next authority terminator in the input starting +// from start_offset. If no terminator is found, the return value will be equal +// to spec_len. +int FindNextAuthorityTerminator(const char* spec, + int start_offset, + int spec_len) { + for (int i = start_offset; i < spec_len; i++) { + if (IsAuthorityTerminator(spec[i])) + return i; + } + return spec_len; // Not found. +} + +void ParseUserInfo(const char* spec, + const Component& user, + Component* username, + Component* password) { + // Find the first colon in the user section, which separates the username and + // password. + int colon_offset = 0; + while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') + colon_offset++; + + if (colon_offset < user.len) { + // Found separator: <username>:<password> + *username = Component(user.begin, colon_offset); + *password = MakeRange(user.begin + colon_offset + 1, user.begin + user.len); + } else { + // No separator, treat everything as the username + *username = user; + *password = Component(); + } +} + +void ParseServerInfo(const char* spec, + const Component& serverinfo, + Component* hostname, + Component* port_num) { + if (serverinfo.len == 0) { + // No server info, host name is empty. + hostname->reset(); + port_num->reset(); + return; + } + + // If the host starts with a left-bracket, assume the entire host is an + // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. + // This assumption will be overridden if we find a right-bracket. + // + // Our IPv6 address canonicalization code requires both brackets to exist, + // but the ability to locate an incomplete address can still be useful. + int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; + int colon = -1; + + // Find the last right-bracket, and the last colon. + for (int i = serverinfo.begin; i < serverinfo.end(); i++) { + switch (spec[i]) { + case ']': + ipv6_terminator = i; + break; + case ':': + colon = i; + break; + } + } + + if (colon > ipv6_terminator) { + // Found a port number: <hostname>:<port> + *hostname = MakeRange(serverinfo.begin, colon); + if (hostname->len == 0) + hostname->reset(); + *port_num = MakeRange(colon + 1, serverinfo.end()); + } else { + // No port: <hostname> + *hostname = serverinfo; + port_num->reset(); + } +} + +// Given an already-identified auth section, breaks it into its consituent +// parts. The port number will be parsed and the resulting integer will be +// filled into the given *port variable, or -1 if there is no port number or it +// is invalid. +void DoParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + assert(auth.is_valid()); + if (auth.len == 0) { + username->reset(); + password->reset(); + hostname->reset(); + port_num->reset(); + return; + } + + // Search backwards for @, which is the separator between the user info and + // the server info. + int i = auth.begin + auth.len - 1; + while (i > auth.begin && spec[i] != '@') + i--; + + if (spec[i] == '@') { + // Found user info: <user-info>@<server-info> + ParseUserInfo(spec, Component(auth.begin, i - auth.begin), username, + password); + ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), hostname, + port_num); + } else { + // No user info, everything is server info. + username->reset(); + password->reset(); + ParseServerInfo(spec, auth, hostname, port_num); + } +} + +inline void FindQueryAndRefParts(const char* spec, + const Component& path, + int* query_separator, + int* ref_separator) { + int path_end = path.begin + path.len; + for (int i = path.begin; i < path_end; i++) { + switch (spec[i]) { + case '?': + // Only match the query string if it precedes the reference fragment + // and when we haven't found one already. + if (*query_separator < 0) + *query_separator = i; + break; + case '#': + // Record the first # sign only. + if (*ref_separator < 0) { + *ref_separator = i; + return; + } + break; + } + } +} + +void ParsePath(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> + + // Special case when there is no path. + if (path.len == -1) { + filepath->reset(); + query->reset(); + ref->reset(); + return; + } + assert(path.len > 0); + + // Search for first occurrence of either ? or #. + int query_separator = -1; // Index of the '?' + int ref_separator = -1; // Index of the '#' + FindQueryAndRefParts(spec, path, &query_separator, &ref_separator); + + // Markers pointing to the character after each of these corresponding + // components. The code below words from the end back to the beginning, + // and will update these indices as it finds components that exist. + int file_end, query_end; + + // Ref fragment: from the # to the end of the path. + int path_end = path.begin + path.len; + if (ref_separator >= 0) { + file_end = query_end = ref_separator; + *ref = MakeRange(ref_separator + 1, path_end); + } else { + file_end = query_end = path_end; + ref->reset(); + } + + // Query fragment: everything from the ? to the next boundary (either the end + // of the path or the ref fragment). + if (query_separator >= 0) { + file_end = query_separator; + *query = MakeRange(query_separator + 1, query_end); + } else { + query->reset(); + } + + // File path: treat an empty file path as no file path. + if (file_end != path.begin) + *filepath = MakeRange(path.begin, file_end); + else + filepath->reset(); +} + +bool DoExtractScheme(const char* url, int url_len, Component* scheme) { + // Skip leading whitespace and control characters. + int begin = 0; + while (begin < url_len && ShouldTrimFromURL(url[begin])) + begin++; + if (begin == url_len) + return false; // Input is empty or all whitespace. + + // Find the first colon character. + for (int i = begin; i < url_len; i++) { + if (url[i] == ':') { + *scheme = MakeRange(begin, i); + return true; + } + } + return false; // No colon found: no scheme +} + +// Fills in all members of the Parsed structure except for the scheme. +// +// |spec| is the full spec being parsed, of length |spec_len|. +// |after_scheme| is the character immediately following the scheme (after the +// colon) where we'll begin parsing. +// +// Compatability data points. I list "host", "path" extracted: +// Input IE6 Firefox Us +// ----- -------------- -------------- -------------- +// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" +// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" +// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// +// (*) Interestingly, although IE fails to load these URLs, its history +// canonicalizer handles them, meaning if you've been to the corresponding +// "http://foo.com/" link, it will be colored. +void DoParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + int after_slashes = after_scheme + num_slashes; + + // First split into two main parts, the authority (username, password, host, + // and port) and the full path (path, query, and reference). + Component authority; + Component full_path; + + // Found "//<some data>", looks like an authority section. Treat everything + // from there to the next slash (or end of spec) to be the authority. Note + // that we ignore the number of slashes and treat it as the authority. + int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); + authority = Component(after_slashes, end_auth - after_slashes); + + if (end_auth == spec_len) // No beginning of path found. + full_path = Component(); + else // Everything starting from the slash to the end is the path. + full_path = Component(end_auth, spec_len - end_auth); + + // Now parse those two sub-parts. + DoParseAuthority(spec, authority, &parsed->username, &parsed->password, + &parsed->host, &parsed->port); + ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); +} + +// The main parsing function for standard URLs. Standard URLs have a scheme, +// host, path, etc. +void DoParseStandardURL(const char* spec, int spec_len, Parsed* parsed) { + assert(spec_len >= 0); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + int after_scheme; + if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { + after_scheme = parsed->scheme.end() + 1; // Skip past the colon. + } else { + // Say there's no scheme when there is no colon. We could also say that + // everything is the scheme. Both would produce an invalid URL, but this way + // seems less wrong in more cases. + parsed->scheme.reset(); + after_scheme = begin; + } + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +void DoParseFileSystemURL(const char* spec, int spec_len, Parsed* parsed) { + assert(spec_len >= 0); + + // Get the unused parts of the URL out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->path.reset(); // May use this; reset for convenience. + parsed->ref.reset(); // May use this; reset for convenience. + parsed->query.reset(); // May use this; reset for convenience. + parsed->clear_inner_parsed(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + return; + } + + int inner_start = -1; + + // Extract the scheme. We also handle the case where there is no scheme. + if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() == spec_len - 1) + return; + + inner_start = parsed->scheme.end() + 1; + } else { + // No scheme found; that's not valid for filesystem URLs. + parsed->scheme.reset(); + return; + } + + Component inner_scheme; + const char* inner_spec = &spec[inner_start]; + int inner_spec_len = spec_len - inner_start; + + if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { + // Offset the results since we gave ExtractScheme a substring. + inner_scheme.begin += inner_start; + + if (inner_scheme.end() == spec_len - 1) + return; + } else { + // No scheme found; that's not valid for filesystem URLs. + // The best we can do is return "filesystem://". + return; + } + + Parsed inner_parsed; + + if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) { + // File URLs are special. + ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); + } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) { + // Filesystem URLs don't nest. + return; + } else if (IsStandard(spec, inner_scheme)) { + // All "normal" URLs. + DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); + } else { + return; + } + + // All members of inner_parsed need to be offset by inner_start. + // If we had any scheme that supported nesting more than one level deep, + // we'd have to recurse into the inner_parsed's inner_parsed when + // adjusting by inner_start. + inner_parsed.scheme.begin += inner_start; + inner_parsed.username.begin += inner_start; + inner_parsed.password.begin += inner_start; + inner_parsed.host.begin += inner_start; + inner_parsed.port.begin += inner_start; + inner_parsed.query.begin += inner_start; + inner_parsed.ref.begin += inner_start; + inner_parsed.path.begin += inner_start; + + // Query and ref move from inner_parsed to parsed. + parsed->query = inner_parsed.query; + inner_parsed.query.reset(); + parsed->ref = inner_parsed.ref; + inner_parsed.ref.reset(); + + parsed->set_inner_parsed(inner_parsed); + if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || + inner_parsed.inner_parsed()) { + return; + } + + // The path in inner_parsed should start with a slash, then have a filesystem + // type followed by a slash. From the first slash up to but excluding the + // second should be what it keeps; the rest goes to parsed. If the path ends + // before the second slash, it's still pretty clear what the user meant, so + // we'll let that through. + if (!IsURLSlash(spec[inner_parsed.path.begin])) { + return; + } + int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash + while (inner_path_end < spec_len && !IsURLSlash(spec[inner_path_end])) + ++inner_path_end; + parsed->path.begin = inner_path_end; + int new_inner_path_length = inner_path_end - inner_parsed.path.begin; + parsed->path.len = inner_parsed.path.len - new_inner_path_length; + parsed->inner_parsed()->path.len = new_inner_path_length; +} + +// Initializes a path URL which is merely a scheme followed by a path. Examples +// include "about:foo" and "javascript:alert('bar');" +void DoParsePathURL(const char* spec, + int spec_len, + bool trim_path_end, + Parsed* parsed) { + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->path.reset(); + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int scheme_begin = 0; + TrimURL(spec, &scheme_begin, &spec_len, trim_path_end); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (scheme_begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin; + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin, + &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += scheme_begin; + path_begin = parsed->scheme.end() + 1; + } else { + // No scheme case. + parsed->scheme.reset(); + path_begin = scheme_begin; + } + + if (path_begin == spec_len) + return; + assert(path_begin < spec_len); + + ParsePath(spec, MakeRange(path_begin, spec_len), &parsed->path, + &parsed->query, &parsed->ref); +} + +void DoParseMailtoURL(const char* spec, int spec_len, Parsed* parsed) { + assert(spec_len >= 0); + + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->ref.reset(); + parsed->query.reset(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin = -1; + int path_end = -1; + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() != spec_len - 1) { + path_begin = parsed->scheme.end() + 1; + path_end = spec_len; + } + } else { + // No scheme found, just path. + parsed->scheme.reset(); + path_begin = begin; + path_end = spec_len; + } + + // Split [path_begin, path_end) into a path + query. + for (int i = path_begin; i < path_end; ++i) { + if (spec[i] == '?') { + parsed->query = MakeRange(i + 1, path_end); + path_end = i; + break; + } + } + + // For compatability with the standard URL parser, treat no path as + // -1, rather than having a length of 0 + if (path_begin == path_end) { + parsed->path.reset(); + } else { + parsed->path = MakeRange(path_begin, path_end); + } +} + +// Converts a port number in a string to an integer. We'd like to just call +// sscanf but our input is not NULL-terminated, which sscanf requires. Instead, +// we copy the digits to a small stack buffer (since we know the maximum number +// of digits in a valid port number) that we can NULL terminate. +int DoParsePort(const char* spec, const Component& component) { + // Easy success case when there is no port. + const int kMaxDigits = 5; + if (!component.is_nonempty()) + return PORT_UNSPECIFIED; + + // Skip over any leading 0s. + Component digits_comp(component.end(), 0); + for (int i = 0; i < component.len; i++) { + if (spec[component.begin + i] != '0') { + digits_comp = MakeRange(component.begin + i, component.end()); + break; + } + } + if (digits_comp.len == 0) + return 0; // All digits were 0. + + // Verify we don't have too many digits (we'll be copying to our buffer so + // we need to double-check). + if (digits_comp.len > kMaxDigits) + return PORT_INVALID; + + // Copy valid digits to the buffer. + char digits[kMaxDigits + 1]; // +1 for null terminator + for (int i = 0; i < digits_comp.len; i++) { + char ch = spec[digits_comp.begin + i]; + if (!IsPortDigit(ch)) { + // Invalid port digit, fail. + return PORT_INVALID; + } + digits[i] = static_cast<char>(ch); + } + + // Null-terminate the string and convert to integer. Since we guarantee + // only digits, atoi's lack of error handling is OK. + digits[digits_comp.len] = 0; + int port = atoi(digits); + if (port > 65535) + return PORT_INVALID; // Out of range. + return port; +} + +void DoExtractFileName(const char* spec, + const Component& path, + Component* file_name) { + // Handle empty paths: they have no file names. + if (!path.is_nonempty()) { + file_name->reset(); + return; + } + + // Extract the filename range from the path which is between + // the last slash and the following semicolon. + int file_end = path.end(); + for (int i = path.end() - 1; i >= path.begin; i--) { + if (spec[i] == ';') { + file_end = i; + } else if (IsURLSlash(spec[i])) { + // File name is everything following this character to the end + *file_name = MakeRange(i + 1, file_end); + return; + } + } + + // No slash found, this means the input was degenerate (generally paths + // will start with a slash). Let's call everything the file name. + *file_name = MakeRange(path.begin, file_end); + return; +} + +bool DoExtractQueryKeyValue(const char* spec, + Component* query, + Component* key, + Component* value) { + if (!query->is_nonempty()) + return false; + + int start = query->begin; + int cur = start; + int end = query->end(); + + // We assume the beginning of the input is the beginning of the "key" and we + // skip to the end of it. + key->begin = cur; + while (cur < end && spec[cur] != '&' && spec[cur] != '=') + cur++; + key->len = cur - key->begin; + + // Skip the separator after the key (if any). + if (cur < end && spec[cur] == '=') + cur++; + + // Find the value part. + value->begin = cur; + while (cur < end && spec[cur] != '&') + cur++; + value->len = cur - value->begin; + + // Finally skip the next separator if any + if (cur < end && spec[cur] == '&') + cur++; + + // Save the new query + *query = MakeRange(cur, end); + return true; +} + +} // namespace + +Parsed::Parsed() : potentially_dangling_markup(false), inner_parsed_(NULL) {} + +Parsed::Parsed(const Parsed& other) + : scheme(other.scheme), + username(other.username), + password(other.password), + host(other.host), + port(other.port), + path(other.path), + query(other.query), + ref(other.ref), + potentially_dangling_markup(other.potentially_dangling_markup), + inner_parsed_(NULL) { + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); +} + +Parsed& Parsed::operator=(const Parsed& other) { + if (this != &other) { + scheme = other.scheme; + username = other.username; + password = other.password; + host = other.host; + port = other.port; + path = other.path; + query = other.query; + ref = other.ref; + potentially_dangling_markup = other.potentially_dangling_markup; + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); + else + clear_inner_parsed(); + } + return *this; +} + +Parsed::~Parsed() { + delete inner_parsed_; +} + +int Parsed::Length() const { + if (ref.is_valid()) + return ref.end(); + return CountCharactersBefore(REF, false); +} + +int Parsed::CountCharactersBefore(ComponentType type, + bool include_delimiter) const { + if (type == SCHEME) + return scheme.begin; + + // There will be some characters after the scheme like "://" and we don't + // know how many. Search forwards for the next thing until we find one. + int cur = 0; + if (scheme.is_valid()) + cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. + + if (username.is_valid()) { + if (type <= USERNAME) + return username.begin; + cur = username.end() + 1; // Advance over the '@' or ':' at the end. + } + + if (password.is_valid()) { + if (type <= PASSWORD) + return password.begin; + cur = password.end() + 1; // Advance over the '@' at the end. + } + + if (host.is_valid()) { + if (type <= HOST) + return host.begin; + cur = host.end(); + } + + if (port.is_valid()) { + if (type < PORT || (type == PORT && include_delimiter)) + return port.begin - 1; // Back over delimiter. + if (type == PORT) + return port.begin; // Don't want delimiter counted. + cur = port.end(); + } + + if (path.is_valid()) { + if (type <= PATH) + return path.begin; + cur = path.end(); + } + + if (query.is_valid()) { + if (type < QUERY || (type == QUERY && include_delimiter)) + return query.begin - 1; // Back over delimiter. + if (type == QUERY) + return query.begin; // Don't want delimiter counted. + cur = query.end(); + } + + if (ref.is_valid()) { + if (type == REF && !include_delimiter) + return ref.begin; // Back over delimiter. + + // When there is a ref and we get here, the component we wanted was before + // this and not found, so we always know the beginning of the ref is right. + return ref.begin - 1; // Don't want delimiter counted. + } + + return cur; +} + +Component Parsed::GetContent() const { + const int begin = CountCharactersBefore(USERNAME, false); + const int len = Length() - begin; + // For compatability with the standard URL parser, we treat no content as + // -1, rather than having a length of 0 (we normally wouldn't care so + // much for these non-standard URLs). + return len ? Component(begin, len) : Component(); +} + +bool ExtractScheme(const char* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +// This handles everything that may be an authority terminator, including +// backslash. For special backslash handling see DoParseAfterScheme. +bool IsAuthorityTerminator(char ch) { + return IsURLSlash(ch) || ch == '?' || ch == '#'; +} + +void ExtractFileName(const char* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +int ParsePort(const char* url, const Component& port) { + return DoParsePort(url, port); +} + +void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed) { + DoParsePathURL(url, url_len, trim_path_end, parsed); +} + +void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileSystemURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +} // namespace openscreen diff --git a/third_party/mozilla/url_parse.h b/third_party/mozilla/url_parse.h new file mode 100644 index 00000000..70e97adf --- /dev/null +++ b/third_party/mozilla/url_parse.h @@ -0,0 +1,322 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef THIRD_PARTY_MOZILLA_URL_PARSE_H_ +#define THIRD_PARTY_MOZILLA_URL_PARSE_H_ + +namespace openscreen { + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { return begin + len; } + + // Returns true if this component is valid, meaning the length is given. Even + // valid components may be empty to record the fact that they exist. + bool is_valid() const { return (len != -1); } + + // Returns true if the given component is specified on false, the component + // is either empty or invalid. + bool is_nonempty() const { return (len > 0); } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// Parsed parsed; +// Component scheme; +// if (!ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// ParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// ParseFileURL(url, url_len, &parsed); +// else +// ParsePathURL(url, url_len, &parsed); +// +struct Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF, + }; + + // The default constructor is sufficient for the components, but inner_parsed_ + // requires special handling. + Parsed(); + Parsed(const Parsed&); + Parsed& operator=(const Parsed&); + ~Parsed(); + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + int CountCharactersBefore(ComponentType type, bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name, stopping at the query of + // ref delimiter (if any). Length will be -1 if unspecified. This includes + // the preceeding slash, so the path on http://www.google.com/asdf" is + // "/asdf". As a result, it is impossible to have a 0 length path, it will + // be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; + + // The URL spec from the character after the scheme: until the end of the + // URL, regardless of the scheme. This is mostly useful for 'opaque' non- + // hierarchical schemes like data: and javascript: as a convient way to get + // the string with the scheme stripped off. + Component GetContent() const; + + // True if the URL's source contained a raw `<` character, and whitespace was + // removed from the URL during parsing + // + // TODO(mkwst): Link this to something in a spec if + // https://github.com/whatwg/url/pull/284 lands. + bool potentially_dangling_markup; + + // This is used for nested URL types, currently only filesystem. If you + // parse a filesystem URL, the resulting Parsed will have a nested + // inner_parsed_ to hold the parsed inner URL's component information. + // For all other url types [including the inner URL], it will be NULL. + Parsed* inner_parsed() const { return inner_parsed_; } + + void set_inner_parsed(const Parsed& inner_parsed) { + if (!inner_parsed_) + inner_parsed_ = new Parsed(inner_parsed); + else + *inner_parsed_ = inner_parsed; + } + + void clear_inner_parsed() { + if (inner_parsed_) { + delete inner_parsed_; + inner_parsed_ = nullptr; + } + } + + private: + Parsed* inner_parsed_; // This object is owned and managed by this struct. +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +void ParseStandardURL(const char* url, int url_len, Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +void ParseFileURL(const char* url, int url_len, Parsed* parsed); + +// Filesystem URLs are structured differently than other URLs. +void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +bool ExtractScheme(const char* url, int url_len, Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +bool IsAuthorityTerminator(char ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +int ParsePort(const char* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +void ExtractFileName(const char* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); + +} // namespace openscreen + +#endif // THIRD_PARTY_MOZILLA_URL_PARSE_H_ diff --git a/third_party/mozilla/url_parse_internal.cc b/third_party/mozilla/url_parse_internal.cc new file mode 100644 index 00000000..136bc62d --- /dev/null +++ b/third_party/mozilla/url_parse_internal.cc @@ -0,0 +1,87 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "third_party/mozilla//url_parse_internal.h" + +#include <ctype.h> + +#include "third_party/mozilla/url_parse.h" + +namespace openscreen { + +namespace { + +static const char* g_standard_schemes[] = { + kHttpsScheme, kHttpScheme, kFileScheme, kFtpScheme, + kWssScheme, kWsScheme, kFileSystemScheme, +}; + +} // namespace + +bool IsURLSlash(char ch) { + return ch == '/' || ch == '\\'; +} + +bool ShouldTrimFromURL(char ch) { + return ch <= ' '; +} + +void TrimURL(const char* spec, int* begin, int* len, bool trim_path_end) { + // Strip leading whitespace and control characters. + while (*begin < *len && ShouldTrimFromURL(spec[*begin])) { + (*begin)++; + } + + if (trim_path_end) { + // Strip trailing whitespace and control characters. We need the >i test + // for when the input string is all blanks; we don't want to back past the + // input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) { + (*len)--; + } + } +} + +int CountConsecutiveSlashes(const char* str, int begin_offset, int str_len) { + int count = 0; + while ((begin_offset + count) < str_len && + IsURLSlash(str[begin_offset + count])) { + ++count; + } + return count; +} + +bool CompareSchemeComponent(const char* spec, + const Component& component, + const char* compare_to) { + if (!component.is_nonempty()) { + return compare_to[0] == 0; // When component is empty, match empty scheme. + } + for (int i = 0; i < component.len; ++i) { + if (tolower(spec[i]) != compare_to[i]) { + return false; + } + } + return true; +} + +bool IsStandard(const char* spec, const Component& component) { + if (!component.is_nonempty()) { + return false; + } + + constexpr int scheme_count = + sizeof(g_standard_schemes) / sizeof(g_standard_schemes[0]); + for (int i = 0; i < scheme_count; ++i) { + if (CompareSchemeComponent(spec, component, g_standard_schemes[i])) { + return true; + } + } + return false; +} + +// NOTE: Not implemented because file URLs are currently unsupported. +void ParseFileURL(const char* url, int url_len, Parsed* parsed) {} + +} // namespace openscreen diff --git a/third_party/mozilla/url_parse_internal.h b/third_party/mozilla/url_parse_internal.h new file mode 100644 index 00000000..58f9f75b --- /dev/null +++ b/third_party/mozilla/url_parse_internal.h @@ -0,0 +1,50 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef THIRD_PARTY_MOZILLA_URL_PARSE_INTERNAL_H_ +#define THIRD_PARTY_MOZILLA_URL_PARSE_INTERNAL_H_ + +namespace openscreen { + +struct Component; + +static constexpr char kHttpsScheme[] = "https"; +static constexpr char kHttpScheme[] = "http"; +static constexpr char kFileScheme[] = "file"; +static constexpr char kFtpScheme[] = "ftp"; +static constexpr char kWssScheme[] = "wss"; +static constexpr char kWsScheme[] = "ws"; +static constexpr char kFileSystemScheme[] = "filesystem"; +static constexpr char kMailtoScheme[] = "mailto"; + +// Returns whether the character |ch| should be treated as a slash. +bool IsURLSlash(char ch); + +// Returns whether the character |ch| can be safely removed for the URL. +bool ShouldTrimFromURL(char ch); + +// Given an already-initialized begin index and length, this shrinks the range +// to eliminate "should-be-trimmed" characters. Note that the length does *not* +// indicate the length of untrimmed data from |*begin|, but rather the position +// in the input string (so the string starts at character |*begin| in the spec, +// and goes until |*len|). +void TrimURL(const char* spec, int* begin, int* len, bool trim_path_end = true); + +// Returns the number of consecutive slashes in |str| starting from offset +// |begin_offset|. +int CountConsecutiveSlashes(const char* str, int begin_offset, int str_len); + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +bool CompareSchemeComponent(const char* spec, + const Component& component, + const char* compare_to); + +// Returns whether the scheme given by (spec, component) is a standard scheme +// (i.e. https://url.spec.whatwg.org/#special-scheme). +bool IsStandard(const char* spec, const Component& component); + +} // namespace openscreen + +#endif // THIRD_PARTY_MOZILLA_URL_PARSE_INTERNAL_H_ |