url: forbid certain confusable changes from being introduced by toASCII

The legacy url.parse() function attempts to convert Unicode domains (IDNs) into their ASCII/Punycode form through the use of the toASCII function. However, toASCII can introduce or remove various characters that at best invalidate the parsed URL, and at worst cause hostname spoofing: url.parse('http://bad.c℀.good.com/').href === 'http://bad.ca/c.good.com/' (from [1]) url.parse('http://\u00AD/bad.com').href === 'http:///bad.com/' While changes to the legacy URL parser are discouraged in general, the security implications here outweigh the desire for strict compatibility. This is since this commit only changes behavior when non-ASCII characters appear in the hostname, an unusual situation for most use cases. Additionally, despite the availability of the WHATWG URL API, url.parse remain widely deployed in the Node.js ecosystem, as exemplified by the recent un-deprecation of the legacy API. This change is similar in spirit to CPython 3.8's change [2] fixing bpo-36216 [3] aka CVE-2019-9636, which also occurred despite potential compatibility concerns. [1]: https://hackerone.com/reports/678487 [2]: https://github.com/python/cpython/commit/16e6f7dee7f02bb81aa6b385b982dcdda5b99286 [3]: https://bugs.python.org/issue36216 PR-URL: https://github.com/nodejs/node/pull/38631 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Rich Trott <rtrott@gmail.com> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com>
author: Timothy Gu <timothygu99@gmail.com> 2021-05-11 10:59:30 +0300
committer: Timothy Gu <timothygu99@gmail.com> 2021-05-14 09:04:23 +0300
commit: 70157b9cb7d703ee9c44ff56522c65829a599d67 (patch)
tree: 199384c093fb57dcfd54862e2c76cdad947a7e3e /lib
parent: 41fab0d6cba15c8e149d2a4b2d1adde98d5ac3ad (diff)
1 files changed, 30 insertions, 2 deletions
diff --git a/lib/url.js b/lib/url.js
index 177bd9eb59c..125fa11ebd4 100644
--- a/lib/url.js
+++ b/lib/url.js
@@ -34,7 +34,8 @@ const { toASCII } = require('internal/idna');
 const { encodeStr, hexTable } = require('internal/querystring');
 
 const {
-  ERR_INVALID_ARG_TYPE
+  ERR_INVALID_ARG_TYPE,
+  ERR_INVALID_URL,
 } = require('internal/errors').codes;
 const { validateString } = require('internal/validators');
 
@@ -167,6 +168,20 @@ function isIpv6Hostname(hostname) {
   );
 }
 
+// This prevents some common spoofing bugs due to our use of IDNA toASCII. For
+// compatibility, the set of characters we use here is the *intersection* of
+// "forbidden host code point" in the WHATWG URL Standard [1] and the
+// characters in the host parsing loop in Url.prototype.parse, with the
+// following additions:
+//
+// - ':' since this could cause a "protocol spoofing" bug
+// - '@' since this could cause parts of the hostname to be confused with auth
+// - '[' and ']' since this could cause a non-IPv6 hostname to be interpreted
+//   as IPv6 by isIpv6Hostname above
+//
+// [1]: https://url.spec.whatwg.org/#forbidden-host-code-point
+const forbiddenHostChars = /[\t\n\r #%/:<>?@[\\\]^|]/;
+
 Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
   validateString(url, 'url');
 
@@ -389,7 +404,7 @@ Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
       this.hostname = this.hostname.toLowerCase();
     }
 
-    if (!ipv6Hostname) {
+    if (!ipv6Hostname && this.hostname !== '') {
       // IDNA Support: Returns a punycoded representation of "domain".
       // It only converts parts of the domain name that
       // have non-ASCII characters, i.e. it doesn't matter if
@@ -398,6 +413,19 @@ Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
       // Use lenient mode (`true`) to try to support even non-compliant
       // URLs.
       this.hostname = toASCII(this.hostname, true);
+
+      // Prevent two potential routes of hostname spoofing.
+      // 1. If this.hostname is empty, it must have become empty due to toASCII
+      //    since we checked this.hostname above.
+      // 2. If any of forbiddenHostChars appears in this.hostname, it must have
+      //    also gotten in due to toASCII. This is since getHostname would have
+      //    filtered them out otherwise.
+      // Rather than trying to correct this by moving the non-host part into
+      // the pathname as we've done in getHostname, throw an exception to
+      // convey the severity of this issue.
+      if (this.hostname === '' || forbiddenHostChars.test(this.hostname)) {
+        throw new ERR_INVALID_URL(url);
+      }
     }
 
     const p = this.port ? ':' + this.port : '';
author	Timothy Gu <timothygu99@gmail.com>	2021-05-11 10:59:30 +0300
committer	Timothy Gu <timothygu99@gmail.com>	2021-05-14 09:04:23 +0300
commit	70157b9cb7d703ee9c44ff56522c65829a599d67 (patch)
tree	199384c093fb57dcfd54862e2c76cdad947a7e3e /lib
parent	41fab0d6cba15c8e149d2a4b2d1adde98d5ac3ad (diff)