561 lines
20 KiB
JavaScript
561 lines
20 KiB
JavaScript
'use strict';
|
|
|
|
/**
|
|
* Check if `vhost` is a valid suffix of `hostname` (top-domain)
|
|
*
|
|
* It means that `vhost` needs to be a suffix of `hostname` and we then need to
|
|
* make sure that: either they are equal, or the character preceding `vhost` in
|
|
* `hostname` is a '.' (it should not be a partial label).
|
|
*
|
|
* * hostname = 'not.evil.com' and vhost = 'vil.com' => not ok
|
|
* * hostname = 'not.evil.com' and vhost = 'evil.com' => ok
|
|
* * hostname = 'not.evil.com' and vhost = 'not.evil.com' => ok
|
|
*/
|
|
function shareSameDomainSuffix(hostname, vhost) {
|
|
if (hostname.endsWith(vhost)) {
|
|
return (hostname.length === vhost.length ||
|
|
hostname[hostname.length - vhost.length - 1] === '.');
|
|
}
|
|
return false;
|
|
}
|
|
/**
|
|
* Given a hostname and its public suffix, extract the general domain.
|
|
*/
|
|
function extractDomainWithSuffix(hostname, publicSuffix) {
|
|
// Locate the index of the last '.' in the part of the `hostname` preceding
|
|
// the public suffix.
|
|
//
|
|
// examples:
|
|
// 1. not.evil.co.uk => evil.co.uk
|
|
// ^ ^
|
|
// | | start of public suffix
|
|
// | index of the last dot
|
|
//
|
|
// 2. example.co.uk => example.co.uk
|
|
// ^ ^
|
|
// | | start of public suffix
|
|
// |
|
|
// | (-1) no dot found before the public suffix
|
|
const publicSuffixIndex = hostname.length - publicSuffix.length - 2;
|
|
const lastDotBeforeSuffixIndex = hostname.lastIndexOf('.', publicSuffixIndex);
|
|
// No '.' found, then `hostname` is the general domain (no sub-domain)
|
|
if (lastDotBeforeSuffixIndex === -1) {
|
|
return hostname;
|
|
}
|
|
// Extract the part between the last '.'
|
|
return hostname.slice(lastDotBeforeSuffixIndex + 1);
|
|
}
|
|
/**
|
|
* Detects the domain based on rules and upon and a host string
|
|
*/
|
|
function getDomain(suffix, hostname, options) {
|
|
// Check if `hostname` ends with a member of `validHosts`.
|
|
if (options.validHosts !== null) {
|
|
const validHosts = options.validHosts;
|
|
for (const vhost of validHosts) {
|
|
if ( /*@__INLINE__*/shareSameDomainSuffix(hostname, vhost)) {
|
|
return vhost;
|
|
}
|
|
}
|
|
}
|
|
let numberOfLeadingDots = 0;
|
|
if (hostname.startsWith('.')) {
|
|
while (numberOfLeadingDots < hostname.length &&
|
|
hostname[numberOfLeadingDots] === '.') {
|
|
numberOfLeadingDots += 1;
|
|
}
|
|
}
|
|
// If `hostname` is a valid public suffix, then there is no domain to return.
|
|
// Since we already know that `getPublicSuffix` returns a suffix of `hostname`
|
|
// there is no need to perform a string comparison and we only compare the
|
|
// size.
|
|
if (suffix.length === hostname.length - numberOfLeadingDots) {
|
|
return null;
|
|
}
|
|
// To extract the general domain, we start by identifying the public suffix
|
|
// (if any), then consider the domain to be the public suffix with one added
|
|
// level of depth. (e.g.: if hostname is `not.evil.co.uk` and public suffix:
|
|
// `co.uk`, then we take one more level: `evil`, giving the final result:
|
|
// `evil.co.uk`).
|
|
return /*@__INLINE__*/ extractDomainWithSuffix(hostname, suffix);
|
|
}
|
|
|
|
/**
|
|
* Return the part of domain without suffix.
|
|
*
|
|
* Example: for domain 'foo.com', the result would be 'foo'.
|
|
*/
|
|
function getDomainWithoutSuffix(domain, suffix) {
|
|
// Note: here `domain` and `suffix` cannot have the same length because in
|
|
// this case we set `domain` to `null` instead. It is thus safe to assume
|
|
// that `suffix` is shorter than `domain`.
|
|
return domain.slice(0, -suffix.length - 1);
|
|
}
|
|
|
|
/**
|
|
* @param url - URL we want to extract a hostname from.
|
|
* @param urlIsValidHostname - hint from caller; true if `url` is already a valid hostname.
|
|
*/
|
|
function extractHostname(url, urlIsValidHostname) {
|
|
let start = 0;
|
|
let end = url.length;
|
|
let hasUpper = false;
|
|
// If url is not already a valid hostname, then try to extract hostname.
|
|
if (!urlIsValidHostname) {
|
|
// Special handling of data URLs
|
|
if (url.startsWith('data:')) {
|
|
return null;
|
|
}
|
|
// Trim leading spaces
|
|
while (start < url.length && url.charCodeAt(start) <= 32) {
|
|
start += 1;
|
|
}
|
|
// Trim trailing spaces
|
|
while (end > start + 1 && url.charCodeAt(end - 1) <= 32) {
|
|
end -= 1;
|
|
}
|
|
// Skip scheme.
|
|
if (url.charCodeAt(start) === 47 /* '/' */ &&
|
|
url.charCodeAt(start + 1) === 47 /* '/' */) {
|
|
start += 2;
|
|
}
|
|
else {
|
|
const indexOfProtocol = url.indexOf(':/', start);
|
|
if (indexOfProtocol !== -1) {
|
|
// Implement fast-path for common protocols. We expect most protocols
|
|
// should be one of these 4 and thus we will not need to perform the
|
|
// more expansive validity check most of the time.
|
|
const protocolSize = indexOfProtocol - start;
|
|
const c0 = url.charCodeAt(start);
|
|
const c1 = url.charCodeAt(start + 1);
|
|
const c2 = url.charCodeAt(start + 2);
|
|
const c3 = url.charCodeAt(start + 3);
|
|
const c4 = url.charCodeAt(start + 4);
|
|
if (protocolSize === 5 &&
|
|
c0 === 104 /* 'h' */ &&
|
|
c1 === 116 /* 't' */ &&
|
|
c2 === 116 /* 't' */ &&
|
|
c3 === 112 /* 'p' */ &&
|
|
c4 === 115 /* 's' */) ;
|
|
else if (protocolSize === 4 &&
|
|
c0 === 104 /* 'h' */ &&
|
|
c1 === 116 /* 't' */ &&
|
|
c2 === 116 /* 't' */ &&
|
|
c3 === 112 /* 'p' */) ;
|
|
else if (protocolSize === 3 &&
|
|
c0 === 119 /* 'w' */ &&
|
|
c1 === 115 /* 's' */ &&
|
|
c2 === 115 /* 's' */) ;
|
|
else if (protocolSize === 2 &&
|
|
c0 === 119 /* 'w' */ &&
|
|
c1 === 115 /* 's' */) ;
|
|
else {
|
|
// Check that scheme is valid
|
|
for (let i = start; i < indexOfProtocol; i += 1) {
|
|
const lowerCaseCode = url.charCodeAt(i) | 32;
|
|
if (!(((lowerCaseCode >= 97 && lowerCaseCode <= 122) || // [a, z]
|
|
(lowerCaseCode >= 48 && lowerCaseCode <= 57) || // [0, 9]
|
|
lowerCaseCode === 46 || // '.'
|
|
lowerCaseCode === 45 || // '-'
|
|
lowerCaseCode === 43) // '+'
|
|
)) {
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
// Skip 0, 1 or more '/' after ':/'
|
|
start = indexOfProtocol + 2;
|
|
while (url.charCodeAt(start) === 47 /* '/' */) {
|
|
start += 1;
|
|
}
|
|
}
|
|
}
|
|
// Detect first occurrence of '/', '?' or '#'. We also keep track of the
|
|
// last occurrence of '@', ']' or ':' to speed-up subsequent parsing of
|
|
// (respectively), identifier, ipv6 or port.
|
|
let indexOfIdentifier = -1;
|
|
let indexOfClosingBracket = -1;
|
|
let indexOfPort = -1;
|
|
for (let i = start; i < end; i += 1) {
|
|
const code = url.charCodeAt(i);
|
|
if (code === 35 || // '#'
|
|
code === 47 || // '/'
|
|
code === 63 // '?'
|
|
) {
|
|
end = i;
|
|
break;
|
|
}
|
|
else if (code === 64) {
|
|
// '@'
|
|
indexOfIdentifier = i;
|
|
}
|
|
else if (code === 93) {
|
|
// ']'
|
|
indexOfClosingBracket = i;
|
|
}
|
|
else if (code === 58) {
|
|
// ':'
|
|
indexOfPort = i;
|
|
}
|
|
else if (code >= 65 && code <= 90) {
|
|
hasUpper = true;
|
|
}
|
|
}
|
|
// Detect identifier: '@'
|
|
if (indexOfIdentifier !== -1 &&
|
|
indexOfIdentifier > start &&
|
|
indexOfIdentifier < end) {
|
|
start = indexOfIdentifier + 1;
|
|
}
|
|
// Handle ipv6 addresses
|
|
if (url.charCodeAt(start) === 91 /* '[' */) {
|
|
if (indexOfClosingBracket !== -1) {
|
|
return url.slice(start + 1, indexOfClosingBracket).toLowerCase();
|
|
}
|
|
return null;
|
|
}
|
|
else if (indexOfPort !== -1 && indexOfPort > start && indexOfPort < end) {
|
|
// Detect port: ':'
|
|
end = indexOfPort;
|
|
}
|
|
}
|
|
// Trim trailing dots
|
|
while (end > start + 1 && url.charCodeAt(end - 1) === 46 /* '.' */) {
|
|
end -= 1;
|
|
}
|
|
const hostname = start !== 0 || end !== url.length ? url.slice(start, end) : url;
|
|
if (hasUpper) {
|
|
return hostname.toLowerCase();
|
|
}
|
|
return hostname;
|
|
}
|
|
|
|
/**
|
|
* Check if a hostname is an IP. You should be aware that this only works
|
|
* because `hostname` is already garanteed to be a valid hostname!
|
|
*/
|
|
function isProbablyIpv4(hostname) {
|
|
// Cannot be shorted than 1.1.1.1
|
|
if (hostname.length < 7) {
|
|
return false;
|
|
}
|
|
// Cannot be longer than: 255.255.255.255
|
|
if (hostname.length > 15) {
|
|
return false;
|
|
}
|
|
let numberOfDots = 0;
|
|
for (let i = 0; i < hostname.length; i += 1) {
|
|
const code = hostname.charCodeAt(i);
|
|
if (code === 46 /* '.' */) {
|
|
numberOfDots += 1;
|
|
}
|
|
else if (code < 48 /* '0' */ || code > 57 /* '9' */) {
|
|
return false;
|
|
}
|
|
}
|
|
return (numberOfDots === 3 &&
|
|
hostname.charCodeAt(0) !== 46 /* '.' */ &&
|
|
hostname.charCodeAt(hostname.length - 1) !== 46 /* '.' */);
|
|
}
|
|
/**
|
|
* Similar to isProbablyIpv4.
|
|
*/
|
|
function isProbablyIpv6(hostname) {
|
|
if (hostname.length < 3) {
|
|
return false;
|
|
}
|
|
let start = hostname.startsWith('[') ? 1 : 0;
|
|
let end = hostname.length;
|
|
if (hostname[end - 1] === ']') {
|
|
end -= 1;
|
|
}
|
|
// We only consider the maximum size of a normal IPV6. Note that this will
|
|
// fail on so-called "IPv4 mapped IPv6 addresses" but this is a corner-case
|
|
// and a proper validation library should be used for these.
|
|
if (end - start > 39) {
|
|
return false;
|
|
}
|
|
let hasColon = false;
|
|
for (; start < end; start += 1) {
|
|
const code = hostname.charCodeAt(start);
|
|
if (code === 58 /* ':' */) {
|
|
hasColon = true;
|
|
}
|
|
else if (!(((code >= 48 && code <= 57) || // 0-9
|
|
(code >= 97 && code <= 102) || // a-f
|
|
(code >= 65 && code <= 90)) // A-F
|
|
)) {
|
|
return false;
|
|
}
|
|
}
|
|
return hasColon;
|
|
}
|
|
/**
|
|
* Check if `hostname` is *probably* a valid ip addr (either ipv6 or ipv4).
|
|
* This *will not* work on any string. We need `hostname` to be a valid
|
|
* hostname.
|
|
*/
|
|
function isIp(hostname) {
|
|
return isProbablyIpv6(hostname) || isProbablyIpv4(hostname);
|
|
}
|
|
|
|
/**
|
|
* Implements fast shallow verification of hostnames. This does not perform a
|
|
* struct check on the content of labels (classes of Unicode characters, etc.)
|
|
* but instead check that the structure is valid (number of labels, length of
|
|
* labels, etc.).
|
|
*
|
|
* If you need stricter validation, consider using an external library.
|
|
*/
|
|
function isValidAscii(code) {
|
|
return ((code >= 97 && code <= 122) || (code >= 48 && code <= 57) || code > 127);
|
|
}
|
|
/**
|
|
* Check if a hostname string is valid. It's usually a preliminary check before
|
|
* trying to use getDomain or anything else.
|
|
*
|
|
* Beware: it does not check if the TLD exists.
|
|
*/
|
|
function isValidHostname (hostname) {
|
|
if (hostname.length > 255) {
|
|
return false;
|
|
}
|
|
if (hostname.length === 0) {
|
|
return false;
|
|
}
|
|
if (
|
|
/*@__INLINE__*/ !isValidAscii(hostname.charCodeAt(0)) &&
|
|
hostname.charCodeAt(0) !== 46 && // '.' (dot)
|
|
hostname.charCodeAt(0) !== 95 // '_' (underscore)
|
|
) {
|
|
return false;
|
|
}
|
|
// Validate hostname according to RFC
|
|
let lastDotIndex = -1;
|
|
let lastCharCode = -1;
|
|
const len = hostname.length;
|
|
for (let i = 0; i < len; i += 1) {
|
|
const code = hostname.charCodeAt(i);
|
|
if (code === 46 /* '.' */) {
|
|
if (
|
|
// Check that previous label is < 63 bytes long (64 = 63 + '.')
|
|
i - lastDotIndex > 64 ||
|
|
// Check that previous character was not already a '.'
|
|
lastCharCode === 46 ||
|
|
// Check that the previous label does not end with a '-' (dash)
|
|
lastCharCode === 45 ||
|
|
// Check that the previous label does not end with a '_' (underscore)
|
|
lastCharCode === 95) {
|
|
return false;
|
|
}
|
|
lastDotIndex = i;
|
|
}
|
|
else if (!( /*@__INLINE__*/(isValidAscii(code) || code === 45 || code === 95))) {
|
|
// Check if there is a forbidden character in the label
|
|
return false;
|
|
}
|
|
lastCharCode = code;
|
|
}
|
|
return (
|
|
// Check that last label is shorter than 63 chars
|
|
len - lastDotIndex - 1 <= 63 &&
|
|
// Check that the last character is an allowed trailing label character.
|
|
// Since we already checked that the char is a valid hostname character,
|
|
// we only need to check that it's different from '-'.
|
|
lastCharCode !== 45);
|
|
}
|
|
|
|
function setDefaultsImpl({ allowIcannDomains = true, allowPrivateDomains = false, detectIp = true, extractHostname = true, mixedInputs = true, validHosts = null, validateHostname = true, }) {
|
|
return {
|
|
allowIcannDomains,
|
|
allowPrivateDomains,
|
|
detectIp,
|
|
extractHostname,
|
|
mixedInputs,
|
|
validHosts,
|
|
validateHostname,
|
|
};
|
|
}
|
|
const DEFAULT_OPTIONS = /*@__INLINE__*/ setDefaultsImpl({});
|
|
function setDefaults(options) {
|
|
if (options === undefined) {
|
|
return DEFAULT_OPTIONS;
|
|
}
|
|
return /*@__INLINE__*/ setDefaultsImpl(options);
|
|
}
|
|
|
|
/**
|
|
* Returns the subdomain of a hostname string
|
|
*/
|
|
function getSubdomain(hostname, domain) {
|
|
// If `hostname` and `domain` are the same, then there is no sub-domain
|
|
if (domain.length === hostname.length) {
|
|
return '';
|
|
}
|
|
return hostname.slice(0, -domain.length - 1);
|
|
}
|
|
|
|
/**
|
|
* Implement a factory allowing to plug different implementations of suffix
|
|
* lookup (e.g.: using a trie or the packed hashes datastructures). This is used
|
|
* and exposed in `tldts.ts` and `tldts-experimental.ts` bundle entrypoints.
|
|
*/
|
|
function getEmptyResult() {
|
|
return {
|
|
domain: null,
|
|
domainWithoutSuffix: null,
|
|
hostname: null,
|
|
isIcann: null,
|
|
isIp: null,
|
|
isPrivate: null,
|
|
publicSuffix: null,
|
|
subdomain: null,
|
|
};
|
|
}
|
|
function resetResult(result) {
|
|
result.domain = null;
|
|
result.domainWithoutSuffix = null;
|
|
result.hostname = null;
|
|
result.isIcann = null;
|
|
result.isIp = null;
|
|
result.isPrivate = null;
|
|
result.publicSuffix = null;
|
|
result.subdomain = null;
|
|
}
|
|
function parseImpl(url, step, suffixLookup, partialOptions, result) {
|
|
const options = /*@__INLINE__*/ setDefaults(partialOptions);
|
|
// Very fast approximate check to make sure `url` is a string. This is needed
|
|
// because the library will not necessarily be used in a typed setup and
|
|
// values of arbitrary types might be given as argument.
|
|
if (typeof url !== 'string') {
|
|
return result;
|
|
}
|
|
// Extract hostname from `url` only if needed. This can be made optional
|
|
// using `options.extractHostname`. This option will typically be used
|
|
// whenever we are sure the inputs to `parse` are already hostnames and not
|
|
// arbitrary URLs.
|
|
//
|
|
// `mixedInput` allows to specify if we expect a mix of URLs and hostnames
|
|
// as input. If only hostnames are expected then `extractHostname` can be
|
|
// set to `false` to speed-up parsing. If only URLs are expected then
|
|
// `mixedInputs` can be set to `false`. The `mixedInputs` is only a hint
|
|
// and will not change the behavior of the library.
|
|
if (!options.extractHostname) {
|
|
result.hostname = url;
|
|
}
|
|
else if (options.mixedInputs) {
|
|
result.hostname = extractHostname(url, isValidHostname(url));
|
|
}
|
|
else {
|
|
result.hostname = extractHostname(url, false);
|
|
}
|
|
if (step === 0 /* FLAG.HOSTNAME */ || result.hostname === null) {
|
|
return result;
|
|
}
|
|
// Check if `hostname` is a valid ip address
|
|
if (options.detectIp) {
|
|
result.isIp = isIp(result.hostname);
|
|
if (result.isIp) {
|
|
return result;
|
|
}
|
|
}
|
|
// Perform optional hostname validation. If hostname is not valid, no need to
|
|
// go further as there will be no valid domain or sub-domain.
|
|
if (options.validateHostname &&
|
|
options.extractHostname &&
|
|
!isValidHostname(result.hostname)) {
|
|
result.hostname = null;
|
|
return result;
|
|
}
|
|
// Extract public suffix
|
|
suffixLookup(result.hostname, options, result);
|
|
if (step === 2 /* FLAG.PUBLIC_SUFFIX */ || result.publicSuffix === null) {
|
|
return result;
|
|
}
|
|
// Extract domain
|
|
result.domain = getDomain(result.publicSuffix, result.hostname, options);
|
|
if (step === 3 /* FLAG.DOMAIN */ || result.domain === null) {
|
|
return result;
|
|
}
|
|
// Extract subdomain
|
|
result.subdomain = getSubdomain(result.hostname, result.domain);
|
|
if (step === 4 /* FLAG.SUB_DOMAIN */) {
|
|
return result;
|
|
}
|
|
// Extract domain without suffix
|
|
result.domainWithoutSuffix = getDomainWithoutSuffix(result.domain, result.publicSuffix);
|
|
return result;
|
|
}
|
|
|
|
function fastPath (hostname, options, out) {
|
|
// Fast path for very popular suffixes; this allows to by-pass lookup
|
|
// completely as well as any extra allocation or string manipulation.
|
|
if (!options.allowPrivateDomains && hostname.length > 3) {
|
|
const last = hostname.length - 1;
|
|
const c3 = hostname.charCodeAt(last);
|
|
const c2 = hostname.charCodeAt(last - 1);
|
|
const c1 = hostname.charCodeAt(last - 2);
|
|
const c0 = hostname.charCodeAt(last - 3);
|
|
if (c3 === 109 /* 'm' */ &&
|
|
c2 === 111 /* 'o' */ &&
|
|
c1 === 99 /* 'c' */ &&
|
|
c0 === 46 /* '.' */) {
|
|
out.isIcann = true;
|
|
out.isPrivate = false;
|
|
out.publicSuffix = 'com';
|
|
return true;
|
|
}
|
|
else if (c3 === 103 /* 'g' */ &&
|
|
c2 === 114 /* 'r' */ &&
|
|
c1 === 111 /* 'o' */ &&
|
|
c0 === 46 /* '.' */) {
|
|
out.isIcann = true;
|
|
out.isPrivate = false;
|
|
out.publicSuffix = 'org';
|
|
return true;
|
|
}
|
|
else if (c3 === 117 /* 'u' */ &&
|
|
c2 === 100 /* 'd' */ &&
|
|
c1 === 101 /* 'e' */ &&
|
|
c0 === 46 /* '.' */) {
|
|
out.isIcann = true;
|
|
out.isPrivate = false;
|
|
out.publicSuffix = 'edu';
|
|
return true;
|
|
}
|
|
else if (c3 === 118 /* 'v' */ &&
|
|
c2 === 111 /* 'o' */ &&
|
|
c1 === 103 /* 'g' */ &&
|
|
c0 === 46 /* '.' */) {
|
|
out.isIcann = true;
|
|
out.isPrivate = false;
|
|
out.publicSuffix = 'gov';
|
|
return true;
|
|
}
|
|
else if (c3 === 116 /* 't' */ &&
|
|
c2 === 101 /* 'e' */ &&
|
|
c1 === 110 /* 'n' */ &&
|
|
c0 === 46 /* '.' */) {
|
|
out.isIcann = true;
|
|
out.isPrivate = false;
|
|
out.publicSuffix = 'net';
|
|
return true;
|
|
}
|
|
else if (c3 === 101 /* 'e' */ &&
|
|
c2 === 100 /* 'd' */ &&
|
|
c1 === 46 /* '.' */) {
|
|
out.isIcann = true;
|
|
out.isPrivate = false;
|
|
out.publicSuffix = 'de';
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
exports.fastPathLookup = fastPath;
|
|
exports.getEmptyResult = getEmptyResult;
|
|
exports.parseImpl = parseImpl;
|
|
exports.resetResult = resetResult;
|
|
exports.setDefaults = setDefaults;
|
|
//# sourceMappingURL=index.js.map
|