Automata/packages/workflow/src/Extensions/StringExtensions.ts

// import { createHash } from 'crypto';
import { titleCase } from 'title-case';
import * as ExpressionError from '../ExpressionError';
import type { ExtensionMap } from './Extensions';
import CryptoJS from 'crypto-js';
import { encode } from 'js-base64';
import { transliterate } from 'transliteration';

const hashFunctions: Record<string, typeof CryptoJS.MD5> = {
	md5: CryptoJS.MD5,
	sha1: CryptoJS.SHA1,
	sha224: CryptoJS.SHA224,
	sha256: CryptoJS.SHA256,
	sha384: CryptoJS.SHA384,
	sha512: CryptoJS.SHA512,
	sha3: CryptoJS.SHA3,
	ripemd160: CryptoJS.RIPEMD160,
};

// All symbols from https://www.xe.com/symbols/ as for 2022/11/09
const CURRENCY_REGEXP =
	/(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu;

/*
	Extract the domain part from various inputs, including URLs, email addresses, and plain domains.

	/^(?:(?:https?|ftp):\/\/)? 								// Match optional http, https, or ftp protocols
  (?:mailto:)?               								// Match optional mailto:
  (?:\/\/)?                  								// Match optional double slashes
  (?:www\.)?                 								// Match optional www prefix
  (?:[-\w]*\.)?              								// Match any optional subdomain
  (                           							// Capture the domain part
    (?:(?:[-\w]+\.)+          							// Match one or more subdomains
      (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) 		// Match top-level domain or Punycode encoded IDN(xn--80aswg.xn--p1ai)
      |localhost              							// Match localhost
      |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} 	// Match IPv4 addresses
    )
  )
  (?::\d+)?                   							// Match optional port number
  (?:\/[^\s?]*)?              							// Match optional path
  (?:\?[^\s#]*)?              							// Match optional query string
  (?:#[^\s]*)?$/i;            							// Match optional hash fragment
*/
const DOMAIN_EXTRACT_REGEXP =
	/^(?:(?:https?|ftp):\/\/)?(?:mailto:)?(?:\/\/)?((?:www\.)?(?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;

/*
	Matches domain names without the protocol or optional subdomains

	/^(?:www\.)? 															// Match optional www prefix
  (                         								// Capture the domain part
    (?:(?:[-\w]+\.)+        								// Match one or more subdomains
      (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) 		// Match top-level domain or Punycode encoded IDN
      |localhost            								// Match localhost
      |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} 	// Match IPv4 addresses
    )
  )
  (?::\d+)?                 								// Match optional port number
  (?:\/[^\s?]*)?            								// Match optional path
  (?:\?[^\s#]*)?            								// Match optional query string
  (?:#[^\s]*)?$/i;          								// Match optional fragment at the end of the string
*/
const DOMAIN_REGEXP =
	/^(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;

/*
	Matches email addresses

	/(
    ( 																											// Capture local part of the email address
      ([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*) 	// One or more characters not in the set, followed by
																														a	period, followed by one or more characters not in the set
      |(".+") 																							// Or one or more characters inside quotes
    )
  )
  @                             														// Match @ symbol
  (?<domain>(                   														// Capture the domain part of the email address
    \[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\] 			// Match IPv4 address inside brackets
    |(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}) 											// Or match domain with at least two subdomains and TLD
  ))/;
*/
const EMAIL_REGEXP =
	/(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?<domain>(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;

/*
	Matches URLs with strict beginning and end of the string checks

	/^(?:(?:https?|ftp):\/\/) 							// Match http, https, or ftp protocols at the start of the string
  (?:www\.)?               								// Match optional www prefix
  (                        								// Capture the domain part
    (?:(?:[-\w]+\.)+       								// Match one or more subdomains
      (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) 	// Match top-level domain or Punycode encoded IDN
      |localhost           								// Match localhost
      |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
    )
  )
  (?::\d+)?                								// Match optional port number
  (?:\/[^\s?#]*)?          								// Match optional path
  (?:\?[^\s#]*)?           								// Match optional query string
  (?=([^\s]+#.*)?)         								// Positive lookahead for the fragment identifier
  #?[^\s]*$/i;              							// Match optional fragment at the end of the string
*/
const URL_REGEXP_EXACT =
	/^(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*$/i;

/*
	Same as URL_REGEXP_EXACT but without the strict beginning and end of the string checks to allow for
	matching URLs in the middle of a string
*/
const URL_REGEXP =
	/(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*/i;

const CHAR_TEST_REGEXP = /\p{L}/u;
const PUNC_TEST_REGEXP = /[!?.]/;

function hash(value: string, extraArgs?: unknown): string {
	const [algorithm = 'MD5'] = extraArgs as string[];
	if (algorithm.toLowerCase() === 'base64') {
		// We're using a library instead of btoa because btoa only
		// works on ASCII
		return encode(value);
	}
	const hashFunction = hashFunctions[algorithm.toLowerCase()];
	if (!hashFunction) {
		throw new ExpressionError.ExpressionExtensionError(
			`Unknown algorithm ${algorithm}. Available algorithms are: ${Object.keys(hashFunctions)
				.map((s) => s.toUpperCase())
				.join(', ')}, and Base64.`,
		);
	}
	return hashFunction(value.toString()).toString();
	// return createHash(format).update(value.toString()).digest('hex');
}

function isEmpty(value: string): boolean {
	return value === '';
}

function isNotEmpty(value: string): boolean {
	return !isEmpty(value);
}

function length(value: string): number {
	return value.length;
}

function removeMarkdown(value: string): string {
	let output = value;
	try {
		output = output.replace(/^([\s\t]*)([*\-+]|\d\.)\s+/gm, '$1');

		output = output
			// Header
			.replace(/\n={2,}/g, '\n')
			// Strikethrough
			.replace(/~~/g, '')
			// Fenced codeblocks
			.replace(/`{3}.*\n/g, '');

		output = output
			// Remove HTML tags
			.replace(/<[\w|\s|=|'|"|:|(|)|,|;|/|0-9|.|-]+[>|\\>]/g, '')
			// Remove setext-style headers
			.replace(/^[=-]{2,}\s*$/g, '')
			// Remove footnotes?
			.replace(/\[\^.+?\](: .*?$)?/g, '')
			.replace(/\s{0,2}\[.*?\]: .*?$/g, '')
			// Remove images
			.replace(/!\[.*?\][[(].*?[\])]/g, '')
			// Remove inline links
			.replace(/\[(.*?)\][[(].*?[\])]/g, '$1')
			// Remove Blockquotes
			.replace(/>/g, '')
			// Remove reference-style links?
			.replace(/^\s{1,2}\[(.*?)\]: (\S+)( ".*?")?\s*$/g, '')
			// Remove atx-style headers
			.replace(/^#{1,6}\s*([^#]*)\s*(#{1,6})?/gm, '$1')
			.replace(/([*_]{1,3})(\S.*?\S)\1/g, '$2')
			.replace(/(`{3,})(.*?)\1/gm, '$2')
			.replace(/^-{3,}\s*$/g, '')
			.replace(/`(.+?)`/g, '$1')
			.replace(/\n{2,}/g, '\n\n');
	} catch (e) {
		return value;
	}
	return output;
}

function removeTags(value: string): string {
	return value.replace(/<[^>]*>?/gm, '');
}

function toDate(value: string): Date {
	const date = new Date(Date.parse(value));

	if (date.toString() === 'Invalid Date') {
		throw new ExpressionError.ExpressionExtensionError('cannot convert to date');
	}
	// If time component is not specified, force 00:00h
	if (!/:/.test(value)) {
		date.setHours(0, 0, 0);
	}
	return date;
}

function urlDecode(value: string, extraArgs: boolean[]): string {
	const [entireString = false] = extraArgs;
	if (entireString) {
		return decodeURI(value.toString());
	}
	return decodeURIComponent(value.toString());
}

function urlEncode(value: string, extraArgs: boolean[]): string {
	const [entireString = false] = extraArgs;
	if (entireString) {
		return encodeURI(value.toString());
	}
	return encodeURIComponent(value.toString());
}

function toInt(value: string, extraArgs: Array<number | undefined>) {
	const [radix] = extraArgs;
	const int = parseInt(value.replace(CURRENCY_REGEXP, ''), radix);

	if (isNaN(int)) {
		throw new ExpressionError.ExpressionExtensionError('cannot convert to integer');
	}

	return int;
}

function toFloat(value: string) {
	if (value.includes(',')) {
		throw new ExpressionError.ExpressionExtensionError(
			'cannot convert to float, expected . as decimal separator',
		);
	}

	const float = parseFloat(value.replace(CURRENCY_REGEXP, ''));

	if (isNaN(float)) {
		throw new ExpressionError.ExpressionExtensionError('cannot convert to float');
	}

	return float;
}

function quote(value: string, extraArgs: string[]) {
	const [quoteChar = '"'] = extraArgs;
	return `${quoteChar}${value
		.replace(/\\/g, '\\\\')
		.replace(new RegExp(`\\${quoteChar}`, 'g'), `\\${quoteChar}`)}${quoteChar}`;
}

function isNumeric(value: string) {
	if (value.includes(' ')) return false;

	return !isNaN(value as unknown as number) && !isNaN(parseFloat(value));
}

function isUrl(value: string) {
	return URL_REGEXP_EXACT.test(value);
}

function isDomain(value: string) {
	return DOMAIN_REGEXP.test(value);
}

function isEmail(value: string) {
	const result = EMAIL_REGEXP.test(value);

	// email regex is loose so check manually for now
	if (result && value.includes(' ')) {
		return false;
	}

	return result;
}

function toTitleCase(value: string) {
	return titleCase(value);
}

function replaceSpecialChars(value: string) {
	return transliterate(value, { unknown: '?' });
}

function toSentenceCase(value: string) {
	let current = value.slice();
	let buffer = '';

	while (CHAR_TEST_REGEXP.test(current)) {
		const charIndex = current.search(CHAR_TEST_REGEXP);
		current =
			current.slice(0, charIndex) +
			current[charIndex]!.toLocaleUpperCase() +
			current.slice(charIndex + 1).toLocaleLowerCase();
		const puncIndex = current.search(PUNC_TEST_REGEXP);
		if (puncIndex === -1) {
			buffer += current;
			current = '';
			break;
		}
		buffer += current.slice(0, puncIndex + 1);
		current = current.slice(puncIndex + 1);
	}

	return buffer;
}

function toSnakeCase(value: string) {
	return value
		.toLocaleLowerCase()
		.replace(/[ \-]/g, '_')
		.replace(/[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,.\/:;<=>?@\[\]^`{|}~]/g, '');
}

function extractEmail(value: string) {
	const matched = EMAIL_REGEXP.exec(value);
	if (!matched) {
		return undefined;
	}
	return matched[0];
}

function extractDomain(value: string) {
	if (isEmail(value)) {
		const matched = EMAIL_REGEXP.exec(value);
		// This shouldn't happen
		if (!matched) {
			return undefined;
		}
		return matched.groups?.domain;
	}

	const domainMatch = value.match(DOMAIN_EXTRACT_REGEXP);
	if (domainMatch) {
		return domainMatch[1];
	}

	return undefined;
}

function extractUrl(value: string) {
	const matched = URL_REGEXP.exec(value);
	if (!matched) {
		return undefined;
	}
	return matched[0];
}

removeMarkdown.doc = {
	name: 'removeMarkdown',
	description: 'Removes Markdown formatting from a string.',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-removeMarkdown',
};

removeTags.doc = {
	name: 'removeTags',
	description: 'Removes tags, such as HTML or XML, from a string.',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-removeTags',
};

toDate.doc = {
	name: 'toDate',
	description: 'Converts a string to a date.',
	returnType: 'Date',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-toDate',
};

toFloat.doc = {
	name: 'toFloat',
	description: 'Converts a string to a decimal number.',
	returnType: 'number',
	aliases: ['toDecimalNumber'],
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-toDecimalNumber',
};

toInt.doc = {
	name: 'toInt',
	description: 'Converts a string to an integer.',
	returnType: 'number',
	args: [{ name: 'radix?', type: 'number' }],
	aliases: ['toWholeNumber'],
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-toInt',
};

toSentenceCase.doc = {
	name: 'toSentenceCase',
	description: 'Formats a string to sentence case. Example: "This is a sentence".',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-toSentenceCase',
};

toSnakeCase.doc = {
	name: 'toSnakeCase',
	description: 'Formats a string to snake case. Example: "this_is_snake_case".',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-toSnakeCase',
};

toTitleCase.doc = {
	name: 'toTitleCase',
	description: 'Formats a string to title case. Example: "This Is a Title".',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-toTitleCase',
};

urlEncode.doc = {
	name: 'urlEncode',
	description: 'Encodes a string to be used/included in a URL.',
	args: [{ name: 'entireString?', type: 'boolean' }],
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-urlEncode',
};

urlDecode.doc = {
	name: 'urlDecode',
	description:
		'Decodes a URL-encoded string. It decodes any percent-encoded characters in the input string, and replaces them with their original characters.',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-urlDecode',
};

replaceSpecialChars.doc = {
	name: 'replaceSpecialChars',
	description: 'Replaces non-ASCII characters in a string with an ASCII representation.',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-replaceSpecialChars',
};

length.doc = {
	name: 'length',
	description: 'Returns the character count of a string.',
	returnType: 'number',
	docURL: 'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings',
};

isDomain.doc = {
	name: 'isDomain',
	description: 'Checks if a string is a domain.',
	returnType: 'boolean',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-isDomain',
};

isEmail.doc = {
	name: 'isEmail',
	description: 'Checks if a string is an email.',
	returnType: 'boolean',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-isEmail',
};

isNumeric.doc = {
	name: 'isEmail',
	description: 'Checks if a string only contains digits.',
	returnType: 'boolean',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-isNumeric',
};

isUrl.doc = {
	name: 'isUrl',
	description: 'Checks if a string is a valid URL.',
	returnType: 'boolean',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-isUrl',
};

isEmpty.doc = {
	name: 'isEmpty',
	description: 'Checks if a string is empty.',
	returnType: 'boolean',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-isEmpty',
};

isNotEmpty.doc = {
	name: 'isNotEmpty',
	description: 'Checks if a string has content.',
	returnType: 'boolean',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-isNotEmpty',
};

extractEmail.doc = {
	name: 'extractEmail',
	description: 'Extracts an email from a string. Returns undefined if none is found.',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-extractEmail',
};

extractDomain.doc = {
	name: 'extractDomain',
	description:
		'Extracts a domain from a string containing a valid URL. Returns undefined if none is found.',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-extractDomain',
};

extractUrl.doc = {
	name: 'extractUrl',
	description: 'Extracts a URL from a string. Returns undefined if none is found.',
	returnType: 'string',
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-extractUrl',
};

hash.doc = {
	name: 'hash',
	description: 'Returns a string hashed with the given algorithm. Default algorithm is `md5`.',
	returnType: 'string',
	args: [{ name: 'algo?', type: 'Algorithm' }],
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-hash',
};

quote.doc = {
	name: 'quote',
	description: 'Returns a string wrapped in the quotation marks. Default quotation is `"`.',
	returnType: 'string',
	args: [{ name: 'mark?', type: 'string' }],
	docURL:
		'https://docs.n8n.io/code-examples/expressions/data-transformation-functions/strings/#string-quote',
};

export const stringExtensions: ExtensionMap = {
	typeName: 'String',
	functions: {
		hash,
		removeMarkdown,
		removeTags,
		toDate,
		toDecimalNumber: toFloat,
		toFloat,
		toInt,
		toWholeNumber: toInt,
		toSentenceCase,
		toSnakeCase,
		toTitleCase,
		urlDecode,
		urlEncode,
		quote,
		replaceSpecialChars,
		length,
		isDomain,
		isEmail,
		isNumeric,
		isUrl,
		isEmpty,
		isNotEmpty,
		extractEmail,
		extractDomain,
		extractUrl,
	},
};