import { hp2Builder } from '@selderee/plugin-htmlparser2'; import { parseDocument } from 'htmlparser2'; import { DecisionTree } from 'selderee'; import merge from 'deepmerge'; import { render } from 'dom-serializer'; /** * Make a recursive function that will only run to a given depth * and switches to an alternative function at that depth. \ * No limitation if `n` is `undefined` (Just wraps `f` in that case). * * @param { number | undefined } n Allowed depth of recursion. `undefined` for no limitation. * @param { Function } f Function that accepts recursive callback as the first argument. * @param { Function } [g] Function to run instead, when maximum depth was reached. Do nothing by default. * @returns { Function } */ function limitedDepthRecursive (n, f, g = () => undefined) { if (n === undefined) { const f1 = function (...args) { return f(f1, ...args); }; return f1; } if (n >= 0) { return function (...args) { return f(limitedDepthRecursive(n - 1, f, g), ...args); }; } return g; } /** * Return the same string or a substring with * the given character occurrences removed from each side. * * @param { string } str A string to trim. * @param { string } char A character to be trimmed. * @returns { string } */ function trimCharacter (str, char) { let start = 0; let end = str.length; while (start < end && str[start] === char) { ++start; } while (end > start && str[end - 1] === char) { --end; } return (start > 0 || end < str.length) ? str.substring(start, end) : str; } /** * Return the same string or a substring with * the given character occurrences removed from the end only. * * @param { string } str A string to trim. * @param { string } char A character to be trimmed. * @returns { string } */ function trimCharacterEnd (str, char) { let end = str.length; while (end > 0 && str[end - 1] === char) { --end; } return (end < str.length) ? str.substring(0, end) : str; } /** * Return a new string will all characters replaced with unicode escape sequences. * This extreme kind of escaping can used to be safely compose regular expressions. * * @param { string } str A string to escape. * @returns { string } A string of unicode escape sequences. */ function unicodeEscape (str) { return str.replace(/[\s\S]/g, c => '\\u' + c.charCodeAt().toString(16).padStart(4, '0')); } /** * Deduplicate an array by a given key callback. * Item properties are merged recursively and with the preference for last defined values. * Of items with the same key, merged item takes the place of the last item, * others are omitted. * * @param { any[] } items An array to deduplicate. * @param { (x: any) => string } getKey Callback to get a value that distinguishes unique items. * @returns { any[] } */ function mergeDuplicatesPreferLast (items, getKey) { const map = new Map(); for (let i = items.length; i-- > 0;) { const item = items[i]; const key = getKey(item); map.set( key, (map.has(key)) ? merge(item, map.get(key), { arrayMerge: overwriteMerge$1 }) : item ); } return [...map.values()].reverse(); } const overwriteMerge$1 = (acc, src, options) => [...src]; /** * Get a nested property from an object. * * @param { object } obj The object to query for the value. * @param { string[] } path The path to the property. * @returns { any } */ function get (obj, path) { for (const key of path) { if (!obj) { return undefined; } obj = obj[key]; } return obj; } /** * Convert a number into alphabetic sequence representation (Sequence without zeroes). * * For example: `a, ..., z, aa, ..., zz, aaa, ...`. * * @param { number } num Number to convert. Must be >= 1. * @param { string } [baseChar = 'a'] Character for 1 in the sequence. * @param { number } [base = 26] Number of characters in the sequence. * @returns { string } */ function numberToLetterSequence (num, baseChar = 'a', base = 26) { const digits = []; do { num -= 1; digits.push(num % base); num = (num / base) >> 0; // quick `floor` } while (num > 0); const baseCode = baseChar.charCodeAt(0); return digits .reverse() .map(n => String.fromCharCode(baseCode + n)) .join(''); } const I = ['I', 'X', 'C', 'M']; const V = ['V', 'L', 'D']; /** * Convert a number to it's Roman representation. No large numbers extension. * * @param { number } num Number to convert. `0 < num <= 3999`. * @returns { string } */ function numberToRoman (num) { return [...(num) + ''] .map(n => +n) .reverse() .map((v, i) => ((v % 5 < 4) ? (v < 5 ? '' : V[i]) + I[i].repeat(v % 5) : I[i] + (v < 5 ? V[i] : I[i + 1]))) .reverse() .join(''); } /** * Helps to build text from words. */ class InlineTextBuilder { /** * Creates an instance of InlineTextBuilder. * * If `maxLineLength` is not provided then it is either `options.wordwrap` or unlimited. * * @param { Options } options HtmlToText options. * @param { number } [ maxLineLength ] This builder will try to wrap text to fit this line length. */ constructor (options, maxLineLength = undefined) { /** @type { string[][] } */ this.lines = []; /** @type { string[] } */ this.nextLineWords = []; this.maxLineLength = maxLineLength || options.wordwrap || Number.MAX_VALUE; this.nextLineAvailableChars = this.maxLineLength; this.wrapCharacters = get(options, ['longWordSplit', 'wrapCharacters']) || []; this.forceWrapOnLimit = get(options, ['longWordSplit', 'forceWrapOnLimit']) || false; this.stashedSpace = false; this.wordBreakOpportunity = false; } /** * Add a new word. * * @param { string } word A word to add. * @param { boolean } [noWrap] Don't wrap text even if the line is too long. */ pushWord (word, noWrap = false) { if (this.nextLineAvailableChars <= 0 && !noWrap) { this.startNewLine(); } const isLineStart = this.nextLineWords.length === 0; const cost = word.length + (isLineStart ? 0 : 1); if ((cost <= this.nextLineAvailableChars) || noWrap) { // Fits into available budget this.nextLineWords.push(word); this.nextLineAvailableChars -= cost; } else { // Does not fit - try to split the word // The word is moved to a new line - prefer to wrap between words. const [first, ...rest] = this.splitLongWord(word); if (!isLineStart) { this.startNewLine(); } this.nextLineWords.push(first); this.nextLineAvailableChars -= first.length; for (const part of rest) { this.startNewLine(); this.nextLineWords.push(part); this.nextLineAvailableChars -= part.length; } } } /** * Pop a word from the currently built line. * This doesn't affect completed lines. * * @returns { string } */ popWord () { const lastWord = this.nextLineWords.pop(); if (lastWord !== undefined) { const isLineStart = this.nextLineWords.length === 0; const cost = lastWord.length + (isLineStart ? 0 : 1); this.nextLineAvailableChars += cost; } return lastWord; } /** * Concat a word to the last word already in the builder. * Adds a new word in case there are no words yet in the last line. * * @param { string } word A word to be concatenated. * @param { boolean } [noWrap] Don't wrap text even if the line is too long. */ concatWord (word, noWrap = false) { if (this.wordBreakOpportunity && word.length > this.nextLineAvailableChars) { this.pushWord(word, noWrap); this.wordBreakOpportunity = false; } else { const lastWord = this.popWord(); this.pushWord((lastWord) ? lastWord.concat(word) : word, noWrap); } } /** * Add current line (and more empty lines if provided argument > 1) to the list of complete lines and start a new one. * * @param { number } n Number of line breaks that will be added to the resulting string. */ startNewLine (n = 1) { this.lines.push(this.nextLineWords); if (n > 1) { this.lines.push(...Array.from({ length: n - 1 }, () => [])); } this.nextLineWords = []; this.nextLineAvailableChars = this.maxLineLength; } /** * No words in this builder. * * @returns { boolean } */ isEmpty () { return this.lines.length === 0 && this.nextLineWords.length === 0; } clear () { this.lines.length = 0; this.nextLineWords.length = 0; this.nextLineAvailableChars = this.maxLineLength; } /** * Join all lines of words inside the InlineTextBuilder into a complete string. * * @returns { string } */ toString () { return [...this.lines, this.nextLineWords] .map(words => words.join(' ')) .join('\n'); } /** * Split a long word up to fit within the word wrap limit. * Use either a character to split looking back from the word wrap limit, * or truncate to the word wrap limit. * * @param { string } word Input word. * @returns { string[] } Parts of the word. */ splitLongWord (word) { const parts = []; let idx = 0; while (word.length > this.maxLineLength) { const firstLine = word.substring(0, this.maxLineLength); const remainingChars = word.substring(this.maxLineLength); const splitIndex = firstLine.lastIndexOf(this.wrapCharacters[idx]); if (splitIndex > -1) { // Found a character to split on word = firstLine.substring(splitIndex + 1) + remainingChars; parts.push(firstLine.substring(0, splitIndex + 1)); } else { // Not found a character to split on idx++; if (idx < this.wrapCharacters.length) { // There is next character to try word = firstLine + remainingChars; } else { // No more characters to try if (this.forceWrapOnLimit) { parts.push(firstLine); word = remainingChars; if (word.length > this.maxLineLength) { continue; } } else { word = firstLine + remainingChars; } break; } } } parts.push(word); // Add remaining part to array return parts; } } /* eslint-disable max-classes-per-file */ class StackItem { constructor (next = null) { this.next = next; } getRoot () { return (this.next) ? this.next : this; } } class BlockStackItem extends StackItem { constructor (options, next = null, leadingLineBreaks = 1, maxLineLength = undefined) { super(next); this.leadingLineBreaks = leadingLineBreaks; this.inlineTextBuilder = new InlineTextBuilder(options, maxLineLength); this.rawText = ''; this.stashedLineBreaks = 0; this.isPre = next && next.isPre; this.isNoWrap = next && next.isNoWrap; } } class ListStackItem extends BlockStackItem { constructor ( options, next = null, { interRowLineBreaks = 1, leadingLineBreaks = 2, maxLineLength = undefined, maxPrefixLength = 0, prefixAlign = 'left', } = {} ) { super(options, next, leadingLineBreaks, maxLineLength); this.maxPrefixLength = maxPrefixLength; this.prefixAlign = prefixAlign; this.interRowLineBreaks = interRowLineBreaks; } } class ListItemStackItem extends BlockStackItem { constructor ( options, next = null, { leadingLineBreaks = 1, maxLineLength = undefined, prefix = '', } = {} ) { super(options, next, leadingLineBreaks, maxLineLength); this.prefix = prefix; } } class TableStackItem extends StackItem { constructor (next = null) { super(next); this.rows = []; this.isPre = next && next.isPre; this.isNoWrap = next && next.isNoWrap; } } class TableRowStackItem extends StackItem { constructor (next = null) { super(next); this.cells = []; this.isPre = next && next.isPre; this.isNoWrap = next && next.isNoWrap; } } class TableCellStackItem extends StackItem { constructor (options, next = null, maxColumnWidth = undefined) { super(next); this.inlineTextBuilder = new InlineTextBuilder(options, maxColumnWidth); this.rawText = ''; this.stashedLineBreaks = 0; this.isPre = next && next.isPre; this.isNoWrap = next && next.isNoWrap; } } class TransformerStackItem extends StackItem { constructor (next = null, transform) { super(next); this.transform = transform; } } function charactersToCodes (str) { return [...str] .map(c => '\\u' + c.charCodeAt(0).toString(16).padStart(4, '0')) .join(''); } /** * Helps to handle HTML whitespaces. * * @class WhitespaceProcessor */ class WhitespaceProcessor { /** * Creates an instance of WhitespaceProcessor. * * @param { Options } options HtmlToText options. * @memberof WhitespaceProcessor */ constructor (options) { this.whitespaceChars = (options.preserveNewlines) ? options.whitespaceCharacters.replace(/\n/g, '') : options.whitespaceCharacters; const whitespaceCodes = charactersToCodes(this.whitespaceChars); this.leadingWhitespaceRe = new RegExp(`^[${whitespaceCodes}]`); this.trailingWhitespaceRe = new RegExp(`[${whitespaceCodes}]$`); this.allWhitespaceOrEmptyRe = new RegExp(`^[${whitespaceCodes}]*$`); this.newlineOrNonWhitespaceRe = new RegExp(`(\\n|[^\\n${whitespaceCodes}])`, 'g'); this.newlineOrNonNewlineStringRe = new RegExp(`(\\n|[^\\n]+)`, 'g'); if (options.preserveNewlines) { const wordOrNewlineRe = new RegExp(`\\n|[^\\n${whitespaceCodes}]+`, 'gm'); /** * Shrink whitespaces and wrap text, add to the builder. * * @param { string } text Input text. * @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text. * @param { (str: string) => string } [ transform ] A transform to be applied to words. * @param { boolean } [noWrap] Don't wrap text even if the line is too long. */ this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) { if (!text) { return; } const previouslyStashedSpace = inlineTextBuilder.stashedSpace; let anyMatch = false; let m = wordOrNewlineRe.exec(text); if (m) { anyMatch = true; if (m[0] === '\n') { inlineTextBuilder.startNewLine(); } else if (previouslyStashedSpace || this.testLeadingWhitespace(text)) { inlineTextBuilder.pushWord(transform(m[0]), noWrap); } else { inlineTextBuilder.concatWord(transform(m[0]), noWrap); } while ((m = wordOrNewlineRe.exec(text)) !== null) { if (m[0] === '\n') { inlineTextBuilder.startNewLine(); } else { inlineTextBuilder.pushWord(transform(m[0]), noWrap); } } } inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || (this.testTrailingWhitespace(text)); // No need to stash a space in case last added item was a new line, // but that won't affect anything later anyway. }; } else { const wordRe = new RegExp(`[^${whitespaceCodes}]+`, 'g'); this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) { if (!text) { return; } const previouslyStashedSpace = inlineTextBuilder.stashedSpace; let anyMatch = false; let m = wordRe.exec(text); if (m) { anyMatch = true; if (previouslyStashedSpace || this.testLeadingWhitespace(text)) { inlineTextBuilder.pushWord(transform(m[0]), noWrap); } else { inlineTextBuilder.concatWord(transform(m[0]), noWrap); } while ((m = wordRe.exec(text)) !== null) { inlineTextBuilder.pushWord(transform(m[0]), noWrap); } } inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || this.testTrailingWhitespace(text); }; } } /** * Add text with only minimal processing. * Everything between newlines considered a single word. * No whitespace is trimmed. * Not affected by preserveNewlines option - `\n` always starts a new line. * * `noWrap` argument is `true` by default - this won't start a new line * even if there is not enough space left in the current line. * * @param { string } text Input text. * @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text. * @param { boolean } [noWrap] Don't wrap text even if the line is too long. */ addLiteral (text, inlineTextBuilder, noWrap = true) { if (!text) { return; } const previouslyStashedSpace = inlineTextBuilder.stashedSpace; let anyMatch = false; let m = this.newlineOrNonNewlineStringRe.exec(text); if (m) { anyMatch = true; if (m[0] === '\n') { inlineTextBuilder.startNewLine(); } else if (previouslyStashedSpace) { inlineTextBuilder.pushWord(m[0], noWrap); } else { inlineTextBuilder.concatWord(m[0], noWrap); } while ((m = this.newlineOrNonNewlineStringRe.exec(text)) !== null) { if (m[0] === '\n') { inlineTextBuilder.startNewLine(); } else { inlineTextBuilder.pushWord(m[0], noWrap); } } } inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch); } /** * Test whether the given text starts with HTML whitespace character. * * @param { string } text The string to test. * @returns { boolean } */ testLeadingWhitespace (text) { return this.leadingWhitespaceRe.test(text); } /** * Test whether the given text ends with HTML whitespace character. * * @param { string } text The string to test. * @returns { boolean } */ testTrailingWhitespace (text) { return this.trailingWhitespaceRe.test(text); } /** * Test whether the given text contains any non-whitespace characters. * * @param { string } text The string to test. * @returns { boolean } */ testContainsWords (text) { return !this.allWhitespaceOrEmptyRe.test(text); } /** * Return the number of newlines if there are no words. * * If any word is found then return zero regardless of the actual number of newlines. * * @param { string } text Input string. * @returns { number } */ countNewlinesNoWords (text) { this.newlineOrNonWhitespaceRe.lastIndex = 0; let counter = 0; let match; while ((match = this.newlineOrNonWhitespaceRe.exec(text)) !== null) { if (match[0] === '\n') { counter++; } else { return 0; } } return counter; } } /** * Helps to build text from inline and block elements. * * @class BlockTextBuilder */ class BlockTextBuilder { /** * Creates an instance of BlockTextBuilder. * * @param { Options } options HtmlToText options. * @param { import('selderee').Picker } picker Selectors decision tree picker. * @param { any} [metadata] Optional metadata for HTML document, for use in formatters. */ constructor (options, picker, metadata = undefined) { this.options = options; this.picker = picker; this.metadata = metadata; this.whitespaceProcessor = new WhitespaceProcessor(options); /** @type { StackItem } */ this._stackItem = new BlockStackItem(options); /** @type { TransformerStackItem } */ this._wordTransformer = undefined; } /** * Put a word-by-word transform function onto the transformations stack. * * Mainly used for uppercasing. Can be bypassed to add unformatted text such as URLs. * * Word transformations applied before wrapping. * * @param { (str: string) => string } wordTransform Word transformation function. */ pushWordTransform (wordTransform) { this._wordTransformer = new TransformerStackItem(this._wordTransformer, wordTransform); } /** * Remove a function from the word transformations stack. * * @returns { (str: string) => string } A function that was removed. */ popWordTransform () { if (!this._wordTransformer) { return undefined; } const transform = this._wordTransformer.transform; this._wordTransformer = this._wordTransformer.next; return transform; } /** * Ignore wordwrap option in followup inline additions and disable automatic wrapping. */ startNoWrap () { this._stackItem.isNoWrap = true; } /** * Return automatic wrapping to behavior defined by options. */ stopNoWrap () { this._stackItem.isNoWrap = false; } /** @returns { (str: string) => string } */ _getCombinedWordTransformer () { const wt = (this._wordTransformer) ? ((str) => applyTransformer(str, this._wordTransformer)) : undefined; const ce = this.options.encodeCharacters; return (wt) ? ((ce) ? (str) => ce(wt(str)) : wt) : ce; } _popStackItem () { const item = this._stackItem; this._stackItem = item.next; return item; } /** * Add a line break into currently built block. */ addLineBreak () { if (!( this._stackItem instanceof BlockStackItem || this._stackItem instanceof ListItemStackItem || this._stackItem instanceof TableCellStackItem )) { return; } if (this._stackItem.isPre) { this._stackItem.rawText += '\n'; } else { this._stackItem.inlineTextBuilder.startNewLine(); } } /** * Allow to break line in case directly following text will not fit. */ addWordBreakOpportunity () { if ( this._stackItem instanceof BlockStackItem || this._stackItem instanceof ListItemStackItem || this._stackItem instanceof TableCellStackItem ) { this._stackItem.inlineTextBuilder.wordBreakOpportunity = true; } } /** * Add a node inline into the currently built block. * * @param { string } str * Text content of a node to add. * * @param { object } [param1] * Object holding the parameters of the operation. * * @param { boolean } [param1.noWordTransform] * Ignore word transformers if there are any. * Don't encode characters as well. * (Use this for things like URL addresses). */ addInline (str, { noWordTransform = false } = {}) { if (!( this._stackItem instanceof BlockStackItem || this._stackItem instanceof ListItemStackItem || this._stackItem instanceof TableCellStackItem )) { return; } if (this._stackItem.isPre) { this._stackItem.rawText += str; return; } if ( str.length === 0 || // empty string ( this._stackItem.stashedLineBreaks && // stashed linebreaks make whitespace irrelevant !this.whitespaceProcessor.testContainsWords(str) // no words to add ) ) { return; } if (this.options.preserveNewlines) { const newlinesNumber = this.whitespaceProcessor.countNewlinesNoWords(str); if (newlinesNumber > 0) { this._stackItem.inlineTextBuilder.startNewLine(newlinesNumber); // keep stashedLineBreaks unchanged return; } } if (this._stackItem.stashedLineBreaks) { this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks); } this.whitespaceProcessor.shrinkWrapAdd( str, this._stackItem.inlineTextBuilder, (noWordTransform) ? undefined : this._getCombinedWordTransformer(), this._stackItem.isNoWrap ); this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks } /** * Add a string inline into the currently built block. * * Use this for markup elements that don't have to adhere * to text layout rules. * * @param { string } str Text to add. */ addLiteral (str) { if (!( this._stackItem instanceof BlockStackItem || this._stackItem instanceof ListItemStackItem || this._stackItem instanceof TableCellStackItem )) { return; } if (str.length === 0) { return; } if (this._stackItem.isPre) { this._stackItem.rawText += str; return; } if (this._stackItem.stashedLineBreaks) { this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks); } this.whitespaceProcessor.addLiteral( str, this._stackItem.inlineTextBuilder, this._stackItem.isNoWrap ); this._stackItem.stashedLineBreaks = 0; } /** * Start building a new block. * * @param { object } [param0] * Object holding the parameters of the block. * * @param { number } [param0.leadingLineBreaks] * This block should have at least this number of line breaks to separate it from any preceding block. * * @param { number } [param0.reservedLineLength] * Reserve this number of characters on each line for block markup. * * @param { boolean } [param0.isPre] * Should HTML whitespace be preserved inside this block. */ openBlock ({ leadingLineBreaks = 1, reservedLineLength = 0, isPre = false } = {}) { const maxLineLength = Math.max(20, this._stackItem.inlineTextBuilder.maxLineLength - reservedLineLength); this._stackItem = new BlockStackItem( this.options, this._stackItem, leadingLineBreaks, maxLineLength ); if (isPre) { this._stackItem.isPre = true; } } /** * Finalize currently built block, add it's content to the parent block. * * @param { object } [param0] * Object holding the parameters of the block. * * @param { number } [param0.trailingLineBreaks] * This block should have at least this number of line breaks to separate it from any following block. * * @param { (str: string) => string } [param0.blockTransform] * A function to transform the block text before adding to the parent block. * This happens after word wrap and should be used in combination with reserved line length * in order to keep line lengths correct. * Used for whole block markup. */ closeBlock ({ trailingLineBreaks = 1, blockTransform = undefined } = {}) { const block = this._popStackItem(); const blockText = (blockTransform) ? blockTransform(getText(block)) : getText(block); addText(this._stackItem, blockText, block.leadingLineBreaks, Math.max(block.stashedLineBreaks, trailingLineBreaks)); } /** * Start building a new list. * * @param { object } [param0] * Object holding the parameters of the list. * * @param { number } [param0.maxPrefixLength] * Length of the longest list item prefix. * If not supplied or too small then list items won't be aligned properly. * * @param { 'left' | 'right' } [param0.prefixAlign] * Specify how prefixes of different lengths have to be aligned * within a column. * * @param { number } [param0.interRowLineBreaks] * Minimum number of line breaks between list items. * * @param { number } [param0.leadingLineBreaks] * This list should have at least this number of line breaks to separate it from any preceding block. */ openList ({ maxPrefixLength = 0, prefixAlign = 'left', interRowLineBreaks = 1, leadingLineBreaks = 2 } = {}) { this._stackItem = new ListStackItem(this.options, this._stackItem, { interRowLineBreaks: interRowLineBreaks, leadingLineBreaks: leadingLineBreaks, maxLineLength: this._stackItem.inlineTextBuilder.maxLineLength, maxPrefixLength: maxPrefixLength, prefixAlign: prefixAlign }); } /** * Start building a new list item. * * @param {object} param0 * Object holding the parameters of the list item. * * @param { string } [param0.prefix] * Prefix for this list item (item number, bullet point, etc). */ openListItem ({ prefix = '' } = {}) { if (!(this._stackItem instanceof ListStackItem)) { throw new Error('Can\'t add a list item to something that is not a list! Check the formatter.'); } const list = this._stackItem; const prefixLength = Math.max(prefix.length, list.maxPrefixLength); const maxLineLength = Math.max(20, list.inlineTextBuilder.maxLineLength - prefixLength); this._stackItem = new ListItemStackItem(this.options, list, { prefix: prefix, maxLineLength: maxLineLength, leadingLineBreaks: list.interRowLineBreaks }); } /** * Finalize currently built list item, add it's content to the parent list. */ closeListItem () { const listItem = this._popStackItem(); const list = listItem.next; const prefixLength = Math.max(listItem.prefix.length, list.maxPrefixLength); const spacing = '\n' + ' '.repeat(prefixLength); const prefix = (list.prefixAlign === 'right') ? listItem.prefix.padStart(prefixLength) : listItem.prefix.padEnd(prefixLength); const text = prefix + getText(listItem).replace(/\n/g, spacing); addText( list, text, listItem.leadingLineBreaks, Math.max(listItem.stashedLineBreaks, list.interRowLineBreaks) ); } /** * Finalize currently built list, add it's content to the parent block. * * @param { object } param0 * Object holding the parameters of the list. * * @param { number } [param0.trailingLineBreaks] * This list should have at least this number of line breaks to separate it from any following block. */ closeList ({ trailingLineBreaks = 2 } = {}) { const list = this._popStackItem(); const text = getText(list); if (text) { addText(this._stackItem, text, list.leadingLineBreaks, trailingLineBreaks); } } /** * Start building a table. */ openTable () { this._stackItem = new TableStackItem(this._stackItem); } /** * Start building a table row. */ openTableRow () { if (!(this._stackItem instanceof TableStackItem)) { throw new Error('Can\'t add a table row to something that is not a table! Check the formatter.'); } this._stackItem = new TableRowStackItem(this._stackItem); } /** * Start building a table cell. * * @param { object } [param0] * Object holding the parameters of the cell. * * @param { number } [param0.maxColumnWidth] * Wrap cell content to this width. Fall back to global wordwrap value if undefined. */ openTableCell ({ maxColumnWidth = undefined } = {}) { if (!(this._stackItem instanceof TableRowStackItem)) { throw new Error('Can\'t add a table cell to something that is not a table row! Check the formatter.'); } this._stackItem = new TableCellStackItem(this.options, this._stackItem, maxColumnWidth); } /** * Finalize currently built table cell and add it to parent table row's cells. * * @param { object } [param0] * Object holding the parameters of the cell. * * @param { number } [param0.colspan] How many columns this cell should occupy. * @param { number } [param0.rowspan] How many rows this cell should occupy. */ closeTableCell ({ colspan = 1, rowspan = 1 } = {}) { const cell = this._popStackItem(); const text = trimCharacter(getText(cell), '\n'); cell.next.cells.push({ colspan: colspan, rowspan: rowspan, text: text }); } /** * Finalize currently built table row and add it to parent table's rows. */ closeTableRow () { const row = this._popStackItem(); row.next.rows.push(row.cells); } /** * Finalize currently built table and add the rendered text to the parent block. * * @param { object } param0 * Object holding the parameters of the table. * * @param { TablePrinter } param0.tableToString * A function to convert a table of stringified cells into a complete table. * * @param { number } [param0.leadingLineBreaks] * This table should have at least this number of line breaks to separate if from any preceding block. * * @param { number } [param0.trailingLineBreaks] * This table should have at least this number of line breaks to separate it from any following block. */ closeTable ({ tableToString, leadingLineBreaks = 2, trailingLineBreaks = 2 }) { const table = this._popStackItem(); const output = tableToString(table.rows); if (output) { addText(this._stackItem, output, leadingLineBreaks, trailingLineBreaks); } } /** * Return the rendered text content of this builder. * * @returns { string } */ toString () { return getText(this._stackItem.getRoot()); // There should only be the root item if everything is closed properly. } } function getText (stackItem) { if (!( stackItem instanceof BlockStackItem || stackItem instanceof ListItemStackItem || stackItem instanceof TableCellStackItem )) { throw new Error('Only blocks, list items and table cells can be requested for text contents.'); } return (stackItem.inlineTextBuilder.isEmpty()) ? stackItem.rawText : stackItem.rawText + stackItem.inlineTextBuilder.toString(); } function addText (stackItem, text, leadingLineBreaks, trailingLineBreaks) { if (!( stackItem instanceof BlockStackItem || stackItem instanceof ListItemStackItem || stackItem instanceof TableCellStackItem )) { throw new Error('Only blocks, list items and table cells can contain text.'); } const parentText = getText(stackItem); const lineBreaks = Math.max(stackItem.stashedLineBreaks, leadingLineBreaks); stackItem.inlineTextBuilder.clear(); if (parentText) { stackItem.rawText = parentText + '\n'.repeat(lineBreaks) + text; } else { stackItem.rawText = text; stackItem.leadingLineBreaks = lineBreaks; } stackItem.stashedLineBreaks = trailingLineBreaks; } /** * @param { string } str A string to transform. * @param { TransformerStackItem } transformer A transformer item (with possible continuation). * @returns { string } */ function applyTransformer (str, transformer) { return ((transformer) ? applyTransformer(transformer.transform(str), transformer.next) : str); } /** * Compile selectors into a decision tree, * return a function intended for batch processing. * * @param { Options } [options = {}] HtmlToText options (defaults, formatters, user options merged, deduplicated). * @returns { (html: string, metadata?: any) => string } Pre-configured converter function. * @static */ function compile$1 (options = {}) { const selectorsWithoutFormat = options.selectors.filter(s => !s.format); if (selectorsWithoutFormat.length) { throw new Error( 'Following selectors have no specified format: ' + selectorsWithoutFormat.map(s => `\`${s.selector}\``).join(', ') ); } const picker = new DecisionTree( options.selectors.map(s => [s.selector, s]) ).build(hp2Builder); if (typeof options.encodeCharacters !== 'function') { options.encodeCharacters = makeReplacerFromDict(options.encodeCharacters); } const baseSelectorsPicker = new DecisionTree( options.baseElements.selectors.map((s, i) => [s, i + 1]) ).build(hp2Builder); function findBaseElements (dom) { return findBases(dom, options, baseSelectorsPicker); } const limitedWalk = limitedDepthRecursive( options.limits.maxDepth, recursiveWalk, function (dom, builder) { builder.addInline(options.limits.ellipsis || ''); } ); return function (html, metadata = undefined) { return process(html, metadata, options, picker, findBaseElements, limitedWalk); }; } /** * Convert given HTML according to preprocessed options. * * @param { string } html HTML content to convert. * @param { any } metadata Optional metadata for HTML document, for use in formatters. * @param { Options } options HtmlToText options (preprocessed). * @param { import('selderee').Picker } picker * Tag definition picker for DOM nodes processing. * @param { (dom: DomNode[]) => DomNode[] } findBaseElements * Function to extract elements from HTML DOM * that will only be present in the output text. * @param { RecursiveCallback } walk Recursive callback. * @returns { string } */ function process (html, metadata, options, picker, findBaseElements, walk) { const maxInputLength = options.limits.maxInputLength; if (maxInputLength && html && html.length > maxInputLength) { console.warn( `Input length ${html.length} is above allowed limit of ${maxInputLength}. Truncating without ellipsis.` ); html = html.substring(0, maxInputLength); } const document = parseDocument(html, { decodeEntities: options.decodeEntities }); const bases = findBaseElements(document.children); const builder = new BlockTextBuilder(options, picker, metadata); walk(bases, builder); return builder.toString(); } function findBases (dom, options, baseSelectorsPicker) { const results = []; function recursiveWalk (walk, /** @type { DomNode[] } */ dom) { dom = dom.slice(0, options.limits.maxChildNodes); for (const elem of dom) { if (elem.type !== 'tag') { continue; } const pickedSelectorIndex = baseSelectorsPicker.pick1(elem); if (pickedSelectorIndex > 0) { results.push({ selectorIndex: pickedSelectorIndex, element: elem }); } else if (elem.children) { walk(elem.children); } if (results.length >= options.limits.maxBaseElements) { return; } } } const limitedWalk = limitedDepthRecursive( options.limits.maxDepth, recursiveWalk ); limitedWalk(dom); if (options.baseElements.orderBy !== 'occurrence') { // 'selectors' results.sort((a, b) => a.selectorIndex - b.selectorIndex); } return (options.baseElements.returnDomByDefault && results.length === 0) ? dom : results.map(x => x.element); } /** * Function to walk through DOM nodes and accumulate their string representations. * * @param { RecursiveCallback } walk Recursive callback. * @param { DomNode[] } [dom] Nodes array to process. * @param { BlockTextBuilder } builder Passed around to accumulate output text. * @private */ function recursiveWalk (walk, dom, builder) { if (!dom) { return; } const options = builder.options; const tooManyChildNodes = dom.length > options.limits.maxChildNodes; if (tooManyChildNodes) { dom = dom.slice(0, options.limits.maxChildNodes); dom.push({ data: options.limits.ellipsis, type: 'text' }); } for (const elem of dom) { switch (elem.type) { case 'text': { builder.addInline(elem.data); break; } case 'tag': { const tagDefinition = builder.picker.pick1(elem); const format = options.formatters[tagDefinition.format]; format(elem, walk, builder, tagDefinition.options || {}); break; } } } return; } /** * @param { Object } dict * A dictionary where keys are characters to replace * and values are replacement strings. * * First code point from dict keys is used. * Compound emojis with ZWJ are not supported (not until Node 16). * * @returns { ((str: string) => string) | undefined } */ function makeReplacerFromDict (dict) { if (!dict || Object.keys(dict).length === 0) { return undefined; } /** @type { [string, string][] } */ const entries = Object.entries(dict).filter(([, v]) => v !== false); const regex = new RegExp( entries .map(([c]) => `(${unicodeEscape([...c][0])})`) .join('|'), 'g' ); const values = entries.map(([, v]) => v); const replacer = (m, ...cgs) => values[cgs.findIndex(cg => cg)]; return (str) => str.replace(regex, replacer); } /** * Dummy formatter that discards the input and does nothing. * * @type { FormatCallback } */ function formatSkip (elem, walk, builder, formatOptions) { /* do nothing */ } /** * Insert the given string literal inline instead of a tag. * * @type { FormatCallback } */ function formatInlineString (elem, walk, builder, formatOptions) { builder.addLiteral(formatOptions.string || ''); } /** * Insert a block with the given string literal instead of a tag. * * @type { FormatCallback } */ function formatBlockString (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); builder.addLiteral(formatOptions.string || ''); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } /** * Process an inline-level element. * * @type { FormatCallback } */ function formatInline (elem, walk, builder, formatOptions) { walk(elem.children, builder); } /** * Process a block-level container. * * @type { FormatCallback } */ function formatBlock$1 (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); walk(elem.children, builder); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } function renderOpenTag (elem) { const attrs = (elem.attribs && elem.attribs.length) ? ' ' + Object.entries(elem.attribs) .map(([k, v]) => ((v === '') ? k : `${k}=${v.replace(/"/g, '"')}`)) .join(' ') : ''; return `<${elem.name}${attrs}>`; } function renderCloseTag (elem) { return ``; } /** * Render an element as inline HTML tag, walk through it's children. * * @type { FormatCallback } */ function formatInlineTag (elem, walk, builder, formatOptions) { builder.startNoWrap(); builder.addLiteral(renderOpenTag(elem)); builder.stopNoWrap(); walk(elem.children, builder); builder.startNoWrap(); builder.addLiteral(renderCloseTag(elem)); builder.stopNoWrap(); } /** * Render an element as HTML block bag, walk through it's children. * * @type { FormatCallback } */ function formatBlockTag (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); builder.startNoWrap(); builder.addLiteral(renderOpenTag(elem)); builder.stopNoWrap(); walk(elem.children, builder); builder.startNoWrap(); builder.addLiteral(renderCloseTag(elem)); builder.stopNoWrap(); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } /** * Render an element with all it's children as inline HTML. * * @type { FormatCallback } */ function formatInlineHtml (elem, walk, builder, formatOptions) { builder.startNoWrap(); builder.addLiteral( render(elem, { decodeEntities: builder.options.decodeEntities }) ); builder.stopNoWrap(); } /** * Render an element with all it's children as HTML block. * * @type { FormatCallback } */ function formatBlockHtml (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); builder.startNoWrap(); builder.addLiteral( render(elem, { decodeEntities: builder.options.decodeEntities }) ); builder.stopNoWrap(); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } /** * Render inline element wrapped with given strings. * * @type { FormatCallback } */ function formatInlineSurround (elem, walk, builder, formatOptions) { builder.addLiteral(formatOptions.prefix || ''); walk(elem.children, builder); builder.addLiteral(formatOptions.suffix || ''); } var genericFormatters = /*#__PURE__*/Object.freeze({ __proto__: null, block: formatBlock$1, blockHtml: formatBlockHtml, blockString: formatBlockString, blockTag: formatBlockTag, inline: formatInline, inlineHtml: formatInlineHtml, inlineString: formatInlineString, inlineSurround: formatInlineSurround, inlineTag: formatInlineTag, skip: formatSkip }); function getRow (matrix, j) { if (!matrix[j]) { matrix[j] = []; } return matrix[j]; } function findFirstVacantIndex (row, x = 0) { while (row[x]) { x++; } return x; } function transposeInPlace (matrix, maxSize) { for (let i = 0; i < maxSize; i++) { const rowI = getRow(matrix, i); for (let j = 0; j < i; j++) { const rowJ = getRow(matrix, j); if (rowI[j] || rowJ[i]) { const temp = rowI[j]; rowI[j] = rowJ[i]; rowJ[i] = temp; } } } } function putCellIntoLayout (cell, layout, baseRow, baseCol) { for (let r = 0; r < cell.rowspan; r++) { const layoutRow = getRow(layout, baseRow + r); for (let c = 0; c < cell.colspan; c++) { layoutRow[baseCol + c] = cell; } } } function getOrInitOffset (offsets, index) { if (offsets[index] === undefined) { offsets[index] = (index === 0) ? 0 : 1 + getOrInitOffset(offsets, index - 1); } return offsets[index]; } function updateOffset (offsets, base, span, value) { offsets[base + span] = Math.max( getOrInitOffset(offsets, base + span), getOrInitOffset(offsets, base) + value ); } /** * Render a table into a string. * Cells can contain multiline text and span across multiple rows and columns. * * Modifies cells to add lines array. * * @param { TablePrinterCell[][] } tableRows Table to render. * @param { number } rowSpacing Number of spaces between columns. * @param { number } colSpacing Number of empty lines between rows. * @returns { string } */ function tableToString (tableRows, rowSpacing, colSpacing) { const layout = []; let colNumber = 0; const rowNumber = tableRows.length; const rowOffsets = [0]; // Fill the layout table and row offsets row-by-row. for (let j = 0; j < rowNumber; j++) { const layoutRow = getRow(layout, j); const cells = tableRows[j]; let x = 0; for (let i = 0; i < cells.length; i++) { const cell = cells[i]; x = findFirstVacantIndex(layoutRow, x); putCellIntoLayout(cell, layout, j, x); x += cell.colspan; cell.lines = cell.text.split('\n'); const cellHeight = cell.lines.length; updateOffset(rowOffsets, j, cell.rowspan, cellHeight + rowSpacing); } colNumber = (layoutRow.length > colNumber) ? layoutRow.length : colNumber; } transposeInPlace(layout, (rowNumber > colNumber) ? rowNumber : colNumber); const outputLines = []; const colOffsets = [0]; // Fill column offsets and output lines column-by-column. for (let x = 0; x < colNumber; x++) { let y = 0; let cell; const rowsInThisColumn = Math.min(rowNumber, layout[x].length); while (y < rowsInThisColumn) { cell = layout[x][y]; if (cell) { if (!cell.rendered) { let cellWidth = 0; for (let j = 0; j < cell.lines.length; j++) { const line = cell.lines[j]; const lineOffset = rowOffsets[y] + j; outputLines[lineOffset] = (outputLines[lineOffset] || '').padEnd(colOffsets[x]) + line; cellWidth = (line.length > cellWidth) ? line.length : cellWidth; } updateOffset(colOffsets, x, cell.colspan, cellWidth + colSpacing); cell.rendered = true; } y += cell.rowspan; } else { const lineOffset = rowOffsets[y]; outputLines[lineOffset] = (outputLines[lineOffset] || ''); y++; } } } return outputLines.join('\n'); } /** * Process a line-break. * * @type { FormatCallback } */ function formatLineBreak (elem, walk, builder, formatOptions) { builder.addLineBreak(); } /** * Process a `wbr` tag (word break opportunity). * * @type { FormatCallback } */ function formatWbr (elem, walk, builder, formatOptions) { builder.addWordBreakOpportunity(); } /** * Process a horizontal line. * * @type { FormatCallback } */ function formatHorizontalLine (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); builder.addInline('-'.repeat(formatOptions.length || builder.options.wordwrap || 40)); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } /** * Process a paragraph. * * @type { FormatCallback } */ function formatParagraph (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); walk(elem.children, builder); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } /** * Process a preformatted content. * * @type { FormatCallback } */ function formatPre (elem, walk, builder, formatOptions) { builder.openBlock({ isPre: true, leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); walk(elem.children, builder); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } /** * Process a heading. * * @type { FormatCallback } */ function formatHeading (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); if (formatOptions.uppercase !== false) { builder.pushWordTransform(str => str.toUpperCase()); walk(elem.children, builder); builder.popWordTransform(); } else { walk(elem.children, builder); } builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); } /** * Process a blockquote. * * @type { FormatCallback } */ function formatBlockquote (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2, reservedLineLength: 2 }); walk(elem.children, builder); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2, blockTransform: str => ((formatOptions.trimEmptyLines !== false) ? trimCharacter(str, '\n') : str) .split('\n') .map(line => '> ' + line) .join('\n') }); } function withBrackets (str, brackets) { if (!brackets) { return str; } const lbr = (typeof brackets[0] === 'string') ? brackets[0] : '['; const rbr = (typeof brackets[1] === 'string') ? brackets[1] : ']'; return lbr + str + rbr; } function pathRewrite (path, rewriter, baseUrl, metadata, elem) { const modifiedPath = (typeof rewriter === 'function') ? rewriter(path, metadata, elem) : path; return (modifiedPath[0] === '/' && baseUrl) ? trimCharacterEnd(baseUrl, '/') + modifiedPath : modifiedPath; } /** * Process an image. * * @type { FormatCallback } */ function formatImage (elem, walk, builder, formatOptions) { const attribs = elem.attribs || {}; const alt = (attribs.alt) ? attribs.alt : ''; const src = (!attribs.src) ? '' : pathRewrite(attribs.src, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem); const text = (!src) ? alt : (!alt) ? withBrackets(src, formatOptions.linkBrackets) : alt + ' ' + withBrackets(src, formatOptions.linkBrackets); builder.addInline(text, { noWordTransform: true }); } // a img baseUrl // a img pathRewrite // a img linkBrackets // a ignoreHref: false // ignoreText ? // a noAnchorUrl: true // can be replaced with selector // a hideLinkHrefIfSameAsText: false // how to compare, what to show (text, href, normalized) ? // a mailto protocol removed without options // a protocols: mailto, tel, ... // can be matched with selector? // anchors, protocols - only if no pathRewrite fn is provided // normalize-url ? // a // a[href^="#"] - format:skip by default // a[href^="mailto:"] - ? /** * Process an anchor. * * @type { FormatCallback } */ function formatAnchor (elem, walk, builder, formatOptions) { function getHref () { if (formatOptions.ignoreHref) { return ''; } if (!elem.attribs || !elem.attribs.href) { return ''; } let href = elem.attribs.href.replace(/^mailto:/, ''); if (formatOptions.noAnchorUrl && href[0] === '#') { return ''; } href = pathRewrite(href, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem); return href; } const href = getHref(); if (!href) { walk(elem.children, builder); } else { let text = ''; builder.pushWordTransform( str => { if (str) { text += str; } return str; } ); walk(elem.children, builder); builder.popWordTransform(); const hideSameLink = formatOptions.hideLinkHrefIfSameAsText && href === text; if (!hideSameLink) { builder.addInline( (!text) ? href : ' ' + withBrackets(href, formatOptions.linkBrackets), { noWordTransform: true } ); } } } /** * @param { DomNode } elem List items with their prefixes. * @param { RecursiveCallback } walk Recursive callback to process child nodes. * @param { BlockTextBuilder } builder Passed around to accumulate output text. * @param { FormatOptions } formatOptions Options specific to a formatter. * @param { () => string } nextPrefixCallback Function that returns increasing index each time it is called. */ function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) { const isNestedList = get(elem, ['parent', 'name']) === 'li'; // With Roman numbers, index length is not as straightforward as with Arabic numbers or letters, // so the dumb length comparison is the most robust way to get the correct value. let maxPrefixLength = 0; const listItems = (elem.children || []) // it might be more accurate to check only for html spaces here, but no significant benefit .filter(child => child.type !== 'text' || !/^\s*$/.test(child.data)) .map(function (child) { if (child.name !== 'li') { return { node: child, prefix: '' }; } const prefix = (isNestedList) ? nextPrefixCallback().trimStart() : nextPrefixCallback(); if (prefix.length > maxPrefixLength) { maxPrefixLength = prefix.length; } return { node: child, prefix: prefix }; }); if (!listItems.length) { return; } builder.openList({ interRowLineBreaks: 1, leadingLineBreaks: isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2), maxPrefixLength: maxPrefixLength, prefixAlign: 'left' }); for (const { node, prefix } of listItems) { builder.openListItem({ prefix: prefix }); walk([node], builder); builder.closeListItem(); } builder.closeList({ trailingLineBreaks: isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2) }); } /** * Process an unordered list. * * @type { FormatCallback } */ function formatUnorderedList (elem, walk, builder, formatOptions) { const prefix = formatOptions.itemPrefix || ' * '; return formatList(elem, walk, builder, formatOptions, () => prefix); } /** * Process an ordered list. * * @type { FormatCallback } */ function formatOrderedList (elem, walk, builder, formatOptions) { let nextIndex = Number(elem.attribs.start || '1'); const indexFunction = getOrderedListIndexFunction(elem.attribs.type); const nextPrefixCallback = () => ' ' + indexFunction(nextIndex++) + '. '; return formatList(elem, walk, builder, formatOptions, nextPrefixCallback); } /** * Return a function that can be used to generate index markers of a specified format. * * @param { string } [olType='1'] Marker type. * @returns { (i: number) => string } */ function getOrderedListIndexFunction (olType = '1') { switch (olType) { case 'a': return (i) => numberToLetterSequence(i, 'a'); case 'A': return (i) => numberToLetterSequence(i, 'A'); case 'i': return (i) => numberToRoman(i).toLowerCase(); case 'I': return (i) => numberToRoman(i); case '1': default: return (i) => (i).toString(); } } /** * Given a list of class and ID selectors (prefixed with '.' and '#'), * return them as separate lists of names without prefixes. * * @param { string[] } selectors Class and ID selectors (`[".class", "#id"]` etc). * @returns { { classes: string[], ids: string[] } } */ function splitClassesAndIds (selectors) { const classes = []; const ids = []; for (const selector of selectors) { if (selector.startsWith('.')) { classes.push(selector.substring(1)); } else if (selector.startsWith('#')) { ids.push(selector.substring(1)); } } return { classes: classes, ids: ids }; } function isDataTable (attr, tables) { if (tables === true) { return true; } if (!attr) { return false; } const { classes, ids } = splitClassesAndIds(tables); const attrClasses = (attr['class'] || '').split(' '); const attrIds = (attr['id'] || '').split(' '); return attrClasses.some(x => classes.includes(x)) || attrIds.some(x => ids.includes(x)); } /** * Process a table (either as a container or as a data table, depending on options). * * @type { FormatCallback } */ function formatTable (elem, walk, builder, formatOptions) { return isDataTable(elem.attribs, builder.options.tables) ? formatDataTable(elem, walk, builder, formatOptions) : formatBlock(elem, walk, builder, formatOptions); } function formatBlock (elem, walk, builder, formatOptions) { builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks }); walk(elem.children, builder); builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks }); } /** * Process a data table. * * @type { FormatCallback } */ function formatDataTable (elem, walk, builder, formatOptions) { builder.openTable(); elem.children.forEach(walkTable); builder.closeTable({ tableToString: (rows) => tableToString(rows, formatOptions.rowSpacing ?? 0, formatOptions.colSpacing ?? 3), leadingLineBreaks: formatOptions.leadingLineBreaks, trailingLineBreaks: formatOptions.trailingLineBreaks }); function formatCell (cellNode) { const colspan = +get(cellNode, ['attribs', 'colspan']) || 1; const rowspan = +get(cellNode, ['attribs', 'rowspan']) || 1; builder.openTableCell({ maxColumnWidth: formatOptions.maxColumnWidth }); walk(cellNode.children, builder); builder.closeTableCell({ colspan: colspan, rowspan: rowspan }); } function walkTable (elem) { if (elem.type !== 'tag') { return; } const formatHeaderCell = (formatOptions.uppercaseHeaderCells !== false) ? (cellNode) => { builder.pushWordTransform(str => str.toUpperCase()); formatCell(cellNode); builder.popWordTransform(); } : formatCell; switch (elem.name) { case 'thead': case 'tbody': case 'tfoot': case 'center': elem.children.forEach(walkTable); return; case 'tr': { builder.openTableRow(); for (const childOfTr of elem.children) { if (childOfTr.type !== 'tag') { continue; } switch (childOfTr.name) { case 'th': { formatHeaderCell(childOfTr); break; } case 'td': { formatCell(childOfTr); break; } // do nothing } } builder.closeTableRow(); break; } // do nothing } } } var textFormatters = /*#__PURE__*/Object.freeze({ __proto__: null, anchor: formatAnchor, blockquote: formatBlockquote, dataTable: formatDataTable, heading: formatHeading, horizontalLine: formatHorizontalLine, image: formatImage, lineBreak: formatLineBreak, orderedList: formatOrderedList, paragraph: formatParagraph, pre: formatPre, table: formatTable, unorderedList: formatUnorderedList, wbr: formatWbr }); /** * Default options. * * @constant * @type { Options } * @default * @private */ const DEFAULT_OPTIONS = { baseElements: { selectors: [ 'body' ], orderBy: 'selectors', // 'selectors' | 'occurrence' returnDomByDefault: true }, decodeEntities: true, encodeCharacters: {}, formatters: {}, limits: { ellipsis: '...', maxBaseElements: undefined, maxChildNodes: undefined, maxDepth: undefined, maxInputLength: (1 << 24) // 16_777_216 }, longWordSplit: { forceWrapOnLimit: false, wrapCharacters: [] }, preserveNewlines: false, selectors: [ { selector: '*', format: 'inline' }, { selector: 'a', format: 'anchor', options: { baseUrl: null, hideLinkHrefIfSameAsText: false, ignoreHref: false, linkBrackets: ['[', ']'], noAnchorUrl: true } }, { selector: 'article', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'aside', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'blockquote', format: 'blockquote', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, trimEmptyLines: true } }, { selector: 'br', format: 'lineBreak' }, { selector: 'div', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'footer', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'form', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'h1', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, { selector: 'h2', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, { selector: 'h3', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, { selector: 'h4', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, { selector: 'h5', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, { selector: 'h6', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, { selector: 'header', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'hr', format: 'horizontalLine', options: { leadingLineBreaks: 2, length: undefined, trailingLineBreaks: 2 } }, { selector: 'img', format: 'image', options: { baseUrl: null, linkBrackets: ['[', ']'] } }, { selector: 'main', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'nav', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'ol', format: 'orderedList', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, { selector: 'p', format: 'paragraph', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, { selector: 'pre', format: 'pre', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, { selector: 'section', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, { selector: 'table', format: 'table', options: { colSpacing: 3, leadingLineBreaks: 2, maxColumnWidth: 60, rowSpacing: 0, trailingLineBreaks: 2, uppercaseHeaderCells: true } }, { selector: 'ul', format: 'unorderedList', options: { itemPrefix: ' * ', leadingLineBreaks: 2, trailingLineBreaks: 2 } }, { selector: 'wbr', format: 'wbr' }, ], tables: [], // deprecated whitespaceCharacters: ' \t\r\n\f\u200b', wordwrap: 80 }; const concatMerge = (acc, src, options) => [...acc, ...src]; const overwriteMerge = (acc, src, options) => [...src]; const selectorsMerge = (acc, src, options) => ( (acc.some(s => typeof s === 'object')) ? concatMerge(acc, src) // selectors : overwriteMerge(acc, src) // baseElements.selectors ); /** * Preprocess options, compile selectors into a decision tree, * return a function intended for batch processing. * * @param { Options } [options = {}] HtmlToText options. * @returns { (html: string, metadata?: any) => string } Pre-configured converter function. * @static */ function compile (options = {}) { options = merge( DEFAULT_OPTIONS, options, { arrayMerge: overwriteMerge, customMerge: (key) => ((key === 'selectors') ? selectorsMerge : undefined) } ); options.formatters = Object.assign({}, genericFormatters, textFormatters, options.formatters); options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector)); handleDeprecatedOptions(options); return compile$1(options); } /** * Convert given HTML content to plain text string. * * @param { string } html HTML content to convert. * @param { Options } [options = {}] HtmlToText options. * @param { any } [metadata] Optional metadata for HTML document, for use in formatters. * @returns { string } Plain text string. * @static * * @example * const { convert } = require('html-to-text'); * const text = convert('

Hello World

', { * wordwrap: 130 * }); * console.log(text); // HELLO WORLD */ function convert (html, options = {}, metadata = undefined) { return compile(options)(html, metadata); } /** * Map previously existing and now deprecated options to the new options layout. * This is a subject for cleanup in major releases. * * @param { Options } options HtmlToText options. */ function handleDeprecatedOptions (options) { if (options.tags) { const tagDefinitions = Object.entries(options.tags).map( ([selector, definition]) => ({ ...definition, selector: selector || '*' }) ); options.selectors.push(...tagDefinitions); options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector)); } function set (obj, path, value) { const valueKey = path.pop(); for (const key of path) { let nested = obj[key]; if (!nested) { nested = {}; obj[key] = nested; } obj = nested; } obj[valueKey] = value; } if (options['baseElement']) { const baseElement = options['baseElement']; set( options, ['baseElements', 'selectors'], (Array.isArray(baseElement) ? baseElement : [baseElement]) ); } if (options['returnDomByDefault'] !== undefined) { set(options, ['baseElements', 'returnDomByDefault'], options['returnDomByDefault']); } for (const definition of options.selectors) { if (definition.format === 'anchor' && get(definition, ['options', 'noLinkBrackets'])) { set(definition, ['options', 'linkBrackets'], false); } } } export { compile, convert, convert as htmlToText };