2158 lines
65 KiB
JavaScript
2158 lines
65 KiB
JavaScript
|
'use strict';
|
||
|
|
||
|
Object.defineProperty(exports, '__esModule', { value: true });
|
||
|
|
||
|
var pluginHtmlparser2 = require('@selderee/plugin-htmlparser2');
|
||
|
var htmlparser2 = require('htmlparser2');
|
||
|
var selderee = require('selderee');
|
||
|
var merge = require('deepmerge');
|
||
|
var domSerializer = require('dom-serializer');
|
||
|
|
||
|
function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
|
||
|
|
||
|
var merge__default = /*#__PURE__*/_interopDefaultLegacy(merge);
|
||
|
|
||
|
/**
|
||
|
* Make a recursive function that will only run to a given depth
|
||
|
* and switches to an alternative function at that depth. \
|
||
|
* No limitation if `n` is `undefined` (Just wraps `f` in that case).
|
||
|
*
|
||
|
* @param { number | undefined } n Allowed depth of recursion. `undefined` for no limitation.
|
||
|
* @param { Function } f Function that accepts recursive callback as the first argument.
|
||
|
* @param { Function } [g] Function to run instead, when maximum depth was reached. Do nothing by default.
|
||
|
* @returns { Function }
|
||
|
*/
|
||
|
function limitedDepthRecursive (n, f, g = () => undefined) {
|
||
|
if (n === undefined) {
|
||
|
const f1 = function (...args) { return f(f1, ...args); };
|
||
|
return f1;
|
||
|
}
|
||
|
if (n >= 0) {
|
||
|
return function (...args) { return f(limitedDepthRecursive(n - 1, f, g), ...args); };
|
||
|
}
|
||
|
return g;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the same string or a substring with
|
||
|
* the given character occurrences removed from each side.
|
||
|
*
|
||
|
* @param { string } str A string to trim.
|
||
|
* @param { string } char A character to be trimmed.
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
function trimCharacter (str, char) {
|
||
|
let start = 0;
|
||
|
let end = str.length;
|
||
|
while (start < end && str[start] === char) { ++start; }
|
||
|
while (end > start && str[end - 1] === char) { --end; }
|
||
|
return (start > 0 || end < str.length)
|
||
|
? str.substring(start, end)
|
||
|
: str;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the same string or a substring with
|
||
|
* the given character occurrences removed from the end only.
|
||
|
*
|
||
|
* @param { string } str A string to trim.
|
||
|
* @param { string } char A character to be trimmed.
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
function trimCharacterEnd (str, char) {
|
||
|
let end = str.length;
|
||
|
while (end > 0 && str[end - 1] === char) { --end; }
|
||
|
return (end < str.length)
|
||
|
? str.substring(0, end)
|
||
|
: str;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return a new string will all characters replaced with unicode escape sequences.
|
||
|
* This extreme kind of escaping can used to be safely compose regular expressions.
|
||
|
*
|
||
|
* @param { string } str A string to escape.
|
||
|
* @returns { string } A string of unicode escape sequences.
|
||
|
*/
|
||
|
function unicodeEscape (str) {
|
||
|
return str.replace(/[\s\S]/g, c => '\\u' + c.charCodeAt().toString(16).padStart(4, '0'));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Deduplicate an array by a given key callback.
|
||
|
* Item properties are merged recursively and with the preference for last defined values.
|
||
|
* Of items with the same key, merged item takes the place of the last item,
|
||
|
* others are omitted.
|
||
|
*
|
||
|
* @param { any[] } items An array to deduplicate.
|
||
|
* @param { (x: any) => string } getKey Callback to get a value that distinguishes unique items.
|
||
|
* @returns { any[] }
|
||
|
*/
|
||
|
function mergeDuplicatesPreferLast (items, getKey) {
|
||
|
const map = new Map();
|
||
|
for (let i = items.length; i-- > 0;) {
|
||
|
const item = items[i];
|
||
|
const key = getKey(item);
|
||
|
map.set(
|
||
|
key,
|
||
|
(map.has(key))
|
||
|
? merge__default["default"](item, map.get(key), { arrayMerge: overwriteMerge$1 })
|
||
|
: item
|
||
|
);
|
||
|
}
|
||
|
return [...map.values()].reverse();
|
||
|
}
|
||
|
|
||
|
const overwriteMerge$1 = (acc, src, options) => [...src];
|
||
|
|
||
|
/**
|
||
|
* Get a nested property from an object.
|
||
|
*
|
||
|
* @param { object } obj The object to query for the value.
|
||
|
* @param { string[] } path The path to the property.
|
||
|
* @returns { any }
|
||
|
*/
|
||
|
function get (obj, path) {
|
||
|
for (const key of path) {
|
||
|
if (!obj) { return undefined; }
|
||
|
obj = obj[key];
|
||
|
}
|
||
|
return obj;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Convert a number into alphabetic sequence representation (Sequence without zeroes).
|
||
|
*
|
||
|
* For example: `a, ..., z, aa, ..., zz, aaa, ...`.
|
||
|
*
|
||
|
* @param { number } num Number to convert. Must be >= 1.
|
||
|
* @param { string } [baseChar = 'a'] Character for 1 in the sequence.
|
||
|
* @param { number } [base = 26] Number of characters in the sequence.
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
function numberToLetterSequence (num, baseChar = 'a', base = 26) {
|
||
|
const digits = [];
|
||
|
do {
|
||
|
num -= 1;
|
||
|
digits.push(num % base);
|
||
|
num = (num / base) >> 0; // quick `floor`
|
||
|
} while (num > 0);
|
||
|
const baseCode = baseChar.charCodeAt(0);
|
||
|
return digits
|
||
|
.reverse()
|
||
|
.map(n => String.fromCharCode(baseCode + n))
|
||
|
.join('');
|
||
|
}
|
||
|
|
||
|
const I = ['I', 'X', 'C', 'M'];
|
||
|
const V = ['V', 'L', 'D'];
|
||
|
|
||
|
/**
|
||
|
* Convert a number to it's Roman representation. No large numbers extension.
|
||
|
*
|
||
|
* @param { number } num Number to convert. `0 < num <= 3999`.
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
function numberToRoman (num) {
|
||
|
return [...(num) + '']
|
||
|
.map(n => +n)
|
||
|
.reverse()
|
||
|
.map((v, i) => ((v % 5 < 4)
|
||
|
? (v < 5 ? '' : V[i]) + I[i].repeat(v % 5)
|
||
|
: I[i] + (v < 5 ? V[i] : I[i + 1])))
|
||
|
.reverse()
|
||
|
.join('');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Helps to build text from words.
|
||
|
*/
|
||
|
class InlineTextBuilder {
|
||
|
/**
|
||
|
* Creates an instance of InlineTextBuilder.
|
||
|
*
|
||
|
* If `maxLineLength` is not provided then it is either `options.wordwrap` or unlimited.
|
||
|
*
|
||
|
* @param { Options } options HtmlToText options.
|
||
|
* @param { number } [ maxLineLength ] This builder will try to wrap text to fit this line length.
|
||
|
*/
|
||
|
constructor (options, maxLineLength = undefined) {
|
||
|
/** @type { string[][] } */
|
||
|
this.lines = [];
|
||
|
/** @type { string[] } */
|
||
|
this.nextLineWords = [];
|
||
|
this.maxLineLength = maxLineLength || options.wordwrap || Number.MAX_VALUE;
|
||
|
this.nextLineAvailableChars = this.maxLineLength;
|
||
|
this.wrapCharacters = get(options, ['longWordSplit', 'wrapCharacters']) || [];
|
||
|
this.forceWrapOnLimit = get(options, ['longWordSplit', 'forceWrapOnLimit']) || false;
|
||
|
|
||
|
this.stashedSpace = false;
|
||
|
this.wordBreakOpportunity = false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add a new word.
|
||
|
*
|
||
|
* @param { string } word A word to add.
|
||
|
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
|
||
|
*/
|
||
|
pushWord (word, noWrap = false) {
|
||
|
if (this.nextLineAvailableChars <= 0 && !noWrap) {
|
||
|
this.startNewLine();
|
||
|
}
|
||
|
const isLineStart = this.nextLineWords.length === 0;
|
||
|
const cost = word.length + (isLineStart ? 0 : 1);
|
||
|
if ((cost <= this.nextLineAvailableChars) || noWrap) { // Fits into available budget
|
||
|
|
||
|
this.nextLineWords.push(word);
|
||
|
this.nextLineAvailableChars -= cost;
|
||
|
|
||
|
} else { // Does not fit - try to split the word
|
||
|
|
||
|
// The word is moved to a new line - prefer to wrap between words.
|
||
|
const [first, ...rest] = this.splitLongWord(word);
|
||
|
if (!isLineStart) { this.startNewLine(); }
|
||
|
this.nextLineWords.push(first);
|
||
|
this.nextLineAvailableChars -= first.length;
|
||
|
for (const part of rest) {
|
||
|
this.startNewLine();
|
||
|
this.nextLineWords.push(part);
|
||
|
this.nextLineAvailableChars -= part.length;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Pop a word from the currently built line.
|
||
|
* This doesn't affect completed lines.
|
||
|
*
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
popWord () {
|
||
|
const lastWord = this.nextLineWords.pop();
|
||
|
if (lastWord !== undefined) {
|
||
|
const isLineStart = this.nextLineWords.length === 0;
|
||
|
const cost = lastWord.length + (isLineStart ? 0 : 1);
|
||
|
this.nextLineAvailableChars += cost;
|
||
|
}
|
||
|
return lastWord;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Concat a word to the last word already in the builder.
|
||
|
* Adds a new word in case there are no words yet in the last line.
|
||
|
*
|
||
|
* @param { string } word A word to be concatenated.
|
||
|
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
|
||
|
*/
|
||
|
concatWord (word, noWrap = false) {
|
||
|
if (this.wordBreakOpportunity && word.length > this.nextLineAvailableChars) {
|
||
|
this.pushWord(word, noWrap);
|
||
|
this.wordBreakOpportunity = false;
|
||
|
} else {
|
||
|
const lastWord = this.popWord();
|
||
|
this.pushWord((lastWord) ? lastWord.concat(word) : word, noWrap);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add current line (and more empty lines if provided argument > 1) to the list of complete lines and start a new one.
|
||
|
*
|
||
|
* @param { number } n Number of line breaks that will be added to the resulting string.
|
||
|
*/
|
||
|
startNewLine (n = 1) {
|
||
|
this.lines.push(this.nextLineWords);
|
||
|
if (n > 1) {
|
||
|
this.lines.push(...Array.from({ length: n - 1 }, () => []));
|
||
|
}
|
||
|
this.nextLineWords = [];
|
||
|
this.nextLineAvailableChars = this.maxLineLength;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* No words in this builder.
|
||
|
*
|
||
|
* @returns { boolean }
|
||
|
*/
|
||
|
isEmpty () {
|
||
|
return this.lines.length === 0
|
||
|
&& this.nextLineWords.length === 0;
|
||
|
}
|
||
|
|
||
|
clear () {
|
||
|
this.lines.length = 0;
|
||
|
this.nextLineWords.length = 0;
|
||
|
this.nextLineAvailableChars = this.maxLineLength;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Join all lines of words inside the InlineTextBuilder into a complete string.
|
||
|
*
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
toString () {
|
||
|
return [...this.lines, this.nextLineWords]
|
||
|
.map(words => words.join(' '))
|
||
|
.join('\n');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Split a long word up to fit within the word wrap limit.
|
||
|
* Use either a character to split looking back from the word wrap limit,
|
||
|
* or truncate to the word wrap limit.
|
||
|
*
|
||
|
* @param { string } word Input word.
|
||
|
* @returns { string[] } Parts of the word.
|
||
|
*/
|
||
|
splitLongWord (word) {
|
||
|
const parts = [];
|
||
|
let idx = 0;
|
||
|
while (word.length > this.maxLineLength) {
|
||
|
|
||
|
const firstLine = word.substring(0, this.maxLineLength);
|
||
|
const remainingChars = word.substring(this.maxLineLength);
|
||
|
|
||
|
const splitIndex = firstLine.lastIndexOf(this.wrapCharacters[idx]);
|
||
|
|
||
|
if (splitIndex > -1) { // Found a character to split on
|
||
|
|
||
|
word = firstLine.substring(splitIndex + 1) + remainingChars;
|
||
|
parts.push(firstLine.substring(0, splitIndex + 1));
|
||
|
|
||
|
} else { // Not found a character to split on
|
||
|
|
||
|
idx++;
|
||
|
if (idx < this.wrapCharacters.length) { // There is next character to try
|
||
|
|
||
|
word = firstLine + remainingChars;
|
||
|
|
||
|
} else { // No more characters to try
|
||
|
|
||
|
if (this.forceWrapOnLimit) {
|
||
|
parts.push(firstLine);
|
||
|
word = remainingChars;
|
||
|
if (word.length > this.maxLineLength) {
|
||
|
continue;
|
||
|
}
|
||
|
} else {
|
||
|
word = firstLine + remainingChars;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|
||
|
parts.push(word); // Add remaining part to array
|
||
|
return parts;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* eslint-disable max-classes-per-file */
|
||
|
|
||
|
|
||
|
class StackItem {
|
||
|
constructor (next = null) { this.next = next; }
|
||
|
|
||
|
getRoot () { return (this.next) ? this.next : this; }
|
||
|
}
|
||
|
|
||
|
class BlockStackItem extends StackItem {
|
||
|
constructor (options, next = null, leadingLineBreaks = 1, maxLineLength = undefined) {
|
||
|
super(next);
|
||
|
this.leadingLineBreaks = leadingLineBreaks;
|
||
|
this.inlineTextBuilder = new InlineTextBuilder(options, maxLineLength);
|
||
|
this.rawText = '';
|
||
|
this.stashedLineBreaks = 0;
|
||
|
this.isPre = next && next.isPre;
|
||
|
this.isNoWrap = next && next.isNoWrap;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
class ListStackItem extends BlockStackItem {
|
||
|
constructor (
|
||
|
options,
|
||
|
next = null,
|
||
|
{
|
||
|
interRowLineBreaks = 1,
|
||
|
leadingLineBreaks = 2,
|
||
|
maxLineLength = undefined,
|
||
|
maxPrefixLength = 0,
|
||
|
prefixAlign = 'left',
|
||
|
} = {}
|
||
|
) {
|
||
|
super(options, next, leadingLineBreaks, maxLineLength);
|
||
|
this.maxPrefixLength = maxPrefixLength;
|
||
|
this.prefixAlign = prefixAlign;
|
||
|
this.interRowLineBreaks = interRowLineBreaks;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
class ListItemStackItem extends BlockStackItem {
|
||
|
constructor (
|
||
|
options,
|
||
|
next = null,
|
||
|
{
|
||
|
leadingLineBreaks = 1,
|
||
|
maxLineLength = undefined,
|
||
|
prefix = '',
|
||
|
} = {}
|
||
|
) {
|
||
|
super(options, next, leadingLineBreaks, maxLineLength);
|
||
|
this.prefix = prefix;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
class TableStackItem extends StackItem {
|
||
|
constructor (next = null) {
|
||
|
super(next);
|
||
|
this.rows = [];
|
||
|
this.isPre = next && next.isPre;
|
||
|
this.isNoWrap = next && next.isNoWrap;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
class TableRowStackItem extends StackItem {
|
||
|
constructor (next = null) {
|
||
|
super(next);
|
||
|
this.cells = [];
|
||
|
this.isPre = next && next.isPre;
|
||
|
this.isNoWrap = next && next.isNoWrap;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
class TableCellStackItem extends StackItem {
|
||
|
constructor (options, next = null, maxColumnWidth = undefined) {
|
||
|
super(next);
|
||
|
this.inlineTextBuilder = new InlineTextBuilder(options, maxColumnWidth);
|
||
|
this.rawText = '';
|
||
|
this.stashedLineBreaks = 0;
|
||
|
this.isPre = next && next.isPre;
|
||
|
this.isNoWrap = next && next.isNoWrap;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
class TransformerStackItem extends StackItem {
|
||
|
constructor (next = null, transform) {
|
||
|
super(next);
|
||
|
this.transform = transform;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function charactersToCodes (str) {
|
||
|
return [...str]
|
||
|
.map(c => '\\u' + c.charCodeAt(0).toString(16).padStart(4, '0'))
|
||
|
.join('');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Helps to handle HTML whitespaces.
|
||
|
*
|
||
|
* @class WhitespaceProcessor
|
||
|
*/
|
||
|
class WhitespaceProcessor {
|
||
|
|
||
|
/**
|
||
|
* Creates an instance of WhitespaceProcessor.
|
||
|
*
|
||
|
* @param { Options } options HtmlToText options.
|
||
|
* @memberof WhitespaceProcessor
|
||
|
*/
|
||
|
constructor (options) {
|
||
|
this.whitespaceChars = (options.preserveNewlines)
|
||
|
? options.whitespaceCharacters.replace(/\n/g, '')
|
||
|
: options.whitespaceCharacters;
|
||
|
const whitespaceCodes = charactersToCodes(this.whitespaceChars);
|
||
|
this.leadingWhitespaceRe = new RegExp(`^[${whitespaceCodes}]`);
|
||
|
this.trailingWhitespaceRe = new RegExp(`[${whitespaceCodes}]$`);
|
||
|
this.allWhitespaceOrEmptyRe = new RegExp(`^[${whitespaceCodes}]*$`);
|
||
|
this.newlineOrNonWhitespaceRe = new RegExp(`(\\n|[^\\n${whitespaceCodes}])`, 'g');
|
||
|
this.newlineOrNonNewlineStringRe = new RegExp(`(\\n|[^\\n]+)`, 'g');
|
||
|
|
||
|
if (options.preserveNewlines) {
|
||
|
|
||
|
const wordOrNewlineRe = new RegExp(`\\n|[^\\n${whitespaceCodes}]+`, 'gm');
|
||
|
|
||
|
/**
|
||
|
* Shrink whitespaces and wrap text, add to the builder.
|
||
|
*
|
||
|
* @param { string } text Input text.
|
||
|
* @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text.
|
||
|
* @param { (str: string) => string } [ transform ] A transform to be applied to words.
|
||
|
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
|
||
|
*/
|
||
|
this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) {
|
||
|
if (!text) { return; }
|
||
|
const previouslyStashedSpace = inlineTextBuilder.stashedSpace;
|
||
|
let anyMatch = false;
|
||
|
let m = wordOrNewlineRe.exec(text);
|
||
|
if (m) {
|
||
|
anyMatch = true;
|
||
|
if (m[0] === '\n') {
|
||
|
inlineTextBuilder.startNewLine();
|
||
|
} else if (previouslyStashedSpace || this.testLeadingWhitespace(text)) {
|
||
|
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
|
||
|
} else {
|
||
|
inlineTextBuilder.concatWord(transform(m[0]), noWrap);
|
||
|
}
|
||
|
while ((m = wordOrNewlineRe.exec(text)) !== null) {
|
||
|
if (m[0] === '\n') {
|
||
|
inlineTextBuilder.startNewLine();
|
||
|
} else {
|
||
|
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || (this.testTrailingWhitespace(text));
|
||
|
// No need to stash a space in case last added item was a new line,
|
||
|
// but that won't affect anything later anyway.
|
||
|
};
|
||
|
|
||
|
} else {
|
||
|
|
||
|
const wordRe = new RegExp(`[^${whitespaceCodes}]+`, 'g');
|
||
|
|
||
|
this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) {
|
||
|
if (!text) { return; }
|
||
|
const previouslyStashedSpace = inlineTextBuilder.stashedSpace;
|
||
|
let anyMatch = false;
|
||
|
let m = wordRe.exec(text);
|
||
|
if (m) {
|
||
|
anyMatch = true;
|
||
|
if (previouslyStashedSpace || this.testLeadingWhitespace(text)) {
|
||
|
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
|
||
|
} else {
|
||
|
inlineTextBuilder.concatWord(transform(m[0]), noWrap);
|
||
|
}
|
||
|
while ((m = wordRe.exec(text)) !== null) {
|
||
|
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
|
||
|
}
|
||
|
}
|
||
|
inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || this.testTrailingWhitespace(text);
|
||
|
};
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add text with only minimal processing.
|
||
|
* Everything between newlines considered a single word.
|
||
|
* No whitespace is trimmed.
|
||
|
* Not affected by preserveNewlines option - `\n` always starts a new line.
|
||
|
*
|
||
|
* `noWrap` argument is `true` by default - this won't start a new line
|
||
|
* even if there is not enough space left in the current line.
|
||
|
*
|
||
|
* @param { string } text Input text.
|
||
|
* @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text.
|
||
|
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
|
||
|
*/
|
||
|
addLiteral (text, inlineTextBuilder, noWrap = true) {
|
||
|
if (!text) { return; }
|
||
|
const previouslyStashedSpace = inlineTextBuilder.stashedSpace;
|
||
|
let anyMatch = false;
|
||
|
let m = this.newlineOrNonNewlineStringRe.exec(text);
|
||
|
if (m) {
|
||
|
anyMatch = true;
|
||
|
if (m[0] === '\n') {
|
||
|
inlineTextBuilder.startNewLine();
|
||
|
} else if (previouslyStashedSpace) {
|
||
|
inlineTextBuilder.pushWord(m[0], noWrap);
|
||
|
} else {
|
||
|
inlineTextBuilder.concatWord(m[0], noWrap);
|
||
|
}
|
||
|
while ((m = this.newlineOrNonNewlineStringRe.exec(text)) !== null) {
|
||
|
if (m[0] === '\n') {
|
||
|
inlineTextBuilder.startNewLine();
|
||
|
} else {
|
||
|
inlineTextBuilder.pushWord(m[0], noWrap);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Test whether the given text starts with HTML whitespace character.
|
||
|
*
|
||
|
* @param { string } text The string to test.
|
||
|
* @returns { boolean }
|
||
|
*/
|
||
|
testLeadingWhitespace (text) {
|
||
|
return this.leadingWhitespaceRe.test(text);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Test whether the given text ends with HTML whitespace character.
|
||
|
*
|
||
|
* @param { string } text The string to test.
|
||
|
* @returns { boolean }
|
||
|
*/
|
||
|
testTrailingWhitespace (text) {
|
||
|
return this.trailingWhitespaceRe.test(text);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Test whether the given text contains any non-whitespace characters.
|
||
|
*
|
||
|
* @param { string } text The string to test.
|
||
|
* @returns { boolean }
|
||
|
*/
|
||
|
testContainsWords (text) {
|
||
|
return !this.allWhitespaceOrEmptyRe.test(text);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the number of newlines if there are no words.
|
||
|
*
|
||
|
* If any word is found then return zero regardless of the actual number of newlines.
|
||
|
*
|
||
|
* @param { string } text Input string.
|
||
|
* @returns { number }
|
||
|
*/
|
||
|
countNewlinesNoWords (text) {
|
||
|
this.newlineOrNonWhitespaceRe.lastIndex = 0;
|
||
|
let counter = 0;
|
||
|
let match;
|
||
|
while ((match = this.newlineOrNonWhitespaceRe.exec(text)) !== null) {
|
||
|
if (match[0] === '\n') {
|
||
|
counter++;
|
||
|
} else {
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
return counter;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Helps to build text from inline and block elements.
|
||
|
*
|
||
|
* @class BlockTextBuilder
|
||
|
*/
|
||
|
class BlockTextBuilder {
|
||
|
|
||
|
/**
|
||
|
* Creates an instance of BlockTextBuilder.
|
||
|
*
|
||
|
* @param { Options } options HtmlToText options.
|
||
|
* @param { import('selderee').Picker<DomNode, TagDefinition> } picker Selectors decision tree picker.
|
||
|
* @param { any} [metadata] Optional metadata for HTML document, for use in formatters.
|
||
|
*/
|
||
|
constructor (options, picker, metadata = undefined) {
|
||
|
this.options = options;
|
||
|
this.picker = picker;
|
||
|
this.metadata = metadata;
|
||
|
this.whitespaceProcessor = new WhitespaceProcessor(options);
|
||
|
/** @type { StackItem } */
|
||
|
this._stackItem = new BlockStackItem(options);
|
||
|
/** @type { TransformerStackItem } */
|
||
|
this._wordTransformer = undefined;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Put a word-by-word transform function onto the transformations stack.
|
||
|
*
|
||
|
* Mainly used for uppercasing. Can be bypassed to add unformatted text such as URLs.
|
||
|
*
|
||
|
* Word transformations applied before wrapping.
|
||
|
*
|
||
|
* @param { (str: string) => string } wordTransform Word transformation function.
|
||
|
*/
|
||
|
pushWordTransform (wordTransform) {
|
||
|
this._wordTransformer = new TransformerStackItem(this._wordTransformer, wordTransform);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Remove a function from the word transformations stack.
|
||
|
*
|
||
|
* @returns { (str: string) => string } A function that was removed.
|
||
|
*/
|
||
|
popWordTransform () {
|
||
|
if (!this._wordTransformer) { return undefined; }
|
||
|
const transform = this._wordTransformer.transform;
|
||
|
this._wordTransformer = this._wordTransformer.next;
|
||
|
return transform;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Ignore wordwrap option in followup inline additions and disable automatic wrapping.
|
||
|
*/
|
||
|
startNoWrap () {
|
||
|
this._stackItem.isNoWrap = true;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return automatic wrapping to behavior defined by options.
|
||
|
*/
|
||
|
stopNoWrap () {
|
||
|
this._stackItem.isNoWrap = false;
|
||
|
}
|
||
|
|
||
|
/** @returns { (str: string) => string } */
|
||
|
_getCombinedWordTransformer () {
|
||
|
const wt = (this._wordTransformer)
|
||
|
? ((str) => applyTransformer(str, this._wordTransformer))
|
||
|
: undefined;
|
||
|
const ce = this.options.encodeCharacters;
|
||
|
return (wt)
|
||
|
? ((ce) ? (str) => ce(wt(str)) : wt)
|
||
|
: ce;
|
||
|
}
|
||
|
|
||
|
_popStackItem () {
|
||
|
const item = this._stackItem;
|
||
|
this._stackItem = item.next;
|
||
|
return item;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add a line break into currently built block.
|
||
|
*/
|
||
|
addLineBreak () {
|
||
|
if (!(
|
||
|
this._stackItem instanceof BlockStackItem
|
||
|
|| this._stackItem instanceof ListItemStackItem
|
||
|
|| this._stackItem instanceof TableCellStackItem
|
||
|
)) { return; }
|
||
|
if (this._stackItem.isPre) {
|
||
|
this._stackItem.rawText += '\n';
|
||
|
} else {
|
||
|
this._stackItem.inlineTextBuilder.startNewLine();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Allow to break line in case directly following text will not fit.
|
||
|
*/
|
||
|
addWordBreakOpportunity () {
|
||
|
if (
|
||
|
this._stackItem instanceof BlockStackItem
|
||
|
|| this._stackItem instanceof ListItemStackItem
|
||
|
|| this._stackItem instanceof TableCellStackItem
|
||
|
) {
|
||
|
this._stackItem.inlineTextBuilder.wordBreakOpportunity = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add a node inline into the currently built block.
|
||
|
*
|
||
|
* @param { string } str
|
||
|
* Text content of a node to add.
|
||
|
*
|
||
|
* @param { object } [param1]
|
||
|
* Object holding the parameters of the operation.
|
||
|
*
|
||
|
* @param { boolean } [param1.noWordTransform]
|
||
|
* Ignore word transformers if there are any.
|
||
|
* Don't encode characters as well.
|
||
|
* (Use this for things like URL addresses).
|
||
|
*/
|
||
|
addInline (str, { noWordTransform = false } = {}) {
|
||
|
if (!(
|
||
|
this._stackItem instanceof BlockStackItem
|
||
|
|| this._stackItem instanceof ListItemStackItem
|
||
|
|| this._stackItem instanceof TableCellStackItem
|
||
|
)) { return; }
|
||
|
|
||
|
if (this._stackItem.isPre) {
|
||
|
this._stackItem.rawText += str;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (
|
||
|
str.length === 0 || // empty string
|
||
|
(
|
||
|
this._stackItem.stashedLineBreaks && // stashed linebreaks make whitespace irrelevant
|
||
|
!this.whitespaceProcessor.testContainsWords(str) // no words to add
|
||
|
)
|
||
|
) { return; }
|
||
|
|
||
|
if (this.options.preserveNewlines) {
|
||
|
const newlinesNumber = this.whitespaceProcessor.countNewlinesNoWords(str);
|
||
|
if (newlinesNumber > 0) {
|
||
|
this._stackItem.inlineTextBuilder.startNewLine(newlinesNumber);
|
||
|
// keep stashedLineBreaks unchanged
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (this._stackItem.stashedLineBreaks) {
|
||
|
this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks);
|
||
|
}
|
||
|
this.whitespaceProcessor.shrinkWrapAdd(
|
||
|
str,
|
||
|
this._stackItem.inlineTextBuilder,
|
||
|
(noWordTransform) ? undefined : this._getCombinedWordTransformer(),
|
||
|
this._stackItem.isNoWrap
|
||
|
);
|
||
|
this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add a string inline into the currently built block.
|
||
|
*
|
||
|
* Use this for markup elements that don't have to adhere
|
||
|
* to text layout rules.
|
||
|
*
|
||
|
* @param { string } str Text to add.
|
||
|
*/
|
||
|
addLiteral (str) {
|
||
|
if (!(
|
||
|
this._stackItem instanceof BlockStackItem
|
||
|
|| this._stackItem instanceof ListItemStackItem
|
||
|
|| this._stackItem instanceof TableCellStackItem
|
||
|
)) { return; }
|
||
|
|
||
|
if (str.length === 0) { return; }
|
||
|
|
||
|
if (this._stackItem.isPre) {
|
||
|
this._stackItem.rawText += str;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (this._stackItem.stashedLineBreaks) {
|
||
|
this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks);
|
||
|
}
|
||
|
this.whitespaceProcessor.addLiteral(
|
||
|
str,
|
||
|
this._stackItem.inlineTextBuilder,
|
||
|
this._stackItem.isNoWrap
|
||
|
);
|
||
|
this._stackItem.stashedLineBreaks = 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Start building a new block.
|
||
|
*
|
||
|
* @param { object } [param0]
|
||
|
* Object holding the parameters of the block.
|
||
|
*
|
||
|
* @param { number } [param0.leadingLineBreaks]
|
||
|
* This block should have at least this number of line breaks to separate it from any preceding block.
|
||
|
*
|
||
|
* @param { number } [param0.reservedLineLength]
|
||
|
* Reserve this number of characters on each line for block markup.
|
||
|
*
|
||
|
* @param { boolean } [param0.isPre]
|
||
|
* Should HTML whitespace be preserved inside this block.
|
||
|
*/
|
||
|
openBlock ({ leadingLineBreaks = 1, reservedLineLength = 0, isPre = false } = {}) {
|
||
|
const maxLineLength = Math.max(20, this._stackItem.inlineTextBuilder.maxLineLength - reservedLineLength);
|
||
|
this._stackItem = new BlockStackItem(
|
||
|
this.options,
|
||
|
this._stackItem,
|
||
|
leadingLineBreaks,
|
||
|
maxLineLength
|
||
|
);
|
||
|
if (isPre) { this._stackItem.isPre = true; }
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Finalize currently built block, add it's content to the parent block.
|
||
|
*
|
||
|
* @param { object } [param0]
|
||
|
* Object holding the parameters of the block.
|
||
|
*
|
||
|
* @param { number } [param0.trailingLineBreaks]
|
||
|
* This block should have at least this number of line breaks to separate it from any following block.
|
||
|
*
|
||
|
* @param { (str: string) => string } [param0.blockTransform]
|
||
|
* A function to transform the block text before adding to the parent block.
|
||
|
* This happens after word wrap and should be used in combination with reserved line length
|
||
|
* in order to keep line lengths correct.
|
||
|
* Used for whole block markup.
|
||
|
*/
|
||
|
closeBlock ({ trailingLineBreaks = 1, blockTransform = undefined } = {}) {
|
||
|
const block = this._popStackItem();
|
||
|
const blockText = (blockTransform) ? blockTransform(getText(block)) : getText(block);
|
||
|
addText(this._stackItem, blockText, block.leadingLineBreaks, Math.max(block.stashedLineBreaks, trailingLineBreaks));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Start building a new list.
|
||
|
*
|
||
|
* @param { object } [param0]
|
||
|
* Object holding the parameters of the list.
|
||
|
*
|
||
|
* @param { number } [param0.maxPrefixLength]
|
||
|
* Length of the longest list item prefix.
|
||
|
* If not supplied or too small then list items won't be aligned properly.
|
||
|
*
|
||
|
* @param { 'left' | 'right' } [param0.prefixAlign]
|
||
|
* Specify how prefixes of different lengths have to be aligned
|
||
|
* within a column.
|
||
|
*
|
||
|
* @param { number } [param0.interRowLineBreaks]
|
||
|
* Minimum number of line breaks between list items.
|
||
|
*
|
||
|
* @param { number } [param0.leadingLineBreaks]
|
||
|
* This list should have at least this number of line breaks to separate it from any preceding block.
|
||
|
*/
|
||
|
openList ({ maxPrefixLength = 0, prefixAlign = 'left', interRowLineBreaks = 1, leadingLineBreaks = 2 } = {}) {
|
||
|
this._stackItem = new ListStackItem(this.options, this._stackItem, {
|
||
|
interRowLineBreaks: interRowLineBreaks,
|
||
|
leadingLineBreaks: leadingLineBreaks,
|
||
|
maxLineLength: this._stackItem.inlineTextBuilder.maxLineLength,
|
||
|
maxPrefixLength: maxPrefixLength,
|
||
|
prefixAlign: prefixAlign
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Start building a new list item.
|
||
|
*
|
||
|
* @param {object} param0
|
||
|
* Object holding the parameters of the list item.
|
||
|
*
|
||
|
* @param { string } [param0.prefix]
|
||
|
* Prefix for this list item (item number, bullet point, etc).
|
||
|
*/
|
||
|
openListItem ({ prefix = '' } = {}) {
|
||
|
if (!(this._stackItem instanceof ListStackItem)) {
|
||
|
throw new Error('Can\'t add a list item to something that is not a list! Check the formatter.');
|
||
|
}
|
||
|
const list = this._stackItem;
|
||
|
const prefixLength = Math.max(prefix.length, list.maxPrefixLength);
|
||
|
const maxLineLength = Math.max(20, list.inlineTextBuilder.maxLineLength - prefixLength);
|
||
|
this._stackItem = new ListItemStackItem(this.options, list, {
|
||
|
prefix: prefix,
|
||
|
maxLineLength: maxLineLength,
|
||
|
leadingLineBreaks: list.interRowLineBreaks
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Finalize currently built list item, add it's content to the parent list.
|
||
|
*/
|
||
|
closeListItem () {
|
||
|
const listItem = this._popStackItem();
|
||
|
const list = listItem.next;
|
||
|
|
||
|
const prefixLength = Math.max(listItem.prefix.length, list.maxPrefixLength);
|
||
|
const spacing = '\n' + ' '.repeat(prefixLength);
|
||
|
const prefix = (list.prefixAlign === 'right')
|
||
|
? listItem.prefix.padStart(prefixLength)
|
||
|
: listItem.prefix.padEnd(prefixLength);
|
||
|
const text = prefix + getText(listItem).replace(/\n/g, spacing);
|
||
|
|
||
|
addText(
|
||
|
list,
|
||
|
text,
|
||
|
listItem.leadingLineBreaks,
|
||
|
Math.max(listItem.stashedLineBreaks, list.interRowLineBreaks)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Finalize currently built list, add it's content to the parent block.
|
||
|
*
|
||
|
* @param { object } param0
|
||
|
* Object holding the parameters of the list.
|
||
|
*
|
||
|
* @param { number } [param0.trailingLineBreaks]
|
||
|
* This list should have at least this number of line breaks to separate it from any following block.
|
||
|
*/
|
||
|
closeList ({ trailingLineBreaks = 2 } = {}) {
|
||
|
const list = this._popStackItem();
|
||
|
const text = getText(list);
|
||
|
if (text) {
|
||
|
addText(this._stackItem, text, list.leadingLineBreaks, trailingLineBreaks);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Start building a table.
|
||
|
*/
|
||
|
openTable () {
|
||
|
this._stackItem = new TableStackItem(this._stackItem);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Start building a table row.
|
||
|
*/
|
||
|
openTableRow () {
|
||
|
if (!(this._stackItem instanceof TableStackItem)) {
|
||
|
throw new Error('Can\'t add a table row to something that is not a table! Check the formatter.');
|
||
|
}
|
||
|
this._stackItem = new TableRowStackItem(this._stackItem);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Start building a table cell.
|
||
|
*
|
||
|
* @param { object } [param0]
|
||
|
* Object holding the parameters of the cell.
|
||
|
*
|
||
|
* @param { number } [param0.maxColumnWidth]
|
||
|
* Wrap cell content to this width. Fall back to global wordwrap value if undefined.
|
||
|
*/
|
||
|
openTableCell ({ maxColumnWidth = undefined } = {}) {
|
||
|
if (!(this._stackItem instanceof TableRowStackItem)) {
|
||
|
throw new Error('Can\'t add a table cell to something that is not a table row! Check the formatter.');
|
||
|
}
|
||
|
this._stackItem = new TableCellStackItem(this.options, this._stackItem, maxColumnWidth);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Finalize currently built table cell and add it to parent table row's cells.
|
||
|
*
|
||
|
* @param { object } [param0]
|
||
|
* Object holding the parameters of the cell.
|
||
|
*
|
||
|
* @param { number } [param0.colspan] How many columns this cell should occupy.
|
||
|
* @param { number } [param0.rowspan] How many rows this cell should occupy.
|
||
|
*/
|
||
|
closeTableCell ({ colspan = 1, rowspan = 1 } = {}) {
|
||
|
const cell = this._popStackItem();
|
||
|
const text = trimCharacter(getText(cell), '\n');
|
||
|
cell.next.cells.push({ colspan: colspan, rowspan: rowspan, text: text });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Finalize currently built table row and add it to parent table's rows.
|
||
|
*/
|
||
|
closeTableRow () {
|
||
|
const row = this._popStackItem();
|
||
|
row.next.rows.push(row.cells);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Finalize currently built table and add the rendered text to the parent block.
|
||
|
*
|
||
|
* @param { object } param0
|
||
|
* Object holding the parameters of the table.
|
||
|
*
|
||
|
* @param { TablePrinter } param0.tableToString
|
||
|
* A function to convert a table of stringified cells into a complete table.
|
||
|
*
|
||
|
* @param { number } [param0.leadingLineBreaks]
|
||
|
* This table should have at least this number of line breaks to separate if from any preceding block.
|
||
|
*
|
||
|
* @param { number } [param0.trailingLineBreaks]
|
||
|
* This table should have at least this number of line breaks to separate it from any following block.
|
||
|
*/
|
||
|
closeTable ({ tableToString, leadingLineBreaks = 2, trailingLineBreaks = 2 }) {
|
||
|
const table = this._popStackItem();
|
||
|
const output = tableToString(table.rows);
|
||
|
if (output) {
|
||
|
addText(this._stackItem, output, leadingLineBreaks, trailingLineBreaks);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the rendered text content of this builder.
|
||
|
*
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
toString () {
|
||
|
return getText(this._stackItem.getRoot());
|
||
|
// There should only be the root item if everything is closed properly.
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
function getText (stackItem) {
|
||
|
if (!(
|
||
|
stackItem instanceof BlockStackItem
|
||
|
|| stackItem instanceof ListItemStackItem
|
||
|
|| stackItem instanceof TableCellStackItem
|
||
|
)) {
|
||
|
throw new Error('Only blocks, list items and table cells can be requested for text contents.');
|
||
|
}
|
||
|
return (stackItem.inlineTextBuilder.isEmpty())
|
||
|
? stackItem.rawText
|
||
|
: stackItem.rawText + stackItem.inlineTextBuilder.toString();
|
||
|
}
|
||
|
|
||
|
function addText (stackItem, text, leadingLineBreaks, trailingLineBreaks) {
|
||
|
if (!(
|
||
|
stackItem instanceof BlockStackItem
|
||
|
|| stackItem instanceof ListItemStackItem
|
||
|
|| stackItem instanceof TableCellStackItem
|
||
|
)) {
|
||
|
throw new Error('Only blocks, list items and table cells can contain text.');
|
||
|
}
|
||
|
const parentText = getText(stackItem);
|
||
|
const lineBreaks = Math.max(stackItem.stashedLineBreaks, leadingLineBreaks);
|
||
|
stackItem.inlineTextBuilder.clear();
|
||
|
if (parentText) {
|
||
|
stackItem.rawText = parentText + '\n'.repeat(lineBreaks) + text;
|
||
|
} else {
|
||
|
stackItem.rawText = text;
|
||
|
stackItem.leadingLineBreaks = lineBreaks;
|
||
|
}
|
||
|
stackItem.stashedLineBreaks = trailingLineBreaks;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @param { string } str A string to transform.
|
||
|
* @param { TransformerStackItem } transformer A transformer item (with possible continuation).
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
function applyTransformer (str, transformer) {
|
||
|
return ((transformer) ? applyTransformer(transformer.transform(str), transformer.next) : str);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Compile selectors into a decision tree,
|
||
|
* return a function intended for batch processing.
|
||
|
*
|
||
|
* @param { Options } [options = {}] HtmlToText options (defaults, formatters, user options merged, deduplicated).
|
||
|
* @returns { (html: string, metadata?: any) => string } Pre-configured converter function.
|
||
|
* @static
|
||
|
*/
|
||
|
function compile$1 (options = {}) {
|
||
|
const selectorsWithoutFormat = options.selectors.filter(s => !s.format);
|
||
|
if (selectorsWithoutFormat.length) {
|
||
|
throw new Error(
|
||
|
'Following selectors have no specified format: ' +
|
||
|
selectorsWithoutFormat.map(s => `\`${s.selector}\``).join(', ')
|
||
|
);
|
||
|
}
|
||
|
const picker = new selderee.DecisionTree(
|
||
|
options.selectors.map(s => [s.selector, s])
|
||
|
).build(pluginHtmlparser2.hp2Builder);
|
||
|
|
||
|
if (typeof options.encodeCharacters !== 'function') {
|
||
|
options.encodeCharacters = makeReplacerFromDict(options.encodeCharacters);
|
||
|
}
|
||
|
|
||
|
const baseSelectorsPicker = new selderee.DecisionTree(
|
||
|
options.baseElements.selectors.map((s, i) => [s, i + 1])
|
||
|
).build(pluginHtmlparser2.hp2Builder);
|
||
|
function findBaseElements (dom) {
|
||
|
return findBases(dom, options, baseSelectorsPicker);
|
||
|
}
|
||
|
|
||
|
const limitedWalk = limitedDepthRecursive(
|
||
|
options.limits.maxDepth,
|
||
|
recursiveWalk,
|
||
|
function (dom, builder) {
|
||
|
builder.addInline(options.limits.ellipsis || '');
|
||
|
}
|
||
|
);
|
||
|
|
||
|
return function (html, metadata = undefined) {
|
||
|
return process(html, metadata, options, picker, findBaseElements, limitedWalk);
|
||
|
};
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Convert given HTML according to preprocessed options.
|
||
|
*
|
||
|
* @param { string } html HTML content to convert.
|
||
|
* @param { any } metadata Optional metadata for HTML document, for use in formatters.
|
||
|
* @param { Options } options HtmlToText options (preprocessed).
|
||
|
* @param { import('selderee').Picker<DomNode, TagDefinition> } picker
|
||
|
* Tag definition picker for DOM nodes processing.
|
||
|
* @param { (dom: DomNode[]) => DomNode[] } findBaseElements
|
||
|
* Function to extract elements from HTML DOM
|
||
|
* that will only be present in the output text.
|
||
|
* @param { RecursiveCallback } walk Recursive callback.
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
function process (html, metadata, options, picker, findBaseElements, walk) {
|
||
|
const maxInputLength = options.limits.maxInputLength;
|
||
|
if (maxInputLength && html && html.length > maxInputLength) {
|
||
|
console.warn(
|
||
|
`Input length ${html.length} is above allowed limit of ${maxInputLength}. Truncating without ellipsis.`
|
||
|
);
|
||
|
html = html.substring(0, maxInputLength);
|
||
|
}
|
||
|
|
||
|
const document = htmlparser2.parseDocument(html, { decodeEntities: options.decodeEntities });
|
||
|
const bases = findBaseElements(document.children);
|
||
|
const builder = new BlockTextBuilder(options, picker, metadata);
|
||
|
walk(bases, builder);
|
||
|
return builder.toString();
|
||
|
}
|
||
|
|
||
|
|
||
|
function findBases (dom, options, baseSelectorsPicker) {
|
||
|
const results = [];
|
||
|
|
||
|
function recursiveWalk (walk, /** @type { DomNode[] } */ dom) {
|
||
|
dom = dom.slice(0, options.limits.maxChildNodes);
|
||
|
for (const elem of dom) {
|
||
|
if (elem.type !== 'tag') {
|
||
|
continue;
|
||
|
}
|
||
|
const pickedSelectorIndex = baseSelectorsPicker.pick1(elem);
|
||
|
if (pickedSelectorIndex > 0) {
|
||
|
results.push({ selectorIndex: pickedSelectorIndex, element: elem });
|
||
|
} else if (elem.children) {
|
||
|
walk(elem.children);
|
||
|
}
|
||
|
if (results.length >= options.limits.maxBaseElements) {
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const limitedWalk = limitedDepthRecursive(
|
||
|
options.limits.maxDepth,
|
||
|
recursiveWalk
|
||
|
);
|
||
|
limitedWalk(dom);
|
||
|
|
||
|
if (options.baseElements.orderBy !== 'occurrence') { // 'selectors'
|
||
|
results.sort((a, b) => a.selectorIndex - b.selectorIndex);
|
||
|
}
|
||
|
return (options.baseElements.returnDomByDefault && results.length === 0)
|
||
|
? dom
|
||
|
: results.map(x => x.element);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Function to walk through DOM nodes and accumulate their string representations.
|
||
|
*
|
||
|
* @param { RecursiveCallback } walk Recursive callback.
|
||
|
* @param { DomNode[] } [dom] Nodes array to process.
|
||
|
* @param { BlockTextBuilder } builder Passed around to accumulate output text.
|
||
|
* @private
|
||
|
*/
|
||
|
function recursiveWalk (walk, dom, builder) {
|
||
|
if (!dom) { return; }
|
||
|
|
||
|
const options = builder.options;
|
||
|
|
||
|
const tooManyChildNodes = dom.length > options.limits.maxChildNodes;
|
||
|
if (tooManyChildNodes) {
|
||
|
dom = dom.slice(0, options.limits.maxChildNodes);
|
||
|
dom.push({
|
||
|
data: options.limits.ellipsis,
|
||
|
type: 'text'
|
||
|
});
|
||
|
}
|
||
|
|
||
|
for (const elem of dom) {
|
||
|
switch (elem.type) {
|
||
|
case 'text': {
|
||
|
builder.addInline(elem.data);
|
||
|
break;
|
||
|
}
|
||
|
case 'tag': {
|
||
|
const tagDefinition = builder.picker.pick1(elem);
|
||
|
const format = options.formatters[tagDefinition.format];
|
||
|
format(elem, walk, builder, tagDefinition.options || {});
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @param { Object<string,string | false> } dict
|
||
|
* A dictionary where keys are characters to replace
|
||
|
* and values are replacement strings.
|
||
|
*
|
||
|
* First code point from dict keys is used.
|
||
|
* Compound emojis with ZWJ are not supported (not until Node 16).
|
||
|
*
|
||
|
* @returns { ((str: string) => string) | undefined }
|
||
|
*/
|
||
|
function makeReplacerFromDict (dict) {
|
||
|
if (!dict || Object.keys(dict).length === 0) {
|
||
|
return undefined;
|
||
|
}
|
||
|
/** @type { [string, string][] } */
|
||
|
const entries = Object.entries(dict).filter(([, v]) => v !== false);
|
||
|
const regex = new RegExp(
|
||
|
entries
|
||
|
.map(([c]) => `(${unicodeEscape([...c][0])})`)
|
||
|
.join('|'),
|
||
|
'g'
|
||
|
);
|
||
|
const values = entries.map(([, v]) => v);
|
||
|
const replacer = (m, ...cgs) => values[cgs.findIndex(cg => cg)];
|
||
|
return (str) => str.replace(regex, replacer);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Dummy formatter that discards the input and does nothing.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatSkip (elem, walk, builder, formatOptions) {
|
||
|
/* do nothing */
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Insert the given string literal inline instead of a tag.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatInlineString (elem, walk, builder, formatOptions) {
|
||
|
builder.addLiteral(formatOptions.string || '');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Insert a block with the given string literal instead of a tag.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatBlockString (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
|
||
|
builder.addLiteral(formatOptions.string || '');
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process an inline-level element.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatInline (elem, walk, builder, formatOptions) {
|
||
|
walk(elem.children, builder);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a block-level container.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatBlock$1 (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
|
||
|
walk(elem.children, builder);
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
function renderOpenTag (elem) {
|
||
|
const attrs = (elem.attribs && elem.attribs.length)
|
||
|
? ' ' + Object.entries(elem.attribs)
|
||
|
.map(([k, v]) => ((v === '') ? k : `${k}=${v.replace(/"/g, '"')}`))
|
||
|
.join(' ')
|
||
|
: '';
|
||
|
return `<${elem.name}${attrs}>`;
|
||
|
}
|
||
|
|
||
|
function renderCloseTag (elem) {
|
||
|
return `</${elem.name}>`;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Render an element as inline HTML tag, walk through it's children.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatInlineTag (elem, walk, builder, formatOptions) {
|
||
|
builder.startNoWrap();
|
||
|
builder.addLiteral(renderOpenTag(elem));
|
||
|
builder.stopNoWrap();
|
||
|
walk(elem.children, builder);
|
||
|
builder.startNoWrap();
|
||
|
builder.addLiteral(renderCloseTag(elem));
|
||
|
builder.stopNoWrap();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Render an element as HTML block bag, walk through it's children.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatBlockTag (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
|
||
|
builder.startNoWrap();
|
||
|
builder.addLiteral(renderOpenTag(elem));
|
||
|
builder.stopNoWrap();
|
||
|
walk(elem.children, builder);
|
||
|
builder.startNoWrap();
|
||
|
builder.addLiteral(renderCloseTag(elem));
|
||
|
builder.stopNoWrap();
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Render an element with all it's children as inline HTML.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatInlineHtml (elem, walk, builder, formatOptions) {
|
||
|
builder.startNoWrap();
|
||
|
builder.addLiteral(
|
||
|
domSerializer.render(elem, { decodeEntities: builder.options.decodeEntities })
|
||
|
);
|
||
|
builder.stopNoWrap();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Render an element with all it's children as HTML block.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatBlockHtml (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
|
||
|
builder.startNoWrap();
|
||
|
builder.addLiteral(
|
||
|
domSerializer.render(elem, { decodeEntities: builder.options.decodeEntities })
|
||
|
);
|
||
|
builder.stopNoWrap();
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Render inline element wrapped with given strings.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatInlineSurround (elem, walk, builder, formatOptions) {
|
||
|
builder.addLiteral(formatOptions.prefix || '');
|
||
|
walk(elem.children, builder);
|
||
|
builder.addLiteral(formatOptions.suffix || '');
|
||
|
}
|
||
|
|
||
|
var genericFormatters = /*#__PURE__*/Object.freeze({
|
||
|
__proto__: null,
|
||
|
block: formatBlock$1,
|
||
|
blockHtml: formatBlockHtml,
|
||
|
blockString: formatBlockString,
|
||
|
blockTag: formatBlockTag,
|
||
|
inline: formatInline,
|
||
|
inlineHtml: formatInlineHtml,
|
||
|
inlineString: formatInlineString,
|
||
|
inlineSurround: formatInlineSurround,
|
||
|
inlineTag: formatInlineTag,
|
||
|
skip: formatSkip
|
||
|
});
|
||
|
|
||
|
function getRow (matrix, j) {
|
||
|
if (!matrix[j]) { matrix[j] = []; }
|
||
|
return matrix[j];
|
||
|
}
|
||
|
|
||
|
function findFirstVacantIndex (row, x = 0) {
|
||
|
while (row[x]) { x++; }
|
||
|
return x;
|
||
|
}
|
||
|
|
||
|
function transposeInPlace (matrix, maxSize) {
|
||
|
for (let i = 0; i < maxSize; i++) {
|
||
|
const rowI = getRow(matrix, i);
|
||
|
for (let j = 0; j < i; j++) {
|
||
|
const rowJ = getRow(matrix, j);
|
||
|
if (rowI[j] || rowJ[i]) {
|
||
|
const temp = rowI[j];
|
||
|
rowI[j] = rowJ[i];
|
||
|
rowJ[i] = temp;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function putCellIntoLayout (cell, layout, baseRow, baseCol) {
|
||
|
for (let r = 0; r < cell.rowspan; r++) {
|
||
|
const layoutRow = getRow(layout, baseRow + r);
|
||
|
for (let c = 0; c < cell.colspan; c++) {
|
||
|
layoutRow[baseCol + c] = cell;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function getOrInitOffset (offsets, index) {
|
||
|
if (offsets[index] === undefined) {
|
||
|
offsets[index] = (index === 0) ? 0 : 1 + getOrInitOffset(offsets, index - 1);
|
||
|
}
|
||
|
return offsets[index];
|
||
|
}
|
||
|
|
||
|
function updateOffset (offsets, base, span, value) {
|
||
|
offsets[base + span] = Math.max(
|
||
|
getOrInitOffset(offsets, base + span),
|
||
|
getOrInitOffset(offsets, base) + value
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Render a table into a string.
|
||
|
* Cells can contain multiline text and span across multiple rows and columns.
|
||
|
*
|
||
|
* Modifies cells to add lines array.
|
||
|
*
|
||
|
* @param { TablePrinterCell[][] } tableRows Table to render.
|
||
|
* @param { number } rowSpacing Number of spaces between columns.
|
||
|
* @param { number } colSpacing Number of empty lines between rows.
|
||
|
* @returns { string }
|
||
|
*/
|
||
|
function tableToString (tableRows, rowSpacing, colSpacing) {
|
||
|
const layout = [];
|
||
|
let colNumber = 0;
|
||
|
const rowNumber = tableRows.length;
|
||
|
const rowOffsets = [0];
|
||
|
// Fill the layout table and row offsets row-by-row.
|
||
|
for (let j = 0; j < rowNumber; j++) {
|
||
|
const layoutRow = getRow(layout, j);
|
||
|
const cells = tableRows[j];
|
||
|
let x = 0;
|
||
|
for (let i = 0; i < cells.length; i++) {
|
||
|
const cell = cells[i];
|
||
|
x = findFirstVacantIndex(layoutRow, x);
|
||
|
putCellIntoLayout(cell, layout, j, x);
|
||
|
x += cell.colspan;
|
||
|
cell.lines = cell.text.split('\n');
|
||
|
const cellHeight = cell.lines.length;
|
||
|
updateOffset(rowOffsets, j, cell.rowspan, cellHeight + rowSpacing);
|
||
|
}
|
||
|
colNumber = (layoutRow.length > colNumber) ? layoutRow.length : colNumber;
|
||
|
}
|
||
|
|
||
|
transposeInPlace(layout, (rowNumber > colNumber) ? rowNumber : colNumber);
|
||
|
|
||
|
const outputLines = [];
|
||
|
const colOffsets = [0];
|
||
|
// Fill column offsets and output lines column-by-column.
|
||
|
for (let x = 0; x < colNumber; x++) {
|
||
|
let y = 0;
|
||
|
let cell;
|
||
|
const rowsInThisColumn = Math.min(rowNumber, layout[x].length);
|
||
|
while (y < rowsInThisColumn) {
|
||
|
cell = layout[x][y];
|
||
|
if (cell) {
|
||
|
if (!cell.rendered) {
|
||
|
let cellWidth = 0;
|
||
|
for (let j = 0; j < cell.lines.length; j++) {
|
||
|
const line = cell.lines[j];
|
||
|
const lineOffset = rowOffsets[y] + j;
|
||
|
outputLines[lineOffset] = (outputLines[lineOffset] || '').padEnd(colOffsets[x]) + line;
|
||
|
cellWidth = (line.length > cellWidth) ? line.length : cellWidth;
|
||
|
}
|
||
|
updateOffset(colOffsets, x, cell.colspan, cellWidth + colSpacing);
|
||
|
cell.rendered = true;
|
||
|
}
|
||
|
y += cell.rowspan;
|
||
|
} else {
|
||
|
const lineOffset = rowOffsets[y];
|
||
|
outputLines[lineOffset] = (outputLines[lineOffset] || '');
|
||
|
y++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return outputLines.join('\n');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a line-break.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatLineBreak (elem, walk, builder, formatOptions) {
|
||
|
builder.addLineBreak();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a `wbr` tag (word break opportunity).
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatWbr (elem, walk, builder, formatOptions) {
|
||
|
builder.addWordBreakOpportunity();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a horizontal line.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatHorizontalLine (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
|
||
|
builder.addInline('-'.repeat(formatOptions.length || builder.options.wordwrap || 40));
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a paragraph.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatParagraph (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
|
||
|
walk(elem.children, builder);
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a preformatted content.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatPre (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({
|
||
|
isPre: true,
|
||
|
leadingLineBreaks: formatOptions.leadingLineBreaks || 2
|
||
|
});
|
||
|
walk(elem.children, builder);
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a heading.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatHeading (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
|
||
|
if (formatOptions.uppercase !== false) {
|
||
|
builder.pushWordTransform(str => str.toUpperCase());
|
||
|
walk(elem.children, builder);
|
||
|
builder.popWordTransform();
|
||
|
} else {
|
||
|
walk(elem.children, builder);
|
||
|
}
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a blockquote.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatBlockquote (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({
|
||
|
leadingLineBreaks: formatOptions.leadingLineBreaks || 2,
|
||
|
reservedLineLength: 2
|
||
|
});
|
||
|
walk(elem.children, builder);
|
||
|
builder.closeBlock({
|
||
|
trailingLineBreaks: formatOptions.trailingLineBreaks || 2,
|
||
|
blockTransform: str => ((formatOptions.trimEmptyLines !== false) ? trimCharacter(str, '\n') : str)
|
||
|
.split('\n')
|
||
|
.map(line => '> ' + line)
|
||
|
.join('\n')
|
||
|
});
|
||
|
}
|
||
|
|
||
|
function withBrackets (str, brackets) {
|
||
|
if (!brackets) { return str; }
|
||
|
|
||
|
const lbr = (typeof brackets[0] === 'string')
|
||
|
? brackets[0]
|
||
|
: '[';
|
||
|
const rbr = (typeof brackets[1] === 'string')
|
||
|
? brackets[1]
|
||
|
: ']';
|
||
|
return lbr + str + rbr;
|
||
|
}
|
||
|
|
||
|
function pathRewrite (path, rewriter, baseUrl, metadata, elem) {
|
||
|
const modifiedPath = (typeof rewriter === 'function')
|
||
|
? rewriter(path, metadata, elem)
|
||
|
: path;
|
||
|
return (modifiedPath[0] === '/' && baseUrl)
|
||
|
? trimCharacterEnd(baseUrl, '/') + modifiedPath
|
||
|
: modifiedPath;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process an image.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatImage (elem, walk, builder, formatOptions) {
|
||
|
const attribs = elem.attribs || {};
|
||
|
const alt = (attribs.alt)
|
||
|
? attribs.alt
|
||
|
: '';
|
||
|
const src = (!attribs.src)
|
||
|
? ''
|
||
|
: pathRewrite(attribs.src, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem);
|
||
|
const text = (!src)
|
||
|
? alt
|
||
|
: (!alt)
|
||
|
? withBrackets(src, formatOptions.linkBrackets)
|
||
|
: alt + ' ' + withBrackets(src, formatOptions.linkBrackets);
|
||
|
|
||
|
builder.addInline(text, { noWordTransform: true });
|
||
|
}
|
||
|
|
||
|
// a img baseUrl
|
||
|
// a img pathRewrite
|
||
|
// a img linkBrackets
|
||
|
|
||
|
// a ignoreHref: false
|
||
|
// ignoreText ?
|
||
|
// a noAnchorUrl: true
|
||
|
// can be replaced with selector
|
||
|
// a hideLinkHrefIfSameAsText: false
|
||
|
// how to compare, what to show (text, href, normalized) ?
|
||
|
// a mailto protocol removed without options
|
||
|
|
||
|
// a protocols: mailto, tel, ...
|
||
|
// can be matched with selector?
|
||
|
|
||
|
// anchors, protocols - only if no pathRewrite fn is provided
|
||
|
|
||
|
// normalize-url ?
|
||
|
|
||
|
// a
|
||
|
// a[href^="#"] - format:skip by default
|
||
|
// a[href^="mailto:"] - ?
|
||
|
|
||
|
/**
|
||
|
* Process an anchor.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatAnchor (elem, walk, builder, formatOptions) {
|
||
|
function getHref () {
|
||
|
if (formatOptions.ignoreHref) { return ''; }
|
||
|
if (!elem.attribs || !elem.attribs.href) { return ''; }
|
||
|
let href = elem.attribs.href.replace(/^mailto:/, '');
|
||
|
if (formatOptions.noAnchorUrl && href[0] === '#') { return ''; }
|
||
|
href = pathRewrite(href, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem);
|
||
|
return href;
|
||
|
}
|
||
|
const href = getHref();
|
||
|
if (!href) {
|
||
|
walk(elem.children, builder);
|
||
|
} else {
|
||
|
let text = '';
|
||
|
builder.pushWordTransform(
|
||
|
str => {
|
||
|
if (str) { text += str; }
|
||
|
return str;
|
||
|
}
|
||
|
);
|
||
|
walk(elem.children, builder);
|
||
|
builder.popWordTransform();
|
||
|
|
||
|
const hideSameLink = formatOptions.hideLinkHrefIfSameAsText && href === text;
|
||
|
if (!hideSameLink) {
|
||
|
builder.addInline(
|
||
|
(!text)
|
||
|
? href
|
||
|
: ' ' + withBrackets(href, formatOptions.linkBrackets),
|
||
|
{ noWordTransform: true }
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @param { DomNode } elem List items with their prefixes.
|
||
|
* @param { RecursiveCallback } walk Recursive callback to process child nodes.
|
||
|
* @param { BlockTextBuilder } builder Passed around to accumulate output text.
|
||
|
* @param { FormatOptions } formatOptions Options specific to a formatter.
|
||
|
* @param { () => string } nextPrefixCallback Function that returns increasing index each time it is called.
|
||
|
*/
|
||
|
function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) {
|
||
|
const isNestedList = get(elem, ['parent', 'name']) === 'li';
|
||
|
|
||
|
// With Roman numbers, index length is not as straightforward as with Arabic numbers or letters,
|
||
|
// so the dumb length comparison is the most robust way to get the correct value.
|
||
|
let maxPrefixLength = 0;
|
||
|
const listItems = (elem.children || [])
|
||
|
// it might be more accurate to check only for html spaces here, but no significant benefit
|
||
|
.filter(child => child.type !== 'text' || !/^\s*$/.test(child.data))
|
||
|
.map(function (child) {
|
||
|
if (child.name !== 'li') {
|
||
|
return { node: child, prefix: '' };
|
||
|
}
|
||
|
const prefix = (isNestedList)
|
||
|
? nextPrefixCallback().trimStart()
|
||
|
: nextPrefixCallback();
|
||
|
if (prefix.length > maxPrefixLength) { maxPrefixLength = prefix.length; }
|
||
|
return { node: child, prefix: prefix };
|
||
|
});
|
||
|
if (!listItems.length) { return; }
|
||
|
|
||
|
builder.openList({
|
||
|
interRowLineBreaks: 1,
|
||
|
leadingLineBreaks: isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2),
|
||
|
maxPrefixLength: maxPrefixLength,
|
||
|
prefixAlign: 'left'
|
||
|
});
|
||
|
|
||
|
for (const { node, prefix } of listItems) {
|
||
|
builder.openListItem({ prefix: prefix });
|
||
|
walk([node], builder);
|
||
|
builder.closeListItem();
|
||
|
}
|
||
|
|
||
|
builder.closeList({ trailingLineBreaks: isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2) });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process an unordered list.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatUnorderedList (elem, walk, builder, formatOptions) {
|
||
|
const prefix = formatOptions.itemPrefix || ' * ';
|
||
|
return formatList(elem, walk, builder, formatOptions, () => prefix);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process an ordered list.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatOrderedList (elem, walk, builder, formatOptions) {
|
||
|
let nextIndex = Number(elem.attribs.start || '1');
|
||
|
const indexFunction = getOrderedListIndexFunction(elem.attribs.type);
|
||
|
const nextPrefixCallback = () => ' ' + indexFunction(nextIndex++) + '. ';
|
||
|
return formatList(elem, walk, builder, formatOptions, nextPrefixCallback);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return a function that can be used to generate index markers of a specified format.
|
||
|
*
|
||
|
* @param { string } [olType='1'] Marker type.
|
||
|
* @returns { (i: number) => string }
|
||
|
*/
|
||
|
function getOrderedListIndexFunction (olType = '1') {
|
||
|
switch (olType) {
|
||
|
case 'a': return (i) => numberToLetterSequence(i, 'a');
|
||
|
case 'A': return (i) => numberToLetterSequence(i, 'A');
|
||
|
case 'i': return (i) => numberToRoman(i).toLowerCase();
|
||
|
case 'I': return (i) => numberToRoman(i);
|
||
|
case '1':
|
||
|
default: return (i) => (i).toString();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a list of class and ID selectors (prefixed with '.' and '#'),
|
||
|
* return them as separate lists of names without prefixes.
|
||
|
*
|
||
|
* @param { string[] } selectors Class and ID selectors (`[".class", "#id"]` etc).
|
||
|
* @returns { { classes: string[], ids: string[] } }
|
||
|
*/
|
||
|
function splitClassesAndIds (selectors) {
|
||
|
const classes = [];
|
||
|
const ids = [];
|
||
|
for (const selector of selectors) {
|
||
|
if (selector.startsWith('.')) {
|
||
|
classes.push(selector.substring(1));
|
||
|
} else if (selector.startsWith('#')) {
|
||
|
ids.push(selector.substring(1));
|
||
|
}
|
||
|
}
|
||
|
return { classes: classes, ids: ids };
|
||
|
}
|
||
|
|
||
|
function isDataTable (attr, tables) {
|
||
|
if (tables === true) { return true; }
|
||
|
if (!attr) { return false; }
|
||
|
|
||
|
const { classes, ids } = splitClassesAndIds(tables);
|
||
|
const attrClasses = (attr['class'] || '').split(' ');
|
||
|
const attrIds = (attr['id'] || '').split(' ');
|
||
|
|
||
|
return attrClasses.some(x => classes.includes(x)) || attrIds.some(x => ids.includes(x));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a table (either as a container or as a data table, depending on options).
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatTable (elem, walk, builder, formatOptions) {
|
||
|
return isDataTable(elem.attribs, builder.options.tables)
|
||
|
? formatDataTable(elem, walk, builder, formatOptions)
|
||
|
: formatBlock(elem, walk, builder, formatOptions);
|
||
|
}
|
||
|
|
||
|
function formatBlock (elem, walk, builder, formatOptions) {
|
||
|
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks });
|
||
|
walk(elem.children, builder);
|
||
|
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Process a data table.
|
||
|
*
|
||
|
* @type { FormatCallback }
|
||
|
*/
|
||
|
function formatDataTable (elem, walk, builder, formatOptions) {
|
||
|
builder.openTable();
|
||
|
elem.children.forEach(walkTable);
|
||
|
builder.closeTable({
|
||
|
tableToString: (rows) => tableToString(rows, formatOptions.rowSpacing ?? 0, formatOptions.colSpacing ?? 3),
|
||
|
leadingLineBreaks: formatOptions.leadingLineBreaks,
|
||
|
trailingLineBreaks: formatOptions.trailingLineBreaks
|
||
|
});
|
||
|
|
||
|
function formatCell (cellNode) {
|
||
|
const colspan = +get(cellNode, ['attribs', 'colspan']) || 1;
|
||
|
const rowspan = +get(cellNode, ['attribs', 'rowspan']) || 1;
|
||
|
builder.openTableCell({ maxColumnWidth: formatOptions.maxColumnWidth });
|
||
|
walk(cellNode.children, builder);
|
||
|
builder.closeTableCell({ colspan: colspan, rowspan: rowspan });
|
||
|
}
|
||
|
|
||
|
function walkTable (elem) {
|
||
|
if (elem.type !== 'tag') { return; }
|
||
|
|
||
|
const formatHeaderCell = (formatOptions.uppercaseHeaderCells !== false)
|
||
|
? (cellNode) => {
|
||
|
builder.pushWordTransform(str => str.toUpperCase());
|
||
|
formatCell(cellNode);
|
||
|
builder.popWordTransform();
|
||
|
}
|
||
|
: formatCell;
|
||
|
|
||
|
switch (elem.name) {
|
||
|
case 'thead':
|
||
|
case 'tbody':
|
||
|
case 'tfoot':
|
||
|
case 'center':
|
||
|
elem.children.forEach(walkTable);
|
||
|
return;
|
||
|
|
||
|
case 'tr': {
|
||
|
builder.openTableRow();
|
||
|
for (const childOfTr of elem.children) {
|
||
|
if (childOfTr.type !== 'tag') { continue; }
|
||
|
switch (childOfTr.name) {
|
||
|
case 'th': {
|
||
|
formatHeaderCell(childOfTr);
|
||
|
break;
|
||
|
}
|
||
|
case 'td': {
|
||
|
formatCell(childOfTr);
|
||
|
break;
|
||
|
}
|
||
|
// do nothing
|
||
|
}
|
||
|
}
|
||
|
builder.closeTableRow();
|
||
|
break;
|
||
|
}
|
||
|
// do nothing
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
var textFormatters = /*#__PURE__*/Object.freeze({
|
||
|
__proto__: null,
|
||
|
anchor: formatAnchor,
|
||
|
blockquote: formatBlockquote,
|
||
|
dataTable: formatDataTable,
|
||
|
heading: formatHeading,
|
||
|
horizontalLine: formatHorizontalLine,
|
||
|
image: formatImage,
|
||
|
lineBreak: formatLineBreak,
|
||
|
orderedList: formatOrderedList,
|
||
|
paragraph: formatParagraph,
|
||
|
pre: formatPre,
|
||
|
table: formatTable,
|
||
|
unorderedList: formatUnorderedList,
|
||
|
wbr: formatWbr
|
||
|
});
|
||
|
|
||
|
/**
|
||
|
* Default options.
|
||
|
*
|
||
|
* @constant
|
||
|
* @type { Options }
|
||
|
* @default
|
||
|
* @private
|
||
|
*/
|
||
|
const DEFAULT_OPTIONS = {
|
||
|
baseElements: {
|
||
|
selectors: [ 'body' ],
|
||
|
orderBy: 'selectors', // 'selectors' | 'occurrence'
|
||
|
returnDomByDefault: true
|
||
|
},
|
||
|
decodeEntities: true,
|
||
|
encodeCharacters: {},
|
||
|
formatters: {},
|
||
|
limits: {
|
||
|
ellipsis: '...',
|
||
|
maxBaseElements: undefined,
|
||
|
maxChildNodes: undefined,
|
||
|
maxDepth: undefined,
|
||
|
maxInputLength: (1 << 24) // 16_777_216
|
||
|
},
|
||
|
longWordSplit: {
|
||
|
forceWrapOnLimit: false,
|
||
|
wrapCharacters: []
|
||
|
},
|
||
|
preserveNewlines: false,
|
||
|
selectors: [
|
||
|
{ selector: '*', format: 'inline' },
|
||
|
{
|
||
|
selector: 'a',
|
||
|
format: 'anchor',
|
||
|
options: {
|
||
|
baseUrl: null,
|
||
|
hideLinkHrefIfSameAsText: false,
|
||
|
ignoreHref: false,
|
||
|
linkBrackets: ['[', ']'],
|
||
|
noAnchorUrl: true
|
||
|
}
|
||
|
},
|
||
|
{ selector: 'article', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{ selector: 'aside', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{
|
||
|
selector: 'blockquote',
|
||
|
format: 'blockquote',
|
||
|
options: { leadingLineBreaks: 2, trailingLineBreaks: 2, trimEmptyLines: true }
|
||
|
},
|
||
|
{ selector: 'br', format: 'lineBreak' },
|
||
|
{ selector: 'div', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{ selector: 'footer', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{ selector: 'form', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{ selector: 'h1', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } },
|
||
|
{ selector: 'h2', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } },
|
||
|
{ selector: 'h3', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } },
|
||
|
{ selector: 'h4', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } },
|
||
|
{ selector: 'h5', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } },
|
||
|
{ selector: 'h6', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } },
|
||
|
{ selector: 'header', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{
|
||
|
selector: 'hr',
|
||
|
format: 'horizontalLine',
|
||
|
options: { leadingLineBreaks: 2, length: undefined, trailingLineBreaks: 2 }
|
||
|
},
|
||
|
{
|
||
|
selector: 'img',
|
||
|
format: 'image',
|
||
|
options: { baseUrl: null, linkBrackets: ['[', ']'] }
|
||
|
},
|
||
|
{ selector: 'main', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{ selector: 'nav', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{
|
||
|
selector: 'ol',
|
||
|
format: 'orderedList',
|
||
|
options: { leadingLineBreaks: 2, trailingLineBreaks: 2 }
|
||
|
},
|
||
|
{ selector: 'p', format: 'paragraph', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } },
|
||
|
{ selector: 'pre', format: 'pre', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } },
|
||
|
{ selector: 'section', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
|
||
|
{
|
||
|
selector: 'table',
|
||
|
format: 'table',
|
||
|
options: {
|
||
|
colSpacing: 3,
|
||
|
leadingLineBreaks: 2,
|
||
|
maxColumnWidth: 60,
|
||
|
rowSpacing: 0,
|
||
|
trailingLineBreaks: 2,
|
||
|
uppercaseHeaderCells: true
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
selector: 'ul',
|
||
|
format: 'unorderedList',
|
||
|
options: { itemPrefix: ' * ', leadingLineBreaks: 2, trailingLineBreaks: 2 }
|
||
|
},
|
||
|
{ selector: 'wbr', format: 'wbr' },
|
||
|
],
|
||
|
tables: [], // deprecated
|
||
|
whitespaceCharacters: ' \t\r\n\f\u200b',
|
||
|
wordwrap: 80
|
||
|
};
|
||
|
|
||
|
const concatMerge = (acc, src, options) => [...acc, ...src];
|
||
|
const overwriteMerge = (acc, src, options) => [...src];
|
||
|
const selectorsMerge = (acc, src, options) => (
|
||
|
(acc.some(s => typeof s === 'object'))
|
||
|
? concatMerge(acc, src) // selectors
|
||
|
: overwriteMerge(acc, src) // baseElements.selectors
|
||
|
);
|
||
|
|
||
|
/**
|
||
|
* Preprocess options, compile selectors into a decision tree,
|
||
|
* return a function intended for batch processing.
|
||
|
*
|
||
|
* @param { Options } [options = {}] HtmlToText options.
|
||
|
* @returns { (html: string, metadata?: any) => string } Pre-configured converter function.
|
||
|
* @static
|
||
|
*/
|
||
|
function compile (options = {}) {
|
||
|
options = merge__default["default"](
|
||
|
DEFAULT_OPTIONS,
|
||
|
options,
|
||
|
{
|
||
|
arrayMerge: overwriteMerge,
|
||
|
customMerge: (key) => ((key === 'selectors') ? selectorsMerge : undefined)
|
||
|
}
|
||
|
);
|
||
|
options.formatters = Object.assign({}, genericFormatters, textFormatters, options.formatters);
|
||
|
options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector));
|
||
|
|
||
|
handleDeprecatedOptions(options);
|
||
|
|
||
|
return compile$1(options);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Convert given HTML content to plain text string.
|
||
|
*
|
||
|
* @param { string } html HTML content to convert.
|
||
|
* @param { Options } [options = {}] HtmlToText options.
|
||
|
* @param { any } [metadata] Optional metadata for HTML document, for use in formatters.
|
||
|
* @returns { string } Plain text string.
|
||
|
* @static
|
||
|
*
|
||
|
* @example
|
||
|
* const { convert } = require('html-to-text');
|
||
|
* const text = convert('<h1>Hello World</h1>', {
|
||
|
* wordwrap: 130
|
||
|
* });
|
||
|
* console.log(text); // HELLO WORLD
|
||
|
*/
|
||
|
function convert (html, options = {}, metadata = undefined) {
|
||
|
return compile(options)(html, metadata);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Map previously existing and now deprecated options to the new options layout.
|
||
|
* This is a subject for cleanup in major releases.
|
||
|
*
|
||
|
* @param { Options } options HtmlToText options.
|
||
|
*/
|
||
|
function handleDeprecatedOptions (options) {
|
||
|
if (options.tags) {
|
||
|
const tagDefinitions = Object.entries(options.tags).map(
|
||
|
([selector, definition]) => ({ ...definition, selector: selector || '*' })
|
||
|
);
|
||
|
options.selectors.push(...tagDefinitions);
|
||
|
options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector));
|
||
|
}
|
||
|
|
||
|
function set (obj, path, value) {
|
||
|
const valueKey = path.pop();
|
||
|
for (const key of path) {
|
||
|
let nested = obj[key];
|
||
|
if (!nested) {
|
||
|
nested = {};
|
||
|
obj[key] = nested;
|
||
|
}
|
||
|
obj = nested;
|
||
|
}
|
||
|
obj[valueKey] = value;
|
||
|
}
|
||
|
|
||
|
if (options['baseElement']) {
|
||
|
const baseElement = options['baseElement'];
|
||
|
set(
|
||
|
options,
|
||
|
['baseElements', 'selectors'],
|
||
|
(Array.isArray(baseElement) ? baseElement : [baseElement])
|
||
|
);
|
||
|
}
|
||
|
if (options['returnDomByDefault'] !== undefined) {
|
||
|
set(options, ['baseElements', 'returnDomByDefault'], options['returnDomByDefault']);
|
||
|
}
|
||
|
|
||
|
for (const definition of options.selectors) {
|
||
|
if (definition.format === 'anchor' && get(definition, ['options', 'noLinkBrackets'])) {
|
||
|
set(definition, ['options', 'linkBrackets'], false);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
exports.compile = compile;
|
||
|
exports.convert = convert;
|
||
|
exports.htmlToText = convert;
|