fromFile = function (file, options, callback) { if (!callback) { callback = options; options = {}; } fs.readFile(file, 'utf8', function (err, str) { if (err) return callback(err); return callback(null, htmlToText(str, options)); }); }
...
## Usage
You can read from a file via:
```javascript
var htmlToText = require('html-to-text');
htmlToText.fromFile(path.join(__dirname, 'test.html'), {
tables: ['#invoice', '.address']
}, (err, text) => {
if (err) return console.error(err);
console.log(text);
});
```
...
fromString = function (str, options) { return htmlToText(str, options || {}); }
...
```
or directly from a string:
```javascript
var htmlToText = require('html-to-text');
var text = htmlToText.fromString('<h1>Hello World</h1>', {
wordwrap: 130
});
console.log(text);
```
### Options:
...
function formatAnchor(elem, fn, options) { var href = ''; // Always get the anchor text var storedCharCount = options.lineCharCount; var text = fn(elem.children || [], options); if (!text) { text = ''; } var result = elem.trimLeadingSpace ? _s.lstrip(text) : text; if (!options.ignoreHref) { // Get the href, if present if (elem.attribs && elem.attribs.href) { href = elem.attribs.href.replace(/^mailto\:/, ''); } if (href) { if (options.linkHrefBaseUrl && href.indexOf('/') === 0) { href = options.linkHrefBaseUrl + href; } if (!options.hideLinkHrefIfSameAsText || href !== _s.replaceAll(result, '\n', '')) { if (!options.noLinkBrackets) { result += ' [' + href + ']'; } else { result += ' ' + href; } } } } options.lineCharCount = storedCharCount; return formatText({ data: result || href, trimLeadingSpace: elem.trimLeadingSpace }, options); }
...
case 'img':
result += format.image(elem, options);
break;
case 'a':
// Inline element needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.anchor(elem, walk, options);
break;
case 'p':
result += format.paragraph(elem, walk, options);
break;
case 'h1':
case 'h2':
case 'h3':
...
function formatHeading(elem, fn, options) { var heading = fn(elem.children, options); if (options.uppercaseHeadings) { heading = heading.toUpperCase(); } return heading + '\n'; }
...
break;
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
result += format.heading(elem, walk, options);
break;
case 'br':
result += format.lineBreak(elem, walk, options);
break;
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
...
function formatHorizontalLine(elem, fn, options) { return '\n' + _s.repeat('-', options.wordwrap) + '\n\n'; }
...
case 'h6':
result += format.heading(elem, walk, options);
break;
case 'br':
result += format.lineBreak(elem, walk, options);
break;
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
case 'ul':
result += format.unorderedList(elem, walk, options);
break;
case 'ol':
result += format.orderedList(elem, walk, options);
break;
...
function formatImage(elem, options) { if (options.ignoreImage) { return ''; } var result = '', attribs = elem.attribs || {}; if (attribs.alt) { result += he.decode(attribs.alt, options.decodeOptions); if (attribs.src) { result += ' '; } } if (attribs.src) { result += '[' + attribs.src + ']'; } return (result); }
...
}
var whiteSpaceRegex = /\s$/;
_.each(dom, function(elem) {
switch(elem.type) {
case 'tag':
switch(elem.name.toLowerCase()) {
case 'img':
result += format.image(elem, options);
break;
case 'a':
// Inline element needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.anchor(elem, walk, options);
break;
...
function formatLineBreak(elem, fn, options) { return '\n' + fn(elem.children, options); }
...
case 'h3':
case 'h4':
case 'h5':
case 'h6':
result += format.heading(elem, walk, options);
break;
case 'br':
result += format.lineBreak(elem, walk, options);
break;
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
case 'ul':
result += format.unorderedList(elem, walk, options);
break;
...
function formatListItem(prefix, elem, fn, options) { options = _.clone(options); // Reduce the wordwrap for sub elements. if (options.wordwrap) { options.wordwrap -= prefix.length; } // Process sub elements. var text = fn(elem.children, options); // Replace all line breaks with line break + prefix spacing. text = text.replace(/\n/g, '\n' + _s.repeat(' ', prefix.length)); // Add first prefix and line break at the end. return prefix + text + '\n'; }
n/a
function formatOrderedList(elem, fn, options) { var result = ''; var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { return child.type !== 'text' || !whiteSpaceRegex.test(child.data); }); // Return different functions for different OL types var typeFunctions = { 1: function(start, i) { return i + 1 + start}, a: function(start, i) { return String.fromCharCode(i + start + 97)}, A: function(start, i) { return String.fromCharCode(i + start + 65)} }; // Determine type var olType = elem.attribs.type || '1' // Make sure there are list items present if (nonWhiteSpaceChildren.length) { // Calculate initial start from ol attribute var start = Number(elem.attribs.start || '1') - 1 // Calculate the maximum length to i. var maxLength = (nonWhiteSpaceChildren.length + start).toString().length; _.each(nonWhiteSpaceChildren, function(elem, i) { // Use different function depending on type var index = typeFunctions[olType](start, i); // Calculate the needed spacing for nice indentation. var spacing = maxLength - index.toString().length; var prefix = (olType === '1') ? ' ' + index + '. ' + _s.repeat(' ', spacing) : index + '. '; result += formatListItem(prefix, elem, fn, options); }); } return result + '\n'; }
...
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
case 'ul':
result += format.unorderedList(elem, walk, options);
break;
case 'ol':
result += format.orderedList(elem, walk, options);
break;
case 'pre':
var newOptions = _(options).clone();
newOptions.isInPre = true;
result += format.paragraph(elem, walk, newOptions);
break;
case 'table':
...
function formatParagraph(elem, fn, options) { var paragraph = fn(elem.children, options) if (options.singleNewLineParagraphs) { return paragraph + '\n' } else { return paragraph + '\n\n' } }
...
case 'a':
// Inline element needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.anchor(elem, walk, options);
break;
case 'p':
result += format.paragraph(elem, walk, options);
break;
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
...
function formatTable(elem, fn, options) { var table = []; _.each(elem.children, tryParseRows); return tableToString(table); function tryParseRows(elem) { if (elem.type !== 'tag') { return; } switch (elem.name.toLowerCase()) { case "thead": case "tbody": case "tfoot": case "center": _.each(elem.children, tryParseRows); return; case 'tr': var rows = []; _.each(elem.children, function(elem) { var tokens, times; if (elem.type === 'tag') { switch (elem.name.toLowerCase()) { case 'th': tokens = formatHeading(elem, fn, options).split('\n'); rows.push(_.compact(tokens)); break; case 'td': tokens = fn(elem.children, options).split('\n'); rows.push(_.compact(tokens)); // Fill colspans with empty values if (elem.attribs && elem.attribs.colspan) { times = elem.attribs.colspan - 1 || 0; _.times(times, function() { rows.push(['']); }); } break; } } }); rows = helper.arrayZip(rows); _.each(rows, function(row) { row = _.map(row, function(col) { return col || ''; }); table.push(row); }); break; } } }
...
case 'pre':
var newOptions = _(options).clone();
newOptions.isInPre = true;
result += format.paragraph(elem, walk, newOptions);
break;
case 'table':
result = containsTable(elem.attribs, options.tables)
? result + format.table(elem, walk, options)
: walk(elem.children || [], options, result);
break;
default:
result = walk(elem.children || [], options, result);
}
break;
case 'text':
...
function formatText(elem, options) { var text = elem.data || ""; text = he.decode(text, options.decodeOptions); if (options.isInPre) { return text; } else { return helper.wordwrap(elem.trimLeadingSpace ? _s.lstrip(text) : text, options); } }
...
}
break;
case 'text':
if (elem.data !== '\r\n') {
// Text needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.text(elem, options);
}
break;
default:
if (!_.include(SKIP_TYPES, elem.type)) {
result = walk(elem.children || [], options, result);
}
}
...
function formatUnorderedList(elem, fn, options) { var result = ''; var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) { return child.type !== 'text' || !whiteSpaceRegex.test(child.data); }); _.each(nonWhiteSpaceChildren, function(elem) { result += formatListItem(' * ', elem, fn, options); }); return result + '\n'; }
...
case 'br':
result += format.lineBreak(elem, walk, options);
break;
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
case 'ul':
result += format.unorderedList(elem, walk, options);
break;
case 'ol':
result += format.orderedList(elem, walk, options);
break;
case 'pre':
var newOptions = _(options).clone();
newOptions.isInPre = true;
...
function arrayZip(array) { return _.zip.apply(_, array); }
...
// Convert all rows to lengths
var widths = _.map(table, function(row) {
return _.map(row, function(col) {
return col.length;
});
});
// Invert rows with colums
widths = helper.arrayZip(widths);
// Determine the max values for each column
widths = _.map(widths, function(col) {
return _.max(col);
});
// Build the table
var text = '';
...
function splitCssSearchTag(tagString) { function getParams(re, string) { var captures = [], found; while ((found = re.exec(string)) !== null) { captures.push(found[1]); } return captures; } var splitTag = {}; var elementRe = /(^\w*)/g; splitTag.element = elementRe.exec(tagString)[1]; splitTag.classes = getParams( /\.([\d\w-]*)/g, tagString); splitTag.ids = getParams( /#([\d\w-]*)/g, tagString); return splitTag; }
...
}
return _s.strip(result);
}
function filterBody(dom, options, baseElement) {
var result = null;
var splitTag = helper.splitCssSearchTag(baseElement);
function walk(dom) {
if (result) return;
_.each(dom, function(elem) {
if (result) return;
if (elem.name === splitTag.element) {
var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : [];
...
function wordwrap(text, options) { var max = options.wordwrap; var preserveNewlines = options.preserveNewlines; var length = options.lineCharCount; // Preserve leading space var result = _s.startsWith(text, ' ') ? ' ' : ''; length += result.length; var buffer = []; // Split the text into words, decide to preserve new lines or not. var words = preserveNewlines ? text.replace(/\n/g, '\n ').split(/\ +/) : _s.words(text); // Determine where to end line word by word. _.each(words, function(word) { // Add buffer to result if we can't fit any more words in the buffer. if ((max || max === 0) && length > 0 && ((length + word.length > max) || (length + word.indexOf('\n') > max))) { // Concat buffer and add it to the result result += buffer.join(' ') + '\n'; // Reset buffer and length buffer.length = length = 0; } // Check if the current word is long enough to be wrapped if ((max || max === 0) && (options.longWordSplit) && (word.length > max)) { word = splitLongWord(word, options); } buffer.push(word); // If the word contains a newline then restart the count and add the buffer to the result if (word.indexOf('\n') !== -1) { result += buffer.join(' '); // Reset the buffer, let the length include any characters after the last newline buffer.length = 0; length = word.length - (word.lastIndexOf('\n') + 1); // If there are characters after the newline, add a space and increase the length by 1 if (length) { result += ' '; length++; } } else { // Add word length + one whitespace length += word.length + 1; } }); // Add the rest to the result. result += buffer.join(' '); // Preserve trailing space if (!_s.endsWith(text, ' ')) { result = _s.rtrim(result); } else if (!_s.endsWith(result, ' ')) { result = result + ' '; } return result; }
...
function formatText(elem, options) {
var text = elem.data || "";
text = he.decode(text, options.decodeOptions);
if (options.isInPre) {
return text;
} else {
return helper.wordwrap(elem.trimLeadingSpace ? _s.lstrip(text) : text, options);
}
}
function formatImage(elem, options) {
if (options.ignoreImage) {
return '';
}
...
fromFile = function (file, options, callback) { if (!callback) { callback = options; options = {}; } fs.readFile(file, 'utf8', function (err, str) { if (err) return callback(err); return callback(null, htmlToText(str, options)); }); }
...
## Usage
You can read from a file via:
```javascript
var htmlToText = require('html-to-text');
htmlToText.fromFile(path.join(__dirname, 'test.html'), {
tables: ['#invoice', '.address']
}, (err, text) => {
if (err) return console.error(err);
console.log(text);
});
```
...
fromString = function (str, options) { return htmlToText(str, options || {}); }
...
```
or directly from a string:
```javascript
var htmlToText = require('html-to-text');
var text = htmlToText.fromString('<h1>Hello World</h1>', {
wordwrap: 130
});
console.log(text);
```
### Options:
...