function fromBufferWithMime( type, bufferContent, options, cb, withPath ) {
if ( typeof type === 'string' &&
bufferContent &&
bufferContent instanceof Buffer &&
( typeof options === 'function' || typeof cb === 'function' ) ) {
_writeBufferToDisk( bufferContent, function( newPath ) {
fromFileWithMimeAndPath( type, newPath, options, cb );
});
} else {
_returnArgsError( arguments );
}
}...
```javascript
textract.fromFileWithMimeAndPath(type, filePath, config, function( error, text ) {})
```
##### Buffer + mime type
```javascript
textract.fromBufferWithMime(type, buffer, function( error, text ) {})
```
```javascript
textract.fromBufferWithMime(type, buffer, config, function( error, text ) {})
```
##### Buffer + file name/path
...function fromBufferWithName( filePath, bufferContent, options, cb ) {
var type;
if ( typeof filePath === 'string' ) {
type = mime.lookup( filePath );
fromBufferWithMime( type, bufferContent, options, cb, true );
} else {
_returnArgsError( arguments );
}
}...
```javascript
textract.fromBufferWithMime(type, buffer, config, function( error, text ) {})
```
##### Buffer + file name/path
```javascript
textract.fromBufferWithName(name, buffer, function( error, text ) {})
```
```javascript
textract.fromBufferWithName(name, buffer, config, function( error, text ) {})
```
##### URL
...function fromFileWithMimeAndPath( type, filePath, options, cb ) {
var called = false;
if ( typeof type === 'string' && typeof filePath === 'string' ) {
if ( typeof cb === 'function' && typeof options === 'object' ) {
// (mimeType, filePath, options, callback)
_extractWithType( type, filePath, options, cb );
called = true;
} else if ( typeof options === 'function' && cb === undefined ) {
// (mimeType, filePath, callback)
_extractWithType( type, filePath, {}, options );
called = true;
}
}
if ( !called ) {
_returnArgsError( arguments );
}
}...
```javascript
textract.fromFileWithPath(filePath, config, function( error, text ) {})
```
##### File + mime type
```javascript
textract.fromFileWithMimeAndPath(type, filePath, function( error, text ) {})
```
```javascript
textract.fromFileWithMimeAndPath(type, filePath, config, function( error, text ) {})
```
##### Buffer + mime type
...function fromFileWithPath( filePath, options, cb ) {
var type;
if ( typeof filePath === 'string' &&
( typeof options === 'function' || typeof cb === 'function' ) ) {
type = ( options && options.typeOverride ) || mime.lookup( filePath );
fromFileWithMimeAndPath( type, filePath, options, cb );
} else {
_returnArgsError( arguments );
}
}...
There are several ways to extract text. For all methods, the extracted text and an error object are passed to a callback.
`error` will contain informative text about why the extraction failed. If textract does not currently extract files of the type
provided, a `typeNotFound` flag will be tossed on the error object.
##### File
```javascript
textract.fromFileWithPath(filePath, function( error, text ) {})
```
```javascript
textract.fromFileWithPath(filePath, config, function( error, text ) {})
```
##### File + mime type
...function fromUrl( url, options, cb ) {
var urlNoQueryParams, extname, filePath, fullFilePath, file, href, callbackCalled;
// allow url to be either a string or to be a
// Node URL Object: https://nodejs.org/api/url.html
href = ( typeof url === 'string' ) ? url : url.href;
if ( href ) {
options = options || {};
urlNoQueryParams = href.split( '?' )[0];
extname = path.extname( urlNoQueryParams );
filePath = _genRandom() + extname;
fullFilePath = path.join( tmpDir, filePath );
file = fs.createWriteStream( fullFilePath );
file.on( 'finish', function() {
if ( !callbackCalled ) {
fromFileWithPath( fullFilePath, options, cb );
}
});
got.stream( url )
.on( 'response', function( response ) {
// allows for overriding by the developer or automatically
// populating based on server response.
if ( !options.typeOverride ) {
options.typeOverride = response.headers['content-type'].split( /;/ )[0];
}
})
.on( 'error', function( error ) {
var _cb = ( typeof options === 'function' ) ? options : cb;
callbackCalled = true;
_cb( error );
})
.pipe( file );
} else {
_returnArgsError( arguments );
}
}...
```
##### URL
When passing a URL, the URL can either be a string, or a [node.js URL object](https://nodejs.org/api/url.html). Using the URL object
allows fine grained control over the URL being used.
```javascript
textract.fromUrl(url, function( error, text ) {})
```
```javascript
textract.fromUrl(url, config, function( error, text ) {})
```
## Testing Notes
...function extractText( filePath, options, cb ) {
var result = '';
yauzl.open( filePath, function( err, zipfile ) {
var processEnd
, processedEntries = 0
;
if ( err ) {
util.yauzlError( err, cb );
return;
}
processEnd = function() {
var text;
if ( zipfile.entryCount === ++processedEntries ) {
if ( result.length ) {
text = _calculateExtractedText( result );
cb( null, text );
} else {
cb( new Error(
'Extraction could not find content in file, are you' +
' sure it is the mime type it says it is?' ),
null );
}
}
};
zipfile.on( 'entry', function( entry ) {
if ( includeRegex.test( entry.fileName ) && !excludeRegex.test( entry.fileName ) ) {
util.getTextFromZipFile( zipfile, entry, function( err2, text ) {
result += text + '\n';
processEnd();
});
} else {
processEnd();
}
});
zipfile.on( 'error', function( err3 ) {
cb( err3 );
});
});
}n/a
function extractText( filePath, options, cb ) {
var execOptions = util.createExecOptions( 'dxf', options )
, escapedPath = filePath.replace( /\s/g, '\\ ' )
;
exec( 'drawingtotext ' + escapedPath,
execOptions,
function( error, stdout, stderr ) {
if ( stderr !== '' ) {
error = new Error( 'error extracting DXF text ' +
path.basename( filePath ) + ': ' + stderr );
cb( error, null );
return;
}
cb( null, stdout );
}
);
}n/a
function testForBinary( options, cb ) {
exec( 'drawingtotext notalegalfile',
function( error, stdout, stderr ) {
var msg
, errorRegex = /I couldn't make sense of your input/
;
if ( !( stderr && errorRegex.test( stderr ) ) ) {
msg = 'INFO: \'drawingtotext\' does not appear to be installed, ' +
'so textract will be unable to extract DXFs.';
cb( false, msg );
} else {
cb( true );
}
}
);
}...
extractor.types.forEach( function( type ) {
failedExtractorTypes[type.toLowerCase()] = failedMessage;
});
}
}
function testExtractor( extractor, options ) {
extractor.test( options, function( passedTest, failedMessage ) {
satisfiedExtractors++;
if ( passedTest ) {
registerExtractor( extractor );
} else {
registerFailedExtractor( extractor, failedMessage );
}
});
...function extractText( filePath, options, cb ) {
fs.readFile( filePath, function( error, data ) {
if ( error ) {
cb( error, null );
return;
}
extractFromText( data, cb );
});
}n/a
function extractFromText( data, cb ) {
var $, text;
text = data.toString()
.replace( /< *(br|p|div|section|aside|button|header|footer|li|article|blockquote|cite|code|h1|h2|h3|h4|h5|h6|legend|nav)((.*?)>)/
g, '<$1$2|||||' )
.replace( /< *\/(td|a|option) *>/g, ' </$1>' ) // spacing some things out so text doesn't get smashed together
.replace( /< *(a|td|option)/g, ' <$1' ) // spacing out links
.replace( /< *(br|hr) +\/>/g, '|||||<$1\\>' )
.replace( /<\/ +?(p|div|section|aside|button|header|footer|li|article|blockquote|cite|code|h1|h2|h3|h4|h5|h6|legend|nav)>/g, '|||||</$
1>' );
text = '<textractwrapper>' + text + '<textractwrapper>';
try {
$ = cheerio.load( text );
$( 'script' ).remove();
$( 'style' ).remove();
$( 'noscript' ).remove();
text = $( 'textractwrapper' ).text().replace( /\|\|\|\|\|/g, '\n' )
.replace( /(\n\u00A0|\u00A0\n|\n | \n)+/g, '\n' )
.replace( /(\r\u00A0|\u00A0\r|\r | \r)+/g, '\n' )
.replace( /(\v\u00A0|\u00A0\v|\v | \v)+/g, '\n' )
.replace( /(\t\u00A0|\u00A0\t|\t | \t)+/g, '\n' )
.replace( /[\n\r\t\v]+/g, '\n' )
;
} catch ( err ) {
cb( err, null );
return;
}
cb( null, text );
}...
return;
}
marked( data.toString(), function( err, content ) {
if ( err ) {
cb( err, null );
} else {
htmlExtract.extractFromText( content, cb );
}
});
});
}
module.exports = {
types: ['text/x-markdown'],
...function extractText( filePath, options, cb ) {
var execOptions = util.createExecOptions( 'images', options );
util.runExecIntoFile( 'tesseract', filePath, options,
execOptions, tesseractExtractionCommand, cb );
}n/a
function testForBinary( options, cb ) {
exec( 'tesseract',
function( error, stdout, stderr ) {
var msg;
// checking for content of help text
if ( ( error && error.toString().indexOf( 'Usage:' ) > -1 ) ||
( stderr && stderr.toString().indexOf( 'Usage:' ) > -1 ) ||
( stdout && stdout.toString().indexOf( 'Usage:' ) > -1 ) ) {
cb( true );
} else {
msg = 'INFO: \'tesseract\' does not appear to be installed, ' +
'so textract will be unable to extract images.';
cb( false, msg );
}
}
);
}...
extractor.types.forEach( function( type ) {
failedExtractorTypes[type.toLowerCase()] = failedMessage;
});
}
}
function testExtractor( extractor, options ) {
extractor.test( options, function( passedTest, failedMessage ) {
satisfiedExtractors++;
if ( passedTest ) {
registerExtractor( extractor );
} else {
registerFailedExtractor( extractor, failedMessage );
}
});
...function extractText( filePath, options, cb ) {
fs.readFile( filePath, function( error, data ) {
if ( error ) {
cb( error, null );
return;
}
marked( data.toString(), function( err, content ) {
if ( err ) {
cb( err, null );
} else {
htmlExtract.extractFromText( content, cb );
}
});
});
}n/a
function extractText( filePath, options, cb ) {
yauzl.open( filePath, function( err, zipfile ) {
var textOnTheWay = false;
if ( err ) {
util.yauzlError( err, cb );
return;
}
zipfile.on( 'end', function() {
if ( !textOnTheWay ) {
cb(
new Error( 'Extraction could not find content.xml in file, ' +
'are you sure it is the mime type it says it is?' ),
null );
}
});
zipfile.on( 'entry', function( entry ) {
if ( entry.fileName === 'content.xml' ) {
textOnTheWay = true;
util.getTextFromZipFile( zipfile, entry, function( err2, text ) {
var output = text
.replace( 'inflating: content.xml', '' )
.replace( /^(.Archive).*/, '' )
.replace( /text:p/g, 'textractTextNode' )
.replace( /text:h/g, 'textractTextNode' )
.replace( /<textractTextNode\/>/g, '' )
.trim()
, $ = cheerio.load( '<body>' + output + '</body>' )
, nodes = $( 'textractTextNode' )
, nodeTexts = []
, i
;
for ( i = 0; i < nodes.length; i++ ) {
nodeTexts.push( $( nodes[i] ).text() );
}
cb( null, nodeTexts.join( '\n' ) );
});
}
});
zipfile.on( 'error', function( err3 ) {
cb( err3 );
});
});
}n/a
function extractText( filePath, options, cb ) {
// See https://github.com/dbashford/textract/issues/75 for description of
// what is happening here
var pdftotextOptions = options.pdftotextOptions || { layout: 'raw' };
extract( filePath, pdftotextOptions, function( error, pages ) {
var fullText;
if ( error ) {
error = new Error( 'Error extracting PDF text for file at [[ ' +
path.basename( filePath ) + ' ]], error: ' + error.message );
cb( error, null );
return;
}
fullText = pages.join( ' ' ).trim();
cb( null, fullText );
});
}n/a
function testForBinary( options, cb ) {
exec( 'pdftotext -v',
function( error, stdout, stderr ) {
var msg;
if ( stderr && stderr.indexOf( 'pdftotext version' ) > -1 ) {
cb( true );
} else {
msg = 'INFO: \'pdftotext\' does not appear to be installed, ' +
'so textract will be unable to extract PDFs.';
cb( false, msg );
}
}
);
}...
extractor.types.forEach( function( type ) {
failedExtractorTypes[type.toLowerCase()] = failedMessage;
});
}
}
function testExtractor( extractor, options ) {
extractor.test( options, function( passedTest, failedMessage ) {
satisfiedExtractors++;
if ( passedTest ) {
registerExtractor( extractor );
} else {
registerFailedExtractor( extractor, failedMessage );
}
});
...extract = function ( filePath, options, cb ) {
/*
var captured = ppt.readFile(filePath);
console.log('CAPTURED!!!!')
console.log(captured)
console.log('CAPTURED!!!!')
cb( null, null );
if ( error ) {
cb( error, null );
return;
}
cb( null, data.toString() );
*/
}n/a
function extractText( filePath, options, cb ) {
var slides = [];
yauzl.open( filePath, function( err, zipfile ) {
if ( err ) {
util.yauzlError( err, cb );
return;
}
zipfile.on( 'end', function() {
var slidesText, text;
if ( slides.length ) {
slides.sort( _compareSlides );
slidesText = slides.map( function( slide ) {
return slide.text;
}).join( '\n' );
text = _calculateExtractedText( slidesText );
cb( null, text );
} else {
cb(
new Error( 'Extraction could not find slides in file, are you' +
' sure it is the mime type it says it is?' ),
null );
}
});
zipfile.on( 'entry', function( entry ) {
if ( slideMatch.test( entry.fileName ) ) {
util.getTextFromZipFile( zipfile, entry, function( err2, text ) {
var slide = +entry.fileName.replace( 'ppt/slides/slide', '' ).replace( '.xml', '' );
slides.push({ slide: slide, text: text });
});
}
});
zipfile.on( 'error', function( err3 ) {
cb( err3 );
});
});
}n/a
function extractText( filePath, options, cb ) {
var escapedPath = filePath.replace( /\s/g, '\\ ' );
// Going to output html from unrtf because unrtf does a great job of
// going to html, but does a crap job of going to text. It leaves sections
// out, strips apostrophes, leaves nasty quotes in for bullets and more
// that I've likely not yet discovered.
//
// textract can go from html to text on its own, so let unrtf go to html
// then extract the text from that
//
// Also do not have to worry about stripping comments from unrtf text
// output since HTML comments are not included in output. Also, the
// unrtf --quiet option doesn't work.
exec( 'unrtf --html --nopict ' + escapedPath,
function( error, stdout /* , stderr */ ) {
var err;
if ( error ) {
err = new Error( 'unrtf read of file named [[ ' +
path.basename( filePath ) + ' ]] failed: ' + error );
cb( err, null );
} else {
htmlExtract.extractFromText( stdout.trim(), cb );
}
}
);
}n/a
function testForBinary( options, cb ) {
// just non-osx extractor
if ( os.platform() === 'darwin' ) {
cb( true );
return;
}
exec( 'unrtf ' + __filename,
function( error /* , stdout, stderr */ ) {
var msg;
if ( error !== null && error.message &&
error.message.indexOf( 'not found' ) !== -1 ) {
msg = 'INFO: \'unrtf\' does not appear to be installed, ' +
'so textract will be unable to extract RTFs.';
cb( false, msg );
} else {
cb( true );
}
}
);
}...
extractor.types.forEach( function( type ) {
failedExtractorTypes[type.toLowerCase()] = failedMessage;
});
}
}
function testExtractor( extractor, options ) {
extractor.test( options, function( passedTest, failedMessage ) {
satisfiedExtractors++;
if ( passedTest ) {
registerExtractor( extractor );
} else {
registerFailedExtractor( extractor, failedMessage );
}
});
...function extractText( filePath, options, cb ) {
fs.readFile( filePath, function( error, data ) {
var encoding, decoded;
if ( error ) {
cb( error, null );
return;
}
try {
encoding = jschardet.detect( data ).encoding.toLowerCase();
decoded = iconv.decode( data, encoding );
} catch ( e ) {
cb( e );
return;
}
cb( null, decoded );
});
}n/a
function createExecOptions( type, options ) {
var execOptions = {};
if ( options[type] && options[type].exec ) {
execOptions = options[type].exec;
} else {
if ( options.exec ) {
execOptions = options.exec;
}
}
return execOptions;
}...
var exec = require( 'child_process' ).exec
, path = require( 'path' )
, util = require( '../util' )
;
function extractText( filePath, options, cb ) {
var execOptions = util.createExecOptions( 'dxf', options )
, escapedPath = filePath.replace( /\s/g, '\\ ' )
;
exec( 'drawingtotext ' + escapedPath,
execOptions,
function( error, stdout, stderr ) {
if ( stderr !== '' ) {
...function getTextFromZipFile( zipfile, entry, cb ) {
zipfile.openReadStream( entry, function( err, readStream ) {
var text = ''
, error = ''
;
if ( err ) {
cb( err, null );
return;
}
readStream.on( 'data', function( chunk ) {
text += chunk;
});
readStream.on( 'end', function() {
if ( error.length > 0 ) {
cb( error, null );
} else {
cb( null, text );
}
});
readStream.on( 'error', function( _err ) {
error += _err;
});
});
}...
null );
}
}
};
zipfile.on( 'entry', function( entry ) {
if ( includeRegex.test( entry.fileName ) && !excludeRegex.test( entry.fileName ) ) {
util.getTextFromZipFile( zipfile, entry, function( err2, text ) {
result += text + '\n';
processEnd();
});
} else {
processEnd();
}
});
...function replaceBadCharacters( text ) {
var i, repl;
for ( i = 0; i < rLen; i++ ) {
repl = replacements[i];
text = text.replace( repl[0], repl[1] );
}
return text;
}...
}
// global, all file type, content cleansing
function cleanseText( options, cb ) {
return function( error, text ) {
if ( !error ) {
// clean up text
text = util.replaceBadCharacters( text );
if ( options.preserveLineBreaks ) {
text = text.replace( WHITELIST_PRESERVE_LINEBREAKS, ' ' );
} else {
text = text.replace( WHITELIST_STRIP_LINEBREAKS, ' ' );
}
...function runExecIntoFile( label, filePath, options, execOptions, genCommand, cb ) {
// escape the file paths
var fileTempOutPath = path.join( outDir, path.basename( filePath, path.extname( filePath ) ) )
, escapedFilePath = filePath.replace( /\s/g, '\\ ' )
, escapedFileTempOutPath = fileTempOutPath.replace( /\s/g, '\\ ' )
, cmd = genCommand( options, escapedFilePath, escapedFileTempOutPath )
;
exec( cmd, execOptions,
function( error /* , stdout, stderr */ ) {
if ( error !== null ) {
error = new Error( 'Error extracting [[ ' +
path.basename( filePath ) + ' ]], exec error: ' + error.message );
cb( error, null );
return;
}
fs.exists( fileTempOutPath + '.txt', function( exists ) {
if ( exists ) {
fs.readFile( fileTempOutPath + '.txt', 'utf8', function( error2, text ) {
if ( error2 ) {
error2 = new Error( 'Error reading' + label +
' output at [[ ' + fileTempOutPath + ' ]], error: ' + error.message );
cb( error2, null );
} else {
fs.unlink( fileTempOutPath + '.txt', function( error3 ) {
if ( error3 ) {
error3 = new Error( 'Error, ' + label +
' , cleaning up temp file [[ ' + fileTempOutPath +
' ]], error: ' + error.message );
cb( error3, null );
} else {
cb( null, text.toString() );
}
});
}
});
} else {
error = new Error( 'Error reading ' + label +
' output at [[ ' + fileTempOutPath + ' ]], file does not exist' );
cb( error, null );
}
});
}
);
}...
}
cmd += ' quiet';
return cmd;
}
function extractText( filePath, options, cb ) {
var execOptions = util.createExecOptions( 'images', options );
util.runExecIntoFile( 'tesseract', filePath, options,
execOptions, tesseractExtractionCommand, cb );
}
function testForBinary( options, cb ) {
exec( 'tesseract',
function( error, stdout, stderr ) {
var msg;
...function unzipCheck( type, cb ) {
exec( 'unzip',
function( error /* , stdout, stderr */ ) {
if ( error ) {
// eslint-disable-next-line no-console
console.error( 'textract: \'unzip\' does not appear to be installed, ' +
'so textract will be unable to extract ' + type + '.' );
}
cb( error === null );
}
);
}n/a
function yauzlError( err, cb ) {
var msg = err.message;
if ( msg === 'end of central directory record signature not found' ) {
msg = 'File not correctly recognized as zip file, ' + msg;
}
cb( new Error( msg ), null );
}...
yauzl.open( filePath, function( err, zipfile ) {
var processEnd
, processedEntries = 0
;
if ( err ) {
util.yauzlError( err, cb );
return;
}
processEnd = function() {
var text;
if ( zipfile.entryCount === ++processedEntries ) {
if ( result.length ) {
...function extractText( filePath, options, cb ) {
var CSVs, wb, result, error;
try {
wb = J.readFile( filePath );
CSVs = J.utils.to_csv( wb );
} catch ( err ) {
error = new Error( 'Could not extract ' + path.basename( filePath ) + ', ' + err );
cb( error, null );
return;
}
result = '';
Object.keys( CSVs ).forEach( function( key ) {
result += CSVs[key];
});
cb( null, result );
}n/a