sentences = function (text, user_options) {
if (!text || typeof text !== "string" || !text.length) {
return [];
}
var options = {
"newline_boundaries" : false,
"html_boundaries" : false,
"html_boundaries_tags": ["p","div","ul","ol"],
"sanitize" : false,
"allowed_tags" : false,
"abbreviations" : null
};
if (typeof user_options === "boolean") {
// Deprecated quick option
options.newline_boundaries = true;
}
else {
// Extend options
for (var k in user_options) {
options[k] = user_options[k];
}
}
Match.setAbbreviations(options.abbreviations);
if (options.newline_boundaries) {
text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder);
}
if (options.html_boundaries) {
var html_boundaries_regexp = "(<br\\s*\\/?>|<\\/(" + options.html_boundaries_tags.join("|") + ")>)";
var re = new RegExp(html_boundaries_regexp, "g");
text = text.replace(re, "$1" + newline_placeholder);
}
if (options.sanitize || options.allowed_tags) {
if (! options.allowed_tags) {
options.allowed_tags = [""];
}
text = sanitizeHtml(text, { "allowedTags" : options.allowed_tags });
}
// Split the text into words
// - see http://blog.tompawlak.org/split-string-into-tokens-javascript
var words = text.trim().match(/\S+|\n/g);
var wordCount = 0;
var index = 0;
var temp = [];
var sentences = [];
var current = [];
// If given text is only whitespace (or nothing of \S+)
if (!words || !words.length) {
return [];
}
for (var i=0, L=words.length; i < L; i++) {
wordCount++;
// Add the word to current sentence
current.push(words[i]);
// Sub-sentences, reset counter
if (~words[i].indexOf(',')) {
wordCount = 0;
}
if (Match.isBoundaryChar(words[i]) ||
String.endsWithChar(words[i], "?!") ||
words[i] === newline_placeholder_t)
{
if ((options.newline_boundaries || options.html_boundaries) && words[i] === newline_placeholder_t) {
current.pop();
}
sentences.push(current);
wordCount = 0;
current = [];
continue;
}
if (String.endsWithChar(words[i], "\"") || String.endsWithChar(words[i], "”")) {
// endQuote = words[i].slice(-1);
words[i] = words[i].slice(0, -1);
}
// A dot might indicate the end sentences
// Exception: The next sentence starts with a word (non abbreviation)
// that has a capital letter.
if (String.endsWithChar(words[i], '.')) {
// Check if there is a next word
// This probably needs to be improved with machine learning
if (i+1 < L) {
// Single character abbr.
if (words[i].length === 2 && isNaN(words[i].charAt(0))) {
continue;
}
// Common abbr. that often do not end sentences
if (Match.isCommonAbbreviation(words[i])) {
continue;
}
// Next word starts with capital word, but current sentence is
// quite short
if (Match.isSentenceStarter(words[i+1])) {
if (Match.isTimeAbbreviation(words[i], words[i+1])) {
continue;
}
// Dealing with names at the start of sentences
if (Match.isNameAbbreviation(wordCount, words.slice(i, 6))) {
continue;
}
if (Match.isNumber(words[i+1])) {
if (Match.isCustomAbbreviation(words[i])) { ......
## How to
```javascript
var tokenizer = require('sbd');
var text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.
x22;;
var sentences = tokenizer.sentences(text, optional_options);
// [
// 'On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S.',
// 'Millions attended the Inauguration.',
// ]
```
...isBoundaryChar = function (word) {
return word === "." ||
word === "!" ||
word === "?";
}n/a
isCapitalized = function (str) {
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
}...
exports.isCapitalized = function(str) {
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
}
// Start with opening quotes or capitalized letter
exports.isSentenceStarter = function(str) {
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
}
exports.isCommonAbbreviation = function(str) {
return ~abbreviations.indexOf(str.replace(/\W+/g, ''));
}
// This is going towards too much rule based
...isCommonAbbreviation = function (str) {
return ~abbreviations.indexOf(str.replace(/\W+/g, ''));
}n/a
isConcatenated = function (word) {
var i = 0;
if ((i = word.indexOf(".")) > -1 ||
(i = word.indexOf("!")) > -1 ||
(i = word.indexOf("?")) > -1)
{
var c = word.charAt(i + 1);
// Check if the next word starts with a letter
if (c.match(/[a-zA-Z].*/)) {
return [word.slice(0, i), word.slice(i+1)];
}
}
return false;
}n/a
isCustomAbbreviation = function (str) {
if (str.length <= 3) {
return true;
}
return this.isCapitalized(str);
}n/a
isDottedAbbreviation = function (word) {
var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/);
return matches && matches[0].length > 0;
}n/a
isNameAbbreviation = function (wordCount, words) {
if (words.length > 0) {
if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) {
return true;
}
var capitalized = words.filter(function(str) {
return /[A-Z]/.test(str.charAt(0));
});
return capitalized.length >= 3;
}
return false;
}n/a
isNumber = function (str, dotPos) {
if (dotPos) {
str = str.slice(dotPos-1, dotPos+2);
}
return !isNaN(str);
}...
abbreviations = abbr;
} else {
abbreviations = englishAbbreviations;
}
}
exports.isCapitalized = function(str) {
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
}
// Start with opening quotes or capitalized letter
exports.isSentenceStarter = function(str) {
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
}
...isPhoneNr = function (str) {
return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][
02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/);
}n/a
isSentenceStarter = function (str) {
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
}n/a
isTimeAbbreviation = function (word, next) {
if (word === "a.m." || word === "p.m.") {
var tmp = next.replace(/\W+/g, '').slice(-3).toLowerCase();
if (tmp === "day") {
return true;
}
}
return false;
}n/a
isURL = function (str) {
return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/);
}n/a
setAbbreviations = function (abbr) {
if(abbr){
abbreviations = abbr;
} else {
abbreviations = englishAbbreviations;
}
}n/a
function ends_with(word, end) {
return word.slice(word.length - end.length) === end;
}n/a
function ends_with_char(word, c) {
if (c.length > 1) {
return c.indexOf(word.slice(-1)) > -1;
}
return word.slice(-1) === c;
}n/a