function DefaultHandler(callback, options) {
this.reset();
this._options = options ? options : { };
if (this._options.ignoreWhitespace == undefined)
this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
if (this._options.verbose == undefined)
this._options.verbose = true; //Keep data property for tags and raw property for all
if (this._options.enforceEmptyTags == undefined)
this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
if ((typeof callback) == "function")
this._callback = callback;
}...
View runtests.html in any browser
##Usage In Node
```javascript
var htmlparser = require("htmlparser");
var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';< / script
><!--<!-- Waah! -- -->";
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error)
[...do something for errors...]
else
[...parsing done, do something...]
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
...function Parser(handler, options) {
this._options = options ? options : { };
if (this._options.includeLocation == undefined) {
this._options.includeLocation = false; //Do not track element position in document by default
}
this.validateHandler(handler);
this._handler = handler;
this.reset();
}...
var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';< / script
><!--<!-- Waah! -- -->";
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error)
[...do something for errors...]
else
[...parsing done, do something...]
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
sys.puts(sys.inspect(handler.dom, false, null));
```
##Usage In Browser
```javascript
...function RssHandler(callback) {
RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
}...
}
parser.done();
```
##Parsing RSS/Atom Feeds
```javascript
new htmlparser.RssHandler(function (error, dom) {
...
});
```
##DefaultHandler Options
###Usage
...function DefaultHandler(callback, options) {
this.reset();
this._options = options ? options : { };
if (this._options.ignoreWhitespace == undefined)
this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
if (this._options.verbose == undefined)
this._options.verbose = true; //Keep data property for tags and raw property for all
if (this._options.enforceEmptyTags == undefined)
this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
if ((typeof callback) == "function")
this._callback = callback;
}...
View runtests.html in any browser
##Usage In Node
```javascript
var htmlparser = require("htmlparser");
var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';< / script
><!--<!-- Waah! -- -->";
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error)
[...do something for errors...]
else
[...parsing done, do something...]
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
...function DefaultHandler$done() {
this._done = true;
this.handleCallback(null);
}...
##Streaming To Parser
```javascript
while (...) {
...
parser.parseChunk(chunk);
}
parser.done();
```
##Parsing RSS/Atom Feeds
```javascript
new htmlparser.RssHandler(function (error, dom) {
...
...function DefaultHandler$error(error) {
this.handleCallback(error);
}...
break;
}
}
}
Parser.prototype.handleError = function Parser$handleError (error) {
if ((typeof this._handler.error) == "function")
this._handler.error(error);
else
throw error;
}
//TODO: make this a trully streamable handler
function RssHandler (callback) {
RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
...function DefaultHandler$handleCallback(error) {
if ((typeof this._callback) != "function")
if (error)
throw error;
else
return;
this._callback(error, this.dom);
}...
this._tagStack.last = function DefaultHandler$_tagStack$last () {
return(this.length ? this[this.length - 1] : null);
}
}
//Signals the handler that parsing is done
DefaultHandler.prototype.done = function DefaultHandler$done () {
this._done = true;
this.handleCallback(null);
}
DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
this.handleElement(element);
}
DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
if (this._options.ignoreWhitespace)
if (DefaultHandler.reWhitespace.test(element.data))
...function DefaultHandler$handleElement(element) {
if (this._done)
this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
if (!this._options.verbose) {
// element.raw = null; //FIXME: Not clean
//FIXME: Serious performance problem using delete
delete element.raw;
if (element.type == "tag" || element.type == "script" || element.type == "style")
delete element.data;
}
if (!this._tagStack.last()) { //There are no parent elements
//If the element can be a container, add it to the tag stack and the top level list
if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
this.dom.push(element);
if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children
this._tagStack.push(element);
}
}
}
else //Otherwise just add to the top level list
this.dom.push(element);
}
else { //There are parent elements
//If the element can be a container, add it as a child of the element
//on top of the tag stack and then add it to the tag stack
if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
if (element.name.charAt(0) == "/") {
//This is a closing tag, scan the tagStack to find the matching opening tag
//and pop the stack up to the opening tag's parent
var baseName = element.name.substring(1);
if (!this.isEmptyTag(element)) {
var pos = this._tagStack.length - 1;
while (pos > -1 && this._tagStack[pos--].name != baseName) { }
if (pos > -1 || this._tagStack[0].name == baseName)
while (pos < this._tagStack.length - 1)
this._tagStack.pop();
}
}
else { //This is not a closing tag
if (!this._tagStack.last().children)
this._tagStack.last().children = [];
this._tagStack.last().children.push(element);
if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children
this._tagStack.push(element);
}
}
else { //This is not a container element
if (!this._tagStack.last().children)
this._tagStack.last().children = [];
this._tagStack.last().children.push(element);
}
}
}...
}
//Signals the handler that parsing is done
DefaultHandler.prototype.done = function DefaultHandler$done () {
this._done = true;
this.handleCallback(null);
}
DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
this.handleElement(element);
}
DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
if (this._options.ignoreWhitespace)
if (DefaultHandler.reWhitespace.test(element.data))
return;
this.handleElement(element);
}
...isEmptyTag = function (element) {
var name = element.name.toLowerCase();
if (name.charAt(0) == '/') {
name = name.substring(1);
}
return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name];
}...
delete element.data;
}
if (!this._tagStack.last()) { //There are no parent elements
//If the element can be a container, add it to the tag stack and the top level list
if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType
.Directive) {
if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
this.dom.push(element);
if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can
't have children
this._tagStack.push(element);
}
}
}
else //Otherwise just add to the top level list
this.dom.push(element);
}
...function DefaultHandler$reset() {
this.dom = [];
this._done = false;
this._tagStack = [];
this._tagStack.last = function DefaultHandler$_tagStack$last () {
return(this.length ? this[this.length - 1] : null);
}
}...
new Tautologistics.NodeHtmlParser.DefaultHandler(handlerCallback, test.options.handler)
;
var parser = new Tautologistics.NodeHtmlParser.Parser(handler, test.options.parser);
document.write("<b>" + test.name + "</b>: ");
parser.parseComplete(test.html);
var resultComplete = handler.dom;
var chunkPos = 0;
parser.reset();
while (chunkPos < test.html.length) {
parser.parseChunk(test.html.substring(chunkPos, chunkPos + chunkSize));
chunkPos += chunkSize;
}
parser.done();
var resultChunk = handler.dom;
var testResult =
...function DefaultHandler$writeComment(element) {
this.handleElement(element);
}...
forceFlush = !!forceFlush;
if (this._tagStack.length && !forceFlush)
return;
while (this._elements.length) {
var element = this._elements.shift();
switch (element.type) {
case ElementType.Comment:
this._handler.writeComment(element);
break;
case ElementType.Directive:
this._handler.writeDirective(element);
break;
case ElementType.Text:
this._handler.writeText(element);
break;
...function DefaultHandler$writeDirective(element) {
this.handleElement(element);
}...
while (this._elements.length) {
var element = this._elements.shift();
switch (element.type) {
case ElementType.Comment:
this._handler.writeComment(element);
break;
case ElementType.Directive:
this._handler.writeDirective(element);
break;
case ElementType.Text:
this._handler.writeText(element);
break;
default:
this._handler.writeTag(element);
break;
...function DefaultHandler$writeTag(element) {
this.handleElement(element);
}...
case ElementType.Directive:
this._handler.writeDirective(element);
break;
case ElementType.Text:
this._handler.writeText(element);
break;
default:
this._handler.writeTag(element);
break;
}
}
}
Parser.prototype.handleError = function Parser$handleError (error) {
if ((typeof this._handler.error) == "function")
...function DefaultHandler$writeText(element) {
if (this._options.ignoreWhitespace)
if (DefaultHandler.reWhitespace.test(element.data))
return;
this.handleElement(element);
}...
case ElementType.Comment:
this._handler.writeComment(element);
break;
case ElementType.Directive:
this._handler.writeDirective(element);
break;
case ElementType.Text:
this._handler.writeText(element);
break;
default:
this._handler.writeTag(element);
break;
}
}
}
...function DomUtils$getElementById(id, currentElement, recurse) {
var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
return(result.length ? result[0] : null);
}...
var handler = new htmlparser.DefaultHandler(function(err, dom) {
if (err) {
sys.debug("Error: " + err);
}
else {
sys.debug(sys.inspect(dom, false, null));
var id = htmlparser.DomUtils.getElementById("x", dom);
sys.debug("id: " + sys.inspect(id, false, null));
var class = htmlparser.DomUtils.getElements({ class: "y" }, dom);
sys.debug("class: " + sys.inspect(class, false, null));
var multiclass = htmlparser.DomUtils.getElements({ class: function (value) { return(value && value.indexOf("h
x22;) > -1); } }, dom);
sys.debug("multiclass: " + sys.inspect(multiclass, false, null));
var name = htmlparser.DomUtils.getElementsByTagName("a", dom);
sys.debug("name: " + sys.inspect(name, false, null));
...function DomUtils$getElements(options, currentElement, recurse, limit) {
recurse = (recurse === undefined || recurse === null) || !!recurse;
limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
if (!currentElement) {
return([]);
}
var found = [];
var elementList;
function getTest (checkVal) {
return(function (value) { return(value == checkVal); });
}
for (var key in options) {
if ((typeof options[key]) != "function") {
options[key] = getTest(options[key]);
}
}
if (DomUtils.testElement(options, currentElement)) {
found.push(currentElement);
}
if (limit >= 0 && found.length >= limit) {
return(found);
}
if (recurse && currentElement.children) {
elementList = currentElement.children;
} else if (currentElement instanceof Array) {
elementList = currentElement;
} else {
return(found);
}
for (var i = 0; i < elementList.length; i++) {
found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
if (limit >= 0 && found.length >= limit) {
break;
}
}
return(found);
}...
if (err) {
sys.debug("Error: " + err);
}
else {
sys.debug(sys.inspect(dom, false, null));
var id = htmlparser.DomUtils.getElementById("x", dom);
sys.debug("id: " + sys.inspect(id, false, null));
var class = htmlparser.DomUtils.getElements({ class: "y" }, dom);
sys.debug("class: " + sys.inspect(class, false, null));
var multiclass = htmlparser.DomUtils.getElements({ class: function (value) { return(value && value.indexOf("h
x22;) > -1); } }, dom);
sys.debug("multiclass: " + sys.inspect(multiclass, false, null));
var name = htmlparser.DomUtils.getElementsByTagName("a", dom);
sys.debug("name: " + sys.inspect(name, false, null));
var text = htmlparser.DomUtils.getElementsByTagType("text", dom);
sys.debug("text: " + sys.inspect(text, false, null));
...function DomUtils$getElementsByTagName(name, currentElement, recurse, limit) {
return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
}...
.bad {
color: #633;
font-style: italic;
}
</style>
<script language="JavaScript">
if ((typeof JSON) != "object") {
var head = document.getElementsByTagName("head")[0];
var script = document.createElement('script');
script.type = "text/javascript";
script.src = "json2.js";
head.insertBefore(script, head.firstChild)
}
</script>
<script language="JavaScript" src="lib/htmlparser.js"></script>
...function DomUtils$getElementsByTagType(type, currentElement, recurse, limit) {
return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
}...
sys.debug("id: " + sys.inspect(id, false, null));
var class = htmlparser.DomUtils.getElements({ class: "y" }, dom);
sys.debug("class: " + sys.inspect(class, false, null));
var multiclass = htmlparser.DomUtils.getElements({ class: function (value) { return(value && value.indexOf("h
x22;) > -1); } }, dom);
sys.debug("multiclass: " + sys.inspect(multiclass, false, null));
var name = htmlparser.DomUtils.getElementsByTagName("a", dom);
sys.debug("name: " + sys.inspect(name, false, null));
var text = htmlparser.DomUtils.getElementsByTagType("text", dom);
sys.debug("text: " + sys.inspect(text, false, null));
var nested = htmlparser.DomUtils.getElements({ tag_name: "d", id: "z", class: "w" }, dom);
nested = htmlparser.DomUtils.getElementsByTagName("e", nested);
nested = htmlparser.DomUtils.getElementsByTagType("text", nested);
sys.debug("nested: " + sys.inspect(nested, false, null));
var double = htmlparser.DomUtils.getElementsByTagName("yy", dom);
sys.debug("double: " + sys.inspect(double, false, null));
...function DomUtils$testElement(options, element) {
if (!element) {
return false;
}
for (var key in options) {
if (key == "tag_name") {
if (element.type != "tag" && element.type != "script" && element.type != "style") {
return false;
}
if (!options["tag_name"](element.name)) {
return false;
}
} else if (key == "tag_type") {
if (!options["tag_type"](element.type)) {
return false;
}
} else if (key == "tag_contains") {
if (element.type != "text" && element.type != "comment" && element.type != "directive") {
return false;
}
if (!options["tag_contains"](element.data)) {
return false;
}
} else {
if (!element.attribs || !options[key](element.attribs[key])) {
return false;
}
}
}
return true;
}...
* Added parser option "includeLocation" to enable document position data
v1.6.4
* Fixed 'prevElement' error [Swizec]
v1.6.3
* Updated to support being an npm package
* Fixed DomUtils.testElement()
v1.6.1
* Optimized DomUtils by up to 2-3x
v1.6.0
* Added support for RSS/Atom feeds
...function Parser(handler, options) {
this._options = options ? options : { };
if (this._options.includeLocation == undefined) {
this._options.includeLocation = false; //Do not track element position in document by default
}
this.validateHandler(handler);
this._handler = handler;
this.reset();
}...
var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';< / script
><!--<!-- Waah! -- -->";
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error)
[...do something for errors...]
else
[...parsing done, do something...]
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
sys.puts(sys.inspect(handler.dom, false, null));
```
##Usage In Browser
```javascript
...function Parser$done() {
if (this._done)
return;
this._done = true;
//Push any unparsed text into a final element in the element list
if (this._buffer.length) {
var rawData = this._buffer;
this._buffer = "";
var element = {
raw: rawData
, data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
, type: this._parseState
};
if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
element.name = this.parseTagName(element.data);
this.parseAttribs(element);
this._elements.push(element);
}
this.writeHandler();
this._handler.done();
}...
##Streaming To Parser
```javascript
while (...) {
...
parser.parseChunk(chunk);
}
parser.done();
```
##Parsing RSS/Atom Feeds
```javascript
new htmlparser.RssHandler(function (error, dom) {
...
...function Parser$getLocation(startTag) {
var c,
l = this._location,
end = this._current - (startTag ? 1 : 0),
chunk = startTag && l.charOffset == 0 && this._current == 0;
for (; l.charOffset < end; l.charOffset++) {
c = this._buffer.charAt(l.charOffset);
if (c == '\n') {
l.inBuffer++;
l.col = 0;
} else if (c != '\r') {
l.col++;
}
}
return {
line: l.row + l.inBuffer + 1
, col: l.col + (chunk ? 0: 1)
};
}...
if (element.name && element.name.charAt(0) == "/")
element.data = element.name;
}
//Add all tags and non-empty text elements to the element list
if (element.raw != "" || element.type != ElementType.Text) {
if (this._options.includeLocation && !element.location) {
element.location = this.getLocation(element.type == ElementType.Tag);
}
this.parseAttribs(element);
this._elements.push(element);
//If tag self-terminates, add an explicit, separate closing tag
if (
element.type != ElementType.Text
&&
...function Parser$handleError(error) {
if ((typeof this._handler.error) == "function")
this._handler.error(error);
else
throw error;
}...
this.parseChunk(data);
this.done();
}
//Parses a piece of an HTML document
Parser.prototype.parseChunk = function Parser$parseChunk (data) {
if (this._done)
this.handleError(new Error("Attempted to parse chunk after parsing already done
"));
this._buffer += data; //FIXME: this can be a bottleneck
this.parseTags();
}
//Tells the parser that the HTML being parsed is complete
Parser.prototype.done = function Parser$done () {
if (this._done)
...function Parser$parseAttribs(element) {
//Only parse attributes for tags
if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
return;
var tagName = element.data.split(Parser._reWhitespace, 1)[0];
var attribRaw = element.data.substring(tagName.length);
if (attribRaw.length < 1)
return;
var match;
Parser._reAttrib.lastIndex = 0;
while (match = Parser._reAttrib.exec(attribRaw)) {
if (element.attribs == undefined)
element.attribs = {};
if (typeof match[1] == "string" && match[1].length) {
element.attribs[match[1]] = match[2];
} else if (typeof match[3] == "string" && match[3].length) {
element.attribs[match[3].toString()] = match[4].toString();
} else if (typeof match[5] == "string" && match[5].length) {
element.attribs[match[5]] = match[6];
} else if (typeof match[7] == "string" && match[7].length) {
element.attribs[match[7]] = match[7];
}
}
}...
var element = {
raw: rawData
, data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
, type: this._parseState
};
if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
element.name = this.parseTagName(element.data);
this.parseAttribs(element);
this._elements.push(element);
}
this.writeHandler();
this._handler.done();
}
...function Parser$parseChunk(data) {
if (this._done)
this.handleError(new Error("Attempted to parse chunk after parsing already done"));
this._buffer += data; //FIXME: this can be a bottleneck
this.parseTags();
}...
```
##Streaming To Parser
```javascript
while (...) {
...
parser.parseChunk(chunk);
}
parser.done();
```
##Parsing RSS/Atom Feeds
```javascript
...function Parser$parseComplete(data) {
this.reset();
this.parseChunk(data);
this.done();
}...
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error)
[...do something for errors...]
else
[...parsing done, do something...]
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
sys.puts(sys.inspect(handler.dom, false, null));
```
##Usage In Browser
```javascript
var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error, dom) {
...function Parser$parseTagAttribs(elements) {
var idxEnd = elements.length;
var idx = 0;
while (idx < idxEnd) {
var element = elements[idx++];
if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
this.parseAttribs(element);
}
return(elements);
}n/a
function Parser$parseTagName(data) {
if (data == null || data == "")
return("");
var match = Parser._reTagName.exec(data);
if (!match)
return("");
return((match[1] ? "/" : "") + match[2]);
}...
this._buffer = "";
var element = {
raw: rawData
, data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
, type: this._parseState
};
if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
element.name = this.parseTagName(element.data);
this.parseAttribs(element);
this._elements.push(element);
}
this.writeHandler();
this._handler.done();
}
...function Parser$parseTags() {
var bufferEnd = this._buffer.length - 1;
while (Parser._reTags.test(this._buffer)) {
this._next = Parser._reTags.lastIndex - 1;
var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
//A new element to eventually be appended to the element list
var element = {
raw: rawData
, data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
, type: this._parseState
};
var elementName = this.parseTagName(element.data);
//This section inspects the current tag stack and modifies the current
//element if we're actually parsing a special area (script/comment/style tag)
if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
if (elementName.toLowerCase() == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
this._tagStack.pop();
else { //Not a closing script tag
if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
//All data from here to script close is now a text element
element.type = ElementType.Text;
//If the previous element is text, append the current text to it
if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
var prevElement = this._elements[this._elements.length - 1];
prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
element.raw = element.data = ""; //This causes the current element to not be added to the element list
}
}
}
}
else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
if (elementName.toLowerCase() == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
this._tagStack.pop();
else {
if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
//All data from here to style close is now a text element
element.type = ElementType.Text;
//If the previous element is text, append the current text to it
if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
var prevElement = this._elements[this._elements.length - 1];
if (element.raw != "") {
prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
element.raw = element.data = ""; //This causes the current element to not be added to the element list
} else { //Element is empty, so just append the last tag marker found
prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
}
} else { //The previous element was not text
if (element.raw != "") {
element.raw = element.data = element.raw;
}
}
}
}
}
else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
var rawLen = element.raw.length;
if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
//Actually, we're no longer in a style tag, so pop it off the stack
this._tagStack.pop();
//If the previous element is a comment, append the current text to it
if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
var prevElement = this._elements[this._elements.length - 1];
prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
element.raw = element.data = ""; //This causes the current element to not be added to the element list
element.type = ElementType.Text;
}
else //Previous element not a comment
element.type = ElementType ......
}
//Parses a piece of an HTML document
Parser.prototype.parseChunk = function Parser$parseChunk (data) {
if (this._done)
this.handleError(new Error("Attempted to parse chunk after parsing already done"));
this._buffer += data; //FIXME: this can be a bottleneck
this.parseTags();
}
//Tells the parser that the HTML being parsed is complete
Parser.prototype.done = function Parser$done () {
if (this._done)
return;
this._done = true;
...function Parser$reset() {
this._buffer = "";
this._done = false;
this._elements = [];
this._elementsCurrent = 0;
this._current = 0;
this._next = 0;
this._location = {
row: 0
, col: 0
, charOffset: 0
, inBuffer: 0
};
this._parseState = ElementType.Text;
this._prevTagSep = '';
this._tagStack = [];
this._handler.reset();
}...
new Tautologistics.NodeHtmlParser.DefaultHandler(handlerCallback, test.options.handler)
;
var parser = new Tautologistics.NodeHtmlParser.Parser(handler, test.options.parser);
document.write("<b>" + test.name + "</b>: ");
parser.parseComplete(test.html);
var resultComplete = handler.dom;
var chunkPos = 0;
parser.reset();
while (chunkPos < test.html.length) {
parser.parseChunk(test.html.substring(chunkPos, chunkPos + chunkSize));
chunkPos += chunkSize;
}
parser.done();
var resultChunk = handler.dom;
var testResult =
...function Parser$validateHandler(handler) {
if ((typeof handler) != "object")
throw new Error("Handler is not an object");
if ((typeof handler.reset) != "function")
throw new Error("Handler method 'reset' is invalid");
if ((typeof handler.done) != "function")
throw new Error("Handler method 'done' is invalid");
if ((typeof handler.writeTag) != "function")
throw new Error("Handler method 'writeTag' is invalid");
if ((typeof handler.writeText) != "function")
throw new Error("Handler method 'writeText' is invalid");
if ((typeof handler.writeComment) != "function")
throw new Error("Handler method 'writeComment' is invalid");
if ((typeof handler.writeDirective) != "function")
throw new Error("Handler method 'writeDirective' is invalid");
}...
function Parser (handler, options) {
this._options = options ? options : { };
if (this._options.includeLocation == undefined) {
this._options.includeLocation = false; //Do not track element position in document by default
}
this.validateHandler(handler);
this._handler = handler;
this.reset();
}
//**"Static"**//
//Regular expressions used for cleaning up and parsing (stateless)
Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
...function Parser$writeHandler(forceFlush) {
forceFlush = !!forceFlush;
if (this._tagStack.length && !forceFlush)
return;
while (this._elements.length) {
var element = this._elements.shift();
switch (element.type) {
case ElementType.Comment:
this._handler.writeComment(element);
break;
case ElementType.Directive:
this._handler.writeDirective(element);
break;
case ElementType.Text:
this._handler.writeText(element);
break;
default:
this._handler.writeTag(element);
break;
}
}
}...
};
if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
element.name = this.parseTagName(element.data);
this.parseAttribs(element);
this._elements.push(element);
}
this.writeHandler();
this._handler.done();
}
//Resets the parser to a blank state, ready to parse a new HTML document
Parser.prototype.reset = function Parser$reset () {
this._buffer = "";
this._done = false;
...function RssHandler(callback) {
RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
}...
}
parser.done();
```
##Parsing RSS/Atom Feeds
```javascript
new htmlparser.RssHandler(function (error, dom) {
...
});
```
##DefaultHandler Options
###Usage
...function DefaultHandler(callback, options) {
this.reset();
this._options = options ? options : { };
if (this._options.ignoreWhitespace == undefined)
this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
if (this._options.verbose == undefined)
this._options.verbose = true; //Keep data property for tags and raw property for all
if (this._options.enforceEmptyTags == undefined)
this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
if ((typeof callback) == "function")
this._callback = callback;
}n/a
function RssHandler(callback) {
RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
}n/a
function RssHandler$done() {
var feed = { };
var feedRoot;
var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
if (found.length) {
feedRoot = found[0];
}
if (feedRoot) {
if (feedRoot.name == "rss") {
feed.type = "rss";
feedRoot = feedRoot.children[0]; //<channel/>
feed.id = "";
try {
feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
} catch (ex) { }
try {
feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
} catch (ex) { }
try {
feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
} catch (ex) { }
try {
feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
} catch (ex) { }
try {
feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
} catch (ex) { }
feed.items = [];
DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
var entry = {};
try {
entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
} catch (ex) { }
try {
entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
} catch (ex) { }
try {
entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
} catch (ex) { }
try {
entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
} catch (ex) { }
try {
entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
} catch (ex) { }
feed.items.push(entry);
});
} else {
feed.type = "atom";
try {
feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
} catch (ex) { }
try {
feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
} catch (ex) { }
try {
feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
} catch (ex) { }
try {
feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
} catch (ex) { }
try {
feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
} catch (ex) { }
try {
feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
} catch (ex) { }
feed.items = [];
DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
var entry = {};
try {
entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
} catch (ex) { }
try {
entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
} catch (ex) { }
try {
entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
} catch (ex) { }
try {
entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
} catch (ex) { }
try {
entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
} catch (ex) { }
feed.items.push(entry);
});
}
this.dom = feed;
}
RssHandler.super_.prototype.done.call(this);
}...
##Streaming To Parser
```javascript
while (...) {
...
parser.parseChunk(chunk);
}
parser.done();
```
##Parsing RSS/Atom Feeds
```javascript
new htmlparser.RssHandler(function (error, dom) {
...
...function DefaultHandler(callback, options) {
this.reset();
this._options = options ? options : { };
if (this._options.ignoreWhitespace == undefined)
this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
if (this._options.verbose == undefined)
this._options.verbose = true; //Keep data property for tags and raw property for all
if (this._options.enforceEmptyTags == undefined)
this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
if ((typeof callback) == "function")
this._callback = callback;
}...
View runtests.html in any browser
##Usage In Node
```javascript
var htmlparser = require("htmlparser");
var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';< / script
><!--<!-- Waah! -- -->";
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error)
[...do something for errors...]
else
[...parsing done, do something...]
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
...function Parser(handler, options) {
this._options = options ? options : { };
if (this._options.includeLocation == undefined) {
this._options.includeLocation = false; //Do not track element position in document by default
}
this.validateHandler(handler);
this._handler = handler;
this.reset();
}...
var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';< / script
><!--<!-- Waah! -- -->";
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error)
[...do something for errors...]
else
[...parsing done, do something...]
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
sys.puts(sys.inspect(handler.dom, false, null));
```
##Usage In Browser
```javascript
...function RssHandler(callback) {
RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
}...
}
parser.done();
```
##Parsing RSS/Atom Feeds
```javascript
new htmlparser.RssHandler(function (error, dom) {
...
});
```
##DefaultHandler Options
###Usage
...