This module implements a simple high performance XML / HTML parser. The only encoding that is supported is UTF-8. The parser has been designed to be somewhat error correcting, so that even most "wild HTML" found on the web can be parsed with it. Note: This parser does not check that each <tag>
has a corresponding </tag>
! These checks have do be implemented by the client code for various reasons:
<br>
for example.The file examples/htmltitle.nim
demonstrates how to use the XML parser to accomplish a simple task: To determine the title of an HTML document.
# Example program to show the parsexml module # This program reads an HTML file and writes its title to stdout. # Errors and whitespace are ignored. import os, streams, parsexml, strutils if paramCount() < 1: quit("Usage: htmltitle filename[.html]") var filename = addFileExt(paramStr(1), "html") var s = newFileStream(filename, fmRead) if s == nil: quit("cannot open the file " & filename) var x: XmlParser open(x, s, filename) while true: x.next() case x.kind of xmlElementStart: if cmpIgnoreCase(x.elementName, "title") == 0: var title = "" x.next() # skip "<title>" while x.kind == xmlCharData: title.add(x.charData) x.next() if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0: echo("Title: " & title) quit(0) # Success! else: echo(x.errorMsgExpected("/title")) of xmlEof: break # end of file reached else: discard # ignore other events x.close() quit("Could not determine title!")
The file examples/htmlrefs.nim
demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains.
# Example program to show the new parsexml module # This program reads an HTML file and writes all its used links to stdout. # Errors and whitespace are ignored. import os, streams, parsexml, strutils proc `=?=` (a, b: string): bool = # little trick: define our own comparator that ignores case return cmpIgnoreCase(a, b) == 0 if paramCount() < 1: quit("Usage: htmlrefs filename[.html]") var links = 0 # count the number of links var filename = addFileExt(paramStr(1), "html") var s = newFileStream(filename, fmRead) if s == nil: quit("cannot open the file " & filename) var x: XmlParser open(x, s, filename) next(x) # get first event block mainLoop: while true: case x.kind of xmlElementOpen: # the <a href = "xyz"> tag we are interested in always has an attribute, # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart`` if x.elementName =?= "a": x.next() if x.kind == xmlAttribute: if x.attrKey =?= "href": var link = x.attrValue inc(links) # skip until we have an ``xmlElementClose`` event while true: x.next() case x.kind of xmlEof: break mainLoop of xmlElementClose: break else: discard x.next() # skip ``xmlElementClose`` # now we have the description for the ``a`` element var desc = "" while x.kind == xmlCharData: desc.add(x.charData) x.next() echo(desc & ": " & link) else: x.next() of xmlEof: break # end of file reached of xmlError: echo(errorMsg(x)) x.next() else: x.next() # skip other events echo($links & " link(s) found!") x.close()
XmlEventKind = enum xmlError, ## an error occurred during parsing xmlEof, ## end of file reached xmlCharData, ## character data xmlWhitespace, ## whitespace has been parsed xmlComment, ## a comment has been parsed xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >``
XmlErrorKind = enum errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected errNameExpected, ## name expected errSemicolonExpected, ## ``;`` expected errQmGtExpected, ## ``?>`` expected errGtExpected, ## ``>`` expected errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected
XmlParseOption = enum reportWhitespace, ## report whitespace reportComments ## report comments
XmlParser = object of BaseLexer a, b, c: string kind: XmlEventKind err: XmlErrorKind state: ParserState cIsEmpty: bool filename: string options: set[XmlParseOption]
proc open(my: var XmlParser; input: Stream; filename: string; options: set[XmlParseOption] = {}) {...}{.raises: [Exception], tags: [ReadIOEffect].}
reportWhitespace
a whitespace token is reported as an xmlWhitespace
event. If options contains reportComments
a comment token is reported as an xmlComment
event. proc close(my: var XmlParser) {...}{.inline, raises: [Exception], tags: [].}
proc kind(my: XmlParser): XmlEventKind {...}{.inline, raises: [], tags: [].}
proc rawData(my: XmlParser): string {...}{.inline, raises: [], tags: [].}
proc rawData2(my: XmlParser): string {...}{.inline, raises: [], tags: [].}
proc getColumn(my: XmlParser): int {...}{.inline, raises: [], tags: [].}
proc getLine(my: XmlParser): int {...}{.inline, raises: [], tags: [].}
proc getFilename(my: XmlParser): string {...}{.inline, raises: [], tags: [].}
proc errorMsg(my: XmlParser): string {...}{.raises: [ValueError], tags: [].}
xmlError
proc errorMsgExpected(my: XmlParser; tag: string): string {...}{.raises: [ValueError], tags: [].}
proc errorMsg(my: XmlParser; msg: string): string {...}{.raises: [ValueError], tags: [].}
proc next(my: var XmlParser) {...}{.raises: [Exception], tags: [ReadIOEffect].}
template charData(my: XmlParser): string
xmlCharData
, xmlWhitespace
, xmlComment
, xmlCData
, xmlSpecial
Raises an assertion in debug mode if my.kind
is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid. template elementName(my: XmlParser): string
xmlElementStart
, xmlElementEnd
, xmlElementOpen
Raises an assertion in debug mode if my.kind
is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid. template entityName(my: XmlParser): string
xmlEntity
Raises an assertion in debug mode if my.kind
is not xmlEntity
. In release mode, this will not trigger an error but the value returned will not be valid. template attrKey(my: XmlParser): string
xmlAttribute
Raises an assertion in debug mode if my.kind
is not xmlAttribute
. In release mode, this will not trigger an error but the value returned will not be valid. template attrValue(my: XmlParser): string
xmlAttribute
Raises an assertion in debug mode if my.kind
is not xmlAttribute
. In release mode, this will not trigger an error but the value returned will not be valid. template piName(my: XmlParser): string
xmlPI
Raises an assertion in debug mode if my.kind
is not xmlPI
. In release mode, this will not trigger an error but the value returned will not be valid. template piRest(my: XmlParser): string
xmlPI
Raises an assertion in debug mode if my.kind
is not xmlPI
. In release mode, this will not trigger an error but the value returned will not be valid.
© 2006–2018 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/parsexml.html