diff options
Diffstat (limited to 'lib/feed2imap/sgml-parser.rb')
-rw-r--r-- | lib/feed2imap/sgml-parser.rb | 333 |
1 files changed, 333 insertions, 0 deletions
diff --git a/lib/feed2imap/sgml-parser.rb b/lib/feed2imap/sgml-parser.rb new file mode 100644 index 0000000..c692f52 --- /dev/null +++ b/lib/feed2imap/sgml-parser.rb @@ -0,0 +1,333 @@ +# A parser for SGML, using the derived class as static DTD. +# from http://raa.ruby-lang.org/project/html-parser + +class SGMLParser + + # Regular expressions used for parsing: + Interesting = /[&<]/ + Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + + '![^<>]*)?') + + Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ + Charref = /&#([0-9]+)[^0-9]/ + + Starttagopen = /<[>a-zA-Z]/ + Endtagopen = /<\/[<>a-zA-Z]/ + Endbracket = /[<>]/ + Special = /<![^<>]*>/ + Commentopen = /<!--/ + Commentclose = /--[ \t\n]*>/ + Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/ + Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + + '(\s*=\s*' + + "('[^']*'" + + '|"[^"]*"' + + '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?') + + Entitydefs = + {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} + + def initialize(verbose=false) + @verbose = verbose + reset + end + + def reset + @rawdata = '' + @stack = [] + @lasttag = '???' + @nomoretags = false + @literal = false + end + + def has_context(gi) + @stack.include? gi + end + + def setnomoretags + @nomoretags = true + @literal = true + end + + def setliteral(*args) + @literal = true + end + + def feed(data) + @rawdata << data + goahead(false) + end + + def close + goahead(true) + end + + def goahead(_end) + rawdata = @rawdata + i = 0 + n = rawdata.length + while i < n + if @nomoretags + handle_data(rawdata[i..(n-1)]) + i = n + break + end + j = rawdata.index(Interesting, i) + j = n unless j + if i < j + handle_data(rawdata[i..(j-1)]) + end + i = j + break if (i == n) + if rawdata[i] == ?< # + if rawdata.index(Starttagopen, i) == i + if @literal + handle_data(rawdata[i, 1]) + i += 1 + next + end + k = parse_starttag(i) + break unless k + i = k + next + end + if rawdata.index(Endtagopen, i) == i + k = parse_endtag(i) + break unless k + i = k + @literal = false + next + end + if rawdata.index(Commentopen, i) == i + if @literal + handle_data(rawdata[i,1]) + i += 1 + next + end + k = parse_comment(i) + break unless k + i += k + next + end + if rawdata.index(Special, i) == i + if @literal + handle_data(rawdata[i, 1]) + i += 1 + next + end + k = parse_special(i) + break unless k + i += k + next + end + elsif rawdata[i] == ?& # + if rawdata.index(Charref, i) == i + i += $&.length + handle_charref($1) + i -= 1 unless rawdata[i-1] == ?; + next + end + if rawdata.index(Entityref, i) == i + i += $&.length + handle_entityref($1) + i -= 1 unless rawdata[i-1] == ?; + next + end + else + raise RuntimeError, 'neither < nor & ??' + end + # We get here only if incomplete matches but + # nothing else + match = rawdata.index(Incomplete, i) + unless match == i + handle_data(rawdata[i, 1]) + i += 1 + next + end + j = match + $&.length + break if j == n # Really incomplete + handle_data(rawdata[i..(j-1)]) + i = j + end + # end while + if _end and i < n + handle_data(@rawdata[i..(n-1)]) + i = n + end + @rawdata = rawdata[i..-1] + end + + def parse_comment(i) + rawdata = @rawdata + if rawdata[i, 4] != '<!--' + raise RuntimeError, 'unexpected call to handle_comment' + end + match = rawdata.index(Commentclose, i) + return nil unless match + matched_length = $&.length + j = match + handle_comment(rawdata[i+4..(j-1)]) + j = match + matched_length + return j-i + end + + def parse_starttag(i) + rawdata = @rawdata + j = rawdata.index(Endbracket, i + 1) + return nil unless j + attrs = [] + if rawdata[i+1] == ?> # + # SGML shorthand: <> == <last open tag seen> + k = j + tag = @lasttag + else + match = rawdata.index(Tagfind, i + 1) + unless match + raise RuntimeError, 'unexpected call to parse_starttag' + end + k = i + 1 + ($&.length) + tag = $&.downcase + @lasttag = tag + end + while k < j + break unless rawdata.index(Attrfind, k) + matched_length = $&.length + attrname, rest, attrvalue = $1, $2, $3 + if not rest + attrvalue = '' # was: = attrname + elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or + (attrvalue[0] == ?" && attrvalue[-1,1] == ?") + attrvalue = attrvalue[1..-2] + end + attrs << [attrname.downcase, attrvalue] + k += matched_length + end + if rawdata[j] == ?> # + j += 1 + end + finish_starttag(tag, attrs) + return j + end + + def parse_endtag(i) + rawdata = @rawdata + j = rawdata.index(Endbracket, i + 1) + return nil unless j + tag = (rawdata[i+2..j-1].strip).downcase + if rawdata[j] == ?> # + j += 1 + end + finish_endtag(tag) + return j + end + + def finish_starttag(tag, attrs) + method = 'start_' + tag + if self.respond_to?(method) + @stack << tag + handle_starttag(tag, method, attrs) + return 1 + else + method = 'do_' + tag + if self.respond_to?(method) + handle_starttag(tag, method, attrs) + return 0 + else + unknown_starttag(tag, attrs) + return -1 + end + end + end + + def finish_endtag(tag) + if tag == '' + found = @stack.length - 1 + if found < 0 + unknown_endtag(tag) + return + end + else + unless @stack.include? tag + method = 'end_' + tag + unless self.respond_to?(method) + unknown_endtag(tag) + end + return + end + found = @stack.index(tag) #or @stack.length + end + while @stack.length > found + tag = @stack[-1] + method = 'end_' + tag + if respond_to?(method) + handle_endtag(tag, method) + else + unknown_endtag(tag) + end + @stack.pop + end + end + + def parse_special(i) + rawdata = @rawdata + match = rawdata.index(Endbracket, i+1) + return nil unless match + matched_length = $&.length + handle_special(rawdata[i+1..(match-1)]) + return match - i + matched_length + end + + def handle_starttag(tag, method, attrs) + self.send(method, attrs) + end + + def handle_endtag(tag, method) + self.send(method) + end + + def report_unbalanced(tag) + if @verbose + print '*** Unbalanced </' + tag + '>', "\n" + print '*** Stack:', self.stack, "\n" + end + end + + def handle_charref(name) + n = Integer(name) + if !(0 <= n && n <= 255) + unknown_charref(name) + return + end + handle_data(n.chr) + end + + def handle_entityref(name) + table = Entitydefs + if table.include?(name) + handle_data(table[name]) + else + unknown_entityref(name) + return + end + end + + def handle_data(data) + end + + def handle_comment(data) + end + + def handle_special(data) + end + + def unknown_starttag(tag, attrs) + end + def unknown_endtag(tag) + end + def unknown_charref(ref) + end + def unknown_entityref(ref) + end + +end |