summaryrefslogtreecommitdiff
path: root/lib/feed2imap/sgml-parser.rb
diff options
context:
space:
mode:
authorlnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972>2005-07-20 16:02:15 +0000
committerlnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972>2005-07-20 16:02:15 +0000
commit8d3b2a0a706ed76527b204e28f97a7fdea6db633 (patch)
tree830f12b7e57fdee27f99848e69895a835ccb4b55 /lib/feed2imap/sgml-parser.rb
parentb497c57ea6afb1e2c93e61be527d0b98ac92e133 (diff)
downloadfeed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.gz
feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.bz2
feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.zip
new html2text converter
git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@36 f70e237a-67f3-0310-a06c-d2b8a7116972
Diffstat (limited to 'lib/feed2imap/sgml-parser.rb')
-rw-r--r--lib/feed2imap/sgml-parser.rb333
1 files changed, 333 insertions, 0 deletions
diff --git a/lib/feed2imap/sgml-parser.rb b/lib/feed2imap/sgml-parser.rb
new file mode 100644
index 0000000..c692f52
--- /dev/null
+++ b/lib/feed2imap/sgml-parser.rb
@@ -0,0 +1,333 @@
+# A parser for SGML, using the derived class as static DTD.
+# from http://raa.ruby-lang.org/project/html-parser
+
+class SGMLParser
+
+ # Regular expressions used for parsing:
+ Interesting = /[&<]/
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
+ '![^<>]*)?')
+
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
+ Charref = /&#([0-9]+)[^0-9]/
+
+ Starttagopen = /<[>a-zA-Z]/
+ Endtagopen = /<\/[<>a-zA-Z]/
+ Endbracket = /[<>]/
+ Special = /<![^<>]*>/
+ Commentopen = /<!--/
+ Commentclose = /--[ \t\n]*>/
+ Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
+ Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
+ '(\s*=\s*' +
+ "('[^']*'" +
+ '|"[^"]*"' +
+ '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
+
+ Entitydefs =
+ {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
+
+ def initialize(verbose=false)
+ @verbose = verbose
+ reset
+ end
+
+ def reset
+ @rawdata = ''
+ @stack = []
+ @lasttag = '???'
+ @nomoretags = false
+ @literal = false
+ end
+
+ def has_context(gi)
+ @stack.include? gi
+ end
+
+ def setnomoretags
+ @nomoretags = true
+ @literal = true
+ end
+
+ def setliteral(*args)
+ @literal = true
+ end
+
+ def feed(data)
+ @rawdata << data
+ goahead(false)
+ end
+
+ def close
+ goahead(true)
+ end
+
+ def goahead(_end)
+ rawdata = @rawdata
+ i = 0
+ n = rawdata.length
+ while i < n
+ if @nomoretags
+ handle_data(rawdata[i..(n-1)])
+ i = n
+ break
+ end
+ j = rawdata.index(Interesting, i)
+ j = n unless j
+ if i < j
+ handle_data(rawdata[i..(j-1)])
+ end
+ i = j
+ break if (i == n)
+ if rawdata[i] == ?< #
+ if rawdata.index(Starttagopen, i) == i
+ if @literal
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ k = parse_starttag(i)
+ break unless k
+ i = k
+ next
+ end
+ if rawdata.index(Endtagopen, i) == i
+ k = parse_endtag(i)
+ break unless k
+ i = k
+ @literal = false
+ next
+ end
+ if rawdata.index(Commentopen, i) == i
+ if @literal
+ handle_data(rawdata[i,1])
+ i += 1
+ next
+ end
+ k = parse_comment(i)
+ break unless k
+ i += k
+ next
+ end
+ if rawdata.index(Special, i) == i
+ if @literal
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ k = parse_special(i)
+ break unless k
+ i += k
+ next
+ end
+ elsif rawdata[i] == ?& #
+ if rawdata.index(Charref, i) == i
+ i += $&.length
+ handle_charref($1)
+ i -= 1 unless rawdata[i-1] == ?;
+ next
+ end
+ if rawdata.index(Entityref, i) == i
+ i += $&.length
+ handle_entityref($1)
+ i -= 1 unless rawdata[i-1] == ?;
+ next
+ end
+ else
+ raise RuntimeError, 'neither < nor & ??'
+ end
+ # We get here only if incomplete matches but
+ # nothing else
+ match = rawdata.index(Incomplete, i)
+ unless match == i
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ j = match + $&.length
+ break if j == n # Really incomplete
+ handle_data(rawdata[i..(j-1)])
+ i = j
+ end
+ # end while
+ if _end and i < n
+ handle_data(@rawdata[i..(n-1)])
+ i = n
+ end
+ @rawdata = rawdata[i..-1]
+ end
+
+ def parse_comment(i)
+ rawdata = @rawdata
+ if rawdata[i, 4] != '<!--'
+ raise RuntimeError, 'unexpected call to handle_comment'
+ end
+ match = rawdata.index(Commentclose, i)
+ return nil unless match
+ matched_length = $&.length
+ j = match
+ handle_comment(rawdata[i+4..(j-1)])
+ j = match + matched_length
+ return j-i
+ end
+
+ def parse_starttag(i)
+ rawdata = @rawdata
+ j = rawdata.index(Endbracket, i + 1)
+ return nil unless j
+ attrs = []
+ if rawdata[i+1] == ?> #
+ # SGML shorthand: <> == <last open tag seen>
+ k = j
+ tag = @lasttag
+ else
+ match = rawdata.index(Tagfind, i + 1)
+ unless match
+ raise RuntimeError, 'unexpected call to parse_starttag'
+ end
+ k = i + 1 + ($&.length)
+ tag = $&.downcase
+ @lasttag = tag
+ end
+ while k < j
+ break unless rawdata.index(Attrfind, k)
+ matched_length = $&.length
+ attrname, rest, attrvalue = $1, $2, $3
+ if not rest
+ attrvalue = '' # was: = attrname
+ elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
+ (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
+ attrvalue = attrvalue[1..-2]
+ end
+ attrs << [attrname.downcase, attrvalue]
+ k += matched_length
+ end
+ if rawdata[j] == ?> #
+ j += 1
+ end
+ finish_starttag(tag, attrs)
+ return j
+ end
+
+ def parse_endtag(i)
+ rawdata = @rawdata
+ j = rawdata.index(Endbracket, i + 1)
+ return nil unless j
+ tag = (rawdata[i+2..j-1].strip).downcase
+ if rawdata[j] == ?> #
+ j += 1
+ end
+ finish_endtag(tag)
+ return j
+ end
+
+ def finish_starttag(tag, attrs)
+ method = 'start_' + tag
+ if self.respond_to?(method)
+ @stack << tag
+ handle_starttag(tag, method, attrs)
+ return 1
+ else
+ method = 'do_' + tag
+ if self.respond_to?(method)
+ handle_starttag(tag, method, attrs)
+ return 0
+ else
+ unknown_starttag(tag, attrs)
+ return -1
+ end
+ end
+ end
+
+ def finish_endtag(tag)
+ if tag == ''
+ found = @stack.length - 1
+ if found < 0
+ unknown_endtag(tag)
+ return
+ end
+ else
+ unless @stack.include? tag
+ method = 'end_' + tag
+ unless self.respond_to?(method)
+ unknown_endtag(tag)
+ end
+ return
+ end
+ found = @stack.index(tag) #or @stack.length
+ end
+ while @stack.length > found
+ tag = @stack[-1]
+ method = 'end_' + tag
+ if respond_to?(method)
+ handle_endtag(tag, method)
+ else
+ unknown_endtag(tag)
+ end
+ @stack.pop
+ end
+ end
+
+ def parse_special(i)
+ rawdata = @rawdata
+ match = rawdata.index(Endbracket, i+1)
+ return nil unless match
+ matched_length = $&.length
+ handle_special(rawdata[i+1..(match-1)])
+ return match - i + matched_length
+ end
+
+ def handle_starttag(tag, method, attrs)
+ self.send(method, attrs)
+ end
+
+ def handle_endtag(tag, method)
+ self.send(method)
+ end
+
+ def report_unbalanced(tag)
+ if @verbose
+ print '*** Unbalanced </' + tag + '>', "\n"
+ print '*** Stack:', self.stack, "\n"
+ end
+ end
+
+ def handle_charref(name)
+ n = Integer(name)
+ if !(0 <= n && n <= 255)
+ unknown_charref(name)
+ return
+ end
+ handle_data(n.chr)
+ end
+
+ def handle_entityref(name)
+ table = Entitydefs
+ if table.include?(name)
+ handle_data(table[name])
+ else
+ unknown_entityref(name)
+ return
+ end
+ end
+
+ def handle_data(data)
+ end
+
+ def handle_comment(data)
+ end
+
+ def handle_special(data)
+ end
+
+ def unknown_starttag(tag, attrs)
+ end
+ def unknown_endtag(tag)
+ end
+ def unknown_charref(ref)
+ end
+ def unknown_entityref(ref)
+ end
+
+end