diff options
author | lnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972> | 2005-07-20 16:02:15 +0000 |
---|---|---|
committer | lnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972> | 2005-07-20 16:02:15 +0000 |
commit | 8d3b2a0a706ed76527b204e28f97a7fdea6db633 (patch) | |
tree | 830f12b7e57fdee27f99848e69895a835ccb4b55 /lib/feed2imap | |
parent | b497c57ea6afb1e2c93e61be527d0b98ac92e133 (diff) | |
download | feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.gz feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.bz2 feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.zip |
new html2text converter
git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@36 f70e237a-67f3-0310-a06c-d2b8a7116972
Diffstat (limited to 'lib/feed2imap')
-rw-r--r-- | lib/feed2imap/html2text-parser.rb | 97 | ||||
-rw-r--r-- | lib/feed2imap/sgml-parser.rb | 333 | ||||
-rw-r--r-- | lib/feed2imap/textconverters.rb | 45 |
3 files changed, 462 insertions, 13 deletions
diff --git a/lib/feed2imap/html2text-parser.rb b/lib/feed2imap/html2text-parser.rb new file mode 100644 index 0000000..a6bf400 --- /dev/null +++ b/lib/feed2imap/html2text-parser.rb @@ -0,0 +1,97 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'feed2imap/sgml-parser.rb' + +# this class provides a simple SGML parser that removes HTML tags +class HTML2TextParser < SGMLParser + + attr_reader :savedata + + def initialize(verbose = false) + @savedata = '' + @pre = false + @href = nil + @links = [] + super(verbose) + end + + def handle_data(data) + # let's remove all CR + data.gsub!(/\n/, '') if not @pre + + @savedata << data + end + + def unknown_starttag(tag, attrs) + case tag + when 'p' + @savedata << "\n\n" + when 'br' + @savedata << "\n" + when 'b' + @savedata << '*' + when 'u' + @savedata << '_' + when 'i' + @savedata << '/' + when 'pre' + @savedata << "\n\n" + @pre = true + when 'a' + # find href in args + @href = nil + attrs.each do |a| + if a[0] == 'href' + @href = a[1] + end + end + @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2') + end + end + + def close + super + if @links.length > 0 + @savedata << "\n\n" + @links.each_index do |i| + @savedata << "[#{i+1}] #{@links[i]}\n" + end + end + end + + def unknown_endtag(tag) + case tag + when 'b' + @savedata << '*' + when 'u' + @savedata << '_' + when 'i' + @savedata << '/' + when 'pre' + @savedata << "\n\n" + @pre = false + when 'a' + if @href + @savedata << "[#{@links.length}]" + @href = nil + end + end + end +end diff --git a/lib/feed2imap/sgml-parser.rb b/lib/feed2imap/sgml-parser.rb new file mode 100644 index 0000000..c692f52 --- /dev/null +++ b/lib/feed2imap/sgml-parser.rb @@ -0,0 +1,333 @@ +# A parser for SGML, using the derived class as static DTD. +# from http://raa.ruby-lang.org/project/html-parser + +class SGMLParser + + # Regular expressions used for parsing: + Interesting = /[&<]/ + Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + + '![^<>]*)?') + + Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ + Charref = /&#([0-9]+)[^0-9]/ + + Starttagopen = /<[>a-zA-Z]/ + Endtagopen = /<\/[<>a-zA-Z]/ + Endbracket = /[<>]/ + Special = /<![^<>]*>/ + Commentopen = /<!--/ + Commentclose = /--[ \t\n]*>/ + Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/ + Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + + '(\s*=\s*' + + "('[^']*'" + + '|"[^"]*"' + + '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?') + + Entitydefs = + {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} + + def initialize(verbose=false) + @verbose = verbose + reset + end + + def reset + @rawdata = '' + @stack = [] + @lasttag = '???' + @nomoretags = false + @literal = false + end + + def has_context(gi) + @stack.include? gi + end + + def setnomoretags + @nomoretags = true + @literal = true + end + + def setliteral(*args) + @literal = true + end + + def feed(data) + @rawdata << data + goahead(false) + end + + def close + goahead(true) + end + + def goahead(_end) + rawdata = @rawdata + i = 0 + n = rawdata.length + while i < n + if @nomoretags + handle_data(rawdata[i..(n-1)]) + i = n + break + end + j = rawdata.index(Interesting, i) + j = n unless j + if i < j + handle_data(rawdata[i..(j-1)]) + end + i = j + break if (i == n) + if rawdata[i] == ?< # + if rawdata.index(Starttagopen, i) == i + if @literal + handle_data(rawdata[i, 1]) + i += 1 + next + end + k = parse_starttag(i) + break unless k + i = k + next + end + if rawdata.index(Endtagopen, i) == i + k = parse_endtag(i) + break unless k + i = k + @literal = false + next + end + if rawdata.index(Commentopen, i) == i + if @literal + handle_data(rawdata[i,1]) + i += 1 + next + end + k = parse_comment(i) + break unless k + i += k + next + end + if rawdata.index(Special, i) == i + if @literal + handle_data(rawdata[i, 1]) + i += 1 + next + end + k = parse_special(i) + break unless k + i += k + next + end + elsif rawdata[i] == ?& # + if rawdata.index(Charref, i) == i + i += $&.length + handle_charref($1) + i -= 1 unless rawdata[i-1] == ?; + next + end + if rawdata.index(Entityref, i) == i + i += $&.length + handle_entityref($1) + i -= 1 unless rawdata[i-1] == ?; + next + end + else + raise RuntimeError, 'neither < nor & ??' + end + # We get here only if incomplete matches but + # nothing else + match = rawdata.index(Incomplete, i) + unless match == i + handle_data(rawdata[i, 1]) + i += 1 + next + end + j = match + $&.length + break if j == n # Really incomplete + handle_data(rawdata[i..(j-1)]) + i = j + end + # end while + if _end and i < n + handle_data(@rawdata[i..(n-1)]) + i = n + end + @rawdata = rawdata[i..-1] + end + + def parse_comment(i) + rawdata = @rawdata + if rawdata[i, 4] != '<!--' + raise RuntimeError, 'unexpected call to handle_comment' + end + match = rawdata.index(Commentclose, i) + return nil unless match + matched_length = $&.length + j = match + handle_comment(rawdata[i+4..(j-1)]) + j = match + matched_length + return j-i + end + + def parse_starttag(i) + rawdata = @rawdata + j = rawdata.index(Endbracket, i + 1) + return nil unless j + attrs = [] + if rawdata[i+1] == ?> # + # SGML shorthand: <> == <last open tag seen> + k = j + tag = @lasttag + else + match = rawdata.index(Tagfind, i + 1) + unless match + raise RuntimeError, 'unexpected call to parse_starttag' + end + k = i + 1 + ($&.length) + tag = $&.downcase + @lasttag = tag + end + while k < j + break unless rawdata.index(Attrfind, k) + matched_length = $&.length + attrname, rest, attrvalue = $1, $2, $3 + if not rest + attrvalue = '' # was: = attrname + elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or + (attrvalue[0] == ?" && attrvalue[-1,1] == ?") + attrvalue = attrvalue[1..-2] + end + attrs << [attrname.downcase, attrvalue] + k += matched_length + end + if rawdata[j] == ?> # + j += 1 + end + finish_starttag(tag, attrs) + return j + end + + def parse_endtag(i) + rawdata = @rawdata + j = rawdata.index(Endbracket, i + 1) + return nil unless j + tag = (rawdata[i+2..j-1].strip).downcase + if rawdata[j] == ?> # + j += 1 + end + finish_endtag(tag) + return j + end + + def finish_starttag(tag, attrs) + method = 'start_' + tag + if self.respond_to?(method) + @stack << tag + handle_starttag(tag, method, attrs) + return 1 + else + method = 'do_' + tag + if self.respond_to?(method) + handle_starttag(tag, method, attrs) + return 0 + else + unknown_starttag(tag, attrs) + return -1 + end + end + end + + def finish_endtag(tag) + if tag == '' + found = @stack.length - 1 + if found < 0 + unknown_endtag(tag) + return + end + else + unless @stack.include? tag + method = 'end_' + tag + unless self.respond_to?(method) + unknown_endtag(tag) + end + return + end + found = @stack.index(tag) #or @stack.length + end + while @stack.length > found + tag = @stack[-1] + method = 'end_' + tag + if respond_to?(method) + handle_endtag(tag, method) + else + unknown_endtag(tag) + end + @stack.pop + end + end + + def parse_special(i) + rawdata = @rawdata + match = rawdata.index(Endbracket, i+1) + return nil unless match + matched_length = $&.length + handle_special(rawdata[i+1..(match-1)]) + return match - i + matched_length + end + + def handle_starttag(tag, method, attrs) + self.send(method, attrs) + end + + def handle_endtag(tag, method) + self.send(method) + end + + def report_unbalanced(tag) + if @verbose + print '*** Unbalanced </' + tag + '>', "\n" + print '*** Stack:', self.stack, "\n" + end + end + + def handle_charref(name) + n = Integer(name) + if !(0 <= n && n <= 255) + unknown_charref(name) + return + end + handle_data(n.chr) + end + + def handle_entityref(name) + table = Entitydefs + if table.include?(name) + handle_data(table[name]) + else + unknown_entityref(name) + return + end + end + + def handle_data(data) + end + + def handle_comment(data) + end + + def handle_special(data) + end + + def unknown_starttag(tag, attrs) + end + def unknown_endtag(tag) + end + def unknown_charref(ref) + end + def unknown_entityref(ref) + end + +end diff --git a/lib/feed2imap/textconverters.rb b/lib/feed2imap/textconverters.rb index be63173..ba3813a 100644 --- a/lib/feed2imap/textconverters.rb +++ b/lib/feed2imap/textconverters.rb @@ -18,6 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end require 'uri' # for URI::regexp +require 'feed2imap/html2text-parser' # This class provides various converters class String @@ -41,19 +42,37 @@ class String # Convert an HTML text to plain text def html2text - text = self.clone - # let's remove all CR - text.gsub!(/\n/, '') - # convert <p> and <br> - text.gsub!(/\s*<\/p>\s*/, '') - text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n") - text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n") - # remove other tags - text.gsub!(/<[^>]*>/, '') - # remove leading and trailing whilespace - text.gsub!(/\A\s*/m, '') - text.gsub!(/\s*\Z/m, '') - text + if false + text = self.clone + # let's remove all CR + text.gsub!(/\n/, '') + # convert <p> and <br> + text.gsub!(/\s*<\/p>\s*/, '') + text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n") + text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n") + # remove other tags + text.gsub!(/<[^>]*>/, '') + # remove leading and trailing whilespace + text.gsub!(/\A\s*/m, '') + text.gsub!(/\s*\Z/m, '') + text + else + text = self.clone + # parse HTML + p = HTML2TextParser::new(true) + p.feed(text) + p.close + text = p.savedata + # remove leading and trailing whilespace + text.gsub!(/\A\s*/m, '') + text.gsub!(/\s*\Z/m, '') + # remove whitespace around \n + text.gsub!(/ *\n/m, "\n") + text.gsub!(/\n */m, "\n") + # and duplicates \n + text.gsub!(/\n\n+/m, "\n\n") + text + end end # Remove white space around the text |