summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorlnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972>2005-07-20 16:02:15 +0000
committerlnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972>2005-07-20 16:02:15 +0000
commit8d3b2a0a706ed76527b204e28f97a7fdea6db633 (patch)
tree830f12b7e57fdee27f99848e69895a835ccb4b55 /lib
parentb497c57ea6afb1e2c93e61be527d0b98ac92e133 (diff)
downloadfeed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.gz
feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.bz2
feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.zip
new html2text converter
git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@36 f70e237a-67f3-0310-a06c-d2b8a7116972
Diffstat (limited to 'lib')
-rw-r--r--lib/feed2imap/html2text-parser.rb97
-rw-r--r--lib/feed2imap/sgml-parser.rb333
-rw-r--r--lib/feed2imap/textconverters.rb45
3 files changed, 462 insertions, 13 deletions
diff --git a/lib/feed2imap/html2text-parser.rb b/lib/feed2imap/html2text-parser.rb
new file mode 100644
index 0000000..a6bf400
--- /dev/null
+++ b/lib/feed2imap/html2text-parser.rb
@@ -0,0 +1,97 @@
+=begin
+Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
+Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+=end
+
+require 'feed2imap/sgml-parser.rb'
+
+# this class provides a simple SGML parser that removes HTML tags
+class HTML2TextParser < SGMLParser
+
+ attr_reader :savedata
+
+ def initialize(verbose = false)
+ @savedata = ''
+ @pre = false
+ @href = nil
+ @links = []
+ super(verbose)
+ end
+
+ def handle_data(data)
+ # let's remove all CR
+ data.gsub!(/\n/, '') if not @pre
+
+ @savedata << data
+ end
+
+ def unknown_starttag(tag, attrs)
+ case tag
+ when 'p'
+ @savedata << "\n\n"
+ when 'br'
+ @savedata << "\n"
+ when 'b'
+ @savedata << '*'
+ when 'u'
+ @savedata << '_'
+ when 'i'
+ @savedata << '/'
+ when 'pre'
+ @savedata << "\n\n"
+ @pre = true
+ when 'a'
+ # find href in args
+ @href = nil
+ attrs.each do |a|
+ if a[0] == 'href'
+ @href = a[1]
+ end
+ end
+ @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
+ end
+ end
+
+ def close
+ super
+ if @links.length > 0
+ @savedata << "\n\n"
+ @links.each_index do |i|
+ @savedata << "[#{i+1}] #{@links[i]}\n"
+ end
+ end
+ end
+
+ def unknown_endtag(tag)
+ case tag
+ when 'b'
+ @savedata << '*'
+ when 'u'
+ @savedata << '_'
+ when 'i'
+ @savedata << '/'
+ when 'pre'
+ @savedata << "\n\n"
+ @pre = false
+ when 'a'
+ if @href
+ @savedata << "[#{@links.length}]"
+ @href = nil
+ end
+ end
+ end
+end
diff --git a/lib/feed2imap/sgml-parser.rb b/lib/feed2imap/sgml-parser.rb
new file mode 100644
index 0000000..c692f52
--- /dev/null
+++ b/lib/feed2imap/sgml-parser.rb
@@ -0,0 +1,333 @@
+# A parser for SGML, using the derived class as static DTD.
+# from http://raa.ruby-lang.org/project/html-parser
+
+class SGMLParser
+
+ # Regular expressions used for parsing:
+ Interesting = /[&<]/
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
+ '![^<>]*)?')
+
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
+ Charref = /&#([0-9]+)[^0-9]/
+
+ Starttagopen = /<[>a-zA-Z]/
+ Endtagopen = /<\/[<>a-zA-Z]/
+ Endbracket = /[<>]/
+ Special = /<![^<>]*>/
+ Commentopen = /<!--/
+ Commentclose = /--[ \t\n]*>/
+ Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
+ Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
+ '(\s*=\s*' +
+ "('[^']*'" +
+ '|"[^"]*"' +
+ '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
+
+ Entitydefs =
+ {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
+
+ def initialize(verbose=false)
+ @verbose = verbose
+ reset
+ end
+
+ def reset
+ @rawdata = ''
+ @stack = []
+ @lasttag = '???'
+ @nomoretags = false
+ @literal = false
+ end
+
+ def has_context(gi)
+ @stack.include? gi
+ end
+
+ def setnomoretags
+ @nomoretags = true
+ @literal = true
+ end
+
+ def setliteral(*args)
+ @literal = true
+ end
+
+ def feed(data)
+ @rawdata << data
+ goahead(false)
+ end
+
+ def close
+ goahead(true)
+ end
+
+ def goahead(_end)
+ rawdata = @rawdata
+ i = 0
+ n = rawdata.length
+ while i < n
+ if @nomoretags
+ handle_data(rawdata[i..(n-1)])
+ i = n
+ break
+ end
+ j = rawdata.index(Interesting, i)
+ j = n unless j
+ if i < j
+ handle_data(rawdata[i..(j-1)])
+ end
+ i = j
+ break if (i == n)
+ if rawdata[i] == ?< #
+ if rawdata.index(Starttagopen, i) == i
+ if @literal
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ k = parse_starttag(i)
+ break unless k
+ i = k
+ next
+ end
+ if rawdata.index(Endtagopen, i) == i
+ k = parse_endtag(i)
+ break unless k
+ i = k
+ @literal = false
+ next
+ end
+ if rawdata.index(Commentopen, i) == i
+ if @literal
+ handle_data(rawdata[i,1])
+ i += 1
+ next
+ end
+ k = parse_comment(i)
+ break unless k
+ i += k
+ next
+ end
+ if rawdata.index(Special, i) == i
+ if @literal
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ k = parse_special(i)
+ break unless k
+ i += k
+ next
+ end
+ elsif rawdata[i] == ?& #
+ if rawdata.index(Charref, i) == i
+ i += $&.length
+ handle_charref($1)
+ i -= 1 unless rawdata[i-1] == ?;
+ next
+ end
+ if rawdata.index(Entityref, i) == i
+ i += $&.length
+ handle_entityref($1)
+ i -= 1 unless rawdata[i-1] == ?;
+ next
+ end
+ else
+ raise RuntimeError, 'neither < nor & ??'
+ end
+ # We get here only if incomplete matches but
+ # nothing else
+ match = rawdata.index(Incomplete, i)
+ unless match == i
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ j = match + $&.length
+ break if j == n # Really incomplete
+ handle_data(rawdata[i..(j-1)])
+ i = j
+ end
+ # end while
+ if _end and i < n
+ handle_data(@rawdata[i..(n-1)])
+ i = n
+ end
+ @rawdata = rawdata[i..-1]
+ end
+
+ def parse_comment(i)
+ rawdata = @rawdata
+ if rawdata[i, 4] != '<!--'
+ raise RuntimeError, 'unexpected call to handle_comment'
+ end
+ match = rawdata.index(Commentclose, i)
+ return nil unless match
+ matched_length = $&.length
+ j = match
+ handle_comment(rawdata[i+4..(j-1)])
+ j = match + matched_length
+ return j-i
+ end
+
+ def parse_starttag(i)
+ rawdata = @rawdata
+ j = rawdata.index(Endbracket, i + 1)
+ return nil unless j
+ attrs = []
+ if rawdata[i+1] == ?> #
+ # SGML shorthand: <> == <last open tag seen>
+ k = j
+ tag = @lasttag
+ else
+ match = rawdata.index(Tagfind, i + 1)
+ unless match
+ raise RuntimeError, 'unexpected call to parse_starttag'
+ end
+ k = i + 1 + ($&.length)
+ tag = $&.downcase
+ @lasttag = tag
+ end
+ while k < j
+ break unless rawdata.index(Attrfind, k)
+ matched_length = $&.length
+ attrname, rest, attrvalue = $1, $2, $3
+ if not rest
+ attrvalue = '' # was: = attrname
+ elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
+ (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
+ attrvalue = attrvalue[1..-2]
+ end
+ attrs << [attrname.downcase, attrvalue]
+ k += matched_length
+ end
+ if rawdata[j] == ?> #
+ j += 1
+ end
+ finish_starttag(tag, attrs)
+ return j
+ end
+
+ def parse_endtag(i)
+ rawdata = @rawdata
+ j = rawdata.index(Endbracket, i + 1)
+ return nil unless j
+ tag = (rawdata[i+2..j-1].strip).downcase
+ if rawdata[j] == ?> #
+ j += 1
+ end
+ finish_endtag(tag)
+ return j
+ end
+
+ def finish_starttag(tag, attrs)
+ method = 'start_' + tag
+ if self.respond_to?(method)
+ @stack << tag
+ handle_starttag(tag, method, attrs)
+ return 1
+ else
+ method = 'do_' + tag
+ if self.respond_to?(method)
+ handle_starttag(tag, method, attrs)
+ return 0
+ else
+ unknown_starttag(tag, attrs)
+ return -1
+ end
+ end
+ end
+
+ def finish_endtag(tag)
+ if tag == ''
+ found = @stack.length - 1
+ if found < 0
+ unknown_endtag(tag)
+ return
+ end
+ else
+ unless @stack.include? tag
+ method = 'end_' + tag
+ unless self.respond_to?(method)
+ unknown_endtag(tag)
+ end
+ return
+ end
+ found = @stack.index(tag) #or @stack.length
+ end
+ while @stack.length > found
+ tag = @stack[-1]
+ method = 'end_' + tag
+ if respond_to?(method)
+ handle_endtag(tag, method)
+ else
+ unknown_endtag(tag)
+ end
+ @stack.pop
+ end
+ end
+
+ def parse_special(i)
+ rawdata = @rawdata
+ match = rawdata.index(Endbracket, i+1)
+ return nil unless match
+ matched_length = $&.length
+ handle_special(rawdata[i+1..(match-1)])
+ return match - i + matched_length
+ end
+
+ def handle_starttag(tag, method, attrs)
+ self.send(method, attrs)
+ end
+
+ def handle_endtag(tag, method)
+ self.send(method)
+ end
+
+ def report_unbalanced(tag)
+ if @verbose
+ print '*** Unbalanced </' + tag + '>', "\n"
+ print '*** Stack:', self.stack, "\n"
+ end
+ end
+
+ def handle_charref(name)
+ n = Integer(name)
+ if !(0 <= n && n <= 255)
+ unknown_charref(name)
+ return
+ end
+ handle_data(n.chr)
+ end
+
+ def handle_entityref(name)
+ table = Entitydefs
+ if table.include?(name)
+ handle_data(table[name])
+ else
+ unknown_entityref(name)
+ return
+ end
+ end
+
+ def handle_data(data)
+ end
+
+ def handle_comment(data)
+ end
+
+ def handle_special(data)
+ end
+
+ def unknown_starttag(tag, attrs)
+ end
+ def unknown_endtag(tag)
+ end
+ def unknown_charref(ref)
+ end
+ def unknown_entityref(ref)
+ end
+
+end
diff --git a/lib/feed2imap/textconverters.rb b/lib/feed2imap/textconverters.rb
index be63173..ba3813a 100644
--- a/lib/feed2imap/textconverters.rb
+++ b/lib/feed2imap/textconverters.rb
@@ -18,6 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
=end
require 'uri' # for URI::regexp
+require 'feed2imap/html2text-parser'
# This class provides various converters
class String
@@ -41,19 +42,37 @@ class String
# Convert an HTML text to plain text
def html2text
- text = self.clone
- # let's remove all CR
- text.gsub!(/\n/, '')
- # convert <p> and <br>
- text.gsub!(/\s*<\/p>\s*/, '')
- text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n")
- text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n")
- # remove other tags
- text.gsub!(/<[^>]*>/, '')
- # remove leading and trailing whilespace
- text.gsub!(/\A\s*/m, '')
- text.gsub!(/\s*\Z/m, '')
- text
+ if false
+ text = self.clone
+ # let's remove all CR
+ text.gsub!(/\n/, '')
+ # convert <p> and <br>
+ text.gsub!(/\s*<\/p>\s*/, '')
+ text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n")
+ text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n")
+ # remove other tags
+ text.gsub!(/<[^>]*>/, '')
+ # remove leading and trailing whilespace
+ text.gsub!(/\A\s*/m, '')
+ text.gsub!(/\s*\Z/m, '')
+ text
+ else
+ text = self.clone
+ # parse HTML
+ p = HTML2TextParser::new(true)
+ p.feed(text)
+ p.close
+ text = p.savedata
+ # remove leading and trailing whilespace
+ text.gsub!(/\A\s*/m, '')
+ text.gsub!(/\s*\Z/m, '')
+ # remove whitespace around \n
+ text.gsub!(/ *\n/m, "\n")
+ text.gsub!(/\n */m, "\n")
+ # and duplicates \n
+ text.gsub!(/\n\n+/m, "\n\n")
+ text
+ end
end
# Remove white space around the text