5 files changed, 91 insertions, 493 deletions
diff --git a/lib/feed2imap/channel.rb b/lib/feed2imap/channel.rb
deleted file mode 100644
index c43c254..0000000
--- a/lib/feed2imap/channel.rb
+++ /dev/null
@@ -1,334 +0,0 @@
-=begin
-Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
-Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-=end
-
-# This class allows to retrieve a feed and parse it into a Channel
-
-require 'rexml/document'
-require 'time'
-require 'rmail'
-require 'feed2imap/textconverters'
-require 'feed2imap/rubymail_patch'
-require 'feed2imap/rexml_patch'
-require 'base64'
-
-class UnknownFeedTypeException < RuntimeError
-end
-# an RSS/Atom channel
-class Channel
-  attr_reader :title, :link, :description, :creator, :encoding, :items
-
-  # parse str to build a channel
-  def initialize(str = nil)
-    parse_str(str) if str
-  end
-
-  # Determines all the fields using a string containing an
-  # XML document
-  def parse_str(str)
-    # Dirty hack: some feeds contain the & char. It must be changed to &amp;
-    str.gsub!(/&(\s+)/, '&amp;\1')
-    doc = REXML::Document.new(str)
-    # get channel info
-    @encoding = doc.encoding
-    @title,@link,@description,@creator = nil
-    @items = []
-    if doc.root.elements['channel'] || doc.root.elements['rss:channel']
-      # We have a RSS feed!
-      # Title
-      if (e = doc.root.elements['channel/title'] ||
-        doc.root.elements['rss:channel/rss:title']) && e.text
-        @title = e.text.toUTF8(@encoding).rmWhiteSpace!
-      end
-      # Link
-      if (e = doc.root.elements['channel/link'] ||
-          doc.root.elements['rss:channel/rss:link']) && e.text
-        @link = e.text.rmWhiteSpace!
-      end
-      # Description
-      if (e = doc.root.elements['channel/description'] || 
-          doc.root.elements['rss:channel/rss:description']) && e.text
-        @description = e.text.toUTF8(@encoding).rmWhiteSpace!
-      end
-      # Creator
-      if ((e = doc.root.elements['channel/dc:creator']) && e.text) ||
-          ((e = doc.root.elements['channel/author'] ||
-          doc.root.elements['rss:channel/rss:author']) && e.text)
-        @creator = e.text.toUTF8(@encoding).rmWhiteSpace!
-      end
-      # Items
-      if doc.root.elements['channel/item']
-        query = 'channel/item'
-      elsif doc.root.elements['item']
-        query = 'item'
-      elsif doc.root.elements['rss:channel/rss:item']
-        query = 'rss:channel/rss:item'
-      else
-        query = 'rss:item'
-      end
-      doc.root.each_element(query) { |e| @items << Item::new(e, self) }
-
-    elsif doc.root.elements['/feed']
-      # We have an ATOM feed!
-      # Title
-      if (e = doc.root.elements['/feed/title']) && e.text
-        @title = e.text.toUTF8(@encoding).rmWhiteSpace!
-      end
-      # Link
-      doc.root.each_element('/feed/link') do |e|
-        if e.attribute('type') and (
-            e.attribute('type').value == 'text/html' or
-            e.attribute('type').value == 'application/xhtml' or
-            e.attribute('type').value == 'application/xhtml+xml')
-          if (h = e.attribute('href')) && h
-            @link = h.value.rmWhiteSpace!
-          end
-        end
-      end
-      # Description
-      if e = doc.root.elements['/feed/info']
-        @description = e.elements.to_s.toUTF8(@encoding).rmWhiteSpace!
-      end
-      # Items
-      doc.root.each_element('/feed/entry') do |e|
-         @items << AtomItem::new(e, self)
-      end
-    else
-      raise UnknownFeedTypeException::new
-    end
-  end
-
-  def to_s
-    s = "Title: #{@title}\nLink: #{@link}\n\n"
-    @items.each { |i| s += i.to_s }
-    s
-  end
-end
-
-# an Item from a channel
-class Item
-  attr_accessor :title, :link, :content, :date, :creator, :subject,
-                :category, :cacheditem
-  attr_reader :channel
-  def initialize(item = nil, channel = nil)
-    @channel = channel
-    @title, @link, @content, @date, @creator, @subject, @category = nil
-    if item
-      # Title
-      if ((e = item.elements['title'] || item.elements['rss:title']) &&
-          e.text)  ||
-          ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) &&
-           e.text)
-        @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace!
-      end
-      # Link
-      if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)||
-          (e = item.elements['guid'] || item.elements['rss:guid'] and
-          not (e.attribute('isPermaLink') and
-          e.attribute('isPermaLink').value == 'false'))
-        @link = e.text.rmWhiteSpace!
-      end
-      # Content
-      if (e = item.elements['content:encoded']) ||
-        (e = item.elements['description'] || item.elements['rss:description'])
-        if e.children.length > 1
-          s = ''
-          e.children.each { |c| s += c.to_s }
-          @content = s.toUTF8(@channel.encoding).rmWhiteSpace!.text2html
-        elsif e.children.length == 1
-          if e.cdatas[0]
-            @content = e.cdatas[0].to_s.toUTF8(@channel.encoding).rmWhiteSpace!
-          elsif e.text
-            @content = e.text.toUTF8(@channel.encoding).text2html
-          end
-        end
-      end
-      # Date
-      if e = item.elements['dc:date'] || item.elements['pubDate'] || 
-          item.elements['rss:pubDate']
-        begin
-          @date = Time::xmlschema(e.text)
-        rescue
-          begin
-            @date = Time::rfc2822(e.text)
-          rescue
-            begin
-              @date = Time::parse(e.text)
-            rescue
-              @date = nil
-            end
-          end
-        end
-      end
-      # Creator
-      @creator = @channel.creator
-      if (e = item.elements['dc:creator'] || item.elements['author'] ||
-          item.elements['rss:author']) && e.text
-        @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace!
-      end
-      # Subject
-      if (e = item.elements['dc:subject']) && e.text
-        @subject = e.text.toUTF8(@channel.encoding).rmWhiteSpace!
-      end
-      # Category
-      if (e = item.elements['dc:category'] || item.elements['category'] ||
-          item.elements['rss:category']) && e.text
-        @category = e.text.toUTF8(@channel.encoding).rmWhiteSpace!
-      end
-    end
-  end
-
-  def to_s
-    "--------------------------------\n" +
-      "Title: #{@title}\nLink: #{@link}\n" +
-      "Date: #{@date.to_s}\nCreator: #{@creator}\n" +
-      "Subject: #{@subject}\nCategory: #{@category}\nContent:\n#{content}\n"
-  end
-
-  def to_text
-    s = ""
-    s += "Channel: "
-    s += @channel.title.toISO_8859_1('utf-8') + ' ' if @channel.title
-    s += "<#{@channel.link.toISO_8859_1('utf-8')}>" if @channel.link
-    s += "\n"
-    s += "Item: "
-    s += @title.toISO_8859_1('utf-8') + ' ' if @title
-    s += "<#{@link.toISO_8859_1('utf-8')}>" if @link
-    s += "\n"
-    s += "\nDate: #{@date.to_s.toISO_8859_1('utf-8')}" if @date # TODO improve date rendering ?
-    s += "\nAuthor: #{@creator.toISO_8859_1('utf-8')}" if @creator
-
-    s += "\nSubject: #{@subject.toISO_8859_1('utf-8')}" if @subject
-    s += "\nCategory: #{@category.toISO_8859_1('utf-8')}" if @category
-    s += "\n\n"
-    s += "#{@content.html2text.toISO_8859_1('utf-8')}" if @content
-    s
-  end
-
-  def to_html
-    s = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'
-    s += '<html>'
-    s += '<body>'
-    s += "<p>Channel: "
-    s += "<a href=\"#{@channel.link}\">" if @channel.link
-    s += @channel.title if @channel.title
-    s += "</a>" if @channel.link
-    s += "<br/>\nItem: "
-    s += "<a href=\"#{@link}\">" if @link
-    s += @title if @title
-    s += "</a>" if @link
-    s += "\n"
-    s += "<br/>Date: #{@date.to_s}" if @date # TODO improve date rendering ?
-    s += "<br/>Author: #{@creator}" if @creator
-    s += "<br/>Subject: #{@subject}" if @subject
-    s += "<br/>Category: #{@category}" if @category
-    s += "</p>"
-    s += "<p>#{@content}</p>" if @content
-    s += '</body></html>'
-    s
-  end
-
-  def to_mail(from = 'Feed2Imap')
-    message = RMail::Message::new
-    message.header['From'] = "#{from} <feed2imap@feed2imap.net>"
-    message.header['To'] = "#{from} <feed2imap@feed2imap.net>"
-    if @date.nil?
-      message.header['Date'] = Time::new.rfc2822
-    else
-      message.header['Date'] = @date.rfc2822
-    end
-    message.header['X-Feed2Imap-Version'] = F2I_VERSION if defined?(F2I_VERSION)
-    message.header['X-CacheIndex'] = "-#{@cacheditem.index}-"
-    message.header['X-F2IStatus'] = "Updated" if @cacheditem.updated
-    # treat subject. Might need MIME encoding.
-    subj = @title or (@date and @date.to_s) or @link
-    if subj
-      if subj.needMIME
-        message.header['Subject'] = "=?utf-8?b?#{Base64::encode64(subj).gsub("\n",'')}?="
-      else
-        message.header['Subject'] = subj 
-      end
-    end
-    textpart = RMail::Message::new
-    textpart.header['Content-Type'] = 'text/plain; charset=iso-8859-1'
-    textpart.header['Content-Transfer-Encoding'] = '7bit'
-    textpart.body = to_text
-    htmlpart = RMail::Message::new
-    htmlpart.header['Content-Type'] = 'text/html; charset=utf-8'
-    htmlpart.header['Content-Transfer-Encoding'] = '7bit'
-    htmlpart.body = to_html
-    message.add_part(textpart)
-    message.add_part(htmlpart)
-    return message.to_s
-  end
-end
-
-class AtomItem < Item
-  def initialize(item = nil, channel = nil)
-    @channel = channel
-    @title, @link, @content, @date, @creator, @subject, @category = nil
-    if item
-      # Title
-      if (e = item.elements['title']) && e.text
-        @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace!
-      end
-      # Link
-      item.each_element('link') do |e|
-        if e.attribute('type').value == 'text/html' or
-          e.attribute('type').value == 'application/xhtml' or
-          e.attribute('type').value == 'application/xhtml+xml'
-          if (h = e.attribute('href')) && h.value
-            @link = h.value
-          end
-        end
-      end
-      # Content
-      if e = item.elements['content'] || item.elements['summary']
-        if (e.attribute('mode') and e.attribute('mode').value == 'escaped') &&
-          e.text
-          @content = e.text.toUTF8(@channel.encoding).rmWhiteSpace!
-        else
-          # go one step deeper in the recursion if possible
-          e = e.elements['div'] || e
-          @content = e.to_s.toUTF8(@channel.encoding).rmWhiteSpace!
-        end
-      end
-      # Date
-      if (e = item.elements['issued'] || e = item.elements['created']) && e.text
-        begin
-          @date = Time::xmlschema(e.text)
-        rescue
-          begin
-            @date = Time::rfc2822(e.text)
-          rescue
-            begin
-              @date = Time::parse(e.text)
-            rescue
-              @date = nil
-            end
-          end
-        end
-      end
-      # Creator
-      @creator = @channel.creator
-      if (e = item.elements['author/name']) && e.text
-        @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace!
-      end
-    end
-  end
-end
diff --git a/lib/feed2imap/feed2imap.rb b/lib/feed2imap/feed2imap.rb
index e1ad3cd..250a0e7 100644
--- a/lib/feed2imap/feed2imap.rb
+++ b/lib/feed2imap/feed2imap.rb
@@ -18,14 +18,15 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 =end
 
 # Feed2Imap version
-F2I_VERSION = '0.5'
+F2I_VERSION = '0.5.1'
 
 require 'feed2imap/config'
 require 'feed2imap/cache'
-require 'feed2imap/channel'
 require 'feed2imap/httpfetcher'
 require 'logger'
 require 'thread'
+require 'feedparser'
+require 'feed2imap/itemtomail'
 
 class Feed2Imap
   def Feed2Imap.version
@@ -119,13 +120,13 @@ class Feed2Imap
     @config.feeds.each do |f|
       next if f.body.nil? # means 304
       begin
-        channel = Channel::new(f.body)
+        feed = FeedParser::Feed::new(f.body)
       rescue Exception => e
         @logger.fatal("Error while parsing #{f.name}: #{e}")
         next
       end
       begin
-        newitems, updateditems = @cache.get_new_items(f.name, channel.items)
+        newitems, updateditems = @cache.get_new_items(f.name, feed.items)
       rescue
         @logger.fatal("Exception caught when selecting new items for #{f.name}: #{$!}")
         puts $!.backtrace
@@ -134,8 +135,14 @@ class Feed2Imap
       @logger.info("#{f.name}: #{newitems.length} new items, #{updateditems.length} updated items.") if newitems.length > 0 or updateditems.length > 0
       begin
         if !cacherebuild
-          updateditems.each { |i| f.imapaccount.updatemail(f.folder, i.to_mail(f.name), i.cacheditem.index) }
-          newitems.each { |i| f.imapaccount.putmail(f.folder, i.to_mail(f.name)) }
+          updateditems.each do |i|
+            email = item_to_mail(i, i.cacheditem.index, true, f.name)
+            f.imapaccount.updatemail(f.folder, email, i.cacheditem.index)
+          end
+          newitems.each do |i|
+            email = item_to_mail(i, i.cacheditem.index, false, f.name)
+            f.imapaccount.putmail(f.folder, email)
+          end
         end
       rescue
         @logger.fatal("Exception caught while uploading mail to #{f.folder}: #{$!}")
diff --git a/lib/feed2imap/itemtomail.rb b/lib/feed2imap/itemtomail.rb
new file mode 100644
index 0000000..9d1cbad
--- /dev/null
+++ b/lib/feed2imap/itemtomail.rb
@@ -0,0 +1,77 @@
+=begin
+Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
+Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>
+
+This file contains classes to parse a feed and store it as a Channel object.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+=end
+
+require 'rexml/document'
+require 'time'
+require 'rmail'
+require 'feedparser'
+require 'feedparser/text-output'
+require 'feedparser/html-output'
+require 'base64'
+require 'feed2imap/rubymail_patch'
+
+class String
+  def needMIME
+    utf8 = false
+    self.unpack('U*').each do |c|
+      if c > 127
+        utf8 = true
+        break
+      end
+    end
+    utf8
+  end
+end
+
+def item_to_mail(item, index, updated, from = 'Feed2Imap')
+  message = RMail::Message::new
+  message.header['From'] = "#{from} <feed2imap@feed2imap.net>"
+  message.header['To'] = "#{from} <feed2imap@feed2imap.net>"
+  if @date.nil?
+    message.header['Date'] = Time::new.rfc2822
+  else
+    message.header['Date'] = item.date.rfc2822
+  end
+  message.header['X-Feed2Imap-Version'] = F2I_VERSION if defined?(F2I_VERSION)
+  message.header['X-CacheIndex'] = "-#{index}-"
+  message.header['X-F2IStatus'] = "Updated" if updated
+  # treat subject. Might need MIME encoding.
+  subj = item.title or (item.date and item.date.to_s) or item.link
+  if subj
+    if subj.needMIME
+      message.header['Subject'] = "=?utf-8?b?#{Base64::encode64(subj).gsub("\n",'')}?="
+    else
+      message.header['Subject'] = subj 
+    end
+  end
+  textpart = RMail::Message::new
+  textpart.header['Content-Type'] = 'text/plain; charset=utf-8'
+  textpart.header['Content-Transfer-Encoding'] = '7bit'
+  textpart.body = item.to_text
+  htmlpart = RMail::Message::new
+  htmlpart.header['Content-Type'] = 'text/html; charset=utf-8'
+  htmlpart.header['Content-Transfer-Encoding'] = '7bit'
+  htmlpart.body = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html><body>' + item.to_html + '</body></html>'
+  message.add_part(textpart)
+  message.add_part(htmlpart)
+  return message.to_s
+end
+
diff --git a/lib/feed2imap/rexml_patch.rb b/lib/feed2imap/rexml_patch.rb
index bce0be3..f991090 100644
--- a/lib/feed2imap/rexml_patch.rb
+++ b/lib/feed2imap/rexml_patch.rb
@@ -17,7 +17,7 @@ along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 =end
 
-require 'feed2imap/textconverters'
+require 'feedparser'
 
 # Patch for REXML
 # Very ugly patch to make REXML error-proof.
diff --git a/lib/feed2imap/textconverters.rb b/lib/feed2imap/textconverters.rb
deleted file mode 100644
index 9145e5a..0000000
--- a/lib/feed2imap/textconverters.rb
+++ /dev/null
@@ -1,152 +0,0 @@
-=begin
-Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
-Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-=end
-
-# for URI::regexp
-require 'uri'
-require 'feed2imap/html2text-parser'
-
-# This class provides various converters
-class String
-  # is this text HTML ? search for tags
-  def html?
-    return (self =~ /<p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/) || (self =~ /<\/a>/) || (self =~ /<img.*>/)
-  end
-
-  # returns true if the text contains escaped HTML (with HTML entities)
-  def escaped_html?
-    return (self =~ /&lt;img src=/) || (self =~ /&lt;a href=/) || (self =~ /&lt;br(\/| \/|)&gt;/)
-  end
-
-  # un-escape HTML in the text
-  def unescape_html
-    {
-      '<' => '&lt;',
-      '>' => '&gt;',
-      "'" => '&apos;',
-      '"' => '&quot;',
-      '&' => '&amp;',
-      "\047" => '&#39;'
-    }.each do |k, v|
-      gsub!(v, k)
-    end
-    self
-  end
-
-  # convert text to HTML
-  def text2html
-    text = self.clone
-    return text if text.html?
-    if text.escaped_html?
-      return text.unescape_html
-    end
-    # paragraphs
-    text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
-    text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
-    # uris
-    text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
-        '<a href="\1">\1</a>')
-    text
-  end
-
-  # Convert an HTML text to plain text
-  def html2text
-    if false
-      text = self.clone
-      # let's remove all CR
-      text.gsub!(/\n/, '')
-      # convert <p> and <br>
-      text.gsub!(/\s*<\/p>\s*/, '')
-      text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n")
-      text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n")
-      # remove other tags
-      text.gsub!(/<[^>]*>/, '')
-      # remove leading and trailing whilespace
-      text.gsub!(/\A\s*/m, '')
-      text.gsub!(/\s*\Z/m, '')
-      text
-    else
-      text = self.clone
-      # parse HTML
-      p = HTML2TextParser::new(true)
-      p.feed(text)
-      p.close
-      text = p.savedata
-      # remove leading and trailing whilespace
-      text.gsub!(/\A\s*/m, '')
-      text.gsub!(/\s*\Z/m, '')
-      # remove whitespace around \n
-      text.gsub!(/ *\n/m, "\n")
-      text.gsub!(/\n */m, "\n")
-      # and duplicates \n
-      text.gsub!(/\n\n+/m, "\n\n")
-      text
-    end
-  end
-
-  # Remove white space around the text
-  def rmWhiteSpace!
-    return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
-  end
-
-  # Convert a text in inputenc to a text in ISO-8859-1
-  def toISO_8859_1(inputenc)
-    if inputenc.downcase == 'utf-8'
-      begin
-        return self.unpack('U*').pack('C*')
-      rescue
-        return self
-      end
-    else
-      return self
-    end
-  end
-
-  # Convert a text in inputenc to a text in UTF8
-  # must take care of wrong input locales
-  def toUTF8(inputenc)
-    if inputenc.downcase != 'utf-8'
-      # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
-      begin
-        if self.unpack('U*').pack('U*') == self
-          return self
-        end
-      rescue
-        # do nothing
-      end
-      begin
-        return self.unpack('C*').pack('U*')
-      rescue
-        return self #failsafe solution. but a dirty one :-)
-      end
-    else
-      return self
-    end
-  end
-
-  def needMIME
-    utf8 = false
-    self.unpack('U*').each do |c|
-      if c > 127
-        utf8 = true
-        break
-      end
-    end
-    utf8
-  end
-end