=begin Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server Copyright (c) 2005 Lucas Nussbaum This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end # This class allows to retrieve a feed and parse it into a Channel require 'rexml/document' require 'time' require 'rmail' require 'feed2imap/textconverters' require 'feed2imap/rubymail_patch' require 'feed2imap/rexml_patch' require 'base64' class UnknownFeedTypeException < RuntimeError end # an RSS/Atom channel class Channel attr_reader :title, :link, :description, :creator, :encoding, :items # parse str to build a channel def initialize(str = nil) parse_str(str) if str end # Determines all the fields using a string containing an # XML document def parse_str(str) # Dirty hack: some feeds contain the & char. It must be changed to & str.gsub!(/&(\s+)/, '&\1') doc = REXML::Document.new(str) # get channel info @encoding = doc.encoding @title,@link,@description,@creator = nil @items = [] if doc.root.elements['channel'] || doc.root.elements['rss:channel'] # We have a RSS feed! # Title if (e = doc.root.elements['channel/title'] || doc.root.elements['rss:channel/rss:title']) && e.text @title = e.text.toUTF8(@encoding).rmWhiteSpace! end # Link if (e = doc.root.elements['channel/link'] || doc.root.elements['rss:channel/rss:link']) && e.text @link = e.text.rmWhiteSpace! end # Description if (e = doc.root.elements['channel/description'] || doc.root.elements['rss:channel/rss:description']) && e.text @description = e.text.toUTF8(@encoding).rmWhiteSpace! end # Creator if ((e = doc.root.elements['channel/dc:creator']) && e.text) || ((e = doc.root.elements['channel/author'] || doc.root.elements['rss:channel/rss:author']) && e.text) @creator = e.text.toUTF8(@encoding).rmWhiteSpace! end # Items if doc.root.elements['channel/item'] query = 'channel/item' elsif doc.root.elements['item'] query = 'item' elsif doc.root.elements['rss:channel/rss:item'] query = 'rss:channel/rss:item' else query = 'rss:item' end doc.root.each_element(query) { |e| @items << Item::new(e, self) } elsif doc.root.elements['/feed'] # We have an ATOM feed! # Title if (e = doc.root.elements['/feed/title']) && e.text @title = e.text.toUTF8(@encoding).rmWhiteSpace! end # Link doc.root.each_element('/feed/link') do |e| if e.attribute('type') and ( e.attribute('type').value == 'text/html' or e.attribute('type').value == 'application/xhtml' or e.attribute('type').value == 'application/xhtml+xml') if (h = e.attribute('href')) && h @link = h.value.rmWhiteSpace! end end end # Description if e = doc.root.elements['/feed/info'] @description = e.elements.to_s.toUTF8(@encoding).rmWhiteSpace! end # Items doc.root.each_element('/feed/entry') do |e| @items << AtomItem::new(e, self) end else raise UnknownFeedTypeException::new end end def to_s s = "Title: #{@title}\nLink: #{@link}\n\n" @items.each { |i| s += i.to_s } s end end # an Item from a channel class Item attr_accessor :title, :link, :content, :date, :creator, :subject, :category, :cacheditem attr_reader :channel def initialize(item = nil, channel = nil) @channel = channel @title, @link, @content, @date, @creator, @subject, @category = nil if item # Title if ((e = item.elements['title'] || item.elements['rss:title']) && e.text) || ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) && e.text) @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace! end # Link if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)|| (e = item.elements['guid'] || item.elements['rss:guid'] and not (e.attribute('isPermaLink') and e.attribute('isPermaLink').value == 'false')) @link = e.text.rmWhiteSpace! end # Content if (e = item.elements['content:encoded']) || (e = item.elements['description'] || item.elements['rss:description']) if e.children.length > 1 s = '' e.children.each { |c| s += c.to_s } @content = s.toUTF8(@channel.encoding).rmWhiteSpace!.text2html elsif e.children.length == 1 if e.cdatas[0] @content = e.cdatas[0].to_s.toUTF8(@channel.encoding).rmWhiteSpace! elsif e.text @content = e.text.toUTF8(@channel.encoding).text2html end end end # Date if e = item.elements['dc:date'] || item.elements['pubDate'] || item.elements['rss:pubDate'] begin @date = Time::xmlschema(e.text) rescue begin @date = Time::rfc2822(e.text) rescue begin @date = Time::parse(e.text) rescue @date = nil end end end end # Creator @creator = @channel.creator if (e = item.elements['dc:creator'] || item.elements['author'] || item.elements['rss:author']) && e.text @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace! end # Subject if (e = item.elements['dc:subject']) && e.text @subject = e.text.toUTF8(@channel.encoding).rmWhiteSpace! end # Category if (e = item.elements['dc:category'] || item.elements['category'] || item.elements['rss:category']) && e.text @category = e.text.toUTF8(@channel.encoding).rmWhiteSpace! end end end def to_s "--------------------------------\n" + "Title: #{@title}\nLink: #{@link}\n" + "Date: #{@date.to_s}\nCreator: #{@creator}\n" + "Subject: #{@subject}\nCategory: #{@category}\nContent:\n#{content}\n" end def to_text s = "" s += "Channel: " s += @channel.title.toISO_8859_1('utf-8') + ' ' if @channel.title s += "<#{@channel.link.toISO_8859_1('utf-8')}>" if @channel.link s += "\n" s += "Item: " s += @title.toISO_8859_1('utf-8') + ' ' if @title s += "<#{@link.toISO_8859_1('utf-8')}>" if @link s += "\n" s += "\nDate: #{@date.to_s.toISO_8859_1('utf-8')}" if @date # TODO improve date rendering ? s += "\nAuthor: #{@creator.toISO_8859_1('utf-8')}" if @creator s += "\nSubject: #{@subject.toISO_8859_1('utf-8')}" if @subject s += "\nCategory: #{@category.toISO_8859_1('utf-8')}" if @category s += "\n\n" s += "#{@content.html2text.toISO_8859_1('utf-8')}" if @content s end def to_html s = '' s += '' s += '' s += "

Channel: " s += "" if @channel.link s += @channel.title if @channel.title s += "" if @channel.link s += "
\nItem: " s += "" if @link s += @title if @title s += "" if @link s += "\n" s += "
Date: #{@date.to_s}" if @date # TODO improve date rendering ? s += "
Author: #{@creator}" if @creator s += "
Subject: #{@subject}" if @subject s += "
Category: #{@category}" if @category s += "

" s += "

#{@content}

" if @content s += '' s end def to_mail(from = 'Feed2Imap') message = RMail::Message::new message.header['From'] = "#{from} " message.header['To'] = "#{from} " if @date.nil? message.header['Date'] = Time::new.rfc2822 else message.header['Date'] = @date.rfc2822 end message.header['X-Feed2Imap-Version'] = F2I_VERSION if defined?(F2I_VERSION) message.header['X-CacheIndex'] = "-#{@cacheditem.index}-" message.header['X-F2IStatus'] = "Updated" if @cacheditem.updated # treat subject. Might need MIME encoding. subj = @title or (@date and @date.to_s) or @link if subj if subj.needMIME message.header['Subject'] = "=?utf-8?b?#{Base64::encode64(subj).gsub("\n",'')}?=" else message.header['Subject'] = subj end end textpart = RMail::Message::new textpart.header['Content-Type'] = 'text/plain; charset=iso-8859-1' textpart.header['Content-Transfer-Encoding'] = '7bit' textpart.body = to_text htmlpart = RMail::Message::new htmlpart.header['Content-Type'] = 'text/html; charset=utf-8' htmlpart.header['Content-Transfer-Encoding'] = '7bit' htmlpart.body = to_html message.add_part(textpart) message.add_part(htmlpart) return message.to_s end end class AtomItem < Item def initialize(item = nil, channel = nil) @channel = channel @title, @link, @content, @date, @creator, @subject, @category = nil if item # Title if (e = item.elements['title']) && e.text @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace! end # Link item.each_element('link') do |e| if e.attribute('type').value == 'text/html' or e.attribute('type').value == 'application/xhtml' or e.attribute('type').value == 'application/xhtml+xml' if (h = e.attribute('href')) && h.value @link = h.value end end end # Content if e = item.elements['content'] || item.elements['summary'] if (e.attribute('mode') and e.attribute('mode').value == 'escaped') && e.text @content = e.text.toUTF8(@channel.encoding).rmWhiteSpace! else # go one step deeper in the recursion if possible e = e.elements['div'] || e @content = e.to_s.toUTF8(@channel.encoding).rmWhiteSpace! end end # Date if (e = item.elements['issued'] || e = item.elements['created']) && e.text begin @date = Time::xmlschema(e.text) rescue begin @date = Time::rfc2822(e.text) rescue begin @date = Time::parse(e.text) rescue @date = nil end end end end # Creator @creator = @channel.creator if (e = item.elements['author/name']) && e.text @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace! end end end end