From 16ec9aba7e94e628f22bcaeb3ecdd7916f3a3df5 Mon Sep 17 00:00:00 2001 From: lnu Date: Thu, 31 Mar 2005 22:08:32 +0000 Subject: first import git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@5 f70e237a-67f3-0310-a06c-d2b8a7116972 --- lib/feed2imap/cache.rb | 176 ++++++++++++++++++++++ lib/feed2imap/channel.rb | 326 ++++++++++++++++++++++++++++++++++++++++ lib/feed2imap/config.rb | 76 ++++++++++ lib/feed2imap/feed2imap.rb | 135 +++++++++++++++++ lib/feed2imap/httpfetcher.rb | 89 +++++++++++ lib/feed2imap/imap.rb | 118 +++++++++++++++ lib/feed2imap/rexml_patch.rb | 41 +++++ lib/feed2imap/rubymail_patch.rb | 50 ++++++ lib/feed2imap/textconverters.rb | 85 +++++++++++ 9 files changed, 1096 insertions(+) create mode 100644 lib/feed2imap/cache.rb create mode 100644 lib/feed2imap/channel.rb create mode 100644 lib/feed2imap/config.rb create mode 100644 lib/feed2imap/feed2imap.rb create mode 100644 lib/feed2imap/httpfetcher.rb create mode 100644 lib/feed2imap/imap.rb create mode 100644 lib/feed2imap/rexml_patch.rb create mode 100644 lib/feed2imap/rubymail_patch.rb create mode 100644 lib/feed2imap/textconverters.rb (limited to 'lib/feed2imap') diff --git a/lib/feed2imap/cache.rb b/lib/feed2imap/cache.rb new file mode 100644 index 0000000..1534483 --- /dev/null +++ b/lib/feed2imap/cache.rb @@ -0,0 +1,176 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +# This class manages a cache of items +# (items which have already been seen) + +require 'digest/md5' + +class ItemCache + def initialize + @channels = {} + @@cacheidx = 0 + self + end + + # Returns the really new items amongst items + def get_new_items(id, items) + @channels[id] ||= CachedChannel::new + return @channels[id].get_new_items(items) + end + + # Replace the existing cached items by those ones + def update_cache(id, items) + @channels[id] ||= CachedChannel::new + @channels[id].update(items) + end + + # Get the last time the cache was updated + def get_last_check(id) + @channels[id] ||= CachedChannel::new + @channels[id].lastcheck + end + + # Get the last time the cache was updated + def set_last_check(id, time) + @channels[id] ||= CachedChannel::new + @channels[id].lastcheck = time + self + end + + # Load the cache from an IO stream + def load(io) + begin + @@cacheidx, @channels = Marshal.load(io) + rescue + @channels = Marshal.load(io) + @@cacheidx = 0 + end + end + + # Save the cache to an IO stream + def save(io) + Marshal.dump([@@cacheidx, @channels], io) + end + + # Return the number of channels in the cache + def nbchannels + @channels.length + end + + # Return the number of items in the cache + def nbitems + nb = 0 + @channels.each_value { |c| + nb += c.nbitems + } + nb + end + + def ItemCache.getindex + i = @@cacheidx + @@cacheidx += 1 + i + end +end + +class CachedChannel + attr_accessor :lastcheck, :items + + def initialize + @lastcheck = Time::at(0) + @items = [] + end + + # Returns the really new items amongst items + def get_new_items(items) + # set items' cached version if not set yet + newitems = [] + updateditems = [] + items.each { |i| i.cacheditem ||= CachedItem::new(i) } + items.each do |i| + # TODO rewrite with the fact that break can return a value + found = false + # Try to find a perfect match + @items.each do |j| + if i.cacheditem == j + i.cacheditem.index = j.index + found = true + break + end + end + next if found + # Try to find an updated item + @items.each do |j| + if i.link and i.link == j.link + # TODO use a better heuristic ? + i.cacheditem.index = j.index + i.cacheditem.updated = true + updateditems.push(i) + found = true + break + end + end + next if found + # add as new + i.cacheditem.create_index + newitems.push(i) + end + return [newitems, updateditems] + end + + # Replace the existing cached items by those ones + def update(items) + @items = [] + items.each do |i| + @items.push(i.cacheditem) + end + self + end + + # returns the number of items + def nbitems + @items.length + end +end + +# This class is the only thing kept in the cache +class CachedItem + attr_reader :title, :link, :hash + attr_accessor :index + attr_accessor :updated + + def initialize(item) + @title = item.title + @link = item.link + if item.content.nil? + @hash = nil + else + @hash = Digest::MD5.hexdigest(item.content.to_s) + end + end + + def ==(other) + @title == other.title and @link == other.link and @hash == other.hash + end + + def create_index + @index = ItemCache.getindex + end +end diff --git a/lib/feed2imap/channel.rb b/lib/feed2imap/channel.rb new file mode 100644 index 0000000..ae83d18 --- /dev/null +++ b/lib/feed2imap/channel.rb @@ -0,0 +1,326 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +# This class allows to retrieve a feed and parse it into a Channel + +require 'rexml/document' +require 'time' +require 'rmail' +require 'feed2imap/textconverters' +require 'feed2imap/rubymail_patch' +require 'feed2imap/rexml_patch' + +class UnknownFeedTypeException < RuntimeError +end +# an RSS/Atom channel +class Channel + attr_reader :title, :link, :description, :creator, :encoding, :items + + # parse str to build a channel + def initialize(str = nil) + parse_str(str) if str + end + + # Determines all the fields using a string containing an + # XML document + def parse_str(str) + # Dirty hack: some feeds contain the & char. It must be changed to & + str.gsub!(/&(\s+)/, '&\1') + doc = REXML::Document.new(str) + # get channel info + @encoding = doc.encoding + @title,@link,@description,@creator = nil + @items = [] + if doc.root.elements['channel'] || doc.root.elements['rss:channel'] + # We have a RSS feed! + # Title + if (e = doc.root.elements['channel/title'] || + doc.root.elements['rss:channel/rss:title']) && e.text + @title = e.text.toUTF8(@encoding).rmWhiteSpace! + end + # Link + if (e = doc.root.elements['channel/link'] || + doc.root.elements['rss:channel/rss:link']) && e.text + @link = e.text.rmWhiteSpace! + end + # Description + if (e = doc.root.elements['channel/description'] || + doc.root.elements['rss:channel/rss:description']) && e.text + @description = e.text.toUTF8(@encoding).rmWhiteSpace! + end + # Creator + if ((e = doc.root.elements['channel/dc:creator']) && e.text) || + ((e = doc.root.elements['channel/author'] || + doc.root.elements['rss:channel/rss:author']) && e.text) + @creator = e.text.toUTF8(@encoding).rmWhiteSpace! + end + # Items + if doc.root.elements['channel/item'] + query = 'channel/item' + elsif doc.root.elements['item'] + query = 'item' + elsif doc.root.elements['rss:channel/rss:item'] + query = 'rss:channel/rss:item' + else + query = 'rss:item' + end + doc.root.each_element(query) { |e| @items << Item::new(e, self) } + + elsif doc.root.elements['/feed'] + # We have an ATOM feed! + # Title + if (e = doc.root.elements['/feed/title']) && e.text + @title = e.text.toUTF8(@encoding).rmWhiteSpace! + end + # Link + doc.root.each_element('/feed/link') do |e| + if e.attribute('type').value == 'text/html' or + e.attribute('type').value == 'application/xhtml' or + e.attribute('type').value == 'application/xhtml+xml' + if (h = e.attribute('href')) && h + @link = h.value.rmWhiteSpace! + end + end + end + # Description + if e = doc.root.elements['/feed/info'] + @description = e.elements.to_s.toUTF8(@encoding).rmWhiteSpace! + end + # Items + doc.root.each_element('/feed/entry') do |e| + @items << AtomItem::new(e, self) + end + else + raise UnknownFeedTypeException::new + end + end + + def to_s + s = "Title: #{@title}\nLink: #{@link}\n\n" + @items.each { |i| s += i.to_s } + s + end +end + +# an Item from a channel +class Item + attr_accessor :title, :link, :content, :date, :creator, :subject, + :category, :cacheditem + attr_reader :channel + + def initialize(item = nil, channel = nil) + @channel = channel + @title, @link, @content, @date, @creator, @subject, @category = nil + if item + # Title + if ((e = item.elements['title'] || item.elements['rss:title']) && + e.text) || + ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) && + e.text) + @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace! + end + # Link + if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)|| + (e = item.elements['guid'] || item.elements['rss:guid'] and + not (e.attribute('isPermaLink') and + e.attribute('isPermaLink').value == 'false')) + @link = e.text.rmWhiteSpace! + end + # Content + if (e = item.elements['content:encoded']) || + (e = item.elements['description'] || item.elements['rss:description']) + if e.cdatas[0] + @content = e.cdatas[0].to_s.toUTF8(@channel.encoding).rmWhiteSpace! + elsif e.text + @content = e.text.toUTF8(@channel.encoding).text2html + end + end + # Date + if e = item.elements['dc:date'] || item.elements['pubDate'] || + item.elements['rss:pubDate'] + begin + @date = Time::xmlschema(e.text) + rescue + begin + @date = Time::rfc2822(e.text) + rescue + begin + @date = Time::parse(e.text) + rescue + @date = nil + end + end + end + end + # Creator + @creator = @channel.creator + if (e = item.elements['dc:creator'] || item.elements['author'] || + item.elements['rss:author']) && e.text + @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace! + end + # Subject + if (e = item.elements['dc:subject']) && e.text + @subject = e.text.toUTF8(@channel.encoding).rmWhiteSpace! + end + # Category + if (e = item.elements['dc:category'] || item.elements['category'] || + item.elements['rss:category']) && e.text + @category = e.text.toUTF8(@channel.encoding).rmWhiteSpace! + end + end + end + + def to_s + "--------------------------------\n" + + "Title: #{@title}\nLink: #{@link}\n" + + "Date: #{@date.to_s}\nCreator: #{@creator}\n" + + "Subject: #{@subject}\nCategory: #{@category}\nContent:\n#{content}\n" + end + + def to_text + s = "" + s += "Channel: " + s += @channel.title + ' ' if @channel.title + s += "<#{@channel.link}>" if @channel.link + s += "\n" + s += "Item: " + s += @title + ' ' if @title + s += "<#{@link}>" if @link + s += "\n" + s += "\nDate: #{@date.to_s}" if @date # TODO improve date rendering ? + s += "\nAuthor: #{@creator}" if @creator + s += "\nSubject: #{@subject}" if @subject + s += "\nCategory: #{@category}" if @category + s += "\n\n" + s += "#{@content.html2text}" if @content + s + end + + def to_html + s = '' + s += '' + s += '' + s += "

Channel: " + s += "" if @channel.link + s += @channel.title if @channel.title + s += "" if @channel.link + s += "
\nItem: " + s += "" if @link + s += @title if @title + s += "" if @link + s += "\n" + s += "
Date: #{@date.to_s}" if @date # TODO improve date rendering ? + s += "
Author: #{@creator}" if @creator + s += "
Subject: #{@subject}" if @subject + s += "
Category: #{@category}" if @category + s += "

" + s += "

#{@content}

" if @content + s += '' + s + end + + # TODO from significatif + def to_mail(from = 'Feed2Imap') + message = RMail::Message::new + message.header['From'] = "#{from} " + message.header['To'] = "#{from} " + if @date.nil? + message.header['Date'] = Time::new.rfc2822 + else + message.header['Date'] = @date.rfc2822 + end + message.header['X-Feed2Imap-Version'] = F2I_VERSION if defined?(F2I_VERSION) + message.header['X-CacheIndex'] = "-#{@cacheditem.index}-" + message.header['X-F2IStatus'] = "Updated" if @cacheditem.updated + # TODO encode in ISO ? + if @title + message.header['Subject'] = @title + elsif @date + message.header['Subject'] = @date.to_s + elsif @link + message.header['Subject'] = @link + end + textpart = RMail::Message::new + textpart.header['Content-Type'] = 'text/plain; charset=UTF-8; format=flowed' + textpart.header['Content-Transfer-Encoding'] = '7bit' + textpart.body = to_text + htmlpart = RMail::Message::new + htmlpart.header['Content-Type'] = 'text/html; charset=UTF-8' + htmlpart.header['Content-Transfer-Encoding'] = '7bit' + htmlpart.body = to_html + message.add_part(textpart) + message.add_part(htmlpart) + return message.to_s + end +end + +class AtomItem < Item + def initialize(item = nil, channel = nil) + @channel = channel + @title, @link, @content, @date, @creator, @subject, @category = nil + if item + # Title + if (e = item.elements['title']) && e.text + @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace! + end + # Link + item.each_element('link') do |e| + if e.attribute('type').value == 'text/html' or + e.attribute('type').value == 'application/xhtml' or + e.attribute('type').value == 'application/xhtml+xml' + if (h = e.attribute('href')) && h.value + @link = h.value + end + end + end + # Content + if e = item.elements['content'] || item.elements['summary'] + if (e.attribute('mode') and e.attribute('mode').value == 'escaped') && + e.text + @content = e.text.toUTF8(@channel.encoding).rmWhiteSpace! + else + # go one step deeper in the recursion if possible + e = e.elements['div'] || e + @content = e.to_s.toUTF8(@channel.encoding).rmWhiteSpace! + end + end + # Date + if (e = item.elements['issued'] || e = item.elements['created']) && e.text + begin + @date = Time::xmlschema(e.text) + rescue + begin + @date = Time::rfc2822(e.text) + rescue + begin + @date = Time::parse(e.text) + rescue + @date = nil + end + end + end + end + # Creator + @creator = @channel.creator + if (e = item.elements['author/name']) && e.text + @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace! + end + end + end +end diff --git a/lib/feed2imap/config.rb b/lib/feed2imap/config.rb new file mode 100644 index 0000000..8129fd2 --- /dev/null +++ b/lib/feed2imap/config.rb @@ -0,0 +1,76 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'yaml' +require 'uri' +require 'feed2imap/imap' + +# Default cache file +DEFCACHE = ENV['HOME'] + '/.feed2imap.cache' + +# Feed2imap configuration +class F2IConfig + attr_reader :imap_accounts, :cache, :feeds + + # Load the configuration from the IO stream + # TODO should do some sanity check on the data read. + def initialize(io) + @conf = YAML::load(io) + @cache = @conf['cache'] || DEFCACHE + @conf['feeds'] ||= [] + @feeds = [] + @imap_accounts = ImapAccounts::new + @conf['feeds'].each { |f| + uri = URI::parse(f['target']) + path = uri.path + path = path[1..-1] if path[0,1] == '/' + @feeds.push(ConfigFeed::new(f['name'], f['url'], + @imap_accounts.add_account(uri), path)) + } + end + + def to_s + s = "Your Feed2Imap config :\n" + s += "=======================\n" + s += "Cache file: #{@cache}\n\n" + s += "Imap accounts I'll have to connect to :\n" + s += "---------------------------------------\n" + @imap_accounts.each_value { |i| s += i.to_s + "\n" } + s += "\nFeeds :\n" + s += "-------\n" + i = 1 + @feeds.each do |f| + s += "#{i}. #{f.name}\n" + s += " URL: #{f.url}\n" + s += " IMAP Account: #{f.imapaccount}\n" + s += " Folder: #{f.folder}\n\n" + i += 1 + end + s + end +end + +# A configured feed. simple data container. +class ConfigFeed + attr_reader :name, :url, :imapaccount, :folder + + def initialize(name, url, imapaccount, folder) + @name, @url, @imapaccount, @folder = name, url, imapaccount, folder + end +end diff --git a/lib/feed2imap/feed2imap.rb b/lib/feed2imap/feed2imap.rb new file mode 100644 index 0000000..0f09c51 --- /dev/null +++ b/lib/feed2imap/feed2imap.rb @@ -0,0 +1,135 @@ +#!/usr/bin/ruby + +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'feed2imap/config' +require 'feed2imap/cache' +require 'feed2imap/channel' +require 'feed2imap/httpfetcher' +require 'logger' + +# Feed2Imap version +F2I_VERSION = '0.1' + +class Feed2Imap + def initialize(verbose, cacherebuild, configfile) + @logger = Logger::new(STDOUT) + if verbose + @logger.level = Logger::DEBUG + else + @logger.level = Logger::WARN + end + @logger.info("Feed2Imap V.#{F2I_VERSION} started") + # reading config + @logger.info('Reading configuration file') + if not File::exist?(configfile) + @logger.fatal("Configuration file #{configfile} not found.") + exit(1) + end + begin + File::open(configfile) { + |f| @config = F2IConfig::new(f) + } + rescue + @logger.fatal("Error while reading configuration file, exiting: #{$!}") + exit(1) + end + # init cache + @logger.info('Initializing cache') + @cache = ItemCache::new + if not File::exist?(@config.cache) + @logger.warn("Cache file #{@config.cache} not found, using a new one") + else + File::open(@config.cache) { |f| @cache.load(f) } + end + # connecting all IMAP accounts + @logger.info('Connecting to IMAP accounts') + @config.imap_accounts.each_value do |ac| + begin + ac.connect + rescue + @logger.fatal("Error while connecting to #{ac}, exiting: #{$!}") + exit(1) + end + end + # for each feed, fetch, upload to IMAP and cache + @config.feeds.each do |f| + @logger.info("Processing #{f.url}") + begin + # check that folder exist + f.imapaccount.create_folder(f.folder) if not f.imapaccount.folder_exist?(f.folder) + rescue + @logger.fatal("Error while creating IMAP folder #{f.folder}: #{$!}") + exit(1) + end + begin + body = HTTPFetcher::fetch(f.url, @cache.get_last_check(f.name)) + rescue Timeout::Error + @logger.fatal("Timeout::Error while fetching #{f.url}: #{$!}") + next + rescue + @logger.fatal("Error while fetching #{f.url}: #{$!}") + next + end + next if body.nil? # means 304 + begin + channel = Channel::new(body) + rescue + @logger.fatal("Error while parsing #{f.url}: #{$!}") + next + end + begin + newitems, updateditems = @cache.get_new_items(f.name, channel.items) + rescue + @logger.fatal("Exception caught when selecting new items for #{f.url}: #{$!}") + puts $!.backtrace + next + end + @logger.info("#{newitems.length} new items, #{updateditems.length} updated items.") if newitems.length > 0 or updateditems.length > 0 + begin + if !cacherebuild + updateditems.each { |i| f.imapaccount.updatemail(f.folder, i.to_mail(f.name), i.cacheditem.index) } + newitems.each { |i| f.imapaccount.putmail(f.folder, i.to_mail(f.name)) } + end + rescue + @logger.fatal("Exception caught while uploading mail to #{f.folder}: #{$!}") + next + end + begin + @cache.update_cache(f.name, channel.items) + rescue + @logger.fatal("Exception caught while updating cache for #{f.name}: #{$!}") + next + end + end + @logger.info("Finished. Saving cache") + begin + File::open(@config.cache, 'w') { |f| @cache.save(f) } + rescue + @logger.fatal("Exception caught while writing cache to #{@config.cache}: #{$!}") + end + @logger.info("Closing IMAP connections") + begin + @config.imap_accounts.each_value { |ac| ac.disconnect } + rescue + @logger.fatal("Exception caught while closing connection to #{ac.to_s}: #{$!}") + end + end +end diff --git a/lib/feed2imap/httpfetcher.rb b/lib/feed2imap/httpfetcher.rb new file mode 100644 index 0000000..6c72bf3 --- /dev/null +++ b/lib/feed2imap/httpfetcher.rb @@ -0,0 +1,89 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'net/http' +# get openssl if available +begin + require 'openssl' +rescue +end +require 'uri' + +# Class used to retrieve the feed over HTTP +# TODO non standard port, authentification +# TODO don't use If-Mod-Since if = 0 + +if defined?(F2I_VERSION) + USERAGENT = 'Feed2Imap v#{F2I_VERSION} http://home.gna.org/feed2imap/' +else + USERAGENT = 'Feed2Imap http://home.gna.org/feed2imap/' +end + +class HTTPFetcher + def HTTPFetcher::fetcher(baseuri, uri, lastcheck, recursion) + if uri.scheme == 'http' + http = Net::HTTP::new(uri.host, uri.port) + else + http = Net::HTTPS::new(uri.host, uri.port) + end + req = Net::HTTP::Get::new(uri.request_uri, {'User-Agent' => USERAGENT, 'If-Modified-Since' => lastcheck.httpdate}) + if uri.userinfo + login, pw = uri.userinfo.split(':') + req.basic_auth(login, pw) + # workaround. eg. wikini redirects and loses auth info. + elsif uri.host == baseuri.host and baseuri.userinfo + login, pw = baseuri.userinfo.split(':') + req.basic_auth(login, pw) + end + begin + response = http.request(req) + rescue Timeout::Error + raise "Timeout while fetching #{uri.to_s}" + end + case response + when Net::HTTPSuccess + return response.body + when Net::HTTPRedirection + # if not modified + return nil if Net::HTTPNotModified === response + if recursion > 0 + redir = URI::join(uri.to_s, response['location']) + return fetcher(baseuri, redir, lastcheck, recursion - 1) + end + end + # or raise en exception + response.error! + end + + def HTTPFetcher::fetch(url, lastcheck) + uri = URI::parse(url) + return HTTPFetcher::fetcher(uri, uri, lastcheck, 5) + http = Net::HTTP::new(uri.host) + response = http.get(uri.path, {'User-Agent' => USERAGENT, 'If-Modified-Since' => lastcheck.httpdate}) + if response.class == Net::HTTPOK + return response.body + elsif response.class == Net::HTTPNotModified + return nil + elsif response.class == Net::HTTPNotFound + raise "Page not found (404)" + else + raise "Unknown response #{response.class}" + end + end +end diff --git a/lib/feed2imap/imap.rb b/lib/feed2imap/imap.rb new file mode 100644 index 0000000..591f561 --- /dev/null +++ b/lib/feed2imap/imap.rb @@ -0,0 +1,118 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +# Imap connection handling +require 'net/imap' +begin + require 'openssl' +rescue +end +require 'uri' + +# This class is a container of IMAP accounts. +# Thanks to it, accounts are re-used : several feeds +# using the same IMAP account will create only one +# IMAP connection. +class ImapAccounts < Hash + def add_account(uri) + u = URI::Generic::build({ :scheme => uri.scheme, + :userinfo => uri.userinfo, + :host => uri.host, + :port => uri.port }) + if not include?(u) + ac = ImapAccount::new(u) + self[u] = ac + end + return self[u] + end +end + +# This class is an IMAP account, with the given fd +# once the connection has been established +class ImapAccount + attr_reader :uri + + def initialize(uri) + @uri = uri + end + + # connects to the IMAP server + # raises an exception if it fails + def connect + port = 143 + usessl = false + if uri.scheme == 'imap' + port = 143 + usessl = false + elsif uri.scheme == 'imaps' + port = 993 + usessl = true + else + raise "Unknown scheme: #{uri.scheme}" + end + # use given port if port given + port = uri.port if uri.port + @connection = Net::IMAP::new(uri.host, port, usessl) + user, password = uri.userinfo.split(':',2) + @connection.login(user, password) + end + + # disconnect from the IMAP server + def disconnect + @connection.disconnect if @connection + end + + # Returns true if the folder exist + def folder_exist?(folder) + return !@connection.list('', folder).nil? + end + + # Creates the given folder + def create_folder(folder) + @connection.create(folder) + @connection.subscribe(folder) + self + end + + # Put the mail in the given folder + # You should check whether the folder exist first. + def putmail(folder, mail) + # TODO check response + @connection.append(folder, mail) + end + + def updatemail(folder, mail, idx) + # TODO check response + # TODO keep flags of deleted mail + @connection.select(folder) + searchres = @connection.search(['HEADER', 'X-CacheIndex', "-#{idx}-"]) + if searchres.length == 1 + @connection.store(searchres[0], "+FLAGS", [:Deleted]) + @connection.expunge + elsif searchres.length != 0 + raise "Search returned multiple results !!" + end + putmail(folder, mail) + end + + def to_s + uri.to_s + end +end + diff --git a/lib/feed2imap/rexml_patch.rb b/lib/feed2imap/rexml_patch.rb new file mode 100644 index 0000000..3d919ae --- /dev/null +++ b/lib/feed2imap/rexml_patch.rb @@ -0,0 +1,41 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'feed2imap/textconverters' + +# Patch for REXML +# Very ugly patch to make REXML error-proof. +# The problem is REXML uses IConv, which isn't error-proof at all. +# With those changes, it uses unpack/pack with some error handling +module REXML + module Encoding + def decode(str) + return str.toUTF8(@encoding) + end + + def encode(str) + return str + end + + def encoding=(enc) + return if defined? @encoding and enc == @encoding + @encoding = enc || 'utf-8' + end + end +end diff --git a/lib/feed2imap/rubymail_patch.rb b/lib/feed2imap/rubymail_patch.rb new file mode 100644 index 0000000..208228c --- /dev/null +++ b/lib/feed2imap/rubymail_patch.rb @@ -0,0 +1,50 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +# Patches for ruby mail +# The problem is it creates a mail with multipart/mixed (= for attachments), but I need +# multipart/alternative. I just overwrite the two methods doing this. + +require 'rmail' + +module RMail + class Header + undef set_boundary + def set_boundary(boundary) + params = params_quoted('content-type') + params ||= {} + params['boundary'] = boundary + content_type = content_type() + content_type ||= "multipart/alternative" + delete('Content-Type') + add('Content-Type', content_type, nil, params) + end + end + + class Message + # TODO find a way to avoid the warning. undef'ing initialize causes a warning. + def initialize + @header = RMail::Header.new + @body = nil + @epilogue = nil + @preamble = nil + @delimiters = nil + end + end +end diff --git a/lib/feed2imap/textconverters.rb b/lib/feed2imap/textconverters.rb new file mode 100644 index 0000000..ba49193 --- /dev/null +++ b/lib/feed2imap/textconverters.rb @@ -0,0 +1,85 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'uri' # for URI::regexp + +# This class provides various converters +class String + # is this text HTML ? search for tags + def html? + return (self =~ /

/) || (self =~ /
/) || (self =~ //) + end + + # convert text to HTML + def text2html + text = self.clone + return text if text.html? + # paragraphs + text.gsub!(/\A\s*(.*)\Z/m, '

\1

') + text.gsub!(/\s*\n(\s*\n)+\s*/, "

\n

") + # uris + text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/, + '\1') + text + end + + # Convert an HTML text to plain text + def html2text + text = self.clone + # let's remove all CR + text.gsub!(/\n/, '') + # convert

and
+ text.gsub!(/\s*<\/p>\s*/, '') + text.gsub!(/\s*]*)?>\s*/, "\n\n") + text.gsub!(/\s*\s*/, "\n") + # remove other tags + text.gsub!(/<[^>]*>/, '') + # remove leading and trailing whilespace + text.gsub!(/\A\s*/m, '') + text.gsub!(/\s*\Z/m, '') + text + end + + # Remove white space around the text + def rmWhiteSpace! + return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'') + end + + # Convert a text in inputenc to a text in UTF8 + # must take care of wrong input locales + def toUTF8(inputenc) + if inputenc.downcase! != 'utf-8' + # it is said it is not UTF-8. Ensure it is REALLY not UTF-8 + begin + if self.unpack('U*').pack('U*') == self + return self + end + rescue + # do nothing + end + begin + return self.unpack('C*').pack('U*') + rescue + return self #failsafe solution. but a dirty one :-) + end + else + return self + end + end +end -- cgit v1.2.3-54-g00ecf