From 8ed0ec9fd97511e847315d44cf0070ced31a9557 Mon Sep 17 00:00:00 2001 From: lnu Date: Sun, 18 Jun 2006 08:59:16 +0000 Subject: pre-release cleanup git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@97 f70e237a-67f3-0310-a06c-d2b8a7116972 --- ChangeLog | 15 ++++++--- data/doc/feed2imap/examples/feed2imaprc | 26 +++++++++------ lib/feed2imap/cache.rb | 57 +++++++++++++++++++++++++-------- lib/feed2imap/config.rb | 3 +- lib/feed2imap/feed2imap.rb | 4 +-- manpages/feed2imaprc.xml | 3 +- 6 files changed, 77 insertions(+), 31 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8209430..0879abc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,15 +1,20 @@ Feed2Imap 0.8 (XX/XX/2006) ============================ -* Fixed a small bug in the duplicate items handling which could have caused - some items to be ignored if they had the same url but different content. -* New always-new flag in the config file to consider all items as new (for - feeds where items are wrongly marked as updated, e.g mediawiki feeds). - See example configuration file for more information. * Uses the http_proxy environment variable to determine the proxy server if available. (fixes gna bug #5820, all credits go to Boyd Adamson ) * Fixes flocking on Solaris (fixes gna bug #5819). Again, all credits go to Boyd Adamson . +* Rewrite of the "find updated and new items" code. It should work much better + now. Also, a debug-updated configuration variable was added to make it + easier to debug those issues. +* New always-new flag in the config file to consider all items as new (for + feeds where items are wrongly marked as updated, e.g mediawiki feeds). + See example configuration file for more information (fixes Debian bug + #366878). +* When disconnecting from the IMAP server, don't display an exception in + non-verbose mode if the "connection is reset by peer" (fixes Debian bug + #367282). Feed2Imap 0.7 (17/02/2006) ============================ diff --git a/data/doc/feed2imap/examples/feed2imaprc b/data/doc/feed2imap/examples/feed2imaprc index 35a9f6c..602e68b 100644 --- a/data/doc/feed2imap/examples/feed2imaprc +++ b/data/doc/feed2imap/examples/feed2imaprc @@ -1,13 +1,21 @@ -# name is the name of the feed (must be unique) -# url is the HTTP[S] address where the feed has to be fetched -# target is the IMAP URI where to put emails -# min-frequency (in HOURS) is the minimum frequency with which this particular -# feed will be fetched +# Global options: +# dumpdir: (for debugging purposes) directory where all fetched feeds will be +# dumped. +# debug-updated: (for debugging purposes) if true, display a lot of information +# about the "updated-items" algorithm. +# +# Per-feed options: +# name: name of the feed (must be unique) +# url: HTTP[S] address where the feed has to be fetched +# target: the IMAP URI where to put emails. Should start with imap:// for IMAP +# and imaps:// for IMAPS. +# min-frequency: (in HOURS) is the minimum frequency with which this particular +# feed will be fetched # disable: if set to something, the feed will be ignored -# always-new: feed2imap tries to use a clever algorithm to determine whether an item -# is new or has been updated. It doesn't work well with some web apps like -# mediawiki. When this flag is enabled, all items which don't match exactly -# a previously downloaded item are considered as new items. +# always-new: feed2imap tries to use a clever algorithm to determine whether +# an item is new or has been updated. It doesn't work well with some web apps +# like mediawiki. When this flag is enabled, all items which don't match +# exactly a previously downloaded item are considered as new items. # # If your login contains an @ character, replace it with %40. Other reserved # characters can be escaped in the same way (see man ascii to get their code) diff --git a/lib/feed2imap/cache.rb b/lib/feed2imap/cache.rb index 9b5861f..a101785 100644 --- a/lib/feed2imap/cache.rb +++ b/lib/feed2imap/cache.rb @@ -17,20 +17,28 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end +# debug mode +$updateddebug = false + # This class manages a cache of items # (items which have already been seen) require 'digest/md5' class ItemCache - def initialize + def initialize(debug = false) @channels = {} @@cacheidx = 0 + $updateddebug = debug self end # Returns the really new items amongst items def get_new_items(id, items, always_new = false) + if $updateddebug + puts "=======================================================" + puts "GET_NEW_ITEMS FOR #{id}... (#{Time::now})" + end @channels[id] ||= CachedChannel::new return @channels[id].get_new_items(items, always_new) end @@ -92,7 +100,8 @@ end class CachedChannel # Size of the cache for each feed - CACHESIZE = 50 + # 100 items should be enough for everybody, even quite busy feeds + CACHESIZE = 100 attr_accessor :lastcheck, :items @@ -113,8 +122,6 @@ class CachedChannel # @nbnewitems is set by get_new_items, and is used to limit the number # of (old) items serialized. - UPDATEDDEBUG = false - # Returns the really new items amongst items def get_new_items(items, always_new = false) # save number of new items @@ -124,6 +131,10 @@ class CachedChannel updateditems = [] @itemstemp = @items items.each { |i| i.cacheditem ||= CachedItem::new(i) } + if $updateddebug + puts "-------Items downloaded before dups removal (#{items.length}) :----------" + items.each { |i| puts "#{i.cacheditem.to_s}" } + end # remove dups dups = true while dups @@ -131,7 +142,7 @@ class CachedChannel for i in 0...items.length do for j in i+1...items.length do if items[i].cacheditem == items[j].cacheditem - if UPDATEDDEBUG + if $updateddebug puts "## Removed duplicate #{items[j].cacheditem.to_s}" end items.delete_at(j) @@ -143,10 +154,10 @@ class CachedChannel end end # debug : dump interesting info to stdout. - if UPDATEDDEBUG - puts "-------Items downloaded :----------" + if $updateddebug + puts "-------Items downloaded after dups removal (#{items.length}) :----------" items.each { |i| puts "#{i.cacheditem.to_s}" } - puts "-------Items already there :----------" + puts "-------Items already there (#{@items.length}) :----------" @items.each { |i| puts "#{i.to_s}" } puts "Items always considered as new: #{always_new.to_s}" end @@ -168,7 +179,7 @@ class CachedChannel # Try to find an updated item @items.each do |j| # Do we need a better heuristic ? - if i.link and i.link == j.link + if j.is_ancestor_of(i) i.cacheditem.index = j.index i.cacheditem.updated = true updateditems.push(i) @@ -187,7 +198,7 @@ class CachedChannel # add i.cacheditem to @itemstemp @itemstemp.unshift(i.cacheditem) end - if UPDATEDDEBUG + if $updateddebug puts "-------New items :----------" newitems.each { |i| puts "#{i.cacheditem.to_s}" } puts "-------Updated items :----------" @@ -200,6 +211,9 @@ class CachedChannel # too old items must be dropped n = @nbnewitems > CACHESIZE ? @nbnewitems : CACHESIZE @items = @itemstemp[0..n] + if $updateddebug + puts "Committing: new items: #{@nbnewitems} / items kept: #{@items.length}" + end @itemstemp = [] self end @@ -212,13 +226,15 @@ end # This class is the only thing kept in the cache class CachedItem - attr_reader :title, :link, :hash + attr_reader :title, :link, :creator, :date, :hash attr_accessor :index attr_accessor :updated def initialize(item) @title = item.title @link = item.link + @date = item.date + @creator = item.creator if item.content.nil? @hash = nil else @@ -227,14 +243,29 @@ class CachedItem end def ==(other) - @title == other.title and @link == other.link and @hash == other.hash + if $updateddebug and @title =~ /e325/ and other.title =~ /e325/ + puts "Comparing #{self.to_s} and #{other.to_s}:" + puts "Title: #{@title == other.title}" + puts "Link: #{@link == other.link}" + puts "Creator: #{@creator == other.creator}" + puts "Date: #{@date == other.date}" + puts "Hash: #{@hash == other.hash}" + end + @title == other.title and @link == other.link and + (@creator.nil? or other.creator.nil? or @creator == other.creator) and + (@date.nil? or other.date.nil? or @date == other.date) and @hash == other.hash end def create_index @index = ItemCache.getindex end + def is_ancestor_of(other) + (@link and other.link and @link == other.link) and + ((@creator and other.creator and @creator == other.creator) or (@creator.nil?)) + end + def to_s - "\"#{@title}\" #{@link} #{@hash}" + "\"#{@title}\" #{@creator}/#{@date} #{@link} #{@hash}" end end diff --git a/lib/feed2imap/config.rb b/lib/feed2imap/config.rb index 1e13a04..38a1faa 100644 --- a/lib/feed2imap/config.rb +++ b/lib/feed2imap/config.rb @@ -26,7 +26,7 @@ DEFCACHE = ENV['HOME'] + '/.feed2imap.cache' # Feed2imap configuration class F2IConfig - attr_reader :imap_accounts, :cache, :feeds, :dumpdir + attr_reader :imap_accounts, :cache, :feeds, :dumpdir, :updateddebug # Load the configuration from the IO stream # TODO should do some sanity check on the data read. @@ -36,6 +36,7 @@ class F2IConfig @dumpdir = @conf['dumpdir'] || nil @conf['feeds'] ||= [] @feeds = [] + @updateddebug = (@conf['debug-updated'] and @conf['debug-updated'] != 'false') @imap_accounts = ImapAccounts::new @conf['feeds'].each do |f| if f['disable'].nil? diff --git a/lib/feed2imap/feed2imap.rb b/lib/feed2imap/feed2imap.rb index a40dd66..fcbb357 100644 --- a/lib/feed2imap/feed2imap.rb +++ b/lib/feed2imap/feed2imap.rb @@ -61,7 +61,7 @@ class Feed2Imap end # init cache @logger.info('Initializing cache') - @cache = ItemCache::new + @cache = ItemCache::new(@config.updateddebug) if not File::exist?(@config.cache + '.lock') f = File::new(@config.cache + '.lock', 'w') f.close @@ -187,7 +187,7 @@ class Feed2Imap begin ac.disconnect rescue - @logger.fatal("Exception caught while closing connection to #{ac.to_s}: #{$!}") + @logger.info("Exception caught while closing connection to #{ac.to_s}: #{$!}") end end end diff --git a/manpages/feed2imaprc.xml b/manpages/feed2imaprc.xml index 6731f6e..7643d3d 100644 --- a/manpages/feed2imaprc.xml +++ b/manpages/feed2imaprc.xml @@ -43,7 +43,8 @@ BUGS - This manpage should probably give more details. + This manpage should probably give more details. However, the example configuration file is +very well documented. SEE ALSO -- cgit v1.2.3-54-g00ecf