From dd108ab5ffc959ebfaf326769e5fc74822b6647b Mon Sep 17 00:00:00 2001 From: lnu Date: Mon, 23 Oct 2006 16:21:20 +0000 Subject: git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@105 f70e237a-67f3-0310-a06c-d2b8a7116972 --- ChangeLog | 8 +++++ bin/feed2imap | 17 ++++++++-- data/doc/feed2imap/examples/feed2imaprc | 7 +++++ lib/feed2imap/cache.rb | 34 ++++++++++++++++---- lib/feed2imap/config.rb | 6 ++-- lib/feed2imap/feed2imap.rb | 56 ++++++++++++++++++++++++--------- 6 files changed, 104 insertions(+), 24 deletions(-) diff --git a/ChangeLog b/ChangeLog index cc017d5..ec0c198 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,14 @@ Feed2Imap 0.9 (XX/XX/2006) ) * Now supports Snowscripts, using the 'execurl' and 'filter' config keywords. For more information, see the example configuration file. +* Slightly better option parsing. Thanks to Paul van Tilburg for the + patch. +* A debug mode was added, and the normal mode was improved, so it is + no longer necessary to redirect feed2imap output to /dev/null: + transient errors are only reported after they have happened a + certain number of times (default 5). +* An ignore-hash option was added for feeds whose content change all + the time. Feed2Imap 0.8 (28/06/2006) ============================ diff --git a/bin/feed2imap b/bin/feed2imap index 0095d9f..b31c191 100755 --- a/bin/feed2imap +++ b/bin/feed2imap @@ -9,13 +9,20 @@ verbose = false version = false cacherebuild = false configf = ENV['HOME'] + '/.feed2imaprc' +progname = File::basename($PROGRAM_NAME) opts = OptionParser::new do |opts| - opts.banner = "Usage: ./feed2imap.rb [options]" + opts.banner = "Usage: #{progname} [options]" opts.separator "" opts.separator "Options:" + opts.on("-v", "--verbose", "Verbose mode") do |v| verbose = true end + + opts.on("-d", "--debug", "Debug mode") do |v| + verbose = :debug + end + opts.on("-V", "--version", "Display Feed2Imap version") do |v| version = true end @@ -26,7 +33,13 @@ opts = OptionParser::new do |opts| configf = f end end -opts.parse!(ARGV) +begin + opts.parse!(ARGV) +rescue OptionParser::ParseError => pe + opts.warn pe + puts opts + exit 1 +end if version puts "Feed2Imap v.#{F2I_VERSION}" diff --git a/data/doc/feed2imap/examples/feed2imaprc b/data/doc/feed2imap/examples/feed2imaprc index 0d0c3ed..0d8eaac 100644 --- a/data/doc/feed2imap/examples/feed2imaprc +++ b/data/doc/feed2imap/examples/feed2imaprc @@ -1,4 +1,7 @@ # Global options: +# max-failures: maximum number of failures allowed before they are reported in +# normal mode (default 5). By default, failures are only visible in verbose +# mode. # dumpdir: (for debugging purposes) directory where all fetched feeds will be # dumped. # debug-updated: (for debugging purposes) if true, display a lot of information @@ -16,6 +19,10 @@ # an item is new or has been updated. It doesn't work well with some web apps # like mediawiki. When this flag is enabled, all items which don't match # exactly a previously downloaded item are considered as new items. +# ignore-hash: Some feeds change the content of their items all the time, so +# feed2imap detects that they have been updated at each run. When this flag +# is enabled, feed2imap ignores the content of an item when determining +# whether the item is already known. # Snownews/Liferea scripts support : # execurl: Command to execute that will display the RSS/Atom feed on stdout # filter: Command to execute which will receive the RSS/Atom feed on stdin, diff --git a/lib/feed2imap/cache.rb b/lib/feed2imap/cache.rb index a101785..4687b76 100644 --- a/lib/feed2imap/cache.rb +++ b/lib/feed2imap/cache.rb @@ -34,13 +34,13 @@ class ItemCache end # Returns the really new items amongst items - def get_new_items(id, items, always_new = false) + def get_new_items(id, items, always_new = false, ignore_hash = false) if $updateddebug puts "=======================================================" puts "GET_NEW_ITEMS FOR #{id}... (#{Time::now})" end @channels[id] ||= CachedChannel::new - return @channels[id].get_new_items(items, always_new) + return @channels[id].get_new_items(items, always_new, ignore_hash) end # Commit changes to the cache @@ -59,9 +59,16 @@ class ItemCache def set_last_check(id, time) @channels[id] ||= CachedChannel::new @channels[id].lastcheck = time + @channels[id].failures = 0 self end + # Fetching failure. + # returns number of failures + def fetch_failed(id) + @channels[id].fetch_failed + end + # Load the cache from an IO stream def load(io) begin @@ -103,13 +110,14 @@ class CachedChannel # 100 items should be enough for everybody, even quite busy feeds CACHESIZE = 100 - attr_accessor :lastcheck, :items + attr_accessor :lastcheck, :items, :failures def initialize @lastcheck = Time::at(0) @items = [] @itemstemp = [] # see below @nbnewitems = 0 + @failures = 0 end # Let's explain @items and @itemstemp. @@ -123,7 +131,7 @@ class CachedChannel # of (old) items serialized. # Returns the really new items amongst items - def get_new_items(items, always_new = false) + def get_new_items(items, always_new = false, ignore_hash = false) # save number of new items @nbnewitems = items.length # set items' cached version if not set yet @@ -165,7 +173,10 @@ class CachedChannel found = false # Try to find a perfect match @items.each do |j| - if i.cacheditem == j + # note that simple_compare only CachedItem, not RSSItem, so we have to use + # j.simple_compare(i) and not i.simple_compare(j) + if (i.cacheditem == j and not ignore_hash) or + (j.simple_compare(i) and ignore_hash) i.cacheditem.index = j.index found = true # let's put j in front of itemstemp @@ -222,6 +233,12 @@ class CachedChannel def nbitems @items.length end + + def fetch_failed + @failures = 0 if @failures.nil? + @failures += 1 + return @failures + end end # This class is the only thing kept in the cache @@ -243,7 +260,7 @@ class CachedItem end def ==(other) - if $updateddebug and @title =~ /e325/ and other.title =~ /e325/ + if $updateddebug puts "Comparing #{self.to_s} and #{other.to_s}:" puts "Title: #{@title == other.title}" puts "Link: #{@link == other.link}" @@ -256,6 +273,11 @@ class CachedItem (@date.nil? or other.date.nil? or @date == other.date) and @hash == other.hash end + def simple_compare(other) + @title == other.title and @link == other.link and + (@creator.nil? or other.creator.nil? or @creator == other.creator) + end + def create_index @index = ItemCache.getindex end diff --git a/lib/feed2imap/config.rb b/lib/feed2imap/config.rb index 10c4d82..4ab522e 100644 --- a/lib/feed2imap/config.rb +++ b/lib/feed2imap/config.rb @@ -26,7 +26,7 @@ DEFCACHE = ENV['HOME'] + '/.feed2imap.cache' # Feed2imap configuration class F2IConfig - attr_reader :imap_accounts, :cache, :feeds, :dumpdir, :updateddebug + attr_reader :imap_accounts, :cache, :feeds, :dumpdir, :updateddebug, :max_failures # Load the configuration from the IO stream # TODO should do some sanity check on the data read. @@ -36,6 +36,7 @@ class F2IConfig @dumpdir = @conf['dumpdir'] || nil @conf['feeds'] ||= [] @feeds = [] + @max_failures = @conf['max-failures'].to_i || 5 @updateddebug = (@conf['debug-updated'] and @conf['debug-updated'] != 'false') @imap_accounts = ImapAccounts::new @conf['feeds'].each do |f| @@ -71,7 +72,7 @@ end # A configured feed. simple data container. class ConfigFeed - attr_reader :name, :url, :imapaccount, :folder, :always_new, :execurl, :filter + attr_reader :name, :url, :imapaccount, :folder, :always_new, :execurl, :filter, :ignore_hash attr_accessor :body def initialize(f, imapaccount, folder) @@ -83,6 +84,7 @@ class ConfigFeed @always_new = (f['always-new'] and f['always-new'] != 'false') @execurl = f['execurl'] @filter = f['filter'] + @ignore_hash = f['ignore-hash'] || false @freq = @freq.to_i if @freq end diff --git a/lib/feed2imap/feed2imap.rb b/lib/feed2imap/feed2imap.rb index 3cf46aa..2f62a80 100644 --- a/lib/feed2imap/feed2imap.rb +++ b/lib/feed2imap/feed2imap.rb @@ -36,14 +36,17 @@ class Feed2Imap def initialize(verbose, cacherebuild, configfile) @logger = Logger::new(STDOUT) - if verbose + if verbose == :debug @logger.level = Logger::DEBUG + require 'pp' + elsif verbose == true + @logger.level = Logger::INFO else @logger.level = Logger::WARN end @logger.info("Feed2Imap V.#{F2I_VERSION} started") # reading config - @logger.info('Reading configuration file') + @logger.info('Reading configuration file ...') if not File::exist?(configfile) @logger.fatal("Configuration file #{configfile} not found.") exit(1) @@ -60,8 +63,13 @@ class Feed2Imap @logger.fatal("Error while reading configuration file, exiting: #{$!}") exit(1) end + if @logger.level == Logger::DEBUG + @logger.debug("Configuration read:") + pp(@config) + end + # init cache - @logger.info('Initializing cache') + @logger.info('Initializing cache ...') @cache = ItemCache::new(@config.updateddebug) if not File::exist?(@config.cache + '.lock') f = File::new(@config.cache + '.lock', 'w') @@ -78,8 +86,9 @@ class Feed2Imap @cache.load(f) end end + # connecting all IMAP accounts - @logger.info('Connecting to IMAP accounts') + @logger.info('Connecting to IMAP accounts ...') @config.imap_accounts.each_value do |ac| begin ac.connect @@ -88,8 +97,9 @@ class Feed2Imap exit(1) end end + # check that IMAP folders exist - @logger.info("Checking IMAP folders") + @logger.info("Checking IMAP folders ...") @config.feeds.each do |f| begin f.imapaccount.create_folder(f.folder) if not f.imapaccount.folder_exist?(f.folder) @@ -99,7 +109,7 @@ class Feed2Imap end end # for each feed, fetch, upload to IMAP and cache - @logger.info("Fetching and filtering feeds") + @logger.info("Fetching and filtering feeds ...") ths = [] mutex = Mutex::new @config.feeds.each do |f| @@ -126,6 +136,8 @@ class Feed2Imap mutex.lock feed.body = s @cache.set_last_check(feed.name, Time::now) + else + @logger.debug("Feed #{feed.name} doesn't need to be checked again for now.") end mutex.unlock # dump if requested @@ -139,19 +151,34 @@ class Feed2Imap end rescue Timeout::Error mutex.synchronize do - @logger.fatal("Timeout::Error while fetching #{feed.url}: #{$!}") + n = @cache.fetch_failed(feed.name) + m = "Timeout::Error while fetching #{feed.url}: #{$!} (failed #{n} times)" + if n > @config.max_failures + @logger.fatal(m) + else + @logger.info(m) + end end rescue mutex.synchronize do - @logger.fatal("Error while fetching #{feed.url}: #{$!}") + n = @cache.fetch_failed(feed.name) + m = "Error while fetching #{feed.url}: #{$!} (failed #{n} times)" + if n > @config.max_failures + @logger.fatal(m) + else + @logger.info(m) + end end end end end ths.each { |t| t.join } - @logger.info("Parsing and uploading") + @logger.info("Parsing and uploading ...") @config.feeds.each do |f| - next if f.body.nil? # means 304 + if f.body.nil? # means 304 + @logger.debug("Feed #{f.name} did not change.") + next + end begin feed = FeedParser::Feed::new(f.body) rescue Exception => e @@ -159,13 +186,13 @@ class Feed2Imap next end begin - newitems, updateditems = @cache.get_new_items(f.name, feed.items, f.always_new) + newitems, updateditems = @cache.get_new_items(f.name, feed.items, f.always_new, f.ignore_hash) rescue @logger.fatal("Exception caught when selecting new items for #{f.name}: #{$!}") puts $!.backtrace next end - @logger.info("#{f.name}: #{newitems.length} new items, #{updateditems.length} updated items.") if newitems.length > 0 or updateditems.length > 0 + @logger.info("#{f.name}: #{newitems.length} new items, #{updateditems.length} updated items.") if newitems.length > 0 or updateditems.length > 0 or @logger.level == Logger::DEBUG begin if !cacherebuild updateditems.each do |i| @@ -190,17 +217,18 @@ class Feed2Imap next end end - @logger.info("Finished. Saving cache") + @logger.info("Finished. Saving cache ...") begin File::open(@config.cache, 'w') { |f| @cache.save(f) } rescue @logger.fatal("Exception caught while writing cache to #{@config.cache}: #{$!}") end - @logger.info("Closing IMAP connections") + @logger.info("Closing IMAP connections ...") @config.imap_accounts.each_value do |ac| begin ac.disconnect rescue + # servers tend to cause an exception to be raised here, hence the INFO level. @logger.info("Exception caught while closing connection to #{ac.to_s}: #{$!}") end end -- cgit v1.2.3-70-g09d2