From c8a7a285dc83f05cbbd8b935fcf1d9c780f77ced Mon Sep 17 00:00:00 2001 From: lnu Date: Mon, 2 Jan 2006 20:08:52 +0000 Subject: Now uses ruby-feedparser for feed parsing git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@73 f70e237a-67f3-0310-a06c-d2b8a7116972 --- lib/feed2imap/channel.rb | 334 ---------------------------------------- lib/feed2imap/feed2imap.rb | 19 ++- lib/feed2imap/itemtomail.rb | 77 +++++++++ lib/feed2imap/rexml_patch.rb | 2 +- lib/feed2imap/textconverters.rb | 152 ------------------ 5 files changed, 91 insertions(+), 493 deletions(-) delete mode 100644 lib/feed2imap/channel.rb create mode 100644 lib/feed2imap/itemtomail.rb delete mode 100644 lib/feed2imap/textconverters.rb (limited to 'lib/feed2imap') diff --git a/lib/feed2imap/channel.rb b/lib/feed2imap/channel.rb deleted file mode 100644 index c43c254..0000000 --- a/lib/feed2imap/channel.rb +++ /dev/null @@ -1,334 +0,0 @@ -=begin -Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server -Copyright (c) 2005 Lucas Nussbaum - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -=end - -# This class allows to retrieve a feed and parse it into a Channel - -require 'rexml/document' -require 'time' -require 'rmail' -require 'feed2imap/textconverters' -require 'feed2imap/rubymail_patch' -require 'feed2imap/rexml_patch' -require 'base64' - -class UnknownFeedTypeException < RuntimeError -end -# an RSS/Atom channel -class Channel - attr_reader :title, :link, :description, :creator, :encoding, :items - - # parse str to build a channel - def initialize(str = nil) - parse_str(str) if str - end - - # Determines all the fields using a string containing an - # XML document - def parse_str(str) - # Dirty hack: some feeds contain the & char. It must be changed to & - str.gsub!(/&(\s+)/, '&\1') - doc = REXML::Document.new(str) - # get channel info - @encoding = doc.encoding - @title,@link,@description,@creator = nil - @items = [] - if doc.root.elements['channel'] || doc.root.elements['rss:channel'] - # We have a RSS feed! - # Title - if (e = doc.root.elements['channel/title'] || - doc.root.elements['rss:channel/rss:title']) && e.text - @title = e.text.toUTF8(@encoding).rmWhiteSpace! - end - # Link - if (e = doc.root.elements['channel/link'] || - doc.root.elements['rss:channel/rss:link']) && e.text - @link = e.text.rmWhiteSpace! - end - # Description - if (e = doc.root.elements['channel/description'] || - doc.root.elements['rss:channel/rss:description']) && e.text - @description = e.text.toUTF8(@encoding).rmWhiteSpace! - end - # Creator - if ((e = doc.root.elements['channel/dc:creator']) && e.text) || - ((e = doc.root.elements['channel/author'] || - doc.root.elements['rss:channel/rss:author']) && e.text) - @creator = e.text.toUTF8(@encoding).rmWhiteSpace! - end - # Items - if doc.root.elements['channel/item'] - query = 'channel/item' - elsif doc.root.elements['item'] - query = 'item' - elsif doc.root.elements['rss:channel/rss:item'] - query = 'rss:channel/rss:item' - else - query = 'rss:item' - end - doc.root.each_element(query) { |e| @items << Item::new(e, self) } - - elsif doc.root.elements['/feed'] - # We have an ATOM feed! - # Title - if (e = doc.root.elements['/feed/title']) && e.text - @title = e.text.toUTF8(@encoding).rmWhiteSpace! - end - # Link - doc.root.each_element('/feed/link') do |e| - if e.attribute('type') and ( - e.attribute('type').value == 'text/html' or - e.attribute('type').value == 'application/xhtml' or - e.attribute('type').value == 'application/xhtml+xml') - if (h = e.attribute('href')) && h - @link = h.value.rmWhiteSpace! - end - end - end - # Description - if e = doc.root.elements['/feed/info'] - @description = e.elements.to_s.toUTF8(@encoding).rmWhiteSpace! - end - # Items - doc.root.each_element('/feed/entry') do |e| - @items << AtomItem::new(e, self) - end - else - raise UnknownFeedTypeException::new - end - end - - def to_s - s = "Title: #{@title}\nLink: #{@link}\n\n" - @items.each { |i| s += i.to_s } - s - end -end - -# an Item from a channel -class Item - attr_accessor :title, :link, :content, :date, :creator, :subject, - :category, :cacheditem - attr_reader :channel - def initialize(item = nil, channel = nil) - @channel = channel - @title, @link, @content, @date, @creator, @subject, @category = nil - if item - # Title - if ((e = item.elements['title'] || item.elements['rss:title']) && - e.text) || - ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) && - e.text) - @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace! - end - # Link - if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)|| - (e = item.elements['guid'] || item.elements['rss:guid'] and - not (e.attribute('isPermaLink') and - e.attribute('isPermaLink').value == 'false')) - @link = e.text.rmWhiteSpace! - end - # Content - if (e = item.elements['content:encoded']) || - (e = item.elements['description'] || item.elements['rss:description']) - if e.children.length > 1 - s = '' - e.children.each { |c| s += c.to_s } - @content = s.toUTF8(@channel.encoding).rmWhiteSpace!.text2html - elsif e.children.length == 1 - if e.cdatas[0] - @content = e.cdatas[0].to_s.toUTF8(@channel.encoding).rmWhiteSpace! - elsif e.text - @content = e.text.toUTF8(@channel.encoding).text2html - end - end - end - # Date - if e = item.elements['dc:date'] || item.elements['pubDate'] || - item.elements['rss:pubDate'] - begin - @date = Time::xmlschema(e.text) - rescue - begin - @date = Time::rfc2822(e.text) - rescue - begin - @date = Time::parse(e.text) - rescue - @date = nil - end - end - end - end - # Creator - @creator = @channel.creator - if (e = item.elements['dc:creator'] || item.elements['author'] || - item.elements['rss:author']) && e.text - @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace! - end - # Subject - if (e = item.elements['dc:subject']) && e.text - @subject = e.text.toUTF8(@channel.encoding).rmWhiteSpace! - end - # Category - if (e = item.elements['dc:category'] || item.elements['category'] || - item.elements['rss:category']) && e.text - @category = e.text.toUTF8(@channel.encoding).rmWhiteSpace! - end - end - end - - def to_s - "--------------------------------\n" + - "Title: #{@title}\nLink: #{@link}\n" + - "Date: #{@date.to_s}\nCreator: #{@creator}\n" + - "Subject: #{@subject}\nCategory: #{@category}\nContent:\n#{content}\n" - end - - def to_text - s = "" - s += "Channel: " - s += @channel.title.toISO_8859_1('utf-8') + ' ' if @channel.title - s += "<#{@channel.link.toISO_8859_1('utf-8')}>" if @channel.link - s += "\n" - s += "Item: " - s += @title.toISO_8859_1('utf-8') + ' ' if @title - s += "<#{@link.toISO_8859_1('utf-8')}>" if @link - s += "\n" - s += "\nDate: #{@date.to_s.toISO_8859_1('utf-8')}" if @date # TODO improve date rendering ? - s += "\nAuthor: #{@creator.toISO_8859_1('utf-8')}" if @creator - - s += "\nSubject: #{@subject.toISO_8859_1('utf-8')}" if @subject - s += "\nCategory: #{@category.toISO_8859_1('utf-8')}" if @category - s += "\n\n" - s += "#{@content.html2text.toISO_8859_1('utf-8')}" if @content - s - end - - def to_html - s = '' - s += '' - s += '' - s += "

Channel: " - s += "" if @channel.link - s += @channel.title if @channel.title - s += "" if @channel.link - s += "
\nItem: " - s += "" if @link - s += @title if @title - s += "" if @link - s += "\n" - s += "
Date: #{@date.to_s}" if @date # TODO improve date rendering ? - s += "
Author: #{@creator}" if @creator - s += "
Subject: #{@subject}" if @subject - s += "
Category: #{@category}" if @category - s += "

" - s += "

#{@content}

" if @content - s += '' - s - end - - def to_mail(from = 'Feed2Imap') - message = RMail::Message::new - message.header['From'] = "#{from} " - message.header['To'] = "#{from} " - if @date.nil? - message.header['Date'] = Time::new.rfc2822 - else - message.header['Date'] = @date.rfc2822 - end - message.header['X-Feed2Imap-Version'] = F2I_VERSION if defined?(F2I_VERSION) - message.header['X-CacheIndex'] = "-#{@cacheditem.index}-" - message.header['X-F2IStatus'] = "Updated" if @cacheditem.updated - # treat subject. Might need MIME encoding. - subj = @title or (@date and @date.to_s) or @link - if subj - if subj.needMIME - message.header['Subject'] = "=?utf-8?b?#{Base64::encode64(subj).gsub("\n",'')}?=" - else - message.header['Subject'] = subj - end - end - textpart = RMail::Message::new - textpart.header['Content-Type'] = 'text/plain; charset=iso-8859-1' - textpart.header['Content-Transfer-Encoding'] = '7bit' - textpart.body = to_text - htmlpart = RMail::Message::new - htmlpart.header['Content-Type'] = 'text/html; charset=utf-8' - htmlpart.header['Content-Transfer-Encoding'] = '7bit' - htmlpart.body = to_html - message.add_part(textpart) - message.add_part(htmlpart) - return message.to_s - end -end - -class AtomItem < Item - def initialize(item = nil, channel = nil) - @channel = channel - @title, @link, @content, @date, @creator, @subject, @category = nil - if item - # Title - if (e = item.elements['title']) && e.text - @title = e.text.toUTF8(@channel.encoding).rmWhiteSpace! - end - # Link - item.each_element('link') do |e| - if e.attribute('type').value == 'text/html' or - e.attribute('type').value == 'application/xhtml' or - e.attribute('type').value == 'application/xhtml+xml' - if (h = e.attribute('href')) && h.value - @link = h.value - end - end - end - # Content - if e = item.elements['content'] || item.elements['summary'] - if (e.attribute('mode') and e.attribute('mode').value == 'escaped') && - e.text - @content = e.text.toUTF8(@channel.encoding).rmWhiteSpace! - else - # go one step deeper in the recursion if possible - e = e.elements['div'] || e - @content = e.to_s.toUTF8(@channel.encoding).rmWhiteSpace! - end - end - # Date - if (e = item.elements['issued'] || e = item.elements['created']) && e.text - begin - @date = Time::xmlschema(e.text) - rescue - begin - @date = Time::rfc2822(e.text) - rescue - begin - @date = Time::parse(e.text) - rescue - @date = nil - end - end - end - end - # Creator - @creator = @channel.creator - if (e = item.elements['author/name']) && e.text - @creator = e.text.toUTF8(@channel.encoding).rmWhiteSpace! - end - end - end -end diff --git a/lib/feed2imap/feed2imap.rb b/lib/feed2imap/feed2imap.rb index e1ad3cd..250a0e7 100644 --- a/lib/feed2imap/feed2imap.rb +++ b/lib/feed2imap/feed2imap.rb @@ -18,14 +18,15 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end # Feed2Imap version -F2I_VERSION = '0.5' +F2I_VERSION = '0.5.1' require 'feed2imap/config' require 'feed2imap/cache' -require 'feed2imap/channel' require 'feed2imap/httpfetcher' require 'logger' require 'thread' +require 'feedparser' +require 'feed2imap/itemtomail' class Feed2Imap def Feed2Imap.version @@ -119,13 +120,13 @@ class Feed2Imap @config.feeds.each do |f| next if f.body.nil? # means 304 begin - channel = Channel::new(f.body) + feed = FeedParser::Feed::new(f.body) rescue Exception => e @logger.fatal("Error while parsing #{f.name}: #{e}") next end begin - newitems, updateditems = @cache.get_new_items(f.name, channel.items) + newitems, updateditems = @cache.get_new_items(f.name, feed.items) rescue @logger.fatal("Exception caught when selecting new items for #{f.name}: #{$!}") puts $!.backtrace @@ -134,8 +135,14 @@ class Feed2Imap @logger.info("#{f.name}: #{newitems.length} new items, #{updateditems.length} updated items.") if newitems.length > 0 or updateditems.length > 0 begin if !cacherebuild - updateditems.each { |i| f.imapaccount.updatemail(f.folder, i.to_mail(f.name), i.cacheditem.index) } - newitems.each { |i| f.imapaccount.putmail(f.folder, i.to_mail(f.name)) } + updateditems.each do |i| + email = item_to_mail(i, i.cacheditem.index, true, f.name) + f.imapaccount.updatemail(f.folder, email, i.cacheditem.index) + end + newitems.each do |i| + email = item_to_mail(i, i.cacheditem.index, false, f.name) + f.imapaccount.putmail(f.folder, email) + end end rescue @logger.fatal("Exception caught while uploading mail to #{f.folder}: #{$!}") diff --git a/lib/feed2imap/itemtomail.rb b/lib/feed2imap/itemtomail.rb new file mode 100644 index 0000000..9d1cbad --- /dev/null +++ b/lib/feed2imap/itemtomail.rb @@ -0,0 +1,77 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This file contains classes to parse a feed and store it as a Channel object. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'rexml/document' +require 'time' +require 'rmail' +require 'feedparser' +require 'feedparser/text-output' +require 'feedparser/html-output' +require 'base64' +require 'feed2imap/rubymail_patch' + +class String + def needMIME + utf8 = false + self.unpack('U*').each do |c| + if c > 127 + utf8 = true + break + end + end + utf8 + end +end + +def item_to_mail(item, index, updated, from = 'Feed2Imap') + message = RMail::Message::new + message.header['From'] = "#{from} " + message.header['To'] = "#{from} " + if @date.nil? + message.header['Date'] = Time::new.rfc2822 + else + message.header['Date'] = item.date.rfc2822 + end + message.header['X-Feed2Imap-Version'] = F2I_VERSION if defined?(F2I_VERSION) + message.header['X-CacheIndex'] = "-#{index}-" + message.header['X-F2IStatus'] = "Updated" if updated + # treat subject. Might need MIME encoding. + subj = item.title or (item.date and item.date.to_s) or item.link + if subj + if subj.needMIME + message.header['Subject'] = "=?utf-8?b?#{Base64::encode64(subj).gsub("\n",'')}?=" + else + message.header['Subject'] = subj + end + end + textpart = RMail::Message::new + textpart.header['Content-Type'] = 'text/plain; charset=utf-8' + textpart.header['Content-Transfer-Encoding'] = '7bit' + textpart.body = item.to_text + htmlpart = RMail::Message::new + htmlpart.header['Content-Type'] = 'text/html; charset=utf-8' + htmlpart.header['Content-Transfer-Encoding'] = '7bit' + htmlpart.body = '' + item.to_html + '' + message.add_part(textpart) + message.add_part(htmlpart) + return message.to_s +end + diff --git a/lib/feed2imap/rexml_patch.rb b/lib/feed2imap/rexml_patch.rb index bce0be3..f991090 100644 --- a/lib/feed2imap/rexml_patch.rb +++ b/lib/feed2imap/rexml_patch.rb @@ -17,7 +17,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end -require 'feed2imap/textconverters' +require 'feedparser' # Patch for REXML # Very ugly patch to make REXML error-proof. diff --git a/lib/feed2imap/textconverters.rb b/lib/feed2imap/textconverters.rb deleted file mode 100644 index 9145e5a..0000000 --- a/lib/feed2imap/textconverters.rb +++ /dev/null @@ -1,152 +0,0 @@ -=begin -Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server -Copyright (c) 2005 Lucas Nussbaum - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -=end - -# for URI::regexp -require 'uri' -require 'feed2imap/html2text-parser' - -# This class provides various converters -class String - # is this text HTML ? search for tags - def html? - return (self =~ /

/) || (self =~ /
/) || (self =~ //) || (self =~ /<\/a>/) || (self =~ //) - end - - # returns true if the text contains escaped HTML (with HTML entities) - def escaped_html? - return (self =~ /<img src=/) || (self =~ /<a href=/) || (self =~ /<br(\/| \/|)>/) - end - - # un-escape HTML in the text - def unescape_html - { - '<' => '<', - '>' => '>', - "'" => ''', - '"' => '"', - '&' => '&', - "\047" => ''' - }.each do |k, v| - gsub!(v, k) - end - self - end - - # convert text to HTML - def text2html - text = self.clone - return text if text.html? - if text.escaped_html? - return text.unescape_html - end - # paragraphs - text.gsub!(/\A\s*(.*)\Z/m, '

\1

') - text.gsub!(/\s*\n(\s*\n)+\s*/, "

\n

") - # uris - text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/, - '\1') - text - end - - # Convert an HTML text to plain text - def html2text - if false - text = self.clone - # let's remove all CR - text.gsub!(/\n/, '') - # convert

and
- text.gsub!(/\s*<\/p>\s*/, '') - text.gsub!(/\s*]*)?>\s*/, "\n\n") - text.gsub!(/\s*\s*/, "\n") - # remove other tags - text.gsub!(/<[^>]*>/, '') - # remove leading and trailing whilespace - text.gsub!(/\A\s*/m, '') - text.gsub!(/\s*\Z/m, '') - text - else - text = self.clone - # parse HTML - p = HTML2TextParser::new(true) - p.feed(text) - p.close - text = p.savedata - # remove leading and trailing whilespace - text.gsub!(/\A\s*/m, '') - text.gsub!(/\s*\Z/m, '') - # remove whitespace around \n - text.gsub!(/ *\n/m, "\n") - text.gsub!(/\n */m, "\n") - # and duplicates \n - text.gsub!(/\n\n+/m, "\n\n") - text - end - end - - # Remove white space around the text - def rmWhiteSpace! - return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'') - end - - # Convert a text in inputenc to a text in ISO-8859-1 - def toISO_8859_1(inputenc) - if inputenc.downcase == 'utf-8' - begin - return self.unpack('U*').pack('C*') - rescue - return self - end - else - return self - end - end - - # Convert a text in inputenc to a text in UTF8 - # must take care of wrong input locales - def toUTF8(inputenc) - if inputenc.downcase != 'utf-8' - # it is said it is not UTF-8. Ensure it is REALLY not UTF-8 - begin - if self.unpack('U*').pack('U*') == self - return self - end - rescue - # do nothing - end - begin - return self.unpack('C*').pack('U*') - rescue - return self #failsafe solution. but a dirty one :-) - end - else - return self - end - end - - def needMIME - utf8 = false - self.unpack('U*').each do |c| - if c > 127 - utf8 = true - break - end - end - utf8 - end -end -- cgit v1.2.3-54-g00ecf