From 8d3b2a0a706ed76527b204e28f97a7fdea6db633 Mon Sep 17 00:00:00 2001 From: lnu Date: Wed, 20 Jul 2005 16:02:15 +0000 Subject: new html2text converter git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@36 f70e237a-67f3-0310-a06c-d2b8a7116972 --- ChangeLog | 2 + lib/feed2imap/html2text-parser.rb | 97 +++++++++++ lib/feed2imap/sgml-parser.rb | 333 ++++++++++++++++++++++++++++++++++++++ lib/feed2imap/textconverters.rb | 45 ++++-- test/tc_converters_html2text.rb | 68 +++++++- 5 files changed, 527 insertions(+), 18 deletions(-) create mode 100644 lib/feed2imap/html2text-parser.rb create mode 100644 lib/feed2imap/sgml-parser.rb diff --git a/ChangeLog b/ChangeLog index 03930e5..54afb72 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ Feed2Imap 0.4 (date unknown) ============================ +* Switched to a real SGML parser for the text version. +* Much better output for the text version of emails. * New feed2imap-cleaner to remove old mails seen but not flagged * Feed2Imap version number wasn't displayed in the User-Agent * Better exception handling when parsing errors occur diff --git a/lib/feed2imap/html2text-parser.rb b/lib/feed2imap/html2text-parser.rb new file mode 100644 index 0000000..a6bf400 --- /dev/null +++ b/lib/feed2imap/html2text-parser.rb @@ -0,0 +1,97 @@ +=begin +Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server +Copyright (c) 2005 Lucas Nussbaum + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +=end + +require 'feed2imap/sgml-parser.rb' + +# this class provides a simple SGML parser that removes HTML tags +class HTML2TextParser < SGMLParser + + attr_reader :savedata + + def initialize(verbose = false) + @savedata = '' + @pre = false + @href = nil + @links = [] + super(verbose) + end + + def handle_data(data) + # let's remove all CR + data.gsub!(/\n/, '') if not @pre + + @savedata << data + end + + def unknown_starttag(tag, attrs) + case tag + when 'p' + @savedata << "\n\n" + when 'br' + @savedata << "\n" + when 'b' + @savedata << '*' + when 'u' + @savedata << '_' + when 'i' + @savedata << '/' + when 'pre' + @savedata << "\n\n" + @pre = true + when 'a' + # find href in args + @href = nil + attrs.each do |a| + if a[0] == 'href' + @href = a[1] + end + end + @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2') + end + end + + def close + super + if @links.length > 0 + @savedata << "\n\n" + @links.each_index do |i| + @savedata << "[#{i+1}] #{@links[i]}\n" + end + end + end + + def unknown_endtag(tag) + case tag + when 'b' + @savedata << '*' + when 'u' + @savedata << '_' + when 'i' + @savedata << '/' + when 'pre' + @savedata << "\n\n" + @pre = false + when 'a' + if @href + @savedata << "[#{@links.length}]" + @href = nil + end + end + end +end diff --git a/lib/feed2imap/sgml-parser.rb b/lib/feed2imap/sgml-parser.rb new file mode 100644 index 0000000..c692f52 --- /dev/null +++ b/lib/feed2imap/sgml-parser.rb @@ -0,0 +1,333 @@ +# A parser for SGML, using the derived class as static DTD. +# from http://raa.ruby-lang.org/project/html-parser + +class SGMLParser + + # Regular expressions used for parsing: + Interesting = /[&<]/ + Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + + '![^<>]*)?') + + Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ + Charref = /&#([0-9]+)[^0-9]/ + + Starttagopen = /<[>a-zA-Z]/ + Endtagopen = /<\/[<>a-zA-Z]/ + Endbracket = /[<>]/ + Special = /]*>/ + Commentopen = /