summaryrefslogtreecommitdiff
path: root/lib/feed2imap/html2text-parser.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/feed2imap/html2text-parser.rb')
-rw-r--r--lib/feed2imap/html2text-parser.rb97
1 files changed, 97 insertions, 0 deletions
diff --git a/lib/feed2imap/html2text-parser.rb b/lib/feed2imap/html2text-parser.rb
new file mode 100644
index 0000000..a6bf400
--- /dev/null
+++ b/lib/feed2imap/html2text-parser.rb
@@ -0,0 +1,97 @@
+=begin
+Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
+Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+=end
+
+require 'feed2imap/sgml-parser.rb'
+
+# this class provides a simple SGML parser that removes HTML tags
+class HTML2TextParser < SGMLParser
+
+ attr_reader :savedata
+
+ def initialize(verbose = false)
+ @savedata = ''
+ @pre = false
+ @href = nil
+ @links = []
+ super(verbose)
+ end
+
+ def handle_data(data)
+ # let's remove all CR
+ data.gsub!(/\n/, '') if not @pre
+
+ @savedata << data
+ end
+
+ def unknown_starttag(tag, attrs)
+ case tag
+ when 'p'
+ @savedata << "\n\n"
+ when 'br'
+ @savedata << "\n"
+ when 'b'
+ @savedata << '*'
+ when 'u'
+ @savedata << '_'
+ when 'i'
+ @savedata << '/'
+ when 'pre'
+ @savedata << "\n\n"
+ @pre = true
+ when 'a'
+ # find href in args
+ @href = nil
+ attrs.each do |a|
+ if a[0] == 'href'
+ @href = a[1]
+ end
+ end
+ @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
+ end
+ end
+
+ def close
+ super
+ if @links.length > 0
+ @savedata << "\n\n"
+ @links.each_index do |i|
+ @savedata << "[#{i+1}] #{@links[i]}\n"
+ end
+ end
+ end
+
+ def unknown_endtag(tag)
+ case tag
+ when 'b'
+ @savedata << '*'
+ when 'u'
+ @savedata << '_'
+ when 'i'
+ @savedata << '/'
+ when 'pre'
+ @savedata << "\n\n"
+ @pre = false
+ when 'a'
+ if @href
+ @savedata << "[#{@links.length}]"
+ @href = nil
+ end
+ end
+ end
+end