summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972>2005-07-20 16:02:15 +0000
committerlnu <lnu@f70e237a-67f3-0310-a06c-d2b8a7116972>2005-07-20 16:02:15 +0000
commit8d3b2a0a706ed76527b204e28f97a7fdea6db633 (patch)
tree830f12b7e57fdee27f99848e69895a835ccb4b55
parentb497c57ea6afb1e2c93e61be527d0b98ac92e133 (diff)
downloadfeed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.gz
feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.tar.bz2
feed2imap-8d3b2a0a706ed76527b204e28f97a7fdea6db633.zip
new html2text converter
git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@36 f70e237a-67f3-0310-a06c-d2b8a7116972
-rw-r--r--ChangeLog2
-rw-r--r--lib/feed2imap/html2text-parser.rb97
-rw-r--r--lib/feed2imap/sgml-parser.rb333
-rw-r--r--lib/feed2imap/textconverters.rb45
-rwxr-xr-xtest/tc_converters_html2text.rb68
5 files changed, 527 insertions, 18 deletions
diff --git a/ChangeLog b/ChangeLog
index 03930e5..54afb72 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
Feed2Imap 0.4 (date unknown)
============================
+* Switched to a real SGML parser for the text version.
+* Much better output for the text version of emails.
* New feed2imap-cleaner to remove old mails seen but not flagged
* Feed2Imap version number wasn't displayed in the User-Agent
* Better exception handling when parsing errors occur
diff --git a/lib/feed2imap/html2text-parser.rb b/lib/feed2imap/html2text-parser.rb
new file mode 100644
index 0000000..a6bf400
--- /dev/null
+++ b/lib/feed2imap/html2text-parser.rb
@@ -0,0 +1,97 @@
+=begin
+Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
+Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+=end
+
+require 'feed2imap/sgml-parser.rb'
+
+# this class provides a simple SGML parser that removes HTML tags
+class HTML2TextParser < SGMLParser
+
+ attr_reader :savedata
+
+ def initialize(verbose = false)
+ @savedata = ''
+ @pre = false
+ @href = nil
+ @links = []
+ super(verbose)
+ end
+
+ def handle_data(data)
+ # let's remove all CR
+ data.gsub!(/\n/, '') if not @pre
+
+ @savedata << data
+ end
+
+ def unknown_starttag(tag, attrs)
+ case tag
+ when 'p'
+ @savedata << "\n\n"
+ when 'br'
+ @savedata << "\n"
+ when 'b'
+ @savedata << '*'
+ when 'u'
+ @savedata << '_'
+ when 'i'
+ @savedata << '/'
+ when 'pre'
+ @savedata << "\n\n"
+ @pre = true
+ when 'a'
+ # find href in args
+ @href = nil
+ attrs.each do |a|
+ if a[0] == 'href'
+ @href = a[1]
+ end
+ end
+ @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
+ end
+ end
+
+ def close
+ super
+ if @links.length > 0
+ @savedata << "\n\n"
+ @links.each_index do |i|
+ @savedata << "[#{i+1}] #{@links[i]}\n"
+ end
+ end
+ end
+
+ def unknown_endtag(tag)
+ case tag
+ when 'b'
+ @savedata << '*'
+ when 'u'
+ @savedata << '_'
+ when 'i'
+ @savedata << '/'
+ when 'pre'
+ @savedata << "\n\n"
+ @pre = false
+ when 'a'
+ if @href
+ @savedata << "[#{@links.length}]"
+ @href = nil
+ end
+ end
+ end
+end
diff --git a/lib/feed2imap/sgml-parser.rb b/lib/feed2imap/sgml-parser.rb
new file mode 100644
index 0000000..c692f52
--- /dev/null
+++ b/lib/feed2imap/sgml-parser.rb
@@ -0,0 +1,333 @@
+# A parser for SGML, using the derived class as static DTD.
+# from http://raa.ruby-lang.org/project/html-parser
+
+class SGMLParser
+
+ # Regular expressions used for parsing:
+ Interesting = /[&<]/
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
+ '![^<>]*)?')
+
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
+ Charref = /&#([0-9]+)[^0-9]/
+
+ Starttagopen = /<[>a-zA-Z]/
+ Endtagopen = /<\/[<>a-zA-Z]/
+ Endbracket = /[<>]/
+ Special = /<![^<>]*>/
+ Commentopen = /<!--/
+ Commentclose = /--[ \t\n]*>/
+ Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
+ Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
+ '(\s*=\s*' +
+ "('[^']*'" +
+ '|"[^"]*"' +
+ '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
+
+ Entitydefs =
+ {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
+
+ def initialize(verbose=false)
+ @verbose = verbose
+ reset
+ end
+
+ def reset
+ @rawdata = ''
+ @stack = []
+ @lasttag = '???'
+ @nomoretags = false
+ @literal = false
+ end
+
+ def has_context(gi)
+ @stack.include? gi
+ end
+
+ def setnomoretags
+ @nomoretags = true
+ @literal = true
+ end
+
+ def setliteral(*args)
+ @literal = true
+ end
+
+ def feed(data)
+ @rawdata << data
+ goahead(false)
+ end
+
+ def close
+ goahead(true)
+ end
+
+ def goahead(_end)
+ rawdata = @rawdata
+ i = 0
+ n = rawdata.length
+ while i < n
+ if @nomoretags
+ handle_data(rawdata[i..(n-1)])
+ i = n
+ break
+ end
+ j = rawdata.index(Interesting, i)
+ j = n unless j
+ if i < j
+ handle_data(rawdata[i..(j-1)])
+ end
+ i = j
+ break if (i == n)
+ if rawdata[i] == ?< #
+ if rawdata.index(Starttagopen, i) == i
+ if @literal
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ k = parse_starttag(i)
+ break unless k
+ i = k
+ next
+ end
+ if rawdata.index(Endtagopen, i) == i
+ k = parse_endtag(i)
+ break unless k
+ i = k
+ @literal = false
+ next
+ end
+ if rawdata.index(Commentopen, i) == i
+ if @literal
+ handle_data(rawdata[i,1])
+ i += 1
+ next
+ end
+ k = parse_comment(i)
+ break unless k
+ i += k
+ next
+ end
+ if rawdata.index(Special, i) == i
+ if @literal
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ k = parse_special(i)
+ break unless k
+ i += k
+ next
+ end
+ elsif rawdata[i] == ?& #
+ if rawdata.index(Charref, i) == i
+ i += $&.length
+ handle_charref($1)
+ i -= 1 unless rawdata[i-1] == ?;
+ next
+ end
+ if rawdata.index(Entityref, i) == i
+ i += $&.length
+ handle_entityref($1)
+ i -= 1 unless rawdata[i-1] == ?;
+ next
+ end
+ else
+ raise RuntimeError, 'neither < nor & ??'
+ end
+ # We get here only if incomplete matches but
+ # nothing else
+ match = rawdata.index(Incomplete, i)
+ unless match == i
+ handle_data(rawdata[i, 1])
+ i += 1
+ next
+ end
+ j = match + $&.length
+ break if j == n # Really incomplete
+ handle_data(rawdata[i..(j-1)])
+ i = j
+ end
+ # end while
+ if _end and i < n
+ handle_data(@rawdata[i..(n-1)])
+ i = n
+ end
+ @rawdata = rawdata[i..-1]
+ end
+
+ def parse_comment(i)
+ rawdata = @rawdata
+ if rawdata[i, 4] != '<!--'
+ raise RuntimeError, 'unexpected call to handle_comment'
+ end
+ match = rawdata.index(Commentclose, i)
+ return nil unless match
+ matched_length = $&.length
+ j = match
+ handle_comment(rawdata[i+4..(j-1)])
+ j = match + matched_length
+ return j-i
+ end
+
+ def parse_starttag(i)
+ rawdata = @rawdata
+ j = rawdata.index(Endbracket, i + 1)
+ return nil unless j
+ attrs = []
+ if rawdata[i+1] == ?> #
+ # SGML shorthand: <> == <last open tag seen>
+ k = j
+ tag = @lasttag
+ else
+ match = rawdata.index(Tagfind, i + 1)
+ unless match
+ raise RuntimeError, 'unexpected call to parse_starttag'
+ end
+ k = i + 1 + ($&.length)
+ tag = $&.downcase
+ @lasttag = tag
+ end
+ while k < j
+ break unless rawdata.index(Attrfind, k)
+ matched_length = $&.length
+ attrname, rest, attrvalue = $1, $2, $3
+ if not rest
+ attrvalue = '' # was: = attrname
+ elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
+ (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
+ attrvalue = attrvalue[1..-2]
+ end
+ attrs << [attrname.downcase, attrvalue]
+ k += matched_length
+ end
+ if rawdata[j] == ?> #
+ j += 1
+ end
+ finish_starttag(tag, attrs)
+ return j
+ end
+
+ def parse_endtag(i)
+ rawdata = @rawdata
+ j = rawdata.index(Endbracket, i + 1)
+ return nil unless j
+ tag = (rawdata[i+2..j-1].strip).downcase
+ if rawdata[j] == ?> #
+ j += 1
+ end
+ finish_endtag(tag)
+ return j
+ end
+
+ def finish_starttag(tag, attrs)
+ method = 'start_' + tag
+ if self.respond_to?(method)
+ @stack << tag
+ handle_starttag(tag, method, attrs)
+ return 1
+ else
+ method = 'do_' + tag
+ if self.respond_to?(method)
+ handle_starttag(tag, method, attrs)
+ return 0
+ else
+ unknown_starttag(tag, attrs)
+ return -1
+ end
+ end
+ end
+
+ def finish_endtag(tag)
+ if tag == ''
+ found = @stack.length - 1
+ if found < 0
+ unknown_endtag(tag)
+ return
+ end
+ else
+ unless @stack.include? tag
+ method = 'end_' + tag
+ unless self.respond_to?(method)
+ unknown_endtag(tag)
+ end
+ return
+ end
+ found = @stack.index(tag) #or @stack.length
+ end
+ while @stack.length > found
+ tag = @stack[-1]
+ method = 'end_' + tag
+ if respond_to?(method)
+ handle_endtag(tag, method)
+ else
+ unknown_endtag(tag)
+ end
+ @stack.pop
+ end
+ end
+
+ def parse_special(i)
+ rawdata = @rawdata
+ match = rawdata.index(Endbracket, i+1)
+ return nil unless match
+ matched_length = $&.length
+ handle_special(rawdata[i+1..(match-1)])
+ return match - i + matched_length
+ end
+
+ def handle_starttag(tag, method, attrs)
+ self.send(method, attrs)
+ end
+
+ def handle_endtag(tag, method)
+ self.send(method)
+ end
+
+ def report_unbalanced(tag)
+ if @verbose
+ print '*** Unbalanced </' + tag + '>', "\n"
+ print '*** Stack:', self.stack, "\n"
+ end
+ end
+
+ def handle_charref(name)
+ n = Integer(name)
+ if !(0 <= n && n <= 255)
+ unknown_charref(name)
+ return
+ end
+ handle_data(n.chr)
+ end
+
+ def handle_entityref(name)
+ table = Entitydefs
+ if table.include?(name)
+ handle_data(table[name])
+ else
+ unknown_entityref(name)
+ return
+ end
+ end
+
+ def handle_data(data)
+ end
+
+ def handle_comment(data)
+ end
+
+ def handle_special(data)
+ end
+
+ def unknown_starttag(tag, attrs)
+ end
+ def unknown_endtag(tag)
+ end
+ def unknown_charref(ref)
+ end
+ def unknown_entityref(ref)
+ end
+
+end
diff --git a/lib/feed2imap/textconverters.rb b/lib/feed2imap/textconverters.rb
index be63173..ba3813a 100644
--- a/lib/feed2imap/textconverters.rb
+++ b/lib/feed2imap/textconverters.rb
@@ -18,6 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
=end
require 'uri' # for URI::regexp
+require 'feed2imap/html2text-parser'
# This class provides various converters
class String
@@ -41,19 +42,37 @@ class String
# Convert an HTML text to plain text
def html2text
- text = self.clone
- # let's remove all CR
- text.gsub!(/\n/, '')
- # convert <p> and <br>
- text.gsub!(/\s*<\/p>\s*/, '')
- text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n")
- text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n")
- # remove other tags
- text.gsub!(/<[^>]*>/, '')
- # remove leading and trailing whilespace
- text.gsub!(/\A\s*/m, '')
- text.gsub!(/\s*\Z/m, '')
- text
+ if false
+ text = self.clone
+ # let's remove all CR
+ text.gsub!(/\n/, '')
+ # convert <p> and <br>
+ text.gsub!(/\s*<\/p>\s*/, '')
+ text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n")
+ text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n")
+ # remove other tags
+ text.gsub!(/<[^>]*>/, '')
+ # remove leading and trailing whilespace
+ text.gsub!(/\A\s*/m, '')
+ text.gsub!(/\s*\Z/m, '')
+ text
+ else
+ text = self.clone
+ # parse HTML
+ p = HTML2TextParser::new(true)
+ p.feed(text)
+ p.close
+ text = p.savedata
+ # remove leading and trailing whilespace
+ text.gsub!(/\A\s*/m, '')
+ text.gsub!(/\s*\Z/m, '')
+ # remove whitespace around \n
+ text.gsub!(/ *\n/m, "\n")
+ text.gsub!(/\n */m, "\n")
+ # and duplicates \n
+ text.gsub!(/\n\n+/m, "\n\n")
+ text
+ end
end
# Remove white space around the text
diff --git a/test/tc_converters_html2text.rb b/test/tc_converters_html2text.rb
index 4210a32..8074672 100755
--- a/test/tc_converters_html2text.rb
+++ b/test/tc_converters_html2text.rb
@@ -6,21 +6,79 @@ require 'test/unit'
require 'feed2imap/textconverters'
class TextConvertersHTML2TextTest < Test::Unit::TestCase
- def test_t1
+ def test_basic1
inputtext = <<-EOF
-<p> Ceci est un test. <br> On verra <b>bien</b> ce que ça donne ...</p>
+<p> Ceci est un test. <br> On verra bien ce que ça donne ...</p>
EOF
outputtext = "Ceci est un test.
On verra bien ce que ça donne ..."
assert_equal(outputtext, inputtext.html2text)
end
- def test_t2
+ def test_basic2
inputtext = <<-EOF
-<p class="coucou"> Ceci est un test. On verra <b>bien</b> ce que ça donne ...</p>
-<p class="coucou"> Ceci est un test. On verra <b>bien</b> ce que ça donne ...</p>
+<p class="coucou"> Ceci est un test. On verra bien ce que ça donne ...</p>
+<p class="coucou"> Ceci est un test. On verra bien ce que ça donne ...</p>
EOF
outputtext = "Ceci est un test. On verra bien ce que ça donne ...\n\nCeci est un test. On verra bien ce que ça donne ..."
assert_equal(outputtext, inputtext.html2text)
end
+
+ def test_multiline
+ inputtext = <<-EOF
+<p class="coucou"> Ceci
+
+
+est
+
+
+un
+
+test. On
+verra
+bien ce que ça
+donne
+...</p>
+ EOF
+ outputtext = "Ceci est un test. On verra bien ce que ça donne ..."
+ assert_equal(outputtext, inputtext.html2text)
+ end
+
+ def test_bui
+ inputtext = <<-EOF
+Ceci est un <b>test</b>. On <u>verra</u> <i>bien</i> ce
+ EOF
+ outputtext = "Ceci est un *test*. On _verra_ /bien/ ce"
+ assert_equal(outputtext, inputtext.html2text)
+ end
+
+ def test_extchar
+ inputtext = <<-EOF
+test de caractères étendus : éàèç ah ah
+ EOF
+ outputtext = "test de caract\350res \351tendus : \351\340\350\347 ah ah"
+ assert_equal(outputtext, inputtext.html2text)
+ end
+
+ def test_pre
+ inputtext = <<-EOF
+<p>le texte qui suit sera entre pre</p>
+<pre>a b c
+ aaa ddd eee
+ ddd ee dfsdf dfdf dfd f df
+</pre>
+ <br/><br/>
+<p>fin du pre !</p>
+ EOF
+ outputtext = "le texte qui suit sera entre pre\n\na b c\naaa ddd eee\nddd ee dfsdf dfdf dfd f df\n\nfin du pre !"
+ assert_equal(outputtext, inputtext.html2text)
+ end
+
+ def test_link
+ inputtext = <<-EOF
+<p>ceci est un <a href="http://slashdot.org" style="">lien</a>. Ceci est un <a href=http://linuxfr.org/>autre lien</a></p>
+ EOF
+ outputtext = "ceci est un lien[1]. Ceci est un autre lien[2]\n\n[1] http://slashdot.org\n[2] http://linuxfr.org/"
+ assert_equal(outputtext, inputtext.html2text)
+ end
end