From 8d3b2a0a706ed76527b204e28f97a7fdea6db633 Mon Sep 17 00:00:00 2001 From: lnu Date: Wed, 20 Jul 2005 16:02:15 +0000 Subject: new html2text converter git-svn-id: svn+ssh://svn.gna.org/svn/feed2imap/trunk/feed2imap@36 f70e237a-67f3-0310-a06c-d2b8a7116972 --- lib/feed2imap/sgml-parser.rb | 333 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 333 insertions(+) create mode 100644 lib/feed2imap/sgml-parser.rb (limited to 'lib/feed2imap/sgml-parser.rb') diff --git a/lib/feed2imap/sgml-parser.rb b/lib/feed2imap/sgml-parser.rb new file mode 100644 index 0000000..c692f52 --- /dev/null +++ b/lib/feed2imap/sgml-parser.rb @@ -0,0 +1,333 @@ +# A parser for SGML, using the derived class as static DTD. +# from http://raa.ruby-lang.org/project/html-parser + +class SGMLParser + + # Regular expressions used for parsing: + Interesting = /[&<]/ + Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + + '![^<>]*)?') + + Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ + Charref = /&#([0-9]+)[^0-9]/ + + Starttagopen = /<[>a-zA-Z]/ + Endtagopen = /<\/[<>a-zA-Z]/ + Endbracket = /[<>]/ + Special = /]*>/ + Commentopen = /