1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
|
=begin
Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
=end
require 'uri' # for URI::regexp
require 'feed2imap/html2text-parser'
# This class provides various converters
class String
# is this text HTML ? search for tags
def html?
return (self =~ /<p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/)
end
# returns true if the text contains escaped HTML (with HTML entities)
def escaped_html?
return (self =~ /<img src=/) || (self =~ /<a href=/) || (self =~ /<br(\/| \/|)>/)
end
# un-escape HTML in the text
def unescape_html
{
'<' => '<',
'>' => '>',
"'" => ''',
'"' => '"',
'&' => '&',
"\047" => '''
}.each do |k, v|
gsub!(v, k)
end
self
end
# convert text to HTML
def text2html
text = self.clone
return text if text.html?
if text.escaped_html?
return text.unescape_html
end
# paragraphs
text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
# uris
text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
'<a href="\1">\1</a>')
text
end
# Convert an HTML text to plain text
def html2text
if false
text = self.clone
# let's remove all CR
text.gsub!(/\n/, '')
# convert <p> and <br>
text.gsub!(/\s*<\/p>\s*/, '')
text.gsub!(/\s*<p(\s[^>]*)?>\s*/, "\n\n")
text.gsub!(/\s*<br(\s*)\/?(\s*)>\s*/, "\n")
# remove other tags
text.gsub!(/<[^>]*>/, '')
# remove leading and trailing whilespace
text.gsub!(/\A\s*/m, '')
text.gsub!(/\s*\Z/m, '')
text
else
text = self.clone
# parse HTML
p = HTML2TextParser::new(true)
p.feed(text)
p.close
text = p.savedata
# remove leading and trailing whilespace
text.gsub!(/\A\s*/m, '')
text.gsub!(/\s*\Z/m, '')
# remove whitespace around \n
text.gsub!(/ *\n/m, "\n")
text.gsub!(/\n */m, "\n")
# and duplicates \n
text.gsub!(/\n\n+/m, "\n\n")
text
end
end
# Remove white space around the text
def rmWhiteSpace!
return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
end
# Convert a text in inputenc to a text in ISO-8859-1
def toISO_8859_1(inputenc)
if inputenc.downcase == 'utf-8'
begin
return self.unpack('U*').pack('C*')
rescue
return self
end
else
return self
end
end
# Convert a text in inputenc to a text in UTF8
# must take care of wrong input locales
def toUTF8(inputenc)
if inputenc.downcase != 'utf-8'
# it is said it is not UTF-8. Ensure it is REALLY not UTF-8
begin
if self.unpack('U*').pack('U*') == self
return self
end
rescue
# do nothing
end
begin
return self.unpack('C*').pack('U*')
rescue
return self #failsafe solution. but a dirty one :-)
end
else
return self
end
end
def needMIME
utf8 = false
self.unpack('U*').each do |c|
if c > 127
utf8 = true
break
end
end
utf8
end
end
|