1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
|
# A parser for SGML, using the derived class as static DTD.
# from http://raa.ruby-lang.org/project/html-parser
class SGMLParser
# Regular expressions used for parsing:
Interesting = /[&<]/
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
'![^<>]*)?')
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
Charref = /&#([0-9]+)[^0-9]/
Starttagopen = /<[>a-zA-Z]/
Endtagopen = /<\/[<>a-zA-Z]/
Endbracket = /[<>]/
Special = /<![^<>]*>/
Commentopen = /<!--/
Commentclose = /--[ \t\n]*>/
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
'(\s*=\s*' +
"('[^']*'" +
'|"[^"]*"' +
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
Entitydefs =
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
def initialize(verbose=false)
@verbose = verbose
reset
end
def reset
@rawdata = ''
@stack = []
@lasttag = '???'
@nomoretags = false
@literal = false
end
def has_context(gi)
@stack.include? gi
end
def setnomoretags
@nomoretags = true
@literal = true
end
def setliteral(*args)
@literal = true
end
def feed(data)
@rawdata << data
goahead(false)
end
def close
goahead(true)
end
def goahead(_end)
rawdata = @rawdata
i = 0
n = rawdata.length
while i < n
if @nomoretags
handle_data(rawdata[i..(n-1)])
i = n
break
end
j = rawdata.index(Interesting, i)
j = n unless j
if i < j
handle_data(rawdata[i..(j-1)])
end
i = j
break if (i == n)
if rawdata[i] == ?< #
if rawdata.index(Starttagopen, i) == i
if @literal
handle_data(rawdata[i, 1])
i += 1
next
end
k = parse_starttag(i)
break unless k
i = k
next
end
if rawdata.index(Endtagopen, i) == i
k = parse_endtag(i)
break unless k
i = k
@literal = false
next
end
if rawdata.index(Commentopen, i) == i
if @literal
handle_data(rawdata[i,1])
i += 1
next
end
k = parse_comment(i)
break unless k
i += k
next
end
if rawdata.index(Special, i) == i
if @literal
handle_data(rawdata[i, 1])
i += 1
next
end
k = parse_special(i)
break unless k
i += k
next
end
elsif rawdata[i] == ?& #
if rawdata.index(Charref, i) == i
i += $&.length
handle_charref($1)
i -= 1 unless rawdata[i-1] == ?;
next
end
if rawdata.index(Entityref, i) == i
i += $&.length
handle_entityref($1)
i -= 1 unless rawdata[i-1] == ?;
next
end
else
raise RuntimeError, 'neither < nor & ??'
end
# We get here only if incomplete matches but
# nothing else
match = rawdata.index(Incomplete, i)
unless match == i
handle_data(rawdata[i, 1])
i += 1
next
end
j = match + $&.length
break if j == n # Really incomplete
handle_data(rawdata[i..(j-1)])
i = j
end
# end while
if _end and i < n
handle_data(@rawdata[i..(n-1)])
i = n
end
@rawdata = rawdata[i..-1]
end
def parse_comment(i)
rawdata = @rawdata
if rawdata[i, 4] != '<!--'
raise RuntimeError, 'unexpected call to handle_comment'
end
match = rawdata.index(Commentclose, i)
return nil unless match
matched_length = $&.length
j = match
handle_comment(rawdata[i+4..(j-1)])
j = match + matched_length
return j-i
end
def parse_starttag(i)
rawdata = @rawdata
j = rawdata.index(Endbracket, i + 1)
return nil unless j
attrs = []
if rawdata[i+1] == ?> #
# SGML shorthand: <> == <last open tag seen>
k = j
tag = @lasttag
else
match = rawdata.index(Tagfind, i + 1)
unless match
raise RuntimeError, 'unexpected call to parse_starttag'
end
k = i + 1 + ($&.length)
tag = $&.downcase
@lasttag = tag
end
while k < j
break unless rawdata.index(Attrfind, k)
matched_length = $&.length
attrname, rest, attrvalue = $1, $2, $3
if not rest
attrvalue = '' # was: = attrname
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
(attrvalue[0] == ?" && attrvalue[-1,1] == ?")
attrvalue = attrvalue[1..-2]
end
attrs << [attrname.downcase, attrvalue]
k += matched_length
end
if rawdata[j] == ?> #
j += 1
end
finish_starttag(tag, attrs)
return j
end
def parse_endtag(i)
rawdata = @rawdata
j = rawdata.index(Endbracket, i + 1)
return nil unless j
tag = (rawdata[i+2..j-1].strip).downcase
if rawdata[j] == ?> #
j += 1
end
finish_endtag(tag)
return j
end
def finish_starttag(tag, attrs)
method = 'start_' + tag
if self.respond_to?(method)
@stack << tag
handle_starttag(tag, method, attrs)
return 1
else
method = 'do_' + tag
if self.respond_to?(method)
handle_starttag(tag, method, attrs)
return 0
else
unknown_starttag(tag, attrs)
return -1
end
end
end
def finish_endtag(tag)
if tag == ''
found = @stack.length - 1
if found < 0
unknown_endtag(tag)
return
end
else
unless @stack.include? tag
method = 'end_' + tag
unless self.respond_to?(method)
unknown_endtag(tag)
end
return
end
found = @stack.index(tag) #or @stack.length
end
while @stack.length > found
tag = @stack[-1]
method = 'end_' + tag
if respond_to?(method)
handle_endtag(tag, method)
else
unknown_endtag(tag)
end
@stack.pop
end
end
def parse_special(i)
rawdata = @rawdata
match = rawdata.index(Endbracket, i+1)
return nil unless match
matched_length = $&.length
handle_special(rawdata[i+1..(match-1)])
return match - i + matched_length
end
def handle_starttag(tag, method, attrs)
self.send(method, attrs)
end
def handle_endtag(tag, method)
self.send(method)
end
def report_unbalanced(tag)
if @verbose
print '*** Unbalanced </' + tag + '>', "\n"
print '*** Stack:', self.stack, "\n"
end
end
def handle_charref(name)
n = Integer(name)
if !(0 <= n && n <= 255)
unknown_charref(name)
return
end
handle_data(n.chr)
end
def handle_entityref(name)
table = Entitydefs
if table.include?(name)
handle_data(table[name])
else
unknown_entityref(name)
return
end
end
def handle_data(data)
end
def handle_comment(data)
end
def handle_special(data)
end
def unknown_starttag(tag, attrs)
end
def unknown_endtag(tag)
end
def unknown_charref(ref)
end
def unknown_entityref(ref)
end
end
|