/
Parser.pm
271 lines (233 loc) · 7.14 KB
/
Parser.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# Vend::Parser - Interchange parser class
#
# Copyright (C) 2002-2007 Interchange Development Group
# Copyright (C) 1997-2002 Red Hat, Inc.
#
# Based on HTML::Parser
# Copyright 1996 Gisle Aas. All rights reserved.
=head1 NAME
Vend::Parser - Interchange parser class
=head1 DESCRIPTION
C<Vend::Parser> will tokenize a Interchange page when the $p->parse()
method is called. The document to parse can be supplied in arbitrary
chunks. Call $p->eof() the end of the document to flush any remaining
text. The return value from parse() is a reference to the parser object.
=over 4
=item $self->start($tag, $attr, $attrseq, $origtext)
This method is called when a complete start tag has been recognized.
The first argument is the tag name (in lower case) and the second
argument is a reference to a hash that contain all attributes found
within the start tag. The attribute keys are converted to lower case.
Entities found in the attribute values are already expanded. The
third argument is a reference to an array with the lower case
attribute keys in the original order. The fourth argument is the
original Interchange page.
=item $self->end($tag)
This method is called when an end tag has been recognized. The
argument is the lower case tag name.
=item $self->text($text)
This method is called when plain text in the document is recognized.
The text is passed on unmodified and might contain multiple lines.
Note that for efficiency reasons entities in the text are B<not>
expanded.
=back
=head1 COPYRIGHT
Copyright 2002-2007 Interchange Development Group
Copyright 1997-2002 Red Hat, Inc.
Original HTML::Parser module copyright 1996 Gisle Aas.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=head1 AUTHORS
Vend::Parser - Mike Heins <mike@perusion.com>
HTML::Parser - Gisle Aas <aas@sn.no>
=cut
package Vend::Parser;
use strict;
no warnings qw(uninitialized numeric);
use HTML::Entities ();
use vars qw($VERSION);
$VERSION = '2.13';
sub new
{
my $class = shift;
my $self = bless { '_buf' => '' }, $class;
$self;
}
sub eof
{
shift->parse(undef);
}
sub parse
{
my $self = shift;
my $buf = \ $self->{_buf};
unless (defined $_[0]) {
# signals EOF (assume rest is plain text)
$self->text($$buf) if length $$buf;
$$buf = '';
return $self;
}
$$buf .= $_[0];
my $eaten;
# Parse html text in $$buf. The strategy is to remove complete
# tokens from the beginning of $$buf until we can't deside whether
# it is a token or not, or the $$buf is empty.
while (1) { # the loop will end by returning when text is parsed
# If a preceding routine sent the response, stop
if ($Vend::Sent) {
${$self->{OUT}} = $self->{_buf} = '';
@Vend::Output = ();
return $self;
}
# We try to pull off any plain text (anything before a '[')
if ($$buf =~ s/^([^[]+)// ) {
#my $eat = $1;
#::logDebug("plain eat='$eat'");
#$self->text($eat);
$self->text($1);
return $self unless length $$buf;
# Find the most common tags
} elsif ($$buf =~ s|^(\[([-a-z0-9A-Z_]+)[^"'=\]>]*\])||) {
#my $tag=$2; my $eat = $1;
#undef $self->{HTML};
#::logDebug("tag='$tag' eat='$eat'");
#$self->start($tag, {}, [], $eat);
undef $self->{HTML};
$self->start($2, {}, [], $1);
# Then, finally we look for a start tag
} elsif ($$buf =~ s|^\[||) {
# start tag
$eaten = '[';
$self->{HTML} = 0 if ! defined $self->{HTML};
#::logDebug("do [ tag");
# First find a tag name. It must immediately follow the
# opening '[', then start with a letter, and be followed by
# letters, numbers, dot, or underscore.
if ($$buf =~ s|^(([a-zA-Z][-a-zA-Z0-9._]*)\s*)||) {
$eaten .= $1;
my ($tag);
my ($nopush, $element);
my %attr;
my @attrseq;
my $old;
$tag = lc $2;
#::logDebug("tag='$tag' eat='$eaten'");
# Then we would like to find some attributes
while ( $$buf =~ s|^(([_a-zA-Z][-a-zA-Z0-9._]*)\s*)|| or
$$buf =~ s|^(([=!<>][=~]?)\s+)|| )
{
$eaten .= $1;
my $attr = lc $2;
$attr =~ tr/-/_/;
#::logDebug("in parse, eaten=$eaten");
$attr =~ s/\.(.*)//
and $element = $1;
my $val;
# The attribute might take an optional value.
# First we check for an unquoted value
if ($$buf =~ s~(^=\s*([^\|\"\'\`\]\s][^\]>\s]*)\s*)~~) {
$eaten .= $1;
next unless defined $attr;
$val = $2;
# or quoted by " or '
} elsif ($$buf =~ s~(^=\s*(["\'])(.*?)\2\s*)~~s) {
$eaten .= $1;
next unless defined $attr;
$val = $3;
HTML::Entities::decode($val) if $attr{entities};
} elsif ($$buf =~ s~(^=\s*([\`\|])(.*?)\2\s*)~~s) {
$eaten .= $1;
# or quoted by ` to send to [calc]
if ($2 eq '`') {
$val = Vend::Interpolate::tag_calc($3)
unless defined $Vend::Cfg->{AdminSub}{calc};
}
# or quoted by | to strip leading & trailing whitespace
elsif ($2 eq '|') {
$val = $3;
$val =~ s/^\s+//;
$val =~ s/\s+$//;
}
else {
die "parse error!";
}
# truncated just after the '=' or inside the attribute
} elsif ($$buf =~ m|^(=\s*)$|s or
$$buf =~ m|^(=\s*[\"\'].*)|s) {
$$buf = "$eaten$1";
return $self;
} elsif (!$old) {
# assume attribute with implicit value, but if not,
# no value is set and the eaten value is grown
undef $nopush;
($attr,$val,$nopush) = $self->implicit($tag,$attr);
$old = 1 unless $val;
}
next if $old;
if(! $attr) {
$attr->{OLD} = $val if defined $attr;
next;
}
if(defined $element) {
#::logDebug("Found element: $element val=$val");
$val = Vend::Interpolate::interpolate_html($val)
if $::Pragma->{interpolate_itl_references}
and $val =~ /\[\w[-\w]*\s+.*]/s;
if(! ref $attr{$attr}) {
if ($element =~ /[A-Za-z]/) {
$attr{$attr} = { $element => $val };
}
else {
$attr{$attr} = [ ];
$attr{$attr}->[$element] = $val;
}
push (@attrseq, $attr);
}
elsif(ref($attr{$attr}) eq 'ARRAY') {
if($element =~ /\D/) {
push @{$attr{$attr}}, $val;
}
else {
$attr{$attr}->[$element] = $val;
}
}
elsif (ref($attr{$attr}) eq 'HASH') {
$attr{$attr}->{$element} = $val;
}
undef $element;
next;
}
$attr{$attr} = $val;
push(@attrseq, $attr) unless $nopush;
}
# At the end there should be a closing ']'
if ($$buf =~ s|^\]|| ) {
$self->start($tag, \%attr, \@attrseq, "$eaten]");
} elsif ($$buf =~ s|^/\s*\]||) {
# XML-style empty container tag like [this /]
$self->start($tag, \%attr, \@attrseq, "$eaten]", 1);
} elsif ($$buf =~ s|^([^\]\n]+\])||) {
$eaten .= $1;
$self->start($tag, {}, [], $eaten);
} else {
#::logDebug("eaten $eaten");
# Not a conforming start tag, regard it as normal text
$self->text($eaten);
}
} else {
#::logDebug("eaten $eaten");
$self->text($eaten);
}
} elsif (length $$buf) {
::logDebug("remaining: $$buf");
die $$buf; # This should never happen
} else {
# The buffer is empty now
return $self;
}
return $self if $self->{SEND};
}
$self;
}
1;
__END__