File Coverage

File:blib/lib/XML/Twig.pm
Coverage:91.8%

linestmtbrancondsubpodtimecode
1# $Id: /xmltwig/trunk/Twig_pm.slow 25 2006-10-20T09:00:05.351266Z mrodrigu $
2#
3# Copyright (c) 1999-2004 Michel Rodriguez
4# All rights reserved.
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the same terms as Perl itself.
8#
9
10# This is created in the caller's space
11BEGIN
12
91
6
388
112
{ sub ::PCDATA { '#PCDATA' }
13
5
175
€€sub ::CDATA { '#CDATA' }
14}
15
16
17######################################################################
18package XML::Twig;
19######################################################################
20
21require 5.004;
22
91
91
91
899
413
855
use strict;
23
24
91
91
91
841
332
775
use vars qw($VERSION @ISA %valid_option);
25
91
91
91
907
308
945
use Carp;
26
27*isa = \&UNIVERSAL::isa;
28
29#start-extract twig_global
30
31# constants: element types
32
91
91
91
882
326
1109
use constant (PCDATA => '#PCDATA');
33
91
91
91
881
296
630
use constant (CDATA => '#CDATA');
34
91
91
91
879
315
576
use constant (PI => '#PI');
35
91
91
91
906
297
574
use constant (COMMENT => '#COMMENT');
36
91
91
91
885
341
771
use constant (ENT => '#ENT');
37
38# element classes
39
91
91
91
1008
315
625
use constant (ELT => '#ELT');
40
91
91
91
834
372
624
use constant (TEXT => '#TEXT');
41
42# element properties
43
91
91
91
879
369
619
use constant (ASIS => '#ASIS');
44
91
91
91
1497
463
716
use constant (EMPTY => '#EMPTY');
45
46#end-extract twig_global
47
48# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat
49
91
91
91
912
305
656
use constant (BUFSIZE => 32768);
50
51
52# used to store the gi's
53my %gi2index; # gi => index
54my @index2gi; # list of gi's
55my $SPECIAL_GI; # first non-special gi;
56my %base_ent; # base entity character => replacement
57
58# flag, set to true if the weaken sub is available
59
91
91
91
861
300
593
use vars qw( $weakrefs);
60
61#start-extract twig_global
62my $REG_NAME = q{(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*)}; # xml name (leading # allowed)
63my $REG_NAME_W = q{(?:(?:[^\W\d_]|[:#])(?:[\w.-]*:)?[\w.-]*|\*)}; # name or wildcard (* or '') (leading # allowed)
64my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp
65my $REG_REGEXP_EXP = q{(?:(?:[^\\/]|\\.)*)}; # content of a regexp
66my $REG_REGEXP_MOD = q{(?:[eimso]*)}; # regexp modifiers
67my $REG_MATCH = q{[!=]~}; # match (or not)
68my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted)
69my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number
70my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value
71my $REG_OP = q{=|==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge}; # op
72my $REG_FUNCTION = q{(?:string|text)\(\s*\)};
73my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)};
74
75
76# used in the handler trigger code
77my $REG_PREDICATE2 = qq{\\[((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or))*)\\]};
78my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or))*)};
79
80# not all axis, only supported ones (in get_xpath)
81my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self',
82€€€€€€€€€€€€€€€€€€€€€€'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self'
83€€€€€€€€€€€€€€€€€€€€);
84my $REG_AXIS = "(?:" . join( '|', @supported_axis) .")";
85
86# only used in the "xpath"engine (for get_xpath/findnodes) for now
87my $REG_PREDICATE = qr{\[(?:(?:string\(\s*\)|\@$REG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]};
88
89my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
90
91#end-extract twig_global
92
93my $parser_version;
94my( $FB_HTMLCREF, $FB_XMLCREF);
95
96BEGIN
97{
98
91
785
$VERSION = '3.27';
99
100
91
91
91
1129
387
1284
use XML::Parser;
101
91
413
my $needVersion = '2.23';
102
91
428
$parser_version= $XML::Parser::VERSION;
103
91
1026
croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion;
104
105
91
839
if( $] >= 5.008)
106
91
91
91
91
416
1267
400
952
€€{ eval "use Encode qw( :all)";
107
91
1608
€€€€$FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF;
108
91
920
€€€€$FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF;
109€€}
110
111# test whether we can use weak references
112# set local empty signal handler to trap error messages
113
91
91
376
616
{ local $SIG{__DIE__};
114
91
2952
€€if( eval( 'require Scalar::Util') && defined( &Scalar::Util::weaken) )
115
91
91
1220
626
€€€€{ import Scalar::Util( 'weaken'); $weakrefs= 1; }
116€€elsif( eval( 'require WeakRef'))
117
0
0
0
0
€€€€{ import WeakRef; $weakrefs= 1; }
118€€else
119
0
0
€€€€{ $weakrefs= 0; }
120}
121
122
91
1028
import XML::Twig::Elt;
123
91
659
import XML::Twig::Entity;
124
91
624
import XML::Twig::Entity_list;
125
126# used to store the gi's
127# should be set for each twig really, at least when there are several
128# the init ensures that special gi's are always the same
129
130# gi => index
131# do NOT use => or the constants become quoted!
132
91
877
%XML::Twig::gi2index=( PCDATA, 0, CDATA, 1, PI, 2, COMMENT, 3, ENT, 4);
133# list of gi's
134
91
672
@XML::Twig::index2gi=( PCDATA, CDATA, PI, COMMENT, ENT);
135
136# gi's under this value are special
137
91
611
$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi;
138
139
91
899
%XML::Twig::base_ent= ( '>' => '&gt;', '<' => '&lt;', '&' => '&amp;', "'" => '&apos;', '"' => '&quot;',);
140
141# now set some aliases
142
91
582
*find_nodes = *get_xpath; # same as XML::XPath
143
91
392
*findnodes = *get_xpath; # same as XML::LibXML
144
91
390
*getElementsByTagName = *descendants;
145
91
378
*descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt
146
91
374
*find_by_tag_name = *descendants;
147
91
373
*getElementById = *elt_id;
148
91
374
*getEltById = *elt_id;
149
91
535
*toString = *sprint;
150}
151
152@ISA = qw(XML::Parser);
153
154# fake gi's used in twig_handlers and start_tag_handlers
155my $ALL = '_all_'; # the associated function is always called
156my $DEFAULT= '_default_'; # the function is called if no other handler has been
157
158# some defaults
159my $COMMENTS_DEFAULT= 'keep';
160my $PI_DEFAULT = 'keep';
161
162
163# handlers used in regular mode
164my %twig_handlers=( Start => \&_twig_start,
165€€€€€€€€€€€€€€€€€€€€End => \&_twig_end,
166€€€€€€€€€€€€€€€€€€€€Char => \&_twig_char,
167€€€€€€€€€€€€€€€€€€€€Entity => \&_twig_entity,
168€€€€€€€€€€€€€€€€€€€€XMLDecl => \&_twig_xmldecl,
169€€€€€€€€€€€€€€€€€€€€Doctype => \&_twig_doctype,
170€€€€€€€€€€€€€€€€€€€€Element => \&_twig_element,
171€€€€€€€€€€€€€€€€€€€€Attlist => \&_twig_attlist,
172€€€€€€€€€€€€€€€€€€€€CdataStart => \&_twig_cdatastart,
173€€€€€€€€€€€€€€€€€€€€CdataEnd => \&_twig_cdataend,
174€€€€€€€€€€€€€€€€€€€€Proc => \&_twig_pi,
175€€€€€€€€€€€€€€€€€€€€Comment => \&_twig_comment,
176€€€€€€€€€€€€€€€€€€€€Default => \&_twig_default,
177€€€€€€);
178
179# handlers used when twig_roots is used and we are outside of the roots
180my %twig_handlers_roots=
181€€( Start => \&_twig_start_check_roots,
182€€€€End => \&_twig_end_check_roots,
183€€€€Doctype => \&_twig_doctype,
184€€€€Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl,
185€€€€Element => undef, Attlist => undef, CdataStart => undef,
186€€€€CdataEnd => undef, Proc => undef, Comment => undef,
187€€€€Proc => \&_twig_pi_check_roots,
188€€€€Default => sub {}, # hack needed for XML::Parser 2.27
189€€);
190
191# handlers used when twig_roots and print_outside_roots are used and we are
192# outside of the roots
193my %twig_handlers_roots_print_2_30=
194€€( Start => \&_twig_start_check_roots,
195€€€€End => \&_twig_end_check_roots,
196€€€€Char => \&_twig_print,
197€€€€Entity => \&_twig_print_entity,
198€€€€ExternEnt => \&_twig_print_entity,
199€€€€DoctypeFin => \&_twig_doctype_fin_print,
200€€€€XMLDecl => \&_twig_print,
201€€€€Doctype => \&_twig_print_doctype, # because recognized_string is broken here
202€€€€# Element => \&_twig_print, Attlist => \&_twig_print,
203€€€€CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
204€€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
205€€€€Default => \&_twig_print_check_doctype,
206€€);
207
208# handlers used when twig_roots, print_outside_roots and keep_encoding are used
209# and we are outside of the roots
210my %twig_handlers_roots_print_original_2_30=
211€€( Start => \&_twig_start_check_roots,
212€€€€End => \&_twig_end_check_roots,
213€€€€Char => \&_twig_print_original,
214€€€€# I have no idea why I should not be using this handler!
215€€€€Entity => \&_twig_print_entity,
216€€€€ExternEnt => \&_twig_print_entity,
217€€€€DoctypeFin => \&_twig_doctype_fin_print,
218€€€€XMLDecl => \&_twig_print_original,
219€€€€Doctype => \&_twig_print_original_doctype, # because original_string is broken here
220€€€€Element => \&_twig_print_original, Attlist => \&_twig_print_original,
221€€€€CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
222€€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
223€€€€Default => \&_twig_print_original_check_doctype,
224€€);
225
226# handlers used when twig_roots and print_outside_roots are used and we are
227# outside of the roots
228my %twig_handlers_roots_print_2_27=
229€€( Start => \&_twig_start_check_roots,
230€€€€End => \&_twig_end_check_roots,
231€€€€Char => \&_twig_print,
232€€€€# I have no idea why I should not be using this handler!
233€€€€#Entity => \&_twig_print,
234€€€€XMLDecl => \&_twig_print, Doctype => \&_twig_print,
235€€€€CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
236€€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
237€€€€Default => \&_twig_print,
238€€);
239
240# handlers used when twig_roots, print_outside_roots and keep_encoding are used
241# and we are outside of the roots
242my %twig_handlers_roots_print_original_2_27=
243€€( Start => \&_twig_start_check_roots,
244€€€€End => \&_twig_end_check_roots,
245€€€€Char => \&_twig_print_original,
246€€€€# for some reason original_string is wrong here
247€€€€# this can be a problem if the doctype includes non ascii characters
248€€€€XMLDecl => \&_twig_print, Doctype => \&_twig_print,
249€€€€# I have no idea why I should not be using this handler!
250€€€€Entity => \&_twig_print,
251€€€€#Element => undef, Attlist => undef,
252€€€€CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
253€€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
254€€€€Default => \&_twig_print, # _twig_print_original does not work
255€€);
256
257
258my %twig_handlers_roots_print= $parser_version > 2.27
259€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€? %twig_handlers_roots_print_2_30
260€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€: %twig_handlers_roots_print_2_27;
261my %twig_handlers_roots_print_original= $parser_version > 2.27
262€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€? %twig_handlers_roots_print_original_2_30
263€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€: %twig_handlers_roots_print_original_2_27;
264
265
266# handlers used when the finish_print method has been called
267my %twig_handlers_finish_print=
268€€( Start => \&_twig_print,
269€€€€End => \&_twig_print, Char => \&_twig_print,
270€€€€Entity => \&_twig_print, XMLDecl => \&_twig_print,
271€€€€Doctype => \&_twig_print, Element => \&_twig_print,
272€€€€Attlist => \&_twig_print, CdataStart => \&_twig_print,
273€€€€CdataEnd => \&_twig_print, Proc => \&_twig_print,
274€€€€Comment => \&_twig_print, Default => \&_twig_print,
275€€);
276
277# handlers used when the finish_print method has been called and the keep_encoding
278# option is used
279my %twig_handlers_finish_print_original=
280€€( Start => \&_twig_print_original, End => \&_twig_print_end_original,
281€€€€Char => \&_twig_print_original, Entity => \&_twig_print_original,
282€€€€XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original,
283€€€€Element => \&_twig_print_original, Attlist => \&_twig_print_original,
284€€€€CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
285€€€€Proc => \&_twig_print_original, Comment => \&_twig_print_original,
286€€€€Default => \&_twig_print_original,
287€€);
288
289# handlers used whithin ignored elements
290my %twig_handlers_ignore=
291€€( Start => \&_twig_ignore_start,
292€€€€End => \&_twig_ignore_end,
293€€€€Char => undef, Entity => undef, XMLDecl => undef,
294€€€€Doctype => undef, Element => undef, Attlist => undef,
295€€€€CdataStart => undef, CdataEnd => undef, Proc => undef,
296€€€€Comment => undef, Default => undef,
297€€);
298
299
300# those handlers are only used if the entities are NOT to be expanded
301my %twig_noexpand_handlers= ( Default => \&_twig_default );
302
303my @saved_default_handler;
304
305my $ID= 'id'; # default value, set by the Id argument
306
307# all allowed options
308%valid_option=
309€€€€( # XML::Twig options
310€€€€€€TwigHandlers => 1, Id => 1,
311€€€€€€TwigRoots => 1, TwigPrintOutsideRoots => 1,
312€€€€€€StartTagHandlers => 1, EndTagHandlers => 1,
313€€€€€€ForceEndTagHandlersUsage => 1,
314€€€€€€DoNotChainHandlers => 1,
315€€€€€€IgnoreElts => 1,
316€€€€€€Index => 1,
317€€€€€€CharHandler => 1,
318€€€€€€TopDownHandlers => 1,
319€€€€€€KeepEncoding => 1, DoNotEscapeAmpInAtts => 1,
320€€€€€€ParseStartTag => 1, KeepAttsOrder => 1,
321€€€€€€LoadDTD => 1, DTDHandler => 1,
322€€€€€€DoNotOutputDTD => 1, NoProlog => 1,
323€€€€€€ExpandExternalEnts => 1,
324€€€€€€DiscardSpaces => 1, KeepSpaces => 1,
325€€€€€€DiscardSpacesIn => 1, KeepSpacesIn => 1,
326€€€€€€PrettyPrint => 1, EmptyTags => 1,
327€€€€€€Quote => 'double',
328€€€€€€Comments => 1, Pi => 1,
329€€€€€€OutputFilter => 1, InputFilter => 1,
330€€€€€€OutputTextFilter => 1,
331€€€€€€OutputEncoding => 1,
332€€€€€€RemoveCdata => 1,
333€€€€€€EltClass => 1,
334€€€€€€MapXmlns => 1, KeepOriginalPrefix => 1,
335€€€€€€# XML::Parser options
336€€€€€€ErrorContext => 1, ProtocolEncoding => 1,
337€€€€€€Namespaces => 1, NoExpand => 1,
338€€€€€€Stream_Delimiter => 1, ParseParamEnt => 1,
339€€€€€€NoLWP => 1, Non_Expat_Options => 1,
340€€€€€€Xmlns => 1,
341€€€€);
342
343# predefined input and output filters
344
91
91
91
927
317
913
use vars qw( %filter);
345%filter= ( html => \&html_encode,
346€€€€€€€€€€€safe => \&safe_encode,
347€€€€€€€€€€€safe_hex => \&safe_encode_hex,
348€€€€€€€€€);
349
350
351# trigger types (used to sort them)
352my ($XPATH_TRIGGER, $REGEXP_TRIGGER, $LEVEL_TRIGGER)=(1..3);
353
354sub new
355
2728
370809
€€{ my ($class, %args) = @_;
356
2728
9556
€€€€my $handlers;
357
358€€€€# change all nice_perlish_names into nicePerlishNames
359
2728
17204
€€€€%args= _normalize_args( %args);
360
361€€€€# check options
362
2728
23380
€€€€unless( $args{MoreOptions})
363
2727
18906
€€€€€€{ foreach my $arg (keys %args)
364
5015
42659
€€€€€€€€{ carp "invalid option $arg" unless $valid_option{$arg}; }
365€€€€€€}
366
367€€€€# a twig is really an XML::Parser
368€€€€# my $self= XML::Parser->new(%args);
369
2728
11713
€€€€my $self;
370
2728
25173
€€€€$self= XML::Parser->new(%args);
371
372
2728
526056
€€€€bless $self, $class;
373
374
2728
17153
€€€€$self->{_twig_context_stack}= [];
375
376
2728
17164
€€€€if( exists $args{TwigHandlers})
377
151
1014
€€€€€€{ $handlers= $args{TwigHandlers};
378
151
933
€€€€€€€€$self->setTwigHandlers( $handlers);
379
145
1395
€€€€€€€€delete $args{TwigHandlers};
380€€€€€€}
381
382€€€€# take care of twig-specific arguments
383
2722
14376
€€€€if( exists $args{StartTagHandlers})
384
23
165
€€€€€€{ $self->setStartTagHandlers( $args{StartTagHandlers});
385
23
220
€€€€€€€€delete $args{StartTagHandlers};
386€€€€€€}
387
388
2722
15084
€€€€if( exists $args{DoNotChainHandlers})
389
1
10
€€€€€€{ $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; }
390
391
2722
14088
€€€€if( exists $args{IgnoreElts})
392
3
22
€€€€€€{ $self->setIgnoreEltsHandlers( $args{IgnoreElts});
393
3
29
€€€€€€€€delete $args{IgnoreElts};
394€€€€€€}
395
396
2722
13262
€€€€if( exists $args{Index})
397
2
9
€€€€€€{ my $index= $args{Index};
398€€€€€€€€# we really want a hash name => path, we turn an array into a hash if necessary
399
2
14
€€€€€€€€if( ref( $index) eq 'ARRAY')
400
1
2
5
13
€€€€€€€€€€{ my %index= map { $_ => $_ } @$index;
401
1
11
€€€€€€€€€€€€$index= \%index;
402€€€€€€€€€€}
403
2
19
€€€€€€€€while( my( $name, $exp)= each %$index)
404
3
4
4
4
47
14
29
28
€€€€€€€€€€{ $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); }
405€€€€€€}
406
407
2722
35381
€€€€$self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt';
408
2722
62
14615
605
€€€€if( exists( $args{EltClass})) { delete $args{EltClass}; }
409
410
2722
12792
€€€€if( exists( $args{MapXmlns}))
411
14
79
€€€€€€{ $self->{twig_map_xmlns}= $args{MapXmlns};
412
14
77
€€€€€€€€$self->{Namespaces}=1;
413
14
121
€€€€€€€€delete $args{MapXmlns};
414€€€€€€}
415
416
2722
13127
€€€€if( exists( $args{KeepOriginalPrefix}))
417
4
23
€€€€€€{ $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix};
418
4
35
€€€€€€€€delete $args{KeepOriginalPrefix};
419€€€€€€}
420
421
2722
13840
€€€€$self->{twig_dtd_handler}= $args{DTDHandler};
422
2722
9989
€€€€delete $args{DTDHandler};
423
424
2722
13442
€€€€if( $args{ExpandExternalEnts})
425
2
13
€€€€€€{ $self->set_expand_external_entities( 1);
426
2
9
€€€€€€€€$self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts
427
2
8
€€€€€€€€delete $args{LoadDTD};
428
2
16
€€€€€€€€delete $args{ExpandExternalEnts};
429€€€€€€}
430
431
2722
14525
€€€€if( $args{DoNotEscapeAmpInAtts})
432
1
6
€€€€€€{ $self->set_do_not_escape_amp_in_atts( 1);
433
1
5
€€€€€€€€$self->{twig_do_not_escape_amp_in_atts}=1;
434€€€€€€}
435€€€€else
436
2721
16492
€€€€€€{ $self->set_do_not_escape_amp_in_atts( 0);
437
2721
15360
€€€€€€€€$self->{twig_do_not_escape_amp_in_atts}=0;
438€€€€€€}
439
440€€€€# deal with TwigRoots argument, a hash of elements for which
441€€€€# subtrees will be built (and associated handlers)
442
443
2722
14021
€€€€if( $args{TwigRoots})
444
58
396
€€€€€€{ $self->setTwigRoots( $args{TwigRoots});
445
56
493
€€€€€€€€delete $args{TwigRoots};
446€€€€€€}
447
448
2720
13185
€€€€if( $args{EndTagHandlers})
449
9
87
€€€€€€{ unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage})
450
1
9
€€€€€€€€€€{ croak "you should not use EndTagHandlers without TwigRoots\n",
451€€€€€€€€€€€€€€€€€€"if you want to use it anyway, normally because you have ",
452€€€€€€€€€€€€€€€€€€"a start_tag_handlers that calls 'ignore' and you want to ",
453€€€€€€€€€€€€€€€€€€"call an ent_tag_handlers at the end of the element, then ",
454€€€€€€€€€€€€€€€€€€"pass 'force_end_tag_handlers_usage => 1' as an argument ",
455€€€€€€€€€€€€€€€€€€"to new";
456€€€€€€€€€€}
457
458
8
60
€€€€€€€€$self->setEndTagHandlers( $args{EndTagHandlers});
459
8
90
€€€€€€€€delete $args{EndTagHandlers};
460€€€€€€}
461
462
2719
12928
€€€€if( $args{TwigPrintOutsideRoots})
463
31
197
€€€€€€{ croak "cannot use TwigPrintOutsideRoots without TwigRoots"
464€€€€€€€€€€unless( $self->{twig_roots});
465€€€€€€€€# if the arg is a filehandle then store it
466
30
169
€€€€€€€€if( _is_fh( $args{TwigPrintOutsideRoots}) )
467
28
288
€€€€€€€€€€{ $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; }
468
30
307
€€€€€€€€$self->{twig_default_print}= $args{TwigPrintOutsideRoots};
469€€€€€€}
470
471€€€€# space policy
472
2718
13386
€€€€if( $args{KeepSpaces})
473
17
98
€€€€€€{ croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces});
474
16
85
€€€€€€€€croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
475
15
65
€€€€€€€€$self->{twig_keep_spaces}=1;
476
15
131
€€€€€€€€delete $args{KeepSpaces};
477€€€€€€}
478
2716
13067
€€€€if( $args{DiscardSpaces})
479
2
17
€€€€€€{ croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
480
1
5
€€€€€€€€$self->{twig_discard_spaces}=1;
481
1
8
€€€€€€€€delete $args{DiscardSpaces};
482€€€€€€}
483
2715
12640
€€€€if( $args{KeepSpacesIn})
484
8
50
€€€€€€{ croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn});
485
7
33
€€€€€€€€$self->{twig_discard_spaces}=1;
486
7
36
€€€€€€€€$self->{twig_keep_spaces_in}={};
487
7
7
25
44
€€€€€€€€my @tags= @{$args{KeepSpacesIn}};
488
7
9
34
63
€€€€€€€€foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; }
489
7
74
€€€€€€€€delete $args{KeepSpacesIn};
490€€€€€€}
491
2714
13027
€€€€if( $args{DiscardSpacesIn})
492
4
17
€€€€€€{ $self->{twig_keep_spaces}=1;
493
4
20
€€€€€€€€$self->{twig_discard_spaces_in}={};
494
4
4
14
23
€€€€€€€€my @tags= @{$args{DiscardSpacesIn}};
495
4
6
20
39
€€€€€€€€foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; }
496
4
36
€€€€€€€€delete $args{DiscardSpacesIn};
497€€€€€€}
498€€€€# discard spaces by default
499
2714
31336
€€€€$self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces});
500
501
2714
19757
€€€€$args{Comments}||= $COMMENTS_DEFAULT;
502
2714
3
23478
16
€€€€if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; }
503
1825
8727
€€€€elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; }
504
885
4342
€€€€elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; }
505
1
10
€€€€else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; }
506
2713
11590
€€€€delete $args{Comments};
507
508
2713
18768
€€€€$args{Pi}||= $PI_DEFAULT;
509
2713
2
23627
10
€€€€if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; }
510
1827
8977
€€€€elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; }
511
883
4334
€€€€elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; }
512
1
9
€€€€else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; }
513
2712
10671
€€€€delete $args{Pi};
514
515
2712
17769
€€€€if( $args{KeepEncoding})
516€€€€€€{
517€€€€€€€€# set it in XML::Twig::Elt so print functions know what to do
518
967
5454
€€€€€€€€$self->set_keep_encoding( 1);
519
967
13300
€€€€€€€€$self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag;
520
967
5980
€€€€€€€€delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ;
521
967
4011
€€€€€€€€delete $args{KeepEncoding};
522€€€€€€}
523€€€€else
524
1745
11253
€€€€€€{ $self->set_keep_encoding( 0);
525
1745
9836
€€€€€€€€$self->{parse_start_tag}= $args{ParseStartTag} if( $args{ParseStartTag});
526€€€€€€}
527
528
2712
13753
€€€€if( $args{OutputFilter})
529
5
30
€€€€€€{ $self->set_output_filter( $args{OutputFilter});
530
5
21
€€€€€€€€delete $args{OutputFilter};
531€€€€€€}
532€€€€else
533
2707
15181
€€€€€€{ $self->set_output_filter( 0); }
534
535
2712
13488
€€€€if( $args{RemoveCdata})
536
1
6
€€€€€€{ $self->set_remove_cdata( $args{RemoveCdata});
537
1
5
€€€€€€€€delete $args{RemoveCdata};
538€€€€€€}
539€€€€else
540
2711
14432
€€€€€€{ $self->set_remove_cdata( 0); }
541
542
2712
13512
€€€€if( $args{OutputTextFilter})
543
5
29
€€€€€€{ $self->set_output_text_filter( $args{OutputTextFilter});
544
5
24
€€€€€€€€delete $args{OutputTextFilter};
545€€€€€€}
546€€€€else
547
2707
13176
€€€€€€{ $self->set_output_text_filter( 0); }
548
549
550
2712
13640
€€€€if( exists $args{KeepAttsOrder})
551
6
33
€€€€€€{ $self->{keep_atts_order}= $args{KeepAttsOrder};
552
6
30
€€€€€€€€if( _use( 'Tie::IxHash'))
553
5
33
€€€€€€€€€€{ $self->set_keep_atts_order( $self->{keep_atts_order}); }
554€€€€€€€€else
555
1
6
€€€€€€€€€€{ croak "Tie::IxHash not available, option keep_atts_order not allowed"; }
556€€€€€€}
557€€€€else
558
2706
14954
€€€€€€{ $self->set_keep_atts_order( 0); }
559
560
561
2711
39
13702
275
€€€€if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); }
562
2711
1
12201
8
€€€€if( $args{Quote}) { $self->set_quote( $args{Quote}); }
563
2711
11
12636
70
€€€€if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}) }
564
565
2711
1
1
12623
6
9
€€€€if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; }
566
2711
3
3
12383
15
26
€€€€if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; }
567
2711
2
2
12421
10
19
€€€€if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; }
568
2711
4
4
11934
25
35
€€€€if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; }
569
2711
1
1
12005
7
10
€€€€if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; }
570
571
2711
3
1
12247
24
8
€€€€if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; }
572
2709
1
1
13147
8
84
€€€€if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; }
573
2709
1
1
13513
6
9
€€€€if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; }
574
575
2709
4
4
12997
17
33
€€€€if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; }
576
577€€€€# set handlers
578
2709
14285
€€€€if( $self->{twig_roots})
579
56
304
€€€€€€{ if( $self->{twig_default_print})
580
30
173
€€€€€€€€€€{ if( $self->{twig_keep_encoding})
581
6
81
€€€€€€€€€€€€€€{ $self->setHandlers( %twig_handlers_roots_print_original); }
582€€€€€€€€€€€€else
583
24
289
€€€€€€€€€€€€€€{ $self->setHandlers( %twig_handlers_roots_print); }
584€€€€€€€€€€}
585€€€€€€€€else
586
26
309
€€€€€€€€€€{ $self->setHandlers( %twig_handlers_roots); }
587€€€€€€}
588€€€€else
589
2653
29562
€€€€€€{ $self->setHandlers( %twig_handlers); }
590
591€€€€# XML::Parser::Expat does not like these handler to be set. So in order to
592€€€€# use the various sets of handlers on XML::Parser or XML::Parser::Expat
593€€€€# objects when needed, these ones have to be set only once, here, at
594€€€€# XML::Parser level
595
2709
1681892
€€€€$self->setHandlers( Init => \&_twig_init, Final => \&_twig_final);
596
597
2709
340346
€€€€$self->{twig_entity_list}= XML::Twig::Entity_list->new;
598
599
2709
14041
€€€€$self->{twig_id}= $ID;
600
2709
12798
€€€€$self->{twig_stored_spaces}='';
601
602
2709
14207
€€€€$self->{twig_autoflush}= 1; # auto flush by default
603
604
2709
12227
€€€€$self->{twig}= $self;
605
2709
39467
€€€€weaken( $self->{twig}) if( $weakrefs);
606
607
2709
20669
€€€€return $self;
608€€}
609
610sub parse
611€€{
612€€€€# if called as a class method, calls nparse, which creates the twig then parses it
613
2714
3
25772
22
€€€€if( !ref( $_[0])) { return shift->nparse( @_); }
614
615€€€€# requires 5.006 at least (or the ${^UNICODE} causes a problem) # > 5.006
616€€€€# trap underlying bug in IO::Handle (see RT #17500) # > 5.006
617€€€€# croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe # > 5.006
618
2711
47175
€€€€if( $]>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[1], 'GLOB') && -p $_[1] ) # > 5.006
619
0
0
€€€€€€{ croak "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n" # > 5.006
620€€€€€€€€€€€€€€. "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n" # > 5.006
621€€€€€€€€€€€€€€. "not to include 'D'"; # > 5.006
622€€€€€€} # > 5.006
623
2711
35670
€€€€shift->SUPER::parse( @_);
624€€}
625
626
3
20
sub parsefile_inplace { shift->_parse_inplace( parsefile => @_); }
627
3
26
sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); }
628
629sub _parse_inplace
630
6
81
€€{ my( $t, $method, $file, $suffix)= @_;
631
6
31
€€€€_use( 'File::Temp') || die "need File::Temp to use inplace methods\n";
632
6
28
€€€€_use( 'File::Basename');
633
634
635
6
41
€€€€my $tmpdir= dirname( $file);
636
6
44
€€€€my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir);
637
6
39
€€€€my $original_fh= select $tmpfh;
638
639
6
47
€€€€$t->$method( $file);
640
641
6
49
€€€€select $original_fh;
642
6
380
€€€€close $tmpfh;
643
6
88
€€€€my $mode= (stat( $file))[2] & 07777;
644
6
97
€€€€chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!";
645
646
6
42
€€€€if( $suffix)
647
4
15
€€€€€€{ my $backup;
648
4
2
31
21
€€€€€€€€if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; }
649
2
9
€€€€€€€€else { $backup= $file . $suffix; }
650
651
4
123
€€€€€€€€rename( $file, $backup) or die "cannot backup initial file ($file) to $backup: $!";
652€€€€€€}
653
6
205
€€€€rename( $tmpfile, $file) or die "cannot rename temp file ($tmpfile) to initial file ($file): $!";
654
655
6
27
€€€€return $t;
656€€}
657
658
659sub parseurl
660
11
55
€€{ my $t= shift;
661
11
87
€€€€$t->_parseurl( 0, @_);
662€€}
663
664sub safe_parseurl
665
9
59
€€{ my $t= shift;
666
9
80
€€€€$t->_parseurl( 1, @_);
667€€}
668
669
670sub parsefile_html
671
4
18
€€{ my $t= shift;
672
4
16
€€€€my $file= shift;
673
4
29
€€€€my $indent= $t->{ErrorContext} ? 1 : 0;
674
4
22
€€€€$t->parse( _html2xml( _slurp( $file), { indent => $indent }), @_);
675
4
208
€€€€return $t;
676€€}
677
678sub parse_html
679
21
99
€€{ my $t= shift;
680
21
197
€€€€my $content= shift;
681
21
140
€€€€my $indent= $t->{ErrorContext} ? 1 : 0;
682
21
1398
€€€€$t->parse( _html2xml( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, { indent => $indent }), @_);
683
16
849
€€€€return $t;
684€€}
685
686sub xparse
687
1827
7309
€€{ my $t= shift;
688
1827
6310
€€€€my $to_parse= $_[0];
689
1827
1
20836
7
€€€€if( isa( $to_parse, 'GLOB')) { $t->parse( @_); }
690
1816
15520
€€€€elsif( $to_parse=~ m{^\s*<}) { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_)
691€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€: $t->parse( @_);
692€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€}
693
2
9
€€€€elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
694
1
8
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€$t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_);
695€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€}
696
2
11
€€€€elsif( $to_parse=~ m{^\w+://}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
697
1
7
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€my $doc= LWP::Simple::get( shift);
698
1
17
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€my $xml_parse_ok= $t->safe_parse( $doc, @_);
699
1
8
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€if( $xml_parse_ok)
700
1
9
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ return $xml_parse_ok; }
701€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€else
702
0
0
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ my $diag= $@;
703
0
0
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€if( $doc=~ m{<html}i)
704
0
0
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ $t->parse_html( $doc, @_); }
705€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€else
706
0
0
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ die $diag; }
707€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€}
708€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€}
709
2
12
€€€€elsif( $to_parse=~ m{\.html?$}) { my $content= _slurp( shift);
710
2
14
€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€$t->_parse_as_xml_or_html( $content, @_);
711€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€}
712
4
40
€€€€else { $t->parsefile( @_); }
713€€}
714
715sub _parse_as_xml_or_html
716
10
225
€€{ my $t= shift;
717
10
61
€€€€$t->safe_parse( @_) || $t->parse_html( @_);
718€€}
719
720
721sub nparse
722
1827
134279
€€{ my $class= shift;
723
1827
8087
€€€€my $to_parse= pop;
724
1827
12153
€€€€$class->new( @_)->xparse( $to_parse);
725€€}
726
727
1
19
sub nparse_pp { shift()->nparse( pretty_print => 'indented', @_); }
728
4
370
sub nparse_e { shift()->nparse( error_context => 1, @_); }
729
1
21
sub nparse_ppe { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); }
730
731
732sub _html2xml
733
25
186
€€{ my( $html, $options)= @_;
734
25
130
€€€€_use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n";
735
24
211
€€€€my $tree= HTML::TreeBuilder->new;
736
24
154
€€€€$tree->ignore_ignorable_whitespace( 0);
737
24
147
€€€€$tree->no_space_compacting( 1);
738
24
137
€€€€$tree->store_comments( 1);
739
24
534
€€€€$tree->store_pis(1);
740
24
180
€€€€$tree->parse( $html);
741
24
233
€€€€$tree->eof;
742
24
210
€€€€my $xml= _latin12utf8( $tree->as_XML);
743
24
197
€€€€$xml=~ s{(?<=.)<\?xml version="1.0" encoding="utf-8"\?+>}{}g;
744
745
24
4
162
23
€€€€if( $options->{indent}) { _indent_xhtml( \$xml); }
746
24
171
€€€€$tree->delete;
747
24
977
€€€€return $xml;
748€€}
749
750sub _latin12utf8
751
24
285
€€{ my $string= shift;
752
24
260
€€€€local $SIG{__DIE__};
753
24
134
€€€€if( _use( 'Encode'))
754
24
179
€€€€€€{ from_to( $string, 'iso-8859-15' => 'utf8');
755
24
224
€€€€€€€€return $string;
756€€€€€€}
757
0
0
€€€€if( _use( 'Text::Iconv'))
758
0
0
€€€€€€{ my $converter = eval { Text::Iconv->new("iso-8859-15" => "utf8") }
759
0
0
0
0
€€€€€€€€€€€€€€€€€€€€€€|| eval { Text::Iconv->new("8859" => "646") }; # for old Solaris
760
0
0
0
0
€€€€€€€€if( $converter) { return $converter->convert( $string); }
761€€€€€€}
762
0
0
€€€€if( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
763
0
0
€€€€€€{ my $l1_map= Unicode::Map8->new("latin1");
764
0
0
€€€€€€€€return $l1_map->tou( $string)->utf8;
765€€€€€€}
766
0
0
€€€€return $string;
767€€}
768
769sub _indent_xhtml
770
4
18
€€{ my( $xhtml)= @_; # $xhtml is a ref
771
4
156
36
751
€€€€my %block_tag= map { $_ => 1 } qw( html
772€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€head
773€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€meta title link script base
774€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€body
775€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€h1 h2 h3 h4 h5 h6
776€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€p br address blockquote pre
777