| File: | blib/lib/XML/Twig.pm |
| Coverage: | 91.8% |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | # $Id: /xmltwig/trunk/Twig_pm.slow 25 2006-10-20T09:00:05.351266Z mrodrigu $ | ||||||
| 2 | # | ||||||
| 3 | # Copyright (c) 1999-2004 Michel Rodriguez | ||||||
| 4 | # All rights reserved. | ||||||
| 5 | # | ||||||
| 6 | # This program is free software; you can redistribute it and/or | ||||||
| 7 | # modify it under the same terms as Perl itself. | ||||||
| 8 | # | ||||||
| 9 | |||||||
| 10 | # This is created in the caller's space | ||||||
| 11 | BEGIN | ||||||
| 12 | 91 6 | 388 112 | { sub ::PCDATA { '#PCDATA' } | ||||
| 13 | 5 | 175 | €€sub ::CDATA { '#CDATA' } | ||||
| 14 | } | ||||||
| 15 | |||||||
| 16 | |||||||
| 17 | ###################################################################### | ||||||
| 18 | package XML::Twig; | ||||||
| 19 | ###################################################################### | ||||||
| 20 | |||||||
| 21 | require 5.004; | ||||||
| 22 | 91 91 91 | 899 413 855 | use strict; | ||||
| 23 | |||||||
| 24 | 91 91 91 | 841 332 775 | use vars qw($VERSION @ISA %valid_option); | ||||
| 25 | 91 91 91 | 907 308 945 | use Carp; | ||||
| 26 | |||||||
| 27 | *isa = \&UNIVERSAL::isa; | ||||||
| 28 | |||||||
| 29 | #start-extract twig_global | ||||||
| 30 | |||||||
| 31 | # constants: element types | ||||||
| 32 | 91 91 91 | 882 326 1109 | use constant (PCDATA => '#PCDATA'); | ||||
| 33 | 91 91 91 | 881 296 630 | use constant (CDATA => '#CDATA'); | ||||
| 34 | 91 91 91 | 879 315 576 | use constant (PI => '#PI'); | ||||
| 35 | 91 91 91 | 906 297 574 | use constant (COMMENT => '#COMMENT'); | ||||
| 36 | 91 91 91 | 885 341 771 | use constant (ENT => '#ENT'); | ||||
| 37 | |||||||
| 38 | # element classes | ||||||
| 39 | 91 91 91 | 1008 315 625 | use constant (ELT => '#ELT'); | ||||
| 40 | 91 91 91 | 834 372 624 | use constant (TEXT => '#TEXT'); | ||||
| 41 | |||||||
| 42 | # element properties | ||||||
| 43 | 91 91 91 | 879 369 619 | use constant (ASIS => '#ASIS'); | ||||
| 44 | 91 91 91 | 1497 463 716 | use constant (EMPTY => '#EMPTY'); | ||||
| 45 | |||||||
| 46 | #end-extract twig_global | ||||||
| 47 | |||||||
| 48 | # used in parseurl to set the buffer size to the same size as in XML::Parser::Expat | ||||||
| 49 | 91 91 91 | 912 305 656 | use constant (BUFSIZE => 32768); | ||||
| 50 | |||||||
| 51 | |||||||
| 52 | # used to store the gi's | ||||||
| 53 | my %gi2index; # gi => index | ||||||
| 54 | my @index2gi; # list of gi's | ||||||
| 55 | my $SPECIAL_GI; # first non-special gi; | ||||||
| 56 | my %base_ent; # base entity character => replacement | ||||||
| 57 | |||||||
| 58 | # flag, set to true if the weaken sub is available | ||||||
| 59 | 91 91 91 | 861 300 593 | use vars qw( $weakrefs); | ||||
| 60 | |||||||
| 61 | #start-extract twig_global | ||||||
| 62 | my $REG_NAME = q{(?:(?:[^\W\d]|[:#])(?:[\w.-]*:)?[\w.-]*)}; # xml name (leading # allowed) | ||||||
| 63 | my $REG_NAME_W = q{(?:(?:[^\W\d_]|[:#])(?:[\w.-]*:)?[\w.-]*|\*)}; # name or wildcard (* or '') (leading # allowed) | ||||||
| 64 | my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp | ||||||
| 65 | my $REG_REGEXP_EXP = q{(?:(?:[^\\/]|\\.)*)}; # content of a regexp | ||||||
| 66 | my $REG_REGEXP_MOD = q{(?:[eimso]*)}; # regexp modifiers | ||||||
| 67 | my $REG_MATCH = q{[!=]~}; # match (or not) | ||||||
| 68 | my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted) | ||||||
| 69 | my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number | ||||||
| 70 | my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value | ||||||
| 71 | my $REG_OP = q{=|==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge}; # op | ||||||
| 72 | my $REG_FUNCTION = q{(?:string|text)\(\s*\)}; | ||||||
| 73 | my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)}; | ||||||
| 74 | |||||||
| 75 | |||||||
| 76 | # used in the handler trigger code | ||||||
| 77 | my $REG_PREDICATE2 = qq{\\[((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or))*)\\]}; | ||||||
| 78 | my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or))*)}; | ||||||
| 79 | |||||||
| 80 | # not all axis, only supported ones (in get_xpath) | ||||||
| 81 | my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self', | ||||||
| 82 | €€€€€€€€€€€€€€€€€€€€€€'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self' | ||||||
| 83 | €€€€€€€€€€€€€€€€€€€€); | ||||||
| 84 | my $REG_AXIS = "(?:" . join( '|', @supported_axis) .")"; | ||||||
| 85 | |||||||
| 86 | # only used in the "xpath"engine (for get_xpath/findnodes) for now | ||||||
| 87 | my $REG_PREDICATE = qr{\[(?:(?:string\(\s*\)|\@$REG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]}; | ||||||
| 88 | |||||||
| 89 | my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le '); | ||||||
| 90 | |||||||
| 91 | #end-extract twig_global | ||||||
| 92 | |||||||
| 93 | my $parser_version; | ||||||
| 94 | my( $FB_HTMLCREF, $FB_XMLCREF); | ||||||
| 95 | |||||||
| 96 | BEGIN | ||||||
| 97 | { | ||||||
| 98 | 91 | 785 | $VERSION = '3.27'; | ||||
| 99 | |||||||
| 100 | 91 91 91 | 1129 387 1284 | use XML::Parser; | ||||
| 101 | 91 | 413 | my $needVersion = '2.23'; | ||||
| 102 | 91 | 428 | $parser_version= $XML::Parser::VERSION; | ||||
| 103 | 91 | 1026 | croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion; | ||||
| 104 | |||||||
| 105 | 91 | 839 | if( $] >= 5.008) | ||||
| 106 | 91 91 91 91 | 416 1267 400 952 | €€{ eval "use Encode qw( :all)"; | ||||
| 107 | 91 | 1608 | €€€€$FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF; | ||||
| 108 | 91 | 920 | €€€€$FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF; | ||||
| 109 | €€} | ||||||
| 110 | |||||||
| 111 | # test whether we can use weak references | ||||||
| 112 | # set local empty signal handler to trap error messages | ||||||
| 113 | 91 91 | 376 616 | { local $SIG{__DIE__}; | ||||
| 114 | 91 | 2952 | €€if( eval( 'require Scalar::Util') && defined( &Scalar::Util::weaken) ) | ||||
| 115 | 91 91 | 1220 626 | €€€€{ import Scalar::Util( 'weaken'); $weakrefs= 1; } | ||||
| 116 | €€elsif( eval( 'require WeakRef')) | ||||||
| 117 | 0 0 | 0 0 | €€€€{ import WeakRef; $weakrefs= 1; } | ||||
| 118 | €€else | ||||||
| 119 | 0 | 0 | €€€€{ $weakrefs= 0; } | ||||
| 120 | } | ||||||
| 121 | |||||||
| 122 | 91 | 1028 | import XML::Twig::Elt; | ||||
| 123 | 91 | 659 | import XML::Twig::Entity; | ||||
| 124 | 91 | 624 | import XML::Twig::Entity_list; | ||||
| 125 | |||||||
| 126 | # used to store the gi's | ||||||
| 127 | # should be set for each twig really, at least when there are several | ||||||
| 128 | # the init ensures that special gi's are always the same | ||||||
| 129 | |||||||
| 130 | # gi => index | ||||||
| 131 | # do NOT use => or the constants become quoted! | ||||||
| 132 | 91 | 877 | %XML::Twig::gi2index=( PCDATA, 0, CDATA, 1, PI, 2, COMMENT, 3, ENT, 4); | ||||
| 133 | # list of gi's | ||||||
| 134 | 91 | 672 | @XML::Twig::index2gi=( PCDATA, CDATA, PI, COMMENT, ENT); | ||||
| 135 | |||||||
| 136 | # gi's under this value are special | ||||||
| 137 | 91 | 611 | $XML::Twig::SPECIAL_GI= @XML::Twig::index2gi; | ||||
| 138 | |||||||
| 139 | 91 | 899 | %XML::Twig::base_ent= ( '>' => '>', '<' => '<', '&' => '&', "'" => ''', '"' => '"',); | ||||
| 140 | |||||||
| 141 | # now set some aliases | ||||||
| 142 | 91 | 582 | *find_nodes = *get_xpath; # same as XML::XPath | ||||
| 143 | 91 | 392 | *findnodes = *get_xpath; # same as XML::LibXML | ||||
| 144 | 91 | 390 | *getElementsByTagName = *descendants; | ||||
| 145 | 91 | 378 | *descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt | ||||
| 146 | 91 | 374 | *find_by_tag_name = *descendants; | ||||
| 147 | 91 | 373 | *getElementById = *elt_id; | ||||
| 148 | 91 | 374 | *getEltById = *elt_id; | ||||
| 149 | 91 | 535 | *toString = *sprint; | ||||
| 150 | } | ||||||
| 151 | |||||||
| 152 | @ISA = qw(XML::Parser); | ||||||
| 153 | |||||||
| 154 | # fake gi's used in twig_handlers and start_tag_handlers | ||||||
| 155 | my $ALL = '_all_'; # the associated function is always called | ||||||
| 156 | my $DEFAULT= '_default_'; # the function is called if no other handler has been | ||||||
| 157 | |||||||
| 158 | # some defaults | ||||||
| 159 | my $COMMENTS_DEFAULT= 'keep'; | ||||||
| 160 | my $PI_DEFAULT = 'keep'; | ||||||
| 161 | |||||||
| 162 | |||||||
| 163 | # handlers used in regular mode | ||||||
| 164 | my %twig_handlers=( Start => \&_twig_start, | ||||||
| 165 | €€€€€€€€€€€€€€€€€€€€End => \&_twig_end, | ||||||
| 166 | €€€€€€€€€€€€€€€€€€€€Char => \&_twig_char, | ||||||
| 167 | €€€€€€€€€€€€€€€€€€€€Entity => \&_twig_entity, | ||||||
| 168 | €€€€€€€€€€€€€€€€€€€€XMLDecl => \&_twig_xmldecl, | ||||||
| 169 | €€€€€€€€€€€€€€€€€€€€Doctype => \&_twig_doctype, | ||||||
| 170 | €€€€€€€€€€€€€€€€€€€€Element => \&_twig_element, | ||||||
| 171 | €€€€€€€€€€€€€€€€€€€€Attlist => \&_twig_attlist, | ||||||
| 172 | €€€€€€€€€€€€€€€€€€€€CdataStart => \&_twig_cdatastart, | ||||||
| 173 | €€€€€€€€€€€€€€€€€€€€CdataEnd => \&_twig_cdataend, | ||||||
| 174 | €€€€€€€€€€€€€€€€€€€€Proc => \&_twig_pi, | ||||||
| 175 | €€€€€€€€€€€€€€€€€€€€Comment => \&_twig_comment, | ||||||
| 176 | €€€€€€€€€€€€€€€€€€€€Default => \&_twig_default, | ||||||
| 177 | €€€€€€); | ||||||
| 178 | |||||||
| 179 | # handlers used when twig_roots is used and we are outside of the roots | ||||||
| 180 | my %twig_handlers_roots= | ||||||
| 181 | €€( Start => \&_twig_start_check_roots, | ||||||
| 182 | €€€€End => \&_twig_end_check_roots, | ||||||
| 183 | €€€€Doctype => \&_twig_doctype, | ||||||
| 184 | €€€€Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl, | ||||||
| 185 | €€€€Element => undef, Attlist => undef, CdataStart => undef, | ||||||
| 186 | €€€€CdataEnd => undef, Proc => undef, Comment => undef, | ||||||
| 187 | €€€€Proc => \&_twig_pi_check_roots, | ||||||
| 188 | €€€€Default => sub {}, # hack needed for XML::Parser 2.27 | ||||||
| 189 | €€); | ||||||
| 190 | |||||||
| 191 | # handlers used when twig_roots and print_outside_roots are used and we are | ||||||
| 192 | # outside of the roots | ||||||
| 193 | my %twig_handlers_roots_print_2_30= | ||||||
| 194 | €€( Start => \&_twig_start_check_roots, | ||||||
| 195 | €€€€End => \&_twig_end_check_roots, | ||||||
| 196 | €€€€Char => \&_twig_print, | ||||||
| 197 | €€€€Entity => \&_twig_print_entity, | ||||||
| 198 | €€€€ExternEnt => \&_twig_print_entity, | ||||||
| 199 | €€€€DoctypeFin => \&_twig_doctype_fin_print, | ||||||
| 200 | €€€€XMLDecl => \&_twig_print, | ||||||
| 201 | €€€€Doctype => \&_twig_print_doctype, # because recognized_string is broken here | ||||||
| 202 | €€€€# Element => \&_twig_print, Attlist => \&_twig_print, | ||||||
| 203 | €€€€CdataStart => \&_twig_print, CdataEnd => \&_twig_print, | ||||||
| 204 | €€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print, | ||||||
| 205 | €€€€Default => \&_twig_print_check_doctype, | ||||||
| 206 | €€); | ||||||
| 207 | |||||||
| 208 | # handlers used when twig_roots, print_outside_roots and keep_encoding are used | ||||||
| 209 | # and we are outside of the roots | ||||||
| 210 | my %twig_handlers_roots_print_original_2_30= | ||||||
| 211 | €€( Start => \&_twig_start_check_roots, | ||||||
| 212 | €€€€End => \&_twig_end_check_roots, | ||||||
| 213 | €€€€Char => \&_twig_print_original, | ||||||
| 214 | €€€€# I have no idea why I should not be using this handler! | ||||||
| 215 | €€€€Entity => \&_twig_print_entity, | ||||||
| 216 | €€€€ExternEnt => \&_twig_print_entity, | ||||||
| 217 | €€€€DoctypeFin => \&_twig_doctype_fin_print, | ||||||
| 218 | €€€€XMLDecl => \&_twig_print_original, | ||||||
| 219 | €€€€Doctype => \&_twig_print_original_doctype, # because original_string is broken here | ||||||
| 220 | €€€€Element => \&_twig_print_original, Attlist => \&_twig_print_original, | ||||||
| 221 | €€€€CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, | ||||||
| 222 | €€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original, | ||||||
| 223 | €€€€Default => \&_twig_print_original_check_doctype, | ||||||
| 224 | €€); | ||||||
| 225 | |||||||
| 226 | # handlers used when twig_roots and print_outside_roots are used and we are | ||||||
| 227 | # outside of the roots | ||||||
| 228 | my %twig_handlers_roots_print_2_27= | ||||||
| 229 | €€( Start => \&_twig_start_check_roots, | ||||||
| 230 | €€€€End => \&_twig_end_check_roots, | ||||||
| 231 | €€€€Char => \&_twig_print, | ||||||
| 232 | €€€€# I have no idea why I should not be using this handler! | ||||||
| 233 | €€€€#Entity => \&_twig_print, | ||||||
| 234 | €€€€XMLDecl => \&_twig_print, Doctype => \&_twig_print, | ||||||
| 235 | €€€€CdataStart => \&_twig_print, CdataEnd => \&_twig_print, | ||||||
| 236 | €€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print, | ||||||
| 237 | €€€€Default => \&_twig_print, | ||||||
| 238 | €€); | ||||||
| 239 | |||||||
| 240 | # handlers used when twig_roots, print_outside_roots and keep_encoding are used | ||||||
| 241 | # and we are outside of the roots | ||||||
| 242 | my %twig_handlers_roots_print_original_2_27= | ||||||
| 243 | €€( Start => \&_twig_start_check_roots, | ||||||
| 244 | €€€€End => \&_twig_end_check_roots, | ||||||
| 245 | €€€€Char => \&_twig_print_original, | ||||||
| 246 | €€€€# for some reason original_string is wrong here | ||||||
| 247 | €€€€# this can be a problem if the doctype includes non ascii characters | ||||||
| 248 | €€€€XMLDecl => \&_twig_print, Doctype => \&_twig_print, | ||||||
| 249 | €€€€# I have no idea why I should not be using this handler! | ||||||
| 250 | €€€€Entity => \&_twig_print, | ||||||
| 251 | €€€€#Element => undef, Attlist => undef, | ||||||
| 252 | €€€€CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, | ||||||
| 253 | €€€€Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original, | ||||||
| 254 | €€€€Default => \&_twig_print, # _twig_print_original does not work | ||||||
| 255 | €€); | ||||||
| 256 | |||||||
| 257 | |||||||
| 258 | my %twig_handlers_roots_print= $parser_version > 2.27 | ||||||
| 259 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€? %twig_handlers_roots_print_2_30 | ||||||
| 260 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€: %twig_handlers_roots_print_2_27; | ||||||
| 261 | my %twig_handlers_roots_print_original= $parser_version > 2.27 | ||||||
| 262 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€? %twig_handlers_roots_print_original_2_30 | ||||||
| 263 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€: %twig_handlers_roots_print_original_2_27; | ||||||
| 264 | |||||||
| 265 | |||||||
| 266 | # handlers used when the finish_print method has been called | ||||||
| 267 | my %twig_handlers_finish_print= | ||||||
| 268 | €€( Start => \&_twig_print, | ||||||
| 269 | €€€€End => \&_twig_print, Char => \&_twig_print, | ||||||
| 270 | €€€€Entity => \&_twig_print, XMLDecl => \&_twig_print, | ||||||
| 271 | €€€€Doctype => \&_twig_print, Element => \&_twig_print, | ||||||
| 272 | €€€€Attlist => \&_twig_print, CdataStart => \&_twig_print, | ||||||
| 273 | €€€€CdataEnd => \&_twig_print, Proc => \&_twig_print, | ||||||
| 274 | €€€€Comment => \&_twig_print, Default => \&_twig_print, | ||||||
| 275 | €€); | ||||||
| 276 | |||||||
| 277 | # handlers used when the finish_print method has been called and the keep_encoding | ||||||
| 278 | # option is used | ||||||
| 279 | my %twig_handlers_finish_print_original= | ||||||
| 280 | €€( Start => \&_twig_print_original, End => \&_twig_print_end_original, | ||||||
| 281 | €€€€Char => \&_twig_print_original, Entity => \&_twig_print_original, | ||||||
| 282 | €€€€XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original, | ||||||
| 283 | €€€€Element => \&_twig_print_original, Attlist => \&_twig_print_original, | ||||||
| 284 | €€€€CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, | ||||||
| 285 | €€€€Proc => \&_twig_print_original, Comment => \&_twig_print_original, | ||||||
| 286 | €€€€Default => \&_twig_print_original, | ||||||
| 287 | €€); | ||||||
| 288 | |||||||
| 289 | # handlers used whithin ignored elements | ||||||
| 290 | my %twig_handlers_ignore= | ||||||
| 291 | €€( Start => \&_twig_ignore_start, | ||||||
| 292 | €€€€End => \&_twig_ignore_end, | ||||||
| 293 | €€€€Char => undef, Entity => undef, XMLDecl => undef, | ||||||
| 294 | €€€€Doctype => undef, Element => undef, Attlist => undef, | ||||||
| 295 | €€€€CdataStart => undef, CdataEnd => undef, Proc => undef, | ||||||
| 296 | €€€€Comment => undef, Default => undef, | ||||||
| 297 | €€); | ||||||
| 298 | |||||||
| 299 | |||||||
| 300 | # those handlers are only used if the entities are NOT to be expanded | ||||||
| 301 | my %twig_noexpand_handlers= ( Default => \&_twig_default ); | ||||||
| 302 | |||||||
| 303 | my @saved_default_handler; | ||||||
| 304 | |||||||
| 305 | my $ID= 'id'; # default value, set by the Id argument | ||||||
| 306 | |||||||
| 307 | # all allowed options | ||||||
| 308 | %valid_option= | ||||||
| 309 | €€€€( # XML::Twig options | ||||||
| 310 | €€€€€€TwigHandlers => 1, Id => 1, | ||||||
| 311 | €€€€€€TwigRoots => 1, TwigPrintOutsideRoots => 1, | ||||||
| 312 | €€€€€€StartTagHandlers => 1, EndTagHandlers => 1, | ||||||
| 313 | €€€€€€ForceEndTagHandlersUsage => 1, | ||||||
| 314 | €€€€€€DoNotChainHandlers => 1, | ||||||
| 315 | €€€€€€IgnoreElts => 1, | ||||||
| 316 | €€€€€€Index => 1, | ||||||
| 317 | €€€€€€CharHandler => 1, | ||||||
| 318 | €€€€€€TopDownHandlers => 1, | ||||||
| 319 | €€€€€€KeepEncoding => 1, DoNotEscapeAmpInAtts => 1, | ||||||
| 320 | €€€€€€ParseStartTag => 1, KeepAttsOrder => 1, | ||||||
| 321 | €€€€€€LoadDTD => 1, DTDHandler => 1, | ||||||
| 322 | €€€€€€DoNotOutputDTD => 1, NoProlog => 1, | ||||||
| 323 | €€€€€€ExpandExternalEnts => 1, | ||||||
| 324 | €€€€€€DiscardSpaces => 1, KeepSpaces => 1, | ||||||
| 325 | €€€€€€DiscardSpacesIn => 1, KeepSpacesIn => 1, | ||||||
| 326 | €€€€€€PrettyPrint => 1, EmptyTags => 1, | ||||||
| 327 | €€€€€€Quote => 'double', | ||||||
| 328 | €€€€€€Comments => 1, Pi => 1, | ||||||
| 329 | €€€€€€OutputFilter => 1, InputFilter => 1, | ||||||
| 330 | €€€€€€OutputTextFilter => 1, | ||||||
| 331 | €€€€€€OutputEncoding => 1, | ||||||
| 332 | €€€€€€RemoveCdata => 1, | ||||||
| 333 | €€€€€€EltClass => 1, | ||||||
| 334 | €€€€€€MapXmlns => 1, KeepOriginalPrefix => 1, | ||||||
| 335 | €€€€€€# XML::Parser options | ||||||
| 336 | €€€€€€ErrorContext => 1, ProtocolEncoding => 1, | ||||||
| 337 | €€€€€€Namespaces => 1, NoExpand => 1, | ||||||
| 338 | €€€€€€Stream_Delimiter => 1, ParseParamEnt => 1, | ||||||
| 339 | €€€€€€NoLWP => 1, Non_Expat_Options => 1, | ||||||
| 340 | €€€€€€Xmlns => 1, | ||||||
| 341 | €€€€); | ||||||
| 342 | |||||||
| 343 | # predefined input and output filters | ||||||
| 344 | 91 91 91 | 927 317 913 | use vars qw( %filter); | ||||
| 345 | %filter= ( html => \&html_encode, | ||||||
| 346 | €€€€€€€€€€€safe => \&safe_encode, | ||||||
| 347 | €€€€€€€€€€€safe_hex => \&safe_encode_hex, | ||||||
| 348 | €€€€€€€€€); | ||||||
| 349 | |||||||
| 350 | |||||||
| 351 | # trigger types (used to sort them) | ||||||
| 352 | my ($XPATH_TRIGGER, $REGEXP_TRIGGER, $LEVEL_TRIGGER)=(1..3); | ||||||
| 353 | |||||||
| 354 | sub new | ||||||
| 355 | 2728 | 370809 | €€{ my ($class, %args) = @_; | ||||
| 356 | 2728 | 9556 | €€€€my $handlers; | ||||
| 357 | |||||||
| 358 | €€€€# change all nice_perlish_names into nicePerlishNames | ||||||
| 359 | 2728 | 17204 | €€€€%args= _normalize_args( %args); | ||||
| 360 | |||||||
| 361 | €€€€# check options | ||||||
| 362 | 2728 | 23380 | €€€€unless( $args{MoreOptions}) | ||||
| 363 | 2727 | 18906 | €€€€€€{ foreach my $arg (keys %args) | ||||
| 364 | 5015 | 42659 | €€€€€€€€{ carp "invalid option $arg" unless $valid_option{$arg}; } | ||||
| 365 | €€€€€€} | ||||||
| 366 | |||||||
| 367 | €€€€# a twig is really an XML::Parser | ||||||
| 368 | €€€€# my $self= XML::Parser->new(%args); | ||||||
| 369 | 2728 | 11713 | €€€€my $self; | ||||
| 370 | 2728 | 25173 | €€€€$self= XML::Parser->new(%args); | ||||
| 371 | |||||||
| 372 | 2728 | 526056 | €€€€bless $self, $class; | ||||
| 373 | |||||||
| 374 | 2728 | 17153 | €€€€$self->{_twig_context_stack}= []; | ||||
| 375 | |||||||
| 376 | 2728 | 17164 | €€€€if( exists $args{TwigHandlers}) | ||||
| 377 | 151 | 1014 | €€€€€€{ $handlers= $args{TwigHandlers}; | ||||
| 378 | 151 | 933 | €€€€€€€€$self->setTwigHandlers( $handlers); | ||||
| 379 | 145 | 1395 | €€€€€€€€delete $args{TwigHandlers}; | ||||
| 380 | €€€€€€} | ||||||
| 381 | |||||||
| 382 | €€€€# take care of twig-specific arguments | ||||||
| 383 | 2722 | 14376 | €€€€if( exists $args{StartTagHandlers}) | ||||
| 384 | 23 | 165 | €€€€€€{ $self->setStartTagHandlers( $args{StartTagHandlers}); | ||||
| 385 | 23 | 220 | €€€€€€€€delete $args{StartTagHandlers}; | ||||
| 386 | €€€€€€} | ||||||
| 387 | |||||||
| 388 | 2722 | 15084 | €€€€if( exists $args{DoNotChainHandlers}) | ||||
| 389 | 1 | 10 | €€€€€€{ $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; } | ||||
| 390 | |||||||
| 391 | 2722 | 14088 | €€€€if( exists $args{IgnoreElts}) | ||||
| 392 | 3 | 22 | €€€€€€{ $self->setIgnoreEltsHandlers( $args{IgnoreElts}); | ||||
| 393 | 3 | 29 | €€€€€€€€delete $args{IgnoreElts}; | ||||
| 394 | €€€€€€} | ||||||
| 395 | |||||||
| 396 | 2722 | 13262 | €€€€if( exists $args{Index}) | ||||
| 397 | 2 | 9 | €€€€€€{ my $index= $args{Index}; | ||||
| 398 | €€€€€€€€# we really want a hash name => path, we turn an array into a hash if necessary | ||||||
| 399 | 2 | 14 | €€€€€€€€if( ref( $index) eq 'ARRAY') | ||||
| 400 | 1 2 | 5 13 | €€€€€€€€€€{ my %index= map { $_ => $_ } @$index; | ||||
| 401 | 1 | 11 | €€€€€€€€€€€€$index= \%index; | ||||
| 402 | €€€€€€€€€€} | ||||||
| 403 | 2 | 19 | €€€€€€€€while( my( $name, $exp)= each %$index) | ||||
| 404 | 3 4 4 4 | 47 14 29 28 | €€€€€€€€€€{ $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); } | ||||
| 405 | €€€€€€} | ||||||
| 406 | |||||||
| 407 | 2722 | 35381 | €€€€$self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt'; | ||||
| 408 | 2722 62 | 14615 605 | €€€€if( exists( $args{EltClass})) { delete $args{EltClass}; } | ||||
| 409 | |||||||
| 410 | 2722 | 12792 | €€€€if( exists( $args{MapXmlns})) | ||||
| 411 | 14 | 79 | €€€€€€{ $self->{twig_map_xmlns}= $args{MapXmlns}; | ||||
| 412 | 14 | 77 | €€€€€€€€$self->{Namespaces}=1; | ||||
| 413 | 14 | 121 | €€€€€€€€delete $args{MapXmlns}; | ||||
| 414 | €€€€€€} | ||||||
| 415 | |||||||
| 416 | 2722 | 13127 | €€€€if( exists( $args{KeepOriginalPrefix})) | ||||
| 417 | 4 | 23 | €€€€€€{ $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix}; | ||||
| 418 | 4 | 35 | €€€€€€€€delete $args{KeepOriginalPrefix}; | ||||
| 419 | €€€€€€} | ||||||
| 420 | |||||||
| 421 | 2722 | 13840 | €€€€$self->{twig_dtd_handler}= $args{DTDHandler}; | ||||
| 422 | 2722 | 9989 | €€€€delete $args{DTDHandler}; | ||||
| 423 | |||||||
| 424 | 2722 | 13442 | €€€€if( $args{ExpandExternalEnts}) | ||||
| 425 | 2 | 13 | €€€€€€{ $self->set_expand_external_entities( 1); | ||||
| 426 | 2 | 9 | €€€€€€€€$self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts | ||||
| 427 | 2 | 8 | €€€€€€€€delete $args{LoadDTD}; | ||||
| 428 | 2 | 16 | €€€€€€€€delete $args{ExpandExternalEnts}; | ||||
| 429 | €€€€€€} | ||||||
| 430 | |||||||
| 431 | 2722 | 14525 | €€€€if( $args{DoNotEscapeAmpInAtts}) | ||||
| 432 | 1 | 6 | €€€€€€{ $self->set_do_not_escape_amp_in_atts( 1); | ||||
| 433 | 1 | 5 | €€€€€€€€$self->{twig_do_not_escape_amp_in_atts}=1; | ||||
| 434 | €€€€€€} | ||||||
| 435 | €€€€else | ||||||
| 436 | 2721 | 16492 | €€€€€€{ $self->set_do_not_escape_amp_in_atts( 0); | ||||
| 437 | 2721 | 15360 | €€€€€€€€$self->{twig_do_not_escape_amp_in_atts}=0; | ||||
| 438 | €€€€€€} | ||||||
| 439 | |||||||
| 440 | €€€€# deal with TwigRoots argument, a hash of elements for which | ||||||
| 441 | €€€€# subtrees will be built (and associated handlers) | ||||||
| 442 | |||||||
| 443 | 2722 | 14021 | €€€€if( $args{TwigRoots}) | ||||
| 444 | 58 | 396 | €€€€€€{ $self->setTwigRoots( $args{TwigRoots}); | ||||
| 445 | 56 | 493 | €€€€€€€€delete $args{TwigRoots}; | ||||
| 446 | €€€€€€} | ||||||
| 447 | |||||||
| 448 | 2720 | 13185 | €€€€if( $args{EndTagHandlers}) | ||||
| 449 | 9 | 87 | €€€€€€{ unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage}) | ||||
| 450 | 1 | 9 | €€€€€€€€€€{ croak "you should not use EndTagHandlers without TwigRoots\n", | ||||
| 451 | €€€€€€€€€€€€€€€€€€"if you want to use it anyway, normally because you have ", | ||||||
| 452 | €€€€€€€€€€€€€€€€€€"a start_tag_handlers that calls 'ignore' and you want to ", | ||||||
| 453 | €€€€€€€€€€€€€€€€€€"call an ent_tag_handlers at the end of the element, then ", | ||||||
| 454 | €€€€€€€€€€€€€€€€€€"pass 'force_end_tag_handlers_usage => 1' as an argument ", | ||||||
| 455 | €€€€€€€€€€€€€€€€€€"to new"; | ||||||
| 456 | €€€€€€€€€€} | ||||||
| 457 | |||||||
| 458 | 8 | 60 | €€€€€€€€$self->setEndTagHandlers( $args{EndTagHandlers}); | ||||
| 459 | 8 | 90 | €€€€€€€€delete $args{EndTagHandlers}; | ||||
| 460 | €€€€€€} | ||||||
| 461 | |||||||
| 462 | 2719 | 12928 | €€€€if( $args{TwigPrintOutsideRoots}) | ||||
| 463 | 31 | 197 | €€€€€€{ croak "cannot use TwigPrintOutsideRoots without TwigRoots" | ||||
| 464 | €€€€€€€€€€unless( $self->{twig_roots}); | ||||||
| 465 | €€€€€€€€# if the arg is a filehandle then store it | ||||||
| 466 | 30 | 169 | €€€€€€€€if( _is_fh( $args{TwigPrintOutsideRoots}) ) | ||||
| 467 | 28 | 288 | €€€€€€€€€€{ $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; } | ||||
| 468 | 30 | 307 | €€€€€€€€$self->{twig_default_print}= $args{TwigPrintOutsideRoots}; | ||||
| 469 | €€€€€€} | ||||||
| 470 | |||||||
| 471 | €€€€# space policy | ||||||
| 472 | 2718 | 13386 | €€€€if( $args{KeepSpaces}) | ||||
| 473 | 17 | 98 | €€€€€€{ croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces}); | ||||
| 474 | 16 | 85 | €€€€€€€€croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); | ||||
| 475 | 15 | 65 | €€€€€€€€$self->{twig_keep_spaces}=1; | ||||
| 476 | 15 | 131 | €€€€€€€€delete $args{KeepSpaces}; | ||||
| 477 | €€€€€€} | ||||||
| 478 | 2716 | 13067 | €€€€if( $args{DiscardSpaces}) | ||||
| 479 | 2 | 17 | €€€€€€{ croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); | ||||
| 480 | 1 | 5 | €€€€€€€€$self->{twig_discard_spaces}=1; | ||||
| 481 | 1 | 8 | €€€€€€€€delete $args{DiscardSpaces}; | ||||
| 482 | €€€€€€} | ||||||
| 483 | 2715 | 12640 | €€€€if( $args{KeepSpacesIn}) | ||||
| 484 | 8 | 50 | €€€€€€{ croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn}); | ||||
| 485 | 7 | 33 | €€€€€€€€$self->{twig_discard_spaces}=1; | ||||
| 486 | 7 | 36 | €€€€€€€€$self->{twig_keep_spaces_in}={}; | ||||
| 487 | 7 7 | 25 44 | €€€€€€€€my @tags= @{$args{KeepSpacesIn}}; | ||||
| 488 | 7 9 | 34 63 | €€€€€€€€foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; } | ||||
| 489 | 7 | 74 | €€€€€€€€delete $args{KeepSpacesIn}; | ||||
| 490 | €€€€€€} | ||||||
| 491 | 2714 | 13027 | €€€€if( $args{DiscardSpacesIn}) | ||||
| 492 | 4 | 17 | €€€€€€{ $self->{twig_keep_spaces}=1; | ||||
| 493 | 4 | 20 | €€€€€€€€$self->{twig_discard_spaces_in}={}; | ||||
| 494 | 4 4 | 14 23 | €€€€€€€€my @tags= @{$args{DiscardSpacesIn}}; | ||||
| 495 | 4 6 | 20 39 | €€€€€€€€foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; } | ||||
| 496 | 4 | 36 | €€€€€€€€delete $args{DiscardSpacesIn}; | ||||
| 497 | €€€€€€} | ||||||
| 498 | €€€€# discard spaces by default | ||||||
| 499 | 2714 | 31336 | €€€€$self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces}); | ||||
| 500 | |||||||
| 501 | 2714 | 19757 | €€€€$args{Comments}||= $COMMENTS_DEFAULT; | ||||
| 502 | 2714 3 | 23478 16 | €€€€if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; } | ||||
| 503 | 1825 | 8727 | €€€€elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; } | ||||
| 504 | 885 | 4342 | €€€€elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; } | ||||
| 505 | 1 | 10 | €€€€else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; } | ||||
| 506 | 2713 | 11590 | €€€€delete $args{Comments}; | ||||
| 507 | |||||||
| 508 | 2713 | 18768 | €€€€$args{Pi}||= $PI_DEFAULT; | ||||
| 509 | 2713 2 | 23627 10 | €€€€if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; } | ||||
| 510 | 1827 | 8977 | €€€€elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; } | ||||
| 511 | 883 | 4334 | €€€€elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; } | ||||
| 512 | 1 | 9 | €€€€else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; } | ||||
| 513 | 2712 | 10671 | €€€€delete $args{Pi}; | ||||
| 514 | |||||||
| 515 | 2712 | 17769 | €€€€if( $args{KeepEncoding}) | ||||
| 516 | €€€€€€{ | ||||||
| 517 | €€€€€€€€# set it in XML::Twig::Elt so print functions know what to do | ||||||
| 518 | 967 | 5454 | €€€€€€€€$self->set_keep_encoding( 1); | ||||
| 519 | 967 | 13300 | €€€€€€€€$self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag; | ||||
| 520 | 967 | 5980 | €€€€€€€€delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ; | ||||
| 521 | 967 | 4011 | €€€€€€€€delete $args{KeepEncoding}; | ||||
| 522 | €€€€€€} | ||||||
| 523 | €€€€else | ||||||
| 524 | 1745 | 11253 | €€€€€€{ $self->set_keep_encoding( 0); | ||||
| 525 | 1745 | 9836 | €€€€€€€€$self->{parse_start_tag}= $args{ParseStartTag} if( $args{ParseStartTag}); | ||||
| 526 | €€€€€€} | ||||||
| 527 | |||||||
| 528 | 2712 | 13753 | €€€€if( $args{OutputFilter}) | ||||
| 529 | 5 | 30 | €€€€€€{ $self->set_output_filter( $args{OutputFilter}); | ||||
| 530 | 5 | 21 | €€€€€€€€delete $args{OutputFilter}; | ||||
| 531 | €€€€€€} | ||||||
| 532 | €€€€else | ||||||
| 533 | 2707 | 15181 | €€€€€€{ $self->set_output_filter( 0); } | ||||
| 534 | |||||||
| 535 | 2712 | 13488 | €€€€if( $args{RemoveCdata}) | ||||
| 536 | 1 | 6 | €€€€€€{ $self->set_remove_cdata( $args{RemoveCdata}); | ||||
| 537 | 1 | 5 | €€€€€€€€delete $args{RemoveCdata}; | ||||
| 538 | €€€€€€} | ||||||
| 539 | €€€€else | ||||||
| 540 | 2711 | 14432 | €€€€€€{ $self->set_remove_cdata( 0); } | ||||
| 541 | |||||||
| 542 | 2712 | 13512 | €€€€if( $args{OutputTextFilter}) | ||||
| 543 | 5 | 29 | €€€€€€{ $self->set_output_text_filter( $args{OutputTextFilter}); | ||||
| 544 | 5 | 24 | €€€€€€€€delete $args{OutputTextFilter}; | ||||
| 545 | €€€€€€} | ||||||
| 546 | €€€€else | ||||||
| 547 | 2707 | 13176 | €€€€€€{ $self->set_output_text_filter( 0); } | ||||
| 548 | |||||||
| 549 | |||||||
| 550 | 2712 | 13640 | €€€€if( exists $args{KeepAttsOrder}) | ||||
| 551 | 6 | 33 | €€€€€€{ $self->{keep_atts_order}= $args{KeepAttsOrder}; | ||||
| 552 | 6 | 30 | €€€€€€€€if( _use( 'Tie::IxHash')) | ||||
| 553 | 5 | 33 | €€€€€€€€€€{ $self->set_keep_atts_order( $self->{keep_atts_order}); } | ||||
| 554 | €€€€€€€€else | ||||||
| 555 | 1 | 6 | €€€€€€€€€€{ croak "Tie::IxHash not available, option keep_atts_order not allowed"; } | ||||
| 556 | €€€€€€} | ||||||
| 557 | €€€€else | ||||||
| 558 | 2706 | 14954 | €€€€€€{ $self->set_keep_atts_order( 0); } | ||||
| 559 | |||||||
| 560 | |||||||
| 561 | 2711 39 | 13702 275 | €€€€if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); } | ||||
| 562 | 2711 1 | 12201 8 | €€€€if( $args{Quote}) { $self->set_quote( $args{Quote}); } | ||||
| 563 | 2711 11 | 12636 70 | €€€€if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}) } | ||||
| 564 | |||||||
| 565 | 2711 1 1 | 12623 6 9 | €€€€if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; } | ||||
| 566 | 2711 3 3 | 12383 15 26 | €€€€if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; } | ||||
| 567 | 2711 2 2 | 12421 10 19 | €€€€if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; } | ||||
| 568 | 2711 4 4 | 11934 25 35 | €€€€if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; } | ||||
| 569 | 2711 1 1 | 12005 7 10 | €€€€if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; } | ||||
| 570 | |||||||
| 571 | 2711 3 1 | 12247 24 8 | €€€€if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; } | ||||
| 572 | 2709 1 1 | 13147 8 84 | €€€€if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; } | ||||
| 573 | 2709 1 1 | 13513 6 9 | €€€€if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; } | ||||
| 574 | |||||||
| 575 | 2709 4 4 | 12997 17 33 | €€€€if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; } | ||||
| 576 | |||||||
| 577 | €€€€# set handlers | ||||||
| 578 | 2709 | 14285 | €€€€if( $self->{twig_roots}) | ||||
| 579 | 56 | 304 | €€€€€€{ if( $self->{twig_default_print}) | ||||
| 580 | 30 | 173 | €€€€€€€€€€{ if( $self->{twig_keep_encoding}) | ||||
| 581 | 6 | 81 | €€€€€€€€€€€€€€{ $self->setHandlers( %twig_handlers_roots_print_original); } | ||||
| 582 | €€€€€€€€€€€€else | ||||||
| 583 | 24 | 289 | €€€€€€€€€€€€€€{ $self->setHandlers( %twig_handlers_roots_print); } | ||||
| 584 | €€€€€€€€€€} | ||||||
| 585 | €€€€€€€€else | ||||||
| 586 | 26 | 309 | €€€€€€€€€€{ $self->setHandlers( %twig_handlers_roots); } | ||||
| 587 | €€€€€€} | ||||||
| 588 | €€€€else | ||||||
| 589 | 2653 | 29562 | €€€€€€{ $self->setHandlers( %twig_handlers); } | ||||
| 590 | |||||||
| 591 | €€€€# XML::Parser::Expat does not like these handler to be set. So in order to | ||||||
| 592 | €€€€# use the various sets of handlers on XML::Parser or XML::Parser::Expat | ||||||
| 593 | €€€€# objects when needed, these ones have to be set only once, here, at | ||||||
| 594 | €€€€# XML::Parser level | ||||||
| 595 | 2709 | 1681892 | €€€€$self->setHandlers( Init => \&_twig_init, Final => \&_twig_final); | ||||
| 596 | |||||||
| 597 | 2709 | 340346 | €€€€$self->{twig_entity_list}= XML::Twig::Entity_list->new; | ||||
| 598 | |||||||
| 599 | 2709 | 14041 | €€€€$self->{twig_id}= $ID; | ||||
| 600 | 2709 | 12798 | €€€€$self->{twig_stored_spaces}=''; | ||||
| 601 | |||||||
| 602 | 2709 | 14207 | €€€€$self->{twig_autoflush}= 1; # auto flush by default | ||||
| 603 | |||||||
| 604 | 2709 | 12227 | €€€€$self->{twig}= $self; | ||||
| 605 | 2709 | 39467 | €€€€weaken( $self->{twig}) if( $weakrefs); | ||||
| 606 | |||||||
| 607 | 2709 | 20669 | €€€€return $self; | ||||
| 608 | €€} | ||||||
| 609 | |||||||
| 610 | sub parse | ||||||
| 611 | €€{ | ||||||
| 612 | €€€€# if called as a class method, calls nparse, which creates the twig then parses it | ||||||
| 613 | 2714 3 | 25772 22 | €€€€if( !ref( $_[0])) { return shift->nparse( @_); } | ||||
| 614 | |||||||
| 615 | €€€€# requires 5.006 at least (or the ${^UNICODE} causes a problem) # > 5.006 | ||||||
| 616 | €€€€# trap underlying bug in IO::Handle (see RT #17500) # > 5.006 | ||||||
| 617 | €€€€# croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe # > 5.006 | ||||||
| 618 | 2711 | 47175 | €€€€if( $]>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[1], 'GLOB') && -p $_[1] ) # > 5.006 | ||||
| 619 | 0 | 0 | €€€€€€{ croak "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n" # > 5.006 | ||||
| 620 | €€€€€€€€€€€€€€. "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n" # > 5.006 | ||||||
| 621 | €€€€€€€€€€€€€€. "not to include 'D'"; # > 5.006 | ||||||
| 622 | €€€€€€} # > 5.006 | ||||||
| 623 | 2711 | 35670 | €€€€shift->SUPER::parse( @_); | ||||
| 624 | €€} | ||||||
| 625 | |||||||
| 626 | 3 | 20 | sub parsefile_inplace { shift->_parse_inplace( parsefile => @_); } | ||||
| 627 | 3 | 26 | sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); } | ||||
| 628 | |||||||
| 629 | sub _parse_inplace | ||||||
| 630 | 6 | 81 | €€{ my( $t, $method, $file, $suffix)= @_; | ||||
| 631 | 6 | 31 | €€€€_use( 'File::Temp') || die "need File::Temp to use inplace methods\n"; | ||||
| 632 | 6 | 28 | €€€€_use( 'File::Basename'); | ||||
| 633 | |||||||
| 634 | |||||||
| 635 | 6 | 41 | €€€€my $tmpdir= dirname( $file); | ||||
| 636 | 6 | 44 | €€€€my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir); | ||||
| 637 | 6 | 39 | €€€€my $original_fh= select $tmpfh; | ||||
| 638 | |||||||
| 639 | 6 | 47 | €€€€$t->$method( $file); | ||||
| 640 | |||||||
| 641 | 6 | 49 | €€€€select $original_fh; | ||||
| 642 | 6 | 380 | €€€€close $tmpfh; | ||||
| 643 | 6 | 88 | €€€€my $mode= (stat( $file))[2] & 07777; | ||||
| 644 | 6 | 97 | €€€€chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!"; | ||||
| 645 | |||||||
| 646 | 6 | 42 | €€€€if( $suffix) | ||||
| 647 | 4 | 15 | €€€€€€{ my $backup; | ||||
| 648 | 4 2 | 31 21 | €€€€€€€€if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; } | ||||
| 649 | 2 | 9 | €€€€€€€€else { $backup= $file . $suffix; } | ||||
| 650 | |||||||
| 651 | 4 | 123 | €€€€€€€€rename( $file, $backup) or die "cannot backup initial file ($file) to $backup: $!"; | ||||
| 652 | €€€€€€} | ||||||
| 653 | 6 | 205 | €€€€rename( $tmpfile, $file) or die "cannot rename temp file ($tmpfile) to initial file ($file): $!"; | ||||
| 654 | |||||||
| 655 | 6 | 27 | €€€€return $t; | ||||
| 656 | €€} | ||||||
| 657 | |||||||
| 658 | |||||||
| 659 | sub parseurl | ||||||
| 660 | 11 | 55 | €€{ my $t= shift; | ||||
| 661 | 11 | 87 | €€€€$t->_parseurl( 0, @_); | ||||
| 662 | €€} | ||||||
| 663 | |||||||
| 664 | sub safe_parseurl | ||||||
| 665 | 9 | 59 | €€{ my $t= shift; | ||||
| 666 | 9 | 80 | €€€€$t->_parseurl( 1, @_); | ||||
| 667 | €€} | ||||||
| 668 | |||||||
| 669 | |||||||
| 670 | sub parsefile_html | ||||||
| 671 | 4 | 18 | €€{ my $t= shift; | ||||
| 672 | 4 | 16 | €€€€my $file= shift; | ||||
| 673 | 4 | 29 | €€€€my $indent= $t->{ErrorContext} ? 1 : 0; | ||||
| 674 | 4 | 22 | €€€€$t->parse( _html2xml( _slurp( $file), { indent => $indent }), @_); | ||||
| 675 | 4 | 208 | €€€€return $t; | ||||
| 676 | €€} | ||||||
| 677 | |||||||
| 678 | sub parse_html | ||||||
| 679 | 21 | 99 | €€{ my $t= shift; | ||||
| 680 | 21 | 197 | €€€€my $content= shift; | ||||
| 681 | 21 | 140 | €€€€my $indent= $t->{ErrorContext} ? 1 : 0; | ||||
| 682 | 21 | 1398 | €€€€$t->parse( _html2xml( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, { indent => $indent }), @_); | ||||
| 683 | 16 | 849 | €€€€return $t; | ||||
| 684 | €€} | ||||||
| 685 | |||||||
| 686 | sub xparse | ||||||
| 687 | 1827 | 7309 | €€{ my $t= shift; | ||||
| 688 | 1827 | 6310 | €€€€my $to_parse= $_[0]; | ||||
| 689 | 1827 1 | 20836 7 | €€€€if( isa( $to_parse, 'GLOB')) { $t->parse( @_); } | ||||
| 690 | 1816 | 15520 | €€€€elsif( $to_parse=~ m{^\s*<}) { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_) | ||||
| 691 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€: $t->parse( @_); | ||||||
| 692 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€} | ||||||
| 693 | 2 | 9 | €€€€elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple"; | ||||
| 694 | 1 | 8 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€$t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_); | ||||
| 695 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€} | ||||||
| 696 | 2 | 11 | €€€€elsif( $to_parse=~ m{^\w+://}) { _use( 'LWP::Simple') or croak "missing LWP::Simple"; | ||||
| 697 | 1 | 7 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€my $doc= LWP::Simple::get( shift); | ||||
| 698 | 1 | 17 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€my $xml_parse_ok= $t->safe_parse( $doc, @_); | ||||
| 699 | 1 | 8 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€if( $xml_parse_ok) | ||||
| 700 | 1 | 9 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ return $xml_parse_ok; } | ||||
| 701 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€else | ||||||
| 702 | 0 | 0 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ my $diag= $@; | ||||
| 703 | 0 | 0 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€if( $doc=~ m{<html}i) | ||||
| 704 | 0 | 0 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ $t->parse_html( $doc, @_); } | ||||
| 705 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€else | ||||||
| 706 | 0 | 0 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€{ die $diag; } | ||||
| 707 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€} | ||||||
| 708 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€} | ||||||
| 709 | 2 | 12 | €€€€elsif( $to_parse=~ m{\.html?$}) { my $content= _slurp( shift); | ||||
| 710 | 2 | 14 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€$t->_parse_as_xml_or_html( $content, @_); | ||||
| 711 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€} | ||||||
| 712 | 4 | 40 | €€€€else { $t->parsefile( @_); } | ||||
| 713 | €€} | ||||||
| 714 | |||||||
| 715 | sub _parse_as_xml_or_html | ||||||
| 716 | 10 | 225 | €€{ my $t= shift; | ||||
| 717 | 10 | 61 | €€€€$t->safe_parse( @_) || $t->parse_html( @_); | ||||
| 718 | €€} | ||||||
| 719 | |||||||
| 720 | |||||||
| 721 | sub nparse | ||||||
| 722 | 1827 | 134279 | €€{ my $class= shift; | ||||
| 723 | 1827 | 8087 | €€€€my $to_parse= pop; | ||||
| 724 | 1827 | 12153 | €€€€$class->new( @_)->xparse( $to_parse); | ||||
| 725 | €€} | ||||||
| 726 | |||||||
| 727 | 1 | 19 | sub nparse_pp { shift()->nparse( pretty_print => 'indented', @_); } | ||||
| 728 | 4 | 370 | sub nparse_e { shift()->nparse( error_context => 1, @_); } | ||||
| 729 | 1 | 21 | sub nparse_ppe { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); } | ||||
| 730 | |||||||
| 731 | |||||||
| 732 | sub _html2xml | ||||||
| 733 | 25 | 186 | €€{ my( $html, $options)= @_; | ||||
| 734 | 25 | 130 | €€€€_use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n"; | ||||
| 735 | 24 | 211 | €€€€my $tree= HTML::TreeBuilder->new; | ||||
| 736 | 24 | 154 | €€€€$tree->ignore_ignorable_whitespace( 0); | ||||
| 737 | 24 | 147 | €€€€$tree->no_space_compacting( 1); | ||||
| 738 | 24 | 137 | €€€€$tree->store_comments( 1); | ||||
| 739 | 24 | 534 | €€€€$tree->store_pis(1); | ||||
| 740 | 24 | 180 | €€€€$tree->parse( $html); | ||||
| 741 | 24 | 233 | €€€€$tree->eof; | ||||
| 742 | 24 | 210 | €€€€my $xml= _latin12utf8( $tree->as_XML); | ||||
| 743 | 24 | 197 | €€€€$xml=~ s{(?<=.)<\?xml version="1.0" encoding="utf-8"\?+>}{}g; | ||||
| 744 | |||||||
| 745 | 24 4 | 162 23 | €€€€if( $options->{indent}) { _indent_xhtml( \$xml); } | ||||
| 746 | 24 | 171 | €€€€$tree->delete; | ||||
| 747 | 24 | 977 | €€€€return $xml; | ||||
| 748 | €€} | ||||||
| 749 | |||||||
| 750 | sub _latin12utf8 | ||||||
| 751 | 24 | 285 | €€{ my $string= shift; | ||||
| 752 | 24 | 260 | €€€€local $SIG{__DIE__}; | ||||
| 753 | 24 | 134 | €€€€if( _use( 'Encode')) | ||||
| 754 | 24 | 179 | €€€€€€{ from_to( $string, 'iso-8859-15' => 'utf8'); | ||||
| 755 | 24 | 224 | €€€€€€€€return $string; | ||||
| 756 | €€€€€€} | ||||||
| 757 | 0 | 0 | €€€€if( _use( 'Text::Iconv')) | ||||
| 758 | 0 | 0 | €€€€€€{ my $converter = eval { Text::Iconv->new("iso-8859-15" => "utf8") } | ||||
| 759 | 0 0 | 0 0 | €€€€€€€€€€€€€€€€€€€€€€|| eval { Text::Iconv->new("8859" => "646") }; # for old Solaris | ||||
| 760 | 0 0 | 0 0 | €€€€€€€€if( $converter) { return $converter->convert( $string); } | ||||
| 761 | €€€€€€} | ||||||
| 762 | 0 | 0 | €€€€if( _use( 'Unicode::Map8') && _use( 'Unicode::String')) | ||||
| 763 | 0 | 0 | €€€€€€{ my $l1_map= Unicode::Map8->new("latin1"); | ||||
| 764 | 0 | 0 | €€€€€€€€return $l1_map->tou( $string)->utf8; | ||||
| 765 | €€€€€€} | ||||||
| 766 | 0 | 0 | €€€€return $string; | ||||
| 767 | €€} | ||||||
| 768 | |||||||
| 769 | sub _indent_xhtml | ||||||
| 770 | 4 | 18 | €€{ my( $xhtml)= @_; # $xhtml is a ref | ||||
| 771 | 4 156 | 36 751 | €€€€my %block_tag= map { $_ => 1 } qw( html | ||||
| 772 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€head | ||||||
| 773 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€meta title link script base | ||||||
| 774 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€body | ||||||
| 775 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€h1 h2 h3 h4 h5 h6 | ||||||
| 776 | €€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€p br address blockquote pre | ||||||
| 777 | |||||||