deprecated/buildtools/buildsystemtools/lib/XML/Parser/Expat.pod
changeset 662 60be34e1b006
parent 655 3f65fd25dfd4
equal deleted inserted replaced
654:7c11c3d8d025 662:60be34e1b006
       
     1 =head1 WARNING
       
     2 
       
     3 This manual page was copied from the XML::Parser distribution (version 2.27)
       
     4 written by Clark Cooper. You can find newer versions at CPAN.
       
     5 
       
     6 =head1 NAME
       
     7 
       
     8 XML::Parser::Expat - Lowlevel access to James Clark's expat XML parser
       
     9 
       
    10 =head1 SYNOPSIS
       
    11 
       
    12  use XML::Parser::Expat;
       
    13 
       
    14  $parser = new XML::Parser::Expat;
       
    15  $parser->setHandlers('Start' => \&sh,
       
    16 		      'End'   => \&eh,
       
    17                       'Char'  => \&ch);
       
    18  open(FOO, 'info.xml') or die "Couldn't open";
       
    19  $parser->parse(*FOO);
       
    20  close(FOO);
       
    21  # $parser->parse('<foo id="me"> here <em>we</em> go </foo>');
       
    22 
       
    23  sub sh
       
    24  {
       
    25    my ($p, $el, %atts) = @_;
       
    26    $p->setHandlers('Char' => \&spec)
       
    27      if ($el eq 'special');
       
    28    ...
       
    29  }
       
    30 
       
    31  sub eh
       
    32  {
       
    33    my ($p, $el) = @_;
       
    34    $p->setHandlers('Char' => \&ch)  # Special elements won't contain
       
    35      if ($el eq 'special');         # other special elements
       
    36    ...
       
    37  } 
       
    38 
       
    39 =head1 DESCRIPTION
       
    40 
       
    41 This module provides an interface to James Clark's XML parser, expat. As in
       
    42 expat, a single instance of the parser can only parse one document. Calls
       
    43 to parsestring after the first for a given instance will die.
       
    44 
       
    45 Expat (and XML::Parser::Expat) are event based. As the parser recognizes
       
    46 parts of the document (say the start or end of an XML element), then any
       
    47 handlers registered for that type of an event are called with suitable
       
    48 parameters.
       
    49 
       
    50 =head1 METHODS
       
    51 
       
    52 =over 4
       
    53 
       
    54 =item new
       
    55 
       
    56 This is a class method, the constructor for XML::Parser::Expat. Options are
       
    57 passed as keyword value pairs. The recognized options are:
       
    58 
       
    59 =over 4
       
    60 
       
    61 =item * ProtocolEncoding
       
    62 
       
    63 The protocol encoding name. The default is none. The expat built-in
       
    64 encodings are: C<UTF-8>, C<ISO-8859-1>, C<UTF-16>, and C<US-ASCII>.
       
    65 Other encodings may be used if they have encoding maps in one of the
       
    66 directories in the @Encoding_Path list. Setting the protocol encoding
       
    67 overrides any encoding in the XML declaration.
       
    68 
       
    69 =item * Namespaces
       
    70 
       
    71 When this option is given with a true value, then the parser does namespace
       
    72 processing. By default, namespace processing is turned off. When it is
       
    73 turned on, the parser consumes I<xmlns> attributes and strips off prefixes
       
    74 from element and attributes names where those prefixes have a defined
       
    75 namespace. A name's namespace can be found using the L<"namespace"> method
       
    76 and two names can be checked for absolute equality with the L<"eq_name">
       
    77 method.
       
    78 
       
    79 =item * NoExpand
       
    80 
       
    81 Normally, the parser will try to expand references to entities defined in
       
    82 the internal subset. If this option is set to a true value, and a default
       
    83 handler is also set, then the default handler will be called when an
       
    84 entity reference is seen in text. This has no effect if a default handler
       
    85 has not been registered, and it has no effect on the expansion of entity
       
    86 references inside attribute values.
       
    87 
       
    88 =item * Stream_Delimiter
       
    89 
       
    90 This option takes a string value. When this string is found alone on a line
       
    91 while parsing from a stream, then the parse is ended as if it saw an end of
       
    92 file. The intended use is with a stream of xml documents in a MIME multipart
       
    93 format. The string should not contain a trailing newline.
       
    94 
       
    95 =item * ErrorContext
       
    96 
       
    97 When this option is defined, errors are reported in context. The value
       
    98 of ErrorContext should be the number of lines to show on either side of
       
    99 the line in which the error occurred.
       
   100 
       
   101 =item * ParseParamEnt
       
   102 
       
   103 Unless standalone is set to "yes" in the XML declaration, setting this to
       
   104 a true value allows the external DTD to be read, and parameter entities
       
   105 to be parsed and expanded.
       
   106 
       
   107 =item * Base
       
   108 
       
   109 The base to use for relative pathnames or URLs. This can also be done by
       
   110 using the base method.
       
   111 
       
   112 =back
       
   113 
       
   114 =item setHandlers(TYPE, HANDLER [, TYPE, HANDLER [...]])
       
   115 
       
   116 This method registers handlers for the various events. If no handlers are
       
   117 registered, then a call to parsestring or parsefile will only determine if
       
   118 the corresponding XML document is well formed (by returning without error.)
       
   119 This may be called from within a handler, after the parse has started.
       
   120 
       
   121 Setting a handler to something that evaluates to false unsets that
       
   122 handler.
       
   123 
       
   124 This method returns a list of type, handler pairs corresponding to the
       
   125 input. The handlers returned are the ones that were in effect before the
       
   126 call to setHandlers.
       
   127 
       
   128 The recognized events and the parameters passed to the corresponding
       
   129 handlers are:
       
   130 
       
   131 =over 4
       
   132 
       
   133 =item * Start		(Parser, Element [, Attr, Val [,...]])
       
   134 
       
   135 This event is generated when an XML start tag is recognized. Parser is
       
   136 an XML::Parser::Expat instance. Element is the name of the XML element that
       
   137 is opened with the start tag. The Attr & Val pairs are generated for each
       
   138 attribute in the start tag.
       
   139 
       
   140 =item * End		(Parser, Element)
       
   141 
       
   142 This event is generated when an XML end tag is recognized. Note that
       
   143 an XML empty tag (<foo/>) generates both a start and an end event.
       
   144 
       
   145 There is always a lower level start and end handler installed that wrap
       
   146 the corresponding callbacks. This is to handle the context mechanism.
       
   147 A consequence of this is that the default handler (see below) will not
       
   148 see a start tag or end tag unless the default_current method is called.
       
   149 
       
   150 =item * Char		(Parser, String)
       
   151 
       
   152 This event is generated when non-markup is recognized. The non-markup
       
   153 sequence of characters is in String. A single non-markup sequence of
       
   154 characters may generate multiple calls to this handler. Whatever the
       
   155 encoding of the string in the original document, this is given to the
       
   156 handler in UTF-8.
       
   157 
       
   158 =item * Proc		(Parser, Target, Data)
       
   159 
       
   160 This event is generated when a processing instruction is recognized.
       
   161 
       
   162 =item * Comment		(Parser, String)
       
   163 
       
   164 This event is generated when a comment is recognized.
       
   165 
       
   166 =item * CdataStart	(Parser)
       
   167 
       
   168 This is called at the start of a CDATA section.
       
   169 
       
   170 =item * CdataEnd	(Parser)
       
   171 
       
   172 This is called at the end of a CDATA section.
       
   173 
       
   174 =item * Default		(Parser, String)
       
   175 
       
   176 This is called for any characters that don't have a registered handler.
       
   177 This includes both characters that are part of markup for which no
       
   178 events are generated (markup declarations) and characters that
       
   179 could generate events, but for which no handler has been registered.
       
   180 
       
   181 Whatever the encoding in the original document, the string is returned to
       
   182 the handler in UTF-8.
       
   183 
       
   184 =item * Unparsed		(Parser, Entity, Base, Sysid, Pubid, Notation)
       
   185 
       
   186 This is called for a declaration of an unparsed entity. Entity is the name
       
   187 of the entity. Base is the base to be used for resolving a relative URI.
       
   188 Sysid is the system id. Pubid is the public id. Notation is the notation
       
   189 name. Base and Pubid may be undefined.
       
   190 
       
   191 =item * Notation		(Parser, Notation, Base, Sysid, Pubid)
       
   192 
       
   193 This is called for a declaration of notation. Notation is the notation name.
       
   194 Base is the base to be used for resolving a relative URI. Sysid is the system
       
   195 id. Pubid is the public id. Base, Sysid, and Pubid may all be undefined.
       
   196 
       
   197 =item * ExternEnt		(Parser, Base, Sysid, Pubid)
       
   198 
       
   199 This is called when an external entity is referenced. Base is the base to be
       
   200 used for resolving a relative URI. Sysid is the system id. Pubid is the public
       
   201 id. Base, and Pubid may be undefined.
       
   202 
       
   203 This handler should either return a string, which represents the contents of
       
   204 the external entity, or return an open filehandle that can be read to obtain
       
   205 the contents of the external entity, or return undef, which indicates the
       
   206 external entity couldn't be found and will generate a parse error.
       
   207 
       
   208 If an open filehandle is returned, it must be returned as either a glob
       
   209 (*FOO) or as a reference to a glob (e.g. an instance of IO::Handle). The
       
   210 parser will close the filehandle after using it.
       
   211 
       
   212 =item * Entity			(Parser, Name, Val, Sysid, Pubid, Ndata)
       
   213 
       
   214 This is called when an entity is declared. For internal entities, the Val
       
   215 parameter will contain the value and the remaining three parameters will
       
   216 be undefined. For external entities, the Val parameter
       
   217 will be undefined, the Sysid parameter will have the system id, the Pubid
       
   218 parameter will have the public id if it was provided (it will be undefined
       
   219 otherwise), the Ndata parameter will contain the notation for unparsed
       
   220 entities. If this is a parameter entity declaration, then a '%' will be
       
   221 prefixed to the name.
       
   222 
       
   223 Note that this handler and the Unparsed handler above overlap. If both are
       
   224 set, then this handler will not be called for unparsed entities.
       
   225 
       
   226 =item * Element			(Parser, Name, Model)
       
   227 
       
   228 The element handler is called when an element declaration is found. Name is
       
   229 the element name, and Model is the content model as a string.
       
   230 
       
   231 =item * Attlist			(Parser, Elname, Attname, Type, Default, Fixed)
       
   232 
       
   233 This handler is called for each attribute in an ATTLIST declaration.
       
   234 So an ATTLIST declaration that has multiple attributes
       
   235 will generate multiple calls to this handler. The Elname parameter is the
       
   236 name of the element with which the attribute is being associated. The Attname
       
   237 parameter is the name of the attribute. Type is the attribute type, given as
       
   238 a string. Default is the default value, which will either be "#REQUIRED",
       
   239 "#IMPLIED" or a quoted string (i.e. the returned string will begin and end
       
   240 with a quote character). If Fixed is true, then this is a fixed attribute.
       
   241 
       
   242 =item * Doctype			(Parser, Name, Sysid, Pubid, Internal)
       
   243 
       
   244 This handler is called for DOCTYPE declarations. Name is the document type
       
   245 name. Sysid is the system id of the document type, if it was provided,
       
   246 otherwise it's undefined. Pubid is the public id of the document type,
       
   247 which will be undefined if no public id was given. Internal is the internal
       
   248 subset, given as a string. If there was no internal subset, it will be
       
   249 undefined. Internal will contain all whitespace, comments, processing
       
   250 instructions, and declarations seen in the internal subset. The declarations
       
   251 will be there whether or not they have been processed by another handler
       
   252 (except for unparsed entities processed by the Unparsed handler). However,
       
   253 comments and processing instructions will not appear if they've been processed
       
   254 by their respective handlers.
       
   255 
       
   256 =item * XMLDecl			(Parser, Version, Encoding, Standalone)
       
   257 
       
   258 This handler is called for xml declarations. Version is a string containg
       
   259 the version. Encoding is either undefined or contains an encoding string.
       
   260 Standalone will be either true, false, or undefined if the standalone attribute
       
   261 is yes, no, or not made respectively.
       
   262 
       
   263 =back
       
   264 
       
   265 =item namespace(name)
       
   266 
       
   267 Return the URI of the namespace that the name belongs to. If the name doesn't
       
   268 belong to any namespace, an undef is returned. This is only valid on names
       
   269 received through the Start or End handlers from a single document, or through
       
   270 a call to the generate_ns_name method. In other words, don't use names
       
   271 generated from one instance of XML::Parser::Expat with other instances.
       
   272 
       
   273 =item eq_name(name1, name2)
       
   274 
       
   275 Return true if name1 and name2 are identical (i.e. same name and from
       
   276 the same namespace.) This is only meaningful if both names were obtained
       
   277 through the Start or End handlers from a single document, or through
       
   278 a call to the generate_ns_name method.
       
   279 
       
   280 =item generate_ns_name(name, namespace)
       
   281 
       
   282 Return a name, associated with a given namespace, good for using with the
       
   283 above 2 methods. The namespace argument should be the namespace URI, not
       
   284 a prefix.
       
   285 
       
   286 =item new_ns_prefixes
       
   287 
       
   288 When called from a start tag handler, returns namespace prefixes declared
       
   289 with this start tag. If called elsewere (or if there were no namespace
       
   290 prefixes declared), it returns an empty list. Setting of the default
       
   291 namespace is indicated with '#default' as a prefix.
       
   292 
       
   293 =item expand_ns_prefix(prefix)
       
   294 
       
   295 Return the uri to which the given prefix is currently bound. Returns
       
   296 undef if the prefix isn't currently bound. Use '#default' to find the
       
   297 current binding of the default namespace (if any).
       
   298 
       
   299 =item current_ns_prefixes
       
   300 
       
   301 Return a list of currently bound namespace prefixes. The order of the
       
   302 the prefixes in the list has no meaning. If the default namespace is
       
   303 currently bound, '#default' appears in the list.
       
   304 
       
   305 =item recognized_string
       
   306 
       
   307 Returns the string from the document that was recognized in order to call
       
   308 the current handler. For instance, when called from a start handler, it
       
   309 will give us the the start-tag string. The string is encoded in UTF-8.
       
   310 
       
   311 =item original_string
       
   312 
       
   313 Returns the verbatim string from the document that was recognized in
       
   314 order to call the current handler. The string is in the original document
       
   315 encoding.
       
   316 
       
   317 =item default_current
       
   318 
       
   319 When called from a handler, causes the sequence of characters that generated
       
   320 the corresponding event to be sent to the default handler (if one is
       
   321 registered). Use of this method is deprecated in favor the recognized_string
       
   322 method, which you can use without installing a default handler.
       
   323 
       
   324 =item xpcroak(message)
       
   325 
       
   326 Concatenate onto the given message the current line number within the
       
   327 XML document plus the message implied by ErrorContext. Then croak with
       
   328 the formed message.
       
   329 
       
   330 =item xpcarp(message)
       
   331 
       
   332 Concatenate onto the given message the current line number within the
       
   333 XML document plus the message implied by ErrorContext. Then carp with
       
   334 the formed message.
       
   335 
       
   336 =item current_line
       
   337 
       
   338 Returns the line number of the current position of the parse.
       
   339 
       
   340 =item current_column
       
   341 
       
   342 Returns the column number of the current position of the parse.
       
   343 
       
   344 =item current_byte
       
   345 
       
   346 Returns the current position of the parse.
       
   347 
       
   348 =item base([NEWBASE]);
       
   349 
       
   350 Returns the current value of the base for resolving relative URIs. If
       
   351 NEWBASE is supplied, changes the base to that value.
       
   352 
       
   353 =item context
       
   354 
       
   355 Returns a list of element names that represent open elements, with the
       
   356 last one being the innermost. Inside start and end tag handlers, this
       
   357 will be the tag of the parent element.
       
   358 
       
   359 =item current_element
       
   360 
       
   361 Returns the name of the innermost currently opened element. Inside
       
   362 start or end handlers, returns the parent of the element associated
       
   363 with those tags.
       
   364 
       
   365 =item in_element(NAME)
       
   366 
       
   367 Returns true if NAME is equal to the name of the innermost currently opened
       
   368 element. If namespace processing is being used and you want to check
       
   369 against a name that may be in a namespace, then use the generate_ns_name
       
   370 method to create the NAME argument.
       
   371 
       
   372 =item within_element(NAME)
       
   373 
       
   374 Returns the number of times the given name appears in the context list.
       
   375 If namespace processing is being used and you want to check
       
   376 against a name that may be in a namespace, then use the generate_ns_name
       
   377 method to create the NAME argument.
       
   378 
       
   379 =item depth
       
   380 
       
   381 Returns the size of the context list.
       
   382 
       
   383 =item element_index
       
   384 
       
   385 Returns an integer that is the depth-first visit order of the current
       
   386 element. This will be zero outside of the root element. For example,
       
   387 this will return 1 when called from the start handler for the root element
       
   388 start tag.
       
   389 
       
   390 =item skip_until(INDEX)
       
   391 
       
   392 INDEX is an integer that represents an element index. When this method
       
   393 is called, all handlers are suspended until the start tag for an element
       
   394 that has an index number equal to INDEX is seen. If a start handler has
       
   395 been set, then this is the first tag that the start handler will see
       
   396 after skip_until has been called.
       
   397 
       
   398 
       
   399 =item position_in_context(LINES)
       
   400 
       
   401 Returns a string that shows the current parse position. LINES should be
       
   402 an integer >= 0 that represents the number of lines on either side of the
       
   403 current parse line to place into the returned string.
       
   404 
       
   405 =item xml_escape(TEXT [, CHAR [, CHAR ...]])
       
   406 
       
   407 Returns TEXT with markup characters turned into character entities. Any
       
   408 additional characters provided as arguments are also turned into character
       
   409 references where found in TEXT.
       
   410 
       
   411 =item parse (SOURCE)
       
   412 
       
   413 The SOURCE parameter should either be a string containing the whole XML
       
   414 document, or it should be an open IO::Handle. Only a single document
       
   415 may be parsed for a given instance of XML::Parser::Expat, so this will croak
       
   416 if it's been called previously for this instance.
       
   417 
       
   418 =item parsestring(XML_DOC_STRING)
       
   419 
       
   420 Parses the given string as an XML document. Only a single document may be
       
   421 parsed for a given instance of XML::Parser::Expat, so this will die if either
       
   422 parsestring or parsefile has been called for this instance previously.
       
   423 
       
   424 This method is deprecated in favor of the parse method.
       
   425 
       
   426 =item parsefile(FILENAME)
       
   427 
       
   428 Parses the XML document in the given file. Will die if parsestring or
       
   429 parsefile has been called previously for this instance.
       
   430 
       
   431 =item is_defaulted(ATTNAME)
       
   432 
       
   433 NO LONGER WORKS. To find out if an attribute is defaulted please use
       
   434 the specified_attr method.
       
   435 
       
   436 =item specified_attr
       
   437 
       
   438 When the start handler receives lists of attributes and values, the
       
   439 non-defaulted (i.e. explicitly specified) attributes occur in the list
       
   440 first. This method returns the number of specified items in the list.
       
   441 So if this number is equal to the length of the list, there were no
       
   442 defaulted values. Otherwise the number points to the index of the
       
   443 first defaulted attribute name.
       
   444 
       
   445 =item finish
       
   446 
       
   447 Unsets all handlers (including internal ones that set context), but expat
       
   448 continues parsing to the end of the document or until it finds an error.
       
   449 It should finish up a lot faster than with the handlers set.
       
   450 
       
   451 =item release
       
   452 
       
   453 There are data structures used by XML::Parser::Expat that have circular
       
   454 references. This means that these structures will never be garbage
       
   455 collected unless these references are explicitly broken. Calling this
       
   456 method breaks those references (and makes the instance unusable.)
       
   457 
       
   458 Normally, higher level calls handle this for you, but if you are using
       
   459 XML::Parser::Expat directly, then it's your responsibility to call it.
       
   460 
       
   461 =back
       
   462 
       
   463 =head2 XML::Parser::ExpatNB Methods
       
   464 
       
   465 The class XML::Parser::ExpatNB is a subclass of XML::Parser::Expat used
       
   466 for non-blocking access to the expat library. It does not support the parse,
       
   467 parsestring, or parsefile methods, but it does have these additional methods:
       
   468 
       
   469 =over 4
       
   470 
       
   471 =item parse_more(DATA)
       
   472 
       
   473 Feed expat more text to munch on.
       
   474 
       
   475 =item parse_done
       
   476 
       
   477 Tell expat that it's gotten the whole document.
       
   478 
       
   479 =back
       
   480 
       
   481 =head1 FUNCTIONS
       
   482 
       
   483 =over 4
       
   484 
       
   485 =item XML::Parser::Expat::load_encoding(ENCODING)
       
   486 
       
   487 Load an external encoding. ENCODING is either the name of an encoding or
       
   488 the name of a file. The basename is converted to lowercase and a '.enc'
       
   489 extension is appended unless there's one already there. Then, unless
       
   490 it's an absolute pathname (i.e. begins with '/'), the first file by that
       
   491 name discovered in the @Encoding_Path path list is used.
       
   492 
       
   493 The encoding in the file is loaded and kept in the %Encoding_Table
       
   494 table. Earlier encodings of the same name are replaced.
       
   495 
       
   496 This function is automaticly called by expat when it encounters an encoding
       
   497 it doesn't know about. Expat shouldn't call this twice for the same
       
   498 encoding name. The only reason users should use this function is to
       
   499 explicitly load an encoding not contained in the @Encoding_Path list.
       
   500 
       
   501 =back
       
   502 
       
   503 =head1 AUTHORS
       
   504 
       
   505 Larry Wall <F<larry@wall.org>> wrote version 1.0.
       
   506 
       
   507 Clark Cooper <F<coopercc@netheaven.com>> picked up support, changed the API
       
   508 for this version (2.x), provided documentation, and added some standard
       
   509 package features.
       
   510 
       
   511 =cut