| Home | Trees | Indices | Help |
|
|---|
|
|
1 #
2 # ElementTree
3 # $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $
4 #
5 # light-weight XML support for Python 2.3 and later.
6 #
7 # history (since 1.2.6):
8 # 2005-11-12 fl added tostringlist/fromstringlist helpers
9 # 2006-07-05 fl merged in selected changes from the 1.3 sandbox
10 # 2006-07-05 fl removed support for 2.1 and earlier
11 # 2007-06-21 fl added deprecation/future warnings
12 # 2007-08-25 fl added doctype hook, added parser version attribute etc
13 # 2007-08-26 fl added new serializer code (better namespace handling, etc)
14 # 2007-08-27 fl warn for broken /tag searches on tree level
15 # 2007-09-02 fl added html/text methods to serializer (experimental)
16 # 2007-09-05 fl added method argument to tostring/tostringlist
17 # 2007-09-06 fl improved error handling
18 # 2007-09-13 fl added itertext, iterfind; assorted cleanups
19 # 2007-12-15 fl added C14N hooks, copy method (experimental)
20 #
21 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
22 #
23 # fredrik@pythonware.com
24 # http://www.pythonware.com
25 #
26 # --------------------------------------------------------------------
27 # The ElementTree toolkit is
28 #
29 # Copyright (c) 1999-2008 by Fredrik Lundh
30 #
31 # By obtaining, using, and/or copying this software and/or its
32 # associated documentation, you agree that you have read, understood,
33 # and will comply with the following terms and conditions:
34 #
35 # Permission to use, copy, modify, and distribute this software and
36 # its associated documentation for any purpose and without fee is
37 # hereby granted, provided that the above copyright notice appears in
38 # all copies, and that both that copyright notice and this permission
39 # notice appear in supporting documentation, and that the name of
40 # Secret Labs AB or the author not be used in advertising or publicity
41 # pertaining to distribution of the software without specific, written
42 # prior permission.
43 #
44 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
45 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
46 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
47 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
48 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
49 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
50 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
51 # OF THIS SOFTWARE.
52 # --------------------------------------------------------------------
53
54 # Licensed to PSF under a Contributor Agreement.
55 # See http://www.python.org/psf/license for licensing details.
56
57 __all__ = [
58 # public symbols
59 "Comment",
60 "dump",
61 "Element", "ElementTree",
62 "fromstring", "fromstringlist",
63 "iselement", "iterparse",
64 "parse", "ParseError",
65 "PI", "ProcessingInstruction",
66 "QName",
67 "SubElement",
68 "tostring", "tostringlist",
69 "TreeBuilder",
70 "VERSION",
71 "XML",
72 "XMLParser", "XMLTreeBuilder",
73 ]
74
75 VERSION = "1.3.0"
76
77 ##
78 # The <b>Element</b> type is a flexible container object, designed to
79 # store hierarchical data structures in memory. The type can be
80 # described as a cross between a list and a dictionary.
81 # <p>
82 # Each element has a number of properties associated with it:
83 # <ul>
84 # <li>a <i>tag</i>. This is a string identifying what kind of data
85 # this element represents (the element type, in other words).</li>
86 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
87 # <li>a <i>text</i> string.</li>
88 # <li>an optional <i>tail</i> string.</li>
89 # <li>a number of <i>child elements</i>, stored in a Python sequence</li>
90 # </ul>
91 #
92 # To create an element instance, use the {@link #Element} constructor
93 # or the {@link #SubElement} factory function.
94 # <p>
95 # The {@link #ElementTree} class can be used to wrap an element
96 # structure, and convert it from and to XML.
97 ##
98
99 import sys
100 import re
101 import warnings
102
103
105 # emulate pre-1.2 find/findtext/findall behaviour
112 elem = self.find(element, tag)
113 if elem is None:
114 return default
115 return elem.text or ""
117 if tag[:3] == ".//":
118 for elem in element.iter(tag[3:]):
119 yield elem
120 for elem in element:
121 if elem.tag == tag:
122 yield elem
125
126 try:
127 from . import ElementPath
128 except ImportError:
129 ElementPath = _SimpleElementPath()
130
131 ##
132 # Parser error. This is a subclass of <b>SyntaxError</b>.
133 # <p>
134 # In addition to the exception value, an exception instance contains a
135 # specific exception code in the <b>code</b> attribute, and the line and
136 # column of the error in the <b>position</b> attribute.
137
140
141 # --------------------------------------------------------------------
142
143 ##
144 # Checks if an object appears to be a valid element object.
145 #
146 # @param An element instance.
147 # @return A true value if this is an element object.
148 # @defreturn flag
149
151 # FIXME: not sure about this; might be a better idea to look
152 # for tag/attrib/text attributes
153 return isinstance(element, Element) or hasattr(element, "tag")
154
155 ##
156 # Element class. This class defines the Element interface, and
157 # provides a reference implementation of this interface.
158 # <p>
159 # The element name, attribute names, and attribute values can be
160 # either ASCII strings (ordinary Python strings containing only 7-bit
161 # ASCII characters) or Unicode strings.
162 #
163 # @param tag The element name.
164 # @param attrib An optional dictionary, containing element attributes.
165 # @param **extra Additional attributes, given as keyword arguments.
166 # @see Element
167 # @see SubElement
168 # @see Comment
169 # @see ProcessingInstruction
170
172 # <tag attrib>text<child/>...</tag>tail
173
174 ##
175 # (Attribute) Element tag.
176
177 tag = None
178
179 ##
180 # (Attribute) Element attribute dictionary. Where possible, use
181 # {@link #Element.get},
182 # {@link #Element.set},
183 # {@link #Element.keys}, and
184 # {@link #Element.items} to access
185 # element attributes.
186
187 attrib = None
188
189 ##
190 # (Attribute) Text before first subelement. This is either a
191 # string or the value None. Note that if there was no text, this
192 # attribute may be either None or an empty string, depending on
193 # the parser.
194
195 text = None
196
197 ##
198 # (Attribute) Text after this element's end tag, but before the
199 # next sibling element's start tag. This is either a string or
200 # the value None. Note that if there was no text, this attribute
201 # may be either None or an empty string, depending on the parser.
202
203 tail = None # text after end tag, if any
204
205 # constructor
206
208 attrib = attrib.copy()
209 attrib.update(extra)
210 self.tag = tag
211 self.attrib = attrib
212 self._children = []
213
215 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
216
217 ##
218 # Creates a new element object of the same type as this element.
219 #
220 # @param tag Element tag.
221 # @param attrib Element attributes, given as a dictionary.
222 # @return A new element instance.
223
226
227 ##
228 # (Experimental) Copies the current element. This creates a
229 # shallow copy; subelements will be shared with the original tree.
230 #
231 # @return A new element instance.
232
234 elem = self.makeelement(self.tag, self.attrib)
235 elem.text = self.text
236 elem.tail = self.tail
237 elem[:] = self
238 return elem
239
240 ##
241 # Returns the number of subelements. Note that this only counts
242 # full elements; to check if there's any content in an element, you
243 # have to check both the length and the <b>text</b> attribute.
244 #
245 # @return The number of subelements.
246
249
251 warnings.warn(
252 "The behavior of this method will change in future versions. "
253 "Use specific 'len(elem)' or 'elem is not None' test instead.",
254 FutureWarning, stacklevel=2
255 )
256 return len(self._children) != 0 # emulate old behaviour, for now
257
258 ##
259 # Returns the given subelement, by index.
260 #
261 # @param index What subelement to return.
262 # @return The given subelement.
263 # @exception IndexError If the given element does not exist.
264
266 return self._children[index]
267
268 ##
269 # Replaces the given subelement, by index.
270 #
271 # @param index What subelement to replace.
272 # @param element The new element value.
273 # @exception IndexError If the given element does not exist.
274
276 # if isinstance(index, slice):
277 # for elt in element:
278 # assert iselement(elt)
279 # else:
280 # assert iselement(element)
281 self._children[index] = element
282
283 ##
284 # Deletes the given subelement, by index.
285 #
286 # @param index What subelement to delete.
287 # @exception IndexError If the given element does not exist.
288
290 del self._children[index]
291
292 ##
293 # Adds a subelement to the end of this element. In document order,
294 # the new element will appear after the last existing subelement (or
295 # directly after the text, if it's the first subelement), but before
296 # the end tag for this element.
297 #
298 # @param element The element to add.
299
303
304 ##
305 # Appends subelements from a sequence.
306 #
307 # @param elements A sequence object with zero or more elements.
308 # @since 1.3
309
314
315 ##
316 # Inserts a subelement at the given position in this element.
317 #
318 # @param index Where to insert the new subelement.
319
323
324 ##
325 # Removes a matching subelement. Unlike the <b>find</b> methods,
326 # this method compares elements based on identity, not on tag
327 # value or contents. To remove subelements by other means, the
328 # easiest way is often to use a list comprehension to select what
329 # elements to keep, and use slice assignment to update the parent
330 # element.
331 #
332 # @param element What element to remove.
333 # @exception ValueError If a matching element could not be found.
334
338
339 ##
340 # (Deprecated) Returns all subelements. The elements are returned
341 # in document order.
342 #
343 # @return A list of subelements.
344 # @defreturn list of Element instances
345
347 warnings.warn(
348 "This method will be removed in future versions. "
349 "Use 'list(elem)' or iteration over elem instead.",
350 DeprecationWarning, stacklevel=2
351 )
352 return self._children
353
354 ##
355 # Finds the first matching subelement, by tag name or path.
356 #
357 # @param path What element to look for.
358 # @keyparam namespaces Optional namespace prefix map.
359 # @return The first matching element, or None if no element was found.
360 # @defreturn Element or None
361
364
365 ##
366 # Finds text for the first matching subelement, by tag name or path.
367 #
368 # @param path What element to look for.
369 # @param default What to return if the element was not found.
370 # @keyparam namespaces Optional namespace prefix map.
371 # @return The text content of the first matching element, or the
372 # default value no element was found. Note that if the element
373 # is found, but has no text content, this method returns an
374 # empty string.
375 # @defreturn string
376
379
380 ##
381 # Finds all matching subelements, by tag name or path.
382 #
383 # @param path What element to look for.
384 # @keyparam namespaces Optional namespace prefix map.
385 # @return A list or other sequence containing all matching elements,
386 # in document order.
387 # @defreturn list of Element instances
388
391
392 ##
393 # Finds all matching subelements, by tag name or path.
394 #
395 # @param path What element to look for.
396 # @keyparam namespaces Optional namespace prefix map.
397 # @return An iterator or sequence containing all matching elements,
398 # in document order.
399 # @defreturn a generated sequence of Element instances
400
403
404 ##
405 # Resets an element. This function removes all subelements, clears
406 # all attributes, and sets the <b>text</b> and <b>tail</b> attributes
407 # to None.
408
413
414 ##
415 # Gets an element attribute. Equivalent to <b>attrib.get</b>, but
416 # some implementations may handle this a bit more efficiently.
417 #
418 # @param key What attribute to look for.
419 # @param default What to return if the attribute was not found.
420 # @return The attribute value, or the default value, if the
421 # attribute was not found.
422 # @defreturn string or None
423
426
427 ##
428 # Sets an element attribute. Equivalent to <b>attrib[key] = value</b>,
429 # but some implementations may handle this a bit more efficiently.
430 #
431 # @param key What attribute to set.
432 # @param value The attribute value.
433
436
437 ##
438 # Gets a list of attribute names. The names are returned in an
439 # arbitrary order (just like for an ordinary Python dictionary).
440 # Equivalent to <b>attrib.keys()</b>.
441 #
442 # @return A list of element attribute names.
443 # @defreturn list of strings
444
447
448 ##
449 # Gets element attributes, as a sequence. The attributes are
450 # returned in an arbitrary order. Equivalent to <b>attrib.items()</b>.
451 #
452 # @return A list of (name, value) tuples for all attributes.
453 # @defreturn list of (string, string) tuples
454
457
458 ##
459 # Creates a tree iterator. The iterator loops over this element
460 # and all subelements, in document order, and returns all elements
461 # with a matching tag.
462 # <p>
463 # If the tree structure is modified during iteration, new or removed
464 # elements may or may not be included. To get a stable set, use the
465 # list() function on the iterator, and loop over the resulting list.
466 #
467 # @param tag What tags to look for (default is to return all elements).
468 # @return An iterator containing all the matching elements.
469 # @defreturn iterator
470
472 if tag == "*":
473 tag = None
474 if tag is None or self.tag == tag:
475 yield self
476 for e in self._children:
477 for e in e.iter(tag):
478 yield e
479
480 # compatibility
482 # Change for a DeprecationWarning in 1.4
483 warnings.warn(
484 "This method will be removed in future versions. "
485 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
486 PendingDeprecationWarning, stacklevel=2
487 )
488 return list(self.iter(tag))
489
490 ##
491 # Creates a text iterator. The iterator loops over this element
492 # and all subelements, in document order, and returns all inner
493 # text.
494 #
495 # @return An iterator containing all inner text.
496 # @defreturn iterator
497
509
510 # compatibility
511 _Element = _ElementInterface = Element
512
513 ##
514 # Subelement factory. This function creates an element instance, and
515 # appends it to an existing element.
516 # <p>
517 # The element name, attribute names, and attribute values can be
518 # either 8-bit ASCII strings or Unicode strings.
519 #
520 # @param parent The parent element.
521 # @param tag The subelement name.
522 # @param attrib An optional dictionary, containing element attributes.
523 # @param **extra Additional attributes, given as keyword arguments.
524 # @return An element instance.
525 # @defreturn Element
526
528 attrib = attrib.copy()
529 attrib.update(extra)
530 element = parent.makeelement(tag, attrib)
531 parent.append(element)
532 return element
533
534 ##
535 # Comment element factory. This factory function creates a special
536 # element that will be serialized as an XML comment by the standard
537 # serializer.
538 # <p>
539 # The comment string can be either an 8-bit ASCII string or a Unicode
540 # string.
541 #
542 # @param text A string containing the comment string.
543 # @return An element instance, representing a comment.
544 # @defreturn Element
545
550
551 ##
552 # PI element factory. This factory function creates a special element
553 # that will be serialized as an XML processing instruction by the standard
554 # serializer.
555 #
556 # @param target A string containing the PI target.
557 # @param text A string containing the PI contents, if any.
558 # @return An element instance, representing a PI.
559 # @defreturn Element
560
562 element = Element(ProcessingInstruction)
563 element.text = target
564 if text:
565 element.text = element.text + " " + text
566 return element
567
568 PI = ProcessingInstruction
569
570 ##
571 # QName wrapper. This can be used to wrap a QName attribute value, in
572 # order to get proper namespace handling on output.
573 #
574 # @param text A string containing the QName value, in the form {uri}local,
575 # or, if the tag argument is given, the URI part of a QName.
576 # @param tag Optional tag. If given, the first argument is interpreted as
577 # an URI, and this argument is interpreted as a local name.
578 # @return An opaque object, representing the QName.
579
593
594 # --------------------------------------------------------------------
595
596 ##
597 # ElementTree wrapper class. This class represents an entire element
598 # hierarchy, and adds some extra support for serialization to and from
599 # standard XML.
600 #
601 # @param element Optional root element.
602 # @keyparam file Optional file handle or file name. If given, the
603 # tree is initialized with the contents of this XML file.
604
606
608 # assert element is None or iselement(element)
609 self._root = element # first node
610 if file:
611 self.parse(file)
612
613 ##
614 # Gets the root element for this tree.
615 #
616 # @return An element instance.
617 # @defreturn Element
618
621
622 ##
623 # Replaces the root element for this tree. This discards the
624 # current contents of the tree, and replaces it with the given
625 # element. Use with care.
626 #
627 # @param element An element instance.
628
632
633 ##
634 # Loads an external XML document into this element tree.
635 #
636 # @param source A file name or file object. If a file object is
637 # given, it only has to implement a <b>read(n)</b> method.
638 # @keyparam parser An optional parser instance. If not given, the
639 # standard {@link XMLParser} parser is used.
640 # @return The document root element.
641 # @defreturn Element
642 # @exception ParseError If the parser fails to parse the document.
643
645 close_source = False
646 if not hasattr(source, "read"):
647 source = open(source, "rb")
648 close_source = True
649 try:
650 if not parser:
651 parser = XMLParser(target=TreeBuilder())
652 while 1:
653 data = source.read(65536)
654 if not data:
655 break
656 parser.feed(data)
657 self._root = parser.close()
658 return self._root
659 finally:
660 if close_source:
661 source.close()
662
663 ##
664 # Creates a tree iterator for the root element. The iterator loops
665 # over all elements in this tree, in document order.
666 #
667 # @param tag What tags to look for (default is to return all elements)
668 # @return An iterator.
669 # @defreturn iterator
670
674
675 # compatibility
677 # Change for a DeprecationWarning in 1.4
678 warnings.warn(
679 "This method will be removed in future versions. "
680 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
681 PendingDeprecationWarning, stacklevel=2
682 )
683 return list(self.iter(tag))
684
685 ##
686 # Finds the first toplevel element with given tag.
687 # Same as getroot().find(path).
688 #
689 # @param path What element to look for.
690 # @keyparam namespaces Optional namespace prefix map.
691 # @return The first matching element, or None if no element was found.
692 # @defreturn Element or None
693
695 # assert self._root is not None
696 if path[:1] == "/":
697 path = "." + path
698 warnings.warn(
699 "This search is broken in 1.3 and earlier, and will be "
700 "fixed in a future version. If you rely on the current "
701 "behaviour, change it to %r" % path,
702 FutureWarning, stacklevel=2
703 )
704 return self._root.find(path, namespaces)
705
706 ##
707 # Finds the element text for the first toplevel element with given
708 # tag. Same as getroot().findtext(path).
709 #
710 # @param path What toplevel element to look for.
711 # @param default What to return if the element was not found.
712 # @keyparam namespaces Optional namespace prefix map.
713 # @return The text content of the first matching element, or the
714 # default value no element was found. Note that if the element
715 # is found, but has no text content, this method returns an
716 # empty string.
717 # @defreturn string
718
720 # assert self._root is not None
721 if path[:1] == "/":
722 path = "." + path
723 warnings.warn(
724 "This search is broken in 1.3 and earlier, and will be "
725 "fixed in a future version. If you rely on the current "
726 "behaviour, change it to %r" % path,
727 FutureWarning, stacklevel=2
728 )
729 return self._root.findtext(path, default, namespaces)
730
731 ##
732 # Finds all toplevel elements with the given tag.
733 # Same as getroot().findall(path).
734 #
735 # @param path What element to look for.
736 # @keyparam namespaces Optional namespace prefix map.
737 # @return A list or iterator containing all matching elements,
738 # in document order.
739 # @defreturn list of Element instances
740
742 # assert self._root is not None
743 if path[:1] == "/":
744 path = "." + path
745 warnings.warn(
746 "This search is broken in 1.3 and earlier, and will be "
747 "fixed in a future version. If you rely on the current "
748 "behaviour, change it to %r" % path,
749 FutureWarning, stacklevel=2
750 )
751 return self._root.findall(path, namespaces)
752
753 ##
754 # Finds all matching subelements, by tag name or path.
755 # Same as getroot().iterfind(path).
756 #
757 # @param path What element to look for.
758 # @keyparam namespaces Optional namespace prefix map.
759 # @return An iterator or sequence containing all matching elements,
760 # in document order.
761 # @defreturn a generated sequence of Element instances
762
764 # assert self._root is not None
765 if path[:1] == "/":
766 path = "." + path
767 warnings.warn(
768 "This search is broken in 1.3 and earlier, and will be "
769 "fixed in a future version. If you rely on the current "
770 "behaviour, change it to %r" % path,
771 FutureWarning, stacklevel=2
772 )
773 return self._root.iterfind(path, namespaces)
774
775 ##
776 # Writes the element tree to a file, as XML.
777 #
778 # @def write(file, **options)
779 # @param file A file name, or a file object opened for writing.
780 # @param **options Options, given as keyword arguments.
781 # @keyparam encoding Optional output encoding (default is US-ASCII).
782 # @keyparam method Optional output method ("xml", "html", "text" or
783 # "c14n"; default is "xml").
784 # @keyparam xml_declaration Controls if an XML declaration should
785 # be added to the file. Use False for never, True for always,
786 # None for only if not US-ASCII or UTF-8. None is default.
787
788 - def write(self, file_or_filename,
789 # keyword arguments
790 encoding=None,
791 xml_declaration=None,
792 default_namespace=None,
793 method=None):
794 # assert self._root is not None
795 if not method:
796 method = "xml"
797 elif method not in _serialize:
798 # FIXME: raise an ImportError for c14n if ElementC14N is missing?
799 raise ValueError("unknown method %r" % method)
800 if hasattr(file_or_filename, "write"):
801 file = file_or_filename
802 else:
803 file = open(file_or_filename, "wb")
804 write = file.write
805 if not encoding:
806 if method == "c14n":
807 encoding = "utf-8"
808 else:
809 encoding = "us-ascii"
810 elif xml_declaration or (xml_declaration is None and
811 encoding not in ("utf-8", "us-ascii")):
812 if method == "xml":
813 write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
814 if method == "text":
815 _serialize_text(write, self._root, encoding)
816 else:
817 qnames, namespaces = _namespaces(
818 self._root, encoding, default_namespace
819 )
820 serialize = _serialize[method]
821 serialize(write, self._root, encoding, qnames, namespaces)
822 if file_or_filename is not file:
823 file.close()
824
828
829 # --------------------------------------------------------------------
830 # serialization support
831
833 # identify namespaces used in this tree
834
835 # maps qnames to *encoded* prefix:local names
836 qnames = {None: None}
837
838 # maps uri:s to prefixes
839 namespaces = {}
840 if default_namespace:
841 namespaces[default_namespace] = ""
842
843 def encode(text):
844 return text.encode(encoding)
845
846 def add_qname(qname):
847 # calculate serialized qname representation
848 try:
849 if qname[:1] == "{":
850 uri, tag = qname[1:].rsplit("}", 1)
851 prefix = namespaces.get(uri)
852 if prefix is None:
853 prefix = _namespace_map.get(uri)
854 if prefix is None:
855 prefix = "ns%d" % len(namespaces)
856 if prefix != "xml":
857 namespaces[uri] = prefix
858 if prefix:
859 qnames[qname] = encode("%s:%s" % (prefix, tag))
860 else:
861 qnames[qname] = encode(tag) # default element
862 else:
863 if default_namespace:
864 # FIXME: can this be handled in XML 1.0?
865 raise ValueError(
866 "cannot use non-qualified names with "
867 "default_namespace option"
868 )
869 qnames[qname] = encode(qname)
870 except TypeError:
871 _raise_serialization_error(qname)
872
873 # populate qname and namespaces table
874 try:
875 iterate = elem.iter
876 except AttributeError:
877 iterate = elem.getiterator # cET compatibility
878 for elem in iterate():
879 tag = elem.tag
880 if isinstance(tag, QName):
881 if tag.text not in qnames:
882 add_qname(tag.text)
883 elif isinstance(tag, basestring):
884 if tag not in qnames:
885 add_qname(tag)
886 elif tag is not None and tag is not Comment and tag is not PI:
887 _raise_serialization_error(tag)
888 for key, value in elem.items():
889 if isinstance(key, QName):
890 key = key.text
891 if key not in qnames:
892 add_qname(key)
893 if isinstance(value, QName) and value.text not in qnames:
894 add_qname(value.text)
895 text = elem.text
896 if isinstance(text, QName) and text.text not in qnames:
897 add_qname(text.text)
898 return qnames, namespaces
899
901 tag = elem.tag
902 text = elem.text
903 if tag is Comment:
904 write("<!--%s-->" % _encode(text, encoding))
905 elif tag is ProcessingInstruction:
906 write("<?%s?>" % _encode(text, encoding))
907 else:
908 tag = qnames[tag]
909 if tag is None:
910 if text:
911 write(_escape_cdata(text, encoding))
912 for e in elem:
913 _serialize_xml(write, e, encoding, qnames, None)
914 else:
915 write("<" + tag)
916 items = elem.items()
917 if items or namespaces:
918 if namespaces:
919 for v, k in sorted(namespaces.items(),
920 key=lambda x: x[1]): # sort on prefix
921 if k:
922 k = ":" + k
923 write(" xmlns%s=\"%s\"" % (
924 k.encode(encoding),
925 _escape_attrib(v, encoding)
926 ))
927 for k, v in sorted(items): # lexical order
928 if isinstance(k, QName):
929 k = k.text
930 if isinstance(v, QName):
931 v = qnames[v.text]
932 else:
933 v = _escape_attrib(v, encoding)
934 write(" %s=\"%s\"" % (qnames[k], v))
935 if text or len(elem):
936 write(">")
937 if text:
938 write(_escape_cdata(text, encoding))
939 for e in elem:
940 _serialize_xml(write, e, encoding, qnames, None)
941 write("</" + tag + ">")
942 else:
943 write(" />")
944 if elem.tail:
945 write(_escape_cdata(elem.tail, encoding))
946
947 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
948 "img", "input", "isindex", "link", "meta", "param")
949
950 try:
951 HTML_EMPTY = set(HTML_EMPTY)
952 except NameError:
953 pass
954
956 tag = elem.tag
957 text = elem.text
958 if tag is Comment:
959 write("<!--%s-->" % _escape_cdata(text, encoding))
960 elif tag is ProcessingInstruction:
961 write("<?%s?>" % _escape_cdata(text, encoding))
962 else:
963 tag = qnames[tag]
964 if tag is None:
965 if text:
966 write(_escape_cdata(text, encoding))
967 for e in elem:
968 _serialize_html(write, e, encoding, qnames, None)
969 else:
970 write("<" + tag)
971 items = elem.items()
972 if items or namespaces:
973 if namespaces:
974 for v, k in sorted(namespaces.items(),
975 key=lambda x: x[1]): # sort on prefix
976 if k:
977 k = ":" + k
978 write(" xmlns%s=\"%s\"" % (
979 k.encode(encoding),
980 _escape_attrib(v, encoding)
981 ))
982 for k, v in sorted(items): # lexical order
983 if isinstance(k, QName):
984 k = k.text
985 if isinstance(v, QName):
986 v = qnames[v.text]
987 else:
988 v = _escape_attrib_html(v, encoding)
989 # FIXME: handle boolean attributes
990 write(" %s=\"%s\"" % (qnames[k], v))
991 write(">")
992 tag = tag.lower()
993 if text:
994 if tag == "script" or tag == "style":
995 write(_encode(text, encoding))
996 else:
997 write(_escape_cdata(text, encoding))
998 for e in elem:
999 _serialize_html(write, e, encoding, qnames, None)
1000 if tag not in HTML_EMPTY:
1001 write("</" + tag + ">")
1002 if elem.tail:
1003 write(_escape_cdata(elem.tail, encoding))
1004
1006 for part in elem.itertext():
1007 write(part.encode(encoding))
1008 if elem.tail:
1009 write(elem.tail.encode(encoding))
1010
1011 _serialize = {
1012 "xml": _serialize_xml,
1013 "html": _serialize_html,
1014 "text": _serialize_text,
1015 # this optional method is imported at the end of the module
1016 # "c14n": _serialize_c14n,
1017 }
1018
1019 ##
1020 # Registers a namespace prefix. The registry is global, and any
1021 # existing mapping for either the given prefix or the namespace URI
1022 # will be removed.
1023 #
1024 # @param prefix Namespace prefix.
1025 # @param uri Namespace uri. Tags and attributes in this namespace
1026 # will be serialized with the given prefix, if at all possible.
1027 # @exception ValueError If the prefix is reserved, or is otherwise
1028 # invalid.
1029
1031 if re.match("ns\d+$", prefix):
1032 raise ValueError("Prefix format reserved for internal use")
1033 for k, v in _namespace_map.items():
1034 if k == uri or v == prefix:
1035 del _namespace_map[k]
1036 _namespace_map[uri] = prefix
1037
1038 _namespace_map = {
1039 # "well-known" namespace prefixes
1040 "http://www.w3.org/XML/1998/namespace": "xml",
1041 "http://www.w3.org/1999/xhtml": "html",
1042 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1043 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1044 # xml schema
1045 "http://www.w3.org/2001/XMLSchema": "xs",
1046 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1047 # dublin core
1048 "http://purl.org/dc/elements/1.1/": "dc",
1049 }
1050
1055
1057 try:
1058 return text.encode(encoding, "xmlcharrefreplace")
1059 except (TypeError, AttributeError):
1060 _raise_serialization_error(text)
1061
1063 # escape character data
1064 try:
1065 # it's worth avoiding do-nothing calls for strings that are
1066 # shorter than 500 character, or so. assume that's, by far,
1067 # the most common case in most applications.
1068 if "&" in text:
1069 text = text.replace("&", "&")
1070 if "<" in text:
1071 text = text.replace("<", "<")
1072 if ">" in text:
1073 text = text.replace(">", ">")
1074 return text.encode(encoding, "xmlcharrefreplace")
1075 except (TypeError, AttributeError):
1076 _raise_serialization_error(text)
1077
1079 # escape attribute value
1080 try:
1081 if "&" in text:
1082 text = text.replace("&", "&")
1083 if "<" in text:
1084 text = text.replace("<", "<")
1085 if ">" in text:
1086 text = text.replace(">", ">")
1087 if "\"" in text:
1088 text = text.replace("\"", """)
1089 if "\n" in text:
1090 text = text.replace("\n", " ")
1091 return text.encode(encoding, "xmlcharrefreplace")
1092 except (TypeError, AttributeError):
1093 _raise_serialization_error(text)
1094
1096 # escape attribute value
1097 try:
1098 if "&" in text:
1099 text = text.replace("&", "&")
1100 if ">" in text:
1101 text = text.replace(">", ">")
1102 if "\"" in text:
1103 text = text.replace("\"", """)
1104 return text.encode(encoding, "xmlcharrefreplace")
1105 except (TypeError, AttributeError):
1106 _raise_serialization_error(text)
1107
1108 # --------------------------------------------------------------------
1109
1110 ##
1111 # Generates a string representation of an XML element, including all
1112 # subelements.
1113 #
1114 # @param element An Element instance.
1115 # @keyparam encoding Optional output encoding (default is US-ASCII).
1116 # @keyparam method Optional output method ("xml", "html", "text" or
1117 # "c14n"; default is "xml").
1118 # @return An encoded string containing the XML data.
1119 # @defreturn string
1120
1124 data = []
1125 file = dummy()
1126 file.write = data.append
1127 ElementTree(element).write(file, encoding, method=method)
1128 return "".join(data)
1129
1130 ##
1131 # Generates a string representation of an XML element, including all
1132 # subelements. The string is returned as a sequence of string fragments.
1133 #
1134 # @param element An Element instance.
1135 # @keyparam encoding Optional output encoding (default is US-ASCII).
1136 # @keyparam method Optional output method ("xml", "html", "text" or
1137 # "c14n"; default is "xml").
1138 # @return A sequence object containing the XML data.
1139 # @defreturn sequence
1140 # @since 1.3
1141
1145 data = []
1146 file = dummy()
1147 file.write = data.append
1148 ElementTree(element).write(file, encoding, method=method)
1149 # FIXME: merge small fragments into larger parts
1150 return data
1151
1152 ##
1153 # Writes an element tree or element structure to sys.stdout. This
1154 # function should be used for debugging only.
1155 # <p>
1156 # The exact output format is implementation dependent. In this
1157 # version, it's written as an ordinary XML file.
1158 #
1159 # @param elem An element tree or an individual element.
1160
1162 # debugging
1163 if not isinstance(elem, ElementTree):
1164 elem = ElementTree(elem)
1165 elem.write(sys.stdout)
1166 tail = elem.getroot().tail
1167 if not tail or tail[-1] != "\n":
1168 sys.stdout.write("\n")
1169
1170 # --------------------------------------------------------------------
1171 # parsing
1172
1173 ##
1174 # Parses an XML document into an element tree.
1175 #
1176 # @param source A filename or file object containing XML data.
1177 # @param parser An optional parser instance. If not given, the
1178 # standard {@link XMLParser} parser is used.
1179 # @return An ElementTree instance
1180
1185
1186 ##
1187 # Parses an XML document into an element tree incrementally, and reports
1188 # what's going on to the user.
1189 #
1190 # @param source A filename or file object containing XML data.
1191 # @param events A list of events to report back. If omitted, only "end"
1192 # events are reported.
1193 # @param parser An optional parser instance. If not given, the
1194 # standard {@link XMLParser} parser is used.
1195 # @return A (event, elem) iterator.
1196
1198 close_source = False
1199 if not hasattr(source, "read"):
1200 source = open(source, "rb")
1201 close_source = True
1202 if not parser:
1203 parser = XMLParser(target=TreeBuilder())
1204 return _IterParseIterator(source, events, parser, close_source)
1205
1207
1209 self._file = source
1210 self._close_file = close_source
1211 self._events = []
1212 self._index = 0
1213 self._error = None
1214 self.root = self._root = None
1215 self._parser = parser
1216 # wire up the parser for event reporting
1217 parser = self._parser._parser
1218 append = self._events.append
1219 if events is None:
1220 events = ["end"]
1221 for event in events:
1222 if event == "start":
1223 try:
1224 parser.ordered_attributes = 1
1225 parser.specified_attributes = 1
1226 def handler(tag, attrib_in, event=event, append=append,
1227 start=self._parser._start_list):
1228 append((event, start(tag, attrib_in)))
1229 parser.StartElementHandler = handler
1230 except AttributeError:
1231 def handler(tag, attrib_in, event=event, append=append,
1232 start=self._parser._start):
1233 append((event, start(tag, attrib_in)))
1234 parser.StartElementHandler = handler
1235 elif event == "end":
1236 def handler(tag, event=event, append=append,
1237 end=self._parser._end):
1238 append((event, end(tag)))
1239 parser.EndElementHandler = handler
1240 elif event == "start-ns":
1241 def handler(prefix, uri, event=event, append=append):
1242 try:
1243 uri = (uri or "").encode("ascii")
1244 except UnicodeError:
1245 pass
1246 append((event, (prefix or "", uri or "")))
1247 parser.StartNamespaceDeclHandler = handler
1248 elif event == "end-ns":
1249 def handler(prefix, event=event, append=append):
1250 append((event, None))
1251 parser.EndNamespaceDeclHandler = handler
1252 else:
1253 raise ValueError("unknown event %r" % event)
1254
1256 while 1:
1257 try:
1258 item = self._events[self._index]
1259 self._index += 1
1260 return item
1261 except IndexError:
1262 pass
1263 if self._error:
1264 e = self._error
1265 self._error = None
1266 raise e
1267 if self._parser is None:
1268 self.root = self._root
1269 if self._close_file:
1270 self._file.close()
1271 raise StopIteration
1272 # load event buffer
1273 del self._events[:]
1274 self._index = 0
1275 data = self._file.read(16384)
1276 if data:
1277 try:
1278 self._parser.feed(data)
1279 except SyntaxError as exc:
1280 self._error = exc
1281 else:
1282 self._root = self._parser.close()
1283 self._parser = None
1284
1287
1288 ##
1289 # Parses an XML document from a string constant. This function can
1290 # be used to embed "XML literals" in Python code.
1291 #
1292 # @param source A string containing XML data.
1293 # @param parser An optional parser instance. If not given, the
1294 # standard {@link XMLParser} parser is used.
1295 # @return An Element instance.
1296 # @defreturn Element
1297
1299 if not parser:
1300 parser = XMLParser(target=TreeBuilder())
1301 parser.feed(text)
1302 return parser.close()
1303
1304 ##
1305 # Parses an XML document from a string constant, and also returns
1306 # a dictionary which maps from element id:s to elements.
1307 #
1308 # @param source A string containing XML data.
1309 # @param parser An optional parser instance. If not given, the
1310 # standard {@link XMLParser} parser is used.
1311 # @return A tuple containing an Element instance and a dictionary.
1312 # @defreturn (Element, dictionary)
1313
1315 if not parser:
1316 parser = XMLParser(target=TreeBuilder())
1317 parser.feed(text)
1318 tree = parser.close()
1319 ids = {}
1320 for elem in tree.iter():
1321 id = elem.get("id")
1322 if id:
1323 ids[id] = elem
1324 return tree, ids
1325
1326 ##
1327 # Parses an XML document from a string constant. Same as {@link #XML}.
1328 #
1329 # @def fromstring(text)
1330 # @param source A string containing XML data.
1331 # @return An Element instance.
1332 # @defreturn Element
1333
1334 fromstring = XML
1335
1336 ##
1337 # Parses an XML document from a sequence of string fragments.
1338 #
1339 # @param sequence A list or other sequence containing XML data fragments.
1340 # @param parser An optional parser instance. If not given, the
1341 # standard {@link XMLParser} parser is used.
1342 # @return An Element instance.
1343 # @defreturn Element
1344 # @since 1.3
1345
1347 if not parser:
1348 parser = XMLParser(target=TreeBuilder())
1349 for text in sequence:
1350 parser.feed(text)
1351 return parser.close()
1352
1353 # --------------------------------------------------------------------
1354
1355 ##
1356 # Generic element structure builder. This builder converts a sequence
1357 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1358 # #TreeBuilder.end} method calls to a well-formed element structure.
1359 # <p>
1360 # You can use this class to build an element structure using a custom XML
1361 # parser, or a parser for some other XML-like format.
1362 #
1363 # @param element_factory Optional element factory. This factory
1364 # is called to create new Element instances, as necessary.
1365
1367
1369 self._data = [] # data collector
1370 self._elem = [] # element stack
1371 self._last = None # last element
1372 self._tail = None # true if we're after an end tag
1373 if element_factory is None:
1374 element_factory = Element
1375 self._factory = element_factory
1376
1377 ##
1378 # Flushes the builder buffers, and returns the toplevel document
1379 # element.
1380 #
1381 # @return An Element instance.
1382 # @defreturn Element
1383
1385 assert len(self._elem) == 0, "missing end tags"
1386 assert self._last is not None, "missing toplevel element"
1387 return self._last
1388
1390 if self._data:
1391 if self._last is not None:
1392 text = "".join(self._data)
1393 if self._tail:
1394 assert self._last.tail is None, "internal error (tail)"
1395 self._last.tail = text
1396 else:
1397 assert self._last.text is None, "internal error (text)"
1398 self._last.text = text
1399 self._data = []
1400
1401 ##
1402 # Adds text to the current element.
1403 #
1404 # @param data A string. This should be either an 8-bit string
1405 # containing ASCII text, or a Unicode string.
1406
1409
1410 ##
1411 # Opens a new element.
1412 #
1413 # @param tag The element name.
1414 # @param attrib A dictionary containing element attributes.
1415 # @return The opened element.
1416 # @defreturn Element
1417
1419 self._flush()
1420 self._last = elem = self._factory(tag, attrs)
1421 if self._elem:
1422 self._elem[-1].append(elem)
1423 self._elem.append(elem)
1424 self._tail = 0
1425 return elem
1426
1427 ##
1428 # Closes the current element.
1429 #
1430 # @param tag The element name.
1431 # @return The closed element.
1432 # @defreturn Element
1433
1442
1443 ##
1444 # Element structure builder for XML source data, based on the
1445 # <b>expat</b> parser.
1446 #
1447 # @keyparam target Target object. If omitted, the builder uses an
1448 # instance of the standard {@link #TreeBuilder} class.
1449 # @keyparam html Predefine HTML entities. This flag is not supported
1450 # by the current implementation.
1451 # @keyparam encoding Optional encoding. If given, the value overrides
1452 # the encoding specified in the XML file.
1453 # @see #ElementTree
1454 # @see #TreeBuilder
1455
1457
1459 try:
1460 from xml.parsers import expat
1461 except ImportError:
1462 try:
1463 import pyexpat as expat
1464 except ImportError:
1465 raise ImportError(
1466 "No module named expat; use SimpleXMLTreeBuilder instead"
1467 )
1468 parser = expat.ParserCreate(encoding, "}")
1469 if target is None:
1470 target = TreeBuilder()
1471 # underscored names are provided for compatibility only
1472 self.parser = self._parser = parser
1473 self.target = self._target = target
1474 self._error = expat.error
1475 self._names = {} # name memo cache
1476 # callbacks
1477 parser.DefaultHandlerExpand = self._default
1478 parser.StartElementHandler = self._start
1479 parser.EndElementHandler = self._end
1480 parser.CharacterDataHandler = self._data
1481 # optional callbacks
1482 parser.CommentHandler = self._comment
1483 parser.ProcessingInstructionHandler = self._pi
1484 # let expat do the buffering, if supported
1485 try:
1486 self._parser.buffer_text = 1
1487 except AttributeError:
1488 pass
1489 # use new-style attribute handling, if supported
1490 try:
1491 self._parser.ordered_attributes = 1
1492 self._parser.specified_attributes = 1
1493 parser.StartElementHandler = self._start_list
1494 except AttributeError:
1495 pass
1496 self._doctype = None
1497 self.entity = {}
1498 try:
1499 self.version = "Expat %d.%d.%d" % expat.version_info
1500 except AttributeError:
1501 pass # unknown
1502
1504 err = ParseError(value)
1505 err.code = value.code
1506 err.position = value.lineno, value.offset
1507 raise err
1508
1510 # convert text string to ascii, if possible
1511 try:
1512 return text.encode("ascii")
1513 except UnicodeError:
1514 return text
1515
1517 # expand qname, and convert name string to ascii, if possible
1518 try:
1519 name = self._names[key]
1520 except KeyError:
1521 name = key
1522 if "}" in name:
1523 name = "{" + name
1524 self._names[key] = name = self._fixtext(name)
1525 return name
1526
1528 fixname = self._fixname
1529 fixtext = self._fixtext
1530 tag = fixname(tag)
1531 attrib = {}
1532 for key, value in attrib_in.items():
1533 attrib[fixname(key)] = fixtext(value)
1534 return self.target.start(tag, attrib)
1535
1537 fixname = self._fixname
1538 fixtext = self._fixtext
1539 tag = fixname(tag)
1540 attrib = {}
1541 if attrib_in:
1542 for i in range(0, len(attrib_in), 2):
1543 attrib[fixname(attrib_in[i])] = fixtext(attrib_in[i+1])
1544 return self.target.start(tag, attrib)
1545
1548
1551
1553 try:
1554 comment = self.target.comment
1555 except AttributeError:
1556 pass
1557 else:
1558 return comment(self._fixtext(data))
1559
1561 try:
1562 pi = self.target.pi
1563 except AttributeError:
1564 pass
1565 else:
1566 return pi(self._fixtext(target), self._fixtext(data))
1567
1569 prefix = text[:1]
1570 if prefix == "&":
1571 # deal with undefined entities
1572 try:
1573 self.target.data(self.entity[text[1:-1]])
1574 except KeyError:
1575 from xml.parsers import expat
1576 err = expat.error(
1577 "undefined entity %s: line %d, column %d" %
1578 (text, self._parser.ErrorLineNumber,
1579 self._parser.ErrorColumnNumber)
1580 )
1581 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1582 err.lineno = self._parser.ErrorLineNumber
1583 err.offset = self._parser.ErrorColumnNumber
1584 raise err
1585 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1586 self._doctype = [] # inside a doctype declaration
1587 elif self._doctype is not None:
1588 # parse doctype contents
1589 if prefix == ">":
1590 self._doctype = None
1591 return
1592 text = text.strip()
1593 if not text:
1594 return
1595 self._doctype.append(text)
1596 n = len(self._doctype)
1597 if n > 2:
1598 type = self._doctype[1]
1599 if type == "PUBLIC" and n == 4:
1600 name, type, pubid, system = self._doctype
1601 elif type == "SYSTEM" and n == 3:
1602 name, type, system = self._doctype
1603 pubid = None
1604 else:
1605 return
1606 if pubid:
1607 pubid = pubid[1:-1]
1608 if hasattr(self.target, "doctype"):
1609 self.target.doctype(name, pubid, system[1:-1])
1610 elif self.doctype is not self._XMLParser__doctype:
1611 # warn about deprecated call
1612 self._XMLParser__doctype(name, pubid, system[1:-1])
1613 self.doctype(name, pubid, system[1:-1])
1614 self._doctype = None
1615
1616 ##
1617 # (Deprecated) Handles a doctype declaration.
1618 #
1619 # @param name Doctype name.
1620 # @param pubid Public identifier.
1621 # @param system System identifier.
1622
1624 """This method of XMLParser is deprecated."""
1625 warnings.warn(
1626 "This method of XMLParser is deprecated. Define doctype() "
1627 "method on the TreeBuilder target.",
1628 DeprecationWarning,
1629 )
1630
1631 # sentinel, if doctype is redefined in a subclass
1632 __doctype = doctype
1633
1634 ##
1635 # Feeds data to the parser.
1636 #
1637 # @param data Encoded data.
1638
1644
1645 ##
1646 # Finishes feeding data to the parser.
1647 #
1648 # @return An element structure.
1649 # @defreturn Element
1650
1659
1660 # compatibility
1661 XMLTreeBuilder = XMLParser
1662
1663 # workaround circular import.
1664 try:
1665 from ElementC14N import _serialize_c14n
1666 _serialize["c14n"] = _serialize_c14n
1667 except ImportError:
1668 pass
1669
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Sun Apr 28 21:00:13 2013 | http://epydoc.sourceforge.net |