Coverage Summary for Class: XmlEscapers (com.google.common.xml)

Class	Class, %	Method, %	Line, %
XmlEscapers	0% (0/1)	0% (0/4)	0% (0/20)
1 /*
2  * Copyright (C) 2009 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5  * in compliance with the License. You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software distributed under the License
10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing permissions and limitations under
12  * the License.
13  */
14 
15 package com.google.common.xml;
16 
17 import com.google.common.annotations.Beta;
18 import com.google.common.annotations.GwtCompatible;
19 import com.google.common.escape.Escaper;
20 import com.google.common.escape.Escapers;
21 
22 /**
23  * {@code Escaper} instances suitable for strings to be included in XML attribute values and
24  * elements' text contents. When possible, avoid manual escaping by using templating systems and
25  * high-level APIs that provide autoescaping. For example, consider <a
26  * href="http://www.xom.nu/">XOM</a> or <a href="http://www.jdom.org/">JDOM</a>.
27  *
28  * <p><b>Note:</b> Currently the escapers provided by this class do not escape any characters
29  * outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape
30  * non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal
31  * level of escaping to ensure that the output can be safely included in a Unicode XML document.
32  *
33  * <p>For details on the behavior of the escapers in this class, see sections <a
34  * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and <a
35  * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
36  *
37  * @author Alex Matevossian
38  * @author David Beaumont
39  * @since 15.0
40  */
41 @Beta
42 @GwtCompatible
43 @ElementTypesAreNonnullByDefault
44 public class XmlEscapers {
45   private XmlEscapers() {}
46 
47   private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
48   private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
49 
50   // For each xxxEscaper() method, please add links to external reference pages
51   // that are considered authoritative for the behavior of that escaper.
52 
53   /**
54    * Returns an {@link Escaper} instance that escapes special characters in a string so it can
55    * safely be included in an XML document as element content. See section <a
56    * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
57    *
58    * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not safe</b> to use this
59    * escaper to escape attribute values. Use {@link #xmlContentEscaper} if the output can appear in
60    * element content or {@link #xmlAttributeEscaper} in attribute values.
61    *
62    * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
63    * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
64    * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
65    * the XML specification.
66    *
67    * <p>This escaper does not escape non-ASCII characters to their numeric character references
68    * (NCR). Any non-ASCII characters appearing in the input will be preserved in the output.
69    * Specifically "\r" (carriage return) is preserved in the output, which may result in it being
70    * silently converted to "\n" when the XML is parsed.
71    *
72    * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
73    * validation on its input.
74    */
75   public static Escaper xmlContentEscaper() {
76     return XML_CONTENT_ESCAPER;
77   }
78 
79   /**
80    * Returns an {@link Escaper} instance that escapes special characters in a string so it can
81    * safely be included in XML document as an attribute value. See section <a
82    * href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> of the XML
83    * specification.
84    *
85    * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
86    * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
87    * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
88    * the XML specification.
89    *
90    * <p>This escaper does not escape non-ASCII characters to their numeric character references
91    * (NCR). However, horizontal tab {@code '\t'}, line feed {@code '\n'} and carriage return {@code
92    * '\r'} are escaped to a corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
93    * respectively. Any other non-ASCII characters appearing in the input will be preserved in the
94    * output.
95    *
96    * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
97    * validation on its input.
98    */
99   public static Escaper xmlAttributeEscaper() {
100     return XML_ATTRIBUTE_ESCAPER;
101   }
102 
103   private static final Escaper XML_ESCAPER;
104   private static final Escaper XML_CONTENT_ESCAPER;
105   private static final Escaper XML_ATTRIBUTE_ESCAPER;
106 
107   static {
108     Escapers.Builder builder = Escapers.builder();
109     // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
110     // (Unicode code points above \uFFFF are represented via surrogate pairs
111     // which means they are treated as pairs of safe characters).
112     builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
113     // Unsafe characters are replaced with the Unicode replacement character.
114     builder.setUnsafeReplacement("\uFFFD");
115 
116     /*
117      * Except for \n, \t, and \r, all ASCII control characters are replaced with the Unicode
118      * replacement character.
119      *
120      * Implementation note: An alternative to the following would be to make a map that simply
121      * replaces the allowed ASCII whitespace characters with themselves and to set the minimum safe
122      * character to 0x20. However this would slow down the escaping of simple strings that contain
123      * \t, \n, or \r.
124      */
125     for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
126       if (c != '\t' && c != '\n' && c != '\r') {
127         builder.addEscape(c, "\uFFFD");
128       }
129     }
130 
131     // Build the content escaper first and then add quote escaping for the
132     // general escaper.
133     builder.addEscape('&', "&amp;");
134     builder.addEscape('<', "&lt;");
135     builder.addEscape('>', "&gt;");
136     XML_CONTENT_ESCAPER = builder.build();
137     builder.addEscape('\'', "&apos;");
138     builder.addEscape('"', "&quot;");
139     XML_ESCAPER = builder.build();
140     builder.addEscape('\t', "&#x9;");
141     builder.addEscape('\n', "&#xA;");
142     builder.addEscape('\r', "&#xD;");
143     XML_ATTRIBUTE_ESCAPER = builder.build();
144   }
145 }