Coverage Summary for Class: XmlEscapers (com.google.common.xml)

Class Class, % Method, % Line, %
XmlEscapers 0% (0/1) 0% (0/4) 0% (0/20)


1 /* 2  * Copyright (C) 2009 The Guava Authors 3  * 4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5  * in compliance with the License. You may obtain a copy of the License at 6  * 7  * http://www.apache.org/licenses/LICENSE-2.0 8  * 9  * Unless required by applicable law or agreed to in writing, software distributed under the License 10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11  * or implied. See the License for the specific language governing permissions and limitations under 12  * the License. 13  */ 14  15 package com.google.common.xml; 16  17 import com.google.common.annotations.Beta; 18 import com.google.common.annotations.GwtCompatible; 19 import com.google.common.escape.Escaper; 20 import com.google.common.escape.Escapers; 21  22 /** 23  * {@code Escaper} instances suitable for strings to be included in XML attribute values and 24  * elements' text contents. When possible, avoid manual escaping by using templating systems and 25  * high-level APIs that provide autoescaping. For example, consider <a 26  * href="http://www.xom.nu/">XOM</a> or <a href="http://www.jdom.org/">JDOM</a>. 27  * 28  * <p><b>Note:</b> Currently the escapers provided by this class do not escape any characters 29  * outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape 30  * non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal 31  * level of escaping to ensure that the output can be safely included in a Unicode XML document. 32  * 33  * <p>For details on the behavior of the escapers in this class, see sections <a 34  * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and <a 35  * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification. 36  * 37  * @author Alex Matevossian 38  * @author David Beaumont 39  * @since 15.0 40  */ 41 @Beta 42 @GwtCompatible 43 @ElementTypesAreNonnullByDefault 44 public class XmlEscapers { 45  private XmlEscapers() {} 46  47  private static final char MIN_ASCII_CONTROL_CHAR = 0x00; 48  private static final char MAX_ASCII_CONTROL_CHAR = 0x1F; 49  50  // For each xxxEscaper() method, please add links to external reference pages 51  // that are considered authoritative for the behavior of that escaper. 52  53  /** 54  * Returns an {@link Escaper} instance that escapes special characters in a string so it can 55  * safely be included in an XML document as element content. See section <a 56  * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification. 57  * 58  * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not safe</b> to use this 59  * escaper to escape attribute values. Use {@link #xmlContentEscaper} if the output can appear in 60  * element content or {@link #xmlAttributeEscaper} in attribute values. 61  * 62  * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the 63  * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more 64  * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of 65  * the XML specification. 66  * 67  * <p>This escaper does not escape non-ASCII characters to their numeric character references 68  * (NCR). Any non-ASCII characters appearing in the input will be preserved in the output. 69  * Specifically "\r" (carriage return) is preserved in the output, which may result in it being 70  * silently converted to "\n" when the XML is parsed. 71  * 72  * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode 73  * validation on its input. 74  */ 75  public static Escaper xmlContentEscaper() { 76  return XML_CONTENT_ESCAPER; 77  } 78  79  /** 80  * Returns an {@link Escaper} instance that escapes special characters in a string so it can 81  * safely be included in XML document as an attribute value. See section <a 82  * href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> of the XML 83  * specification. 84  * 85  * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the 86  * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more 87  * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of 88  * the XML specification. 89  * 90  * <p>This escaper does not escape non-ASCII characters to their numeric character references 91  * (NCR). However, horizontal tab {@code '\t'}, line feed {@code '\n'} and carriage return {@code 92  * '\r'} are escaped to a corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"} 93  * respectively. Any other non-ASCII characters appearing in the input will be preserved in the 94  * output. 95  * 96  * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode 97  * validation on its input. 98  */ 99  public static Escaper xmlAttributeEscaper() { 100  return XML_ATTRIBUTE_ESCAPER; 101  } 102  103  private static final Escaper XML_ESCAPER; 104  private static final Escaper XML_CONTENT_ESCAPER; 105  private static final Escaper XML_ATTRIBUTE_ESCAPER; 106  107  static { 108  Escapers.Builder builder = Escapers.builder(); 109  // The char values \uFFFE and \uFFFF are explicitly not allowed in XML 110  // (Unicode code points above \uFFFF are represented via surrogate pairs 111  // which means they are treated as pairs of safe characters). 112  builder.setSafeRange(Character.MIN_VALUE, '\uFFFD'); 113  // Unsafe characters are replaced with the Unicode replacement character. 114  builder.setUnsafeReplacement("\uFFFD"); 115  116  /* 117  * Except for \n, \t, and \r, all ASCII control characters are replaced with the Unicode 118  * replacement character. 119  * 120  * Implementation note: An alternative to the following would be to make a map that simply 121  * replaces the allowed ASCII whitespace characters with themselves and to set the minimum safe 122  * character to 0x20. However this would slow down the escaping of simple strings that contain 123  * \t, \n, or \r. 124  */ 125  for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) { 126  if (c != '\t' && c != '\n' && c != '\r') { 127  builder.addEscape(c, "\uFFFD"); 128  } 129  } 130  131  // Build the content escaper first and then add quote escaping for the 132  // general escaper. 133  builder.addEscape('&', "&amp;"); 134  builder.addEscape('<', "&lt;"); 135  builder.addEscape('>', "&gt;"); 136  XML_CONTENT_ESCAPER = builder.build(); 137  builder.addEscape('\'', "&apos;"); 138  builder.addEscape('"', "&quot;"); 139  XML_ESCAPER = builder.build(); 140  builder.addEscape('\t', "&#x9;"); 141  builder.addEscape('\n', "&#xA;"); 142  builder.addEscape('\r', "&#xD;"); 143  XML_ATTRIBUTE_ESCAPER = builder.build(); 144  } 145 }