Coverage Summary for Class: ArrayBasedUnicodeEscaper (com.google.common.escape)

Class Class, % Method, % Line, %
ArrayBasedUnicodeEscaper 0% (0/1) 0% (0/5) 0% (0/35)


1 /* 2  * Copyright (C) 2009 The Guava Authors 3  * 4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5  * in compliance with the License. You may obtain a copy of the License at 6  * 7  * http://www.apache.org/licenses/LICENSE-2.0 8  * 9  * Unless required by applicable law or agreed to in writing, software distributed under the License 10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11  * or implied. See the License for the specific language governing permissions and limitations under 12  * the License. 13  */ 14  15 package com.google.common.escape; 16  17 import static com.google.common.base.Preconditions.checkNotNull; 18  19 import com.google.common.annotations.Beta; 20 import com.google.common.annotations.GwtCompatible; 21 import java.util.Map; 22 import javax.annotation.CheckForNull; 23 import org.checkerframework.checker.nullness.qual.Nullable; 24  25 /** 26  * A {@link UnicodeEscaper} that uses an array to quickly look up replacement characters for a given 27  * code point. An additional safe range is provided that determines whether code points without 28  * specific replacements are to be considered safe and left unescaped or should be escaped in a 29  * general way. 30  * 31  * <p>A good example of usage of this class is for HTML escaping where the replacement array 32  * contains information about the named HTML entities such as {@code &amp;} and {@code &quot;} while 33  * {@link #escapeUnsafe} is overridden to handle general escaping of the form {@code &#NNNNN;}. 34  * 35  * <p>The size of the data structure used by {@link ArrayBasedUnicodeEscaper} is proportional to the 36  * highest valued code point that requires escaping. For example a replacement map containing the 37  * single character '{@code \}{@code u1000}' will require approximately 16K of memory. If you need 38  * to create multiple escaper instances that have the same character replacement mapping consider 39  * using {@link ArrayBasedEscaperMap}. 40  * 41  * @author David Beaumont 42  * @since 15.0 43  */ 44 @Beta 45 @GwtCompatible 46 @ElementTypesAreNonnullByDefault 47 public abstract class ArrayBasedUnicodeEscaper extends UnicodeEscaper { 48  // The replacement array (see ArrayBasedEscaperMap). 49  private final char[][] replacements; 50  // The number of elements in the replacement array. 51  private final int replacementsLength; 52  // The first code point in the safe range. 53  private final int safeMin; 54  // The last code point in the safe range. 55  private final int safeMax; 56  57  // Cropped values used in the fast path range checks. 58  private final char safeMinChar; 59  private final char safeMaxChar; 60  61  /** 62  * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified 63  * safe range. If {@code safeMax < safeMin} then no code points are considered safe. 64  * 65  * <p>If a code point has no mapped replacement then it is checked against the safe range. If it 66  * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed. 67  * 68  * @param replacementMap a map of characters to their escaped representations 69  * @param safeMin the lowest character value in the safe range 70  * @param safeMax the highest character value in the safe range 71  * @param unsafeReplacement the default replacement for unsafe characters or null if no default 72  * replacement is required 73  */ 74  protected ArrayBasedUnicodeEscaper( 75  Map<Character, String> replacementMap, 76  int safeMin, 77  int safeMax, 78  @Nullable String unsafeReplacement) { 79  this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax, unsafeReplacement); 80  } 81  82  /** 83  * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified 84  * safe range. If {@code safeMax < safeMin} then no code points are considered safe. This 85  * initializer is useful when explicit instances of ArrayBasedEscaperMap are used to allow the 86  * sharing of large replacement mappings. 87  * 88  * <p>If a code point has no mapped replacement then it is checked against the safe range. If it 89  * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed. 90  * 91  * @param escaperMap the map of replacements 92  * @param safeMin the lowest character value in the safe range 93  * @param safeMax the highest character value in the safe range 94  * @param unsafeReplacement the default replacement for unsafe characters or null if no default 95  * replacement is required 96  */ 97  protected ArrayBasedUnicodeEscaper( 98  ArrayBasedEscaperMap escaperMap, 99  int safeMin, 100  int safeMax, 101  @Nullable String unsafeReplacement) { 102  checkNotNull(escaperMap); // GWT specific check (do not optimize) 103  this.replacements = escaperMap.getReplacementArray(); 104  this.replacementsLength = replacements.length; 105  if (safeMax < safeMin) { 106  // If the safe range is empty, set the range limits to opposite extremes 107  // to ensure the first test of either value will fail. 108  safeMax = -1; 109  safeMin = Integer.MAX_VALUE; 110  } 111  this.safeMin = safeMin; 112  this.safeMax = safeMax; 113  114  // This is a bit of a hack but lets us do quicker per-character checks in 115  // the fast path code. The safe min/max values are very unlikely to extend 116  // into the range of surrogate characters, but if they do we must not test 117  // any values in that range. To see why, consider the case where: 118  // safeMin <= {hi,lo} <= safeMax 119  // where {hi,lo} are characters forming a surrogate pair such that: 120  // codePointOf(hi, lo) > safeMax 121  // which would result in the surrogate pair being (wrongly) considered safe. 122  // If we clip the safe range used during the per-character tests so it is 123  // below the values of characters in surrogate pairs, this cannot occur. 124  // This approach does mean that we break out of the fast path code in cases 125  // where we don't strictly need to, but this situation will almost never 126  // occur in practice. 127  if (safeMin >= Character.MIN_HIGH_SURROGATE) { 128  // The safe range is empty or the all safe code points lie in or above the 129  // surrogate range. Either way the character range is empty. 130  this.safeMinChar = Character.MAX_VALUE; 131  this.safeMaxChar = 0; 132  } else { 133  // The safe range is non empty and contains values below the surrogate 134  // range but may extend above it. We may need to clip the maximum value. 135  this.safeMinChar = (char) safeMin; 136  this.safeMaxChar = (char) Math.min(safeMax, Character.MIN_HIGH_SURROGATE - 1); 137  } 138  } 139  140  /* 141  * This is overridden to improve performance. Rough benchmarking shows that this almost doubles 142  * the speed when processing strings that do not require any escaping. 143  */ 144  @Override 145  public final String escape(String s) { 146  checkNotNull(s); // GWT specific check (do not optimize) 147  for (int i = 0; i < s.length(); i++) { 148  char c = s.charAt(i); 149  if ((c < replacementsLength && replacements[c] != null) 150  || c > safeMaxChar 151  || c < safeMinChar) { 152  return escapeSlow(s, i); 153  } 154  } 155  return s; 156  } 157  158  /** 159  * Escapes a single Unicode code point using the replacement array and safe range values. If the 160  * given character does not have an explicit replacement and lies outside the safe range then 161  * {@link #escapeUnsafe} is called. 162  * 163  * @return the replacement characters, or {@code null} if no escaping was required 164  */ 165  @Override 166  @CheckForNull 167  protected final char[] escape(int cp) { 168  if (cp < replacementsLength) { 169  char[] chars = replacements[cp]; 170  if (chars != null) { 171  return chars; 172  } 173  } 174  if (cp >= safeMin && cp <= safeMax) { 175  return null; 176  } 177  return escapeUnsafe(cp); 178  } 179  180  /* Overridden for performance. */ 181  @Override 182  protected final int nextEscapeIndex(CharSequence csq, int index, int end) { 183  while (index < end) { 184  char c = csq.charAt(index); 185  if ((c < replacementsLength && replacements[c] != null) 186  || c > safeMaxChar 187  || c < safeMinChar) { 188  break; 189  } 190  index++; 191  } 192  return index; 193  } 194  195  /** 196  * Escapes a code point that has no direct explicit value in the replacement array and lies 197  * outside the stated safe range. Subclasses should override this method to provide generalized 198  * escaping for code points if required. 199  * 200  * <p>Note that arrays returned by this method must not be modified once they have been returned. 201  * However it is acceptable to return the same array multiple times (even for different input 202  * characters). 203  * 204  * @param cp the Unicode code point to escape 205  * @return the replacement characters, or {@code null} if no escaping was required 206  */ 207  @CheckForNull 208  protected abstract char[] escapeUnsafe(int cp); 209 }