Coverage Summary for Class: ArrayBasedUnicodeEscaper (com.google.common.escape)

Class	Class, %	Method, %	Line, %
ArrayBasedUnicodeEscaper	0% (0/1)	0% (0/5)	0% (0/35)
1 /*
2  * Copyright (C) 2009 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5  * in compliance with the License. You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software distributed under the License
10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing permissions and limitations under
12  * the License.
13  */
14 
15 package com.google.common.escape;
16 
17 import static com.google.common.base.Preconditions.checkNotNull;
18 
19 import com.google.common.annotations.Beta;
20 import com.google.common.annotations.GwtCompatible;
21 import java.util.Map;
22 import javax.annotation.CheckForNull;
23 import org.checkerframework.checker.nullness.qual.Nullable;
24 
25 /**
26  * A {@link UnicodeEscaper} that uses an array to quickly look up replacement characters for a given
27  * code point. An additional safe range is provided that determines whether code points without
28  * specific replacements are to be considered safe and left unescaped or should be escaped in a
29  * general way.
30  *
31  * <p>A good example of usage of this class is for HTML escaping where the replacement array
32  * contains information about the named HTML entities such as {@code &amp;} and {@code &quot;} while
33  * {@link #escapeUnsafe} is overridden to handle general escaping of the form {@code &#NNNNN;}.
34  *
35  * <p>The size of the data structure used by {@link ArrayBasedUnicodeEscaper} is proportional to the
36  * highest valued code point that requires escaping. For example a replacement map containing the
37  * single character '{@code \}{@code u1000}' will require approximately 16K of memory. If you need
38  * to create multiple escaper instances that have the same character replacement mapping consider
39  * using {@link ArrayBasedEscaperMap}.
40  *
41  * @author David Beaumont
42  * @since 15.0
43  */
44 @Beta
45 @GwtCompatible
46 @ElementTypesAreNonnullByDefault
47 public abstract class ArrayBasedUnicodeEscaper extends UnicodeEscaper {
48   // The replacement array (see ArrayBasedEscaperMap).
49   private final char[][] replacements;
50   // The number of elements in the replacement array.
51   private final int replacementsLength;
52   // The first code point in the safe range.
53   private final int safeMin;
54   // The last code point in the safe range.
55   private final int safeMax;
56 
57   // Cropped values used in the fast path range checks.
58   private final char safeMinChar;
59   private final char safeMaxChar;
60 
61   /**
62    * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified
63    * safe range. If {@code safeMax < safeMin} then no code points are considered safe.
64    *
65    * <p>If a code point has no mapped replacement then it is checked against the safe range. If it
66    * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed.
67    *
68    * @param replacementMap a map of characters to their escaped representations
69    * @param safeMin the lowest character value in the safe range
70    * @param safeMax the highest character value in the safe range
71    * @param unsafeReplacement the default replacement for unsafe characters or null if no default
72    *     replacement is required
73    */
74   protected ArrayBasedUnicodeEscaper(
75       Map<Character, String> replacementMap,
76       int safeMin,
77       int safeMax,
78       @Nullable String unsafeReplacement) {
79     this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax, unsafeReplacement);
80   }
81 
82   /**
83    * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified
84    * safe range. If {@code safeMax < safeMin} then no code points are considered safe. This
85    * initializer is useful when explicit instances of ArrayBasedEscaperMap are used to allow the
86    * sharing of large replacement mappings.
87    *
88    * <p>If a code point has no mapped replacement then it is checked against the safe range. If it
89    * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed.
90    *
91    * @param escaperMap the map of replacements
92    * @param safeMin the lowest character value in the safe range
93    * @param safeMax the highest character value in the safe range
94    * @param unsafeReplacement the default replacement for unsafe characters or null if no default
95    *     replacement is required
96    */
97   protected ArrayBasedUnicodeEscaper(
98       ArrayBasedEscaperMap escaperMap,
99       int safeMin,
100       int safeMax,
101       @Nullable String unsafeReplacement) {
102     checkNotNull(escaperMap); // GWT specific check (do not optimize)
103     this.replacements = escaperMap.getReplacementArray();
104     this.replacementsLength = replacements.length;
105     if (safeMax < safeMin) {
106       // If the safe range is empty, set the range limits to opposite extremes
107       // to ensure the first test of either value will fail.
108       safeMax = -1;
109       safeMin = Integer.MAX_VALUE;
110     }
111     this.safeMin = safeMin;
112     this.safeMax = safeMax;
113 
114     // This is a bit of a hack but lets us do quicker per-character checks in
115     // the fast path code. The safe min/max values are very unlikely to extend
116     // into the range of surrogate characters, but if they do we must not test
117     // any values in that range. To see why, consider the case where:
118     // safeMin <= {hi,lo} <= safeMax
119     // where {hi,lo} are characters forming a surrogate pair such that:
120     // codePointOf(hi, lo) > safeMax
121     // which would result in the surrogate pair being (wrongly) considered safe.
122     // If we clip the safe range used during the per-character tests so it is
123     // below the values of characters in surrogate pairs, this cannot occur.
124     // This approach does mean that we break out of the fast path code in cases
125     // where we don't strictly need to, but this situation will almost never
126     // occur in practice.
127     if (safeMin >= Character.MIN_HIGH_SURROGATE) {
128       // The safe range is empty or the all safe code points lie in or above the
129       // surrogate range. Either way the character range is empty.
130       this.safeMinChar = Character.MAX_VALUE;
131       this.safeMaxChar = 0;
132     } else {
133       // The safe range is non empty and contains values below the surrogate
134       // range but may extend above it. We may need to clip the maximum value.
135       this.safeMinChar = (char) safeMin;
136       this.safeMaxChar = (char) Math.min(safeMax, Character.MIN_HIGH_SURROGATE - 1);
137     }
138   }
139 
140   /*
141    * This is overridden to improve performance. Rough benchmarking shows that this almost doubles
142    * the speed when processing strings that do not require any escaping.
143    */
144   @Override
145   public final String escape(String s) {
146     checkNotNull(s); // GWT specific check (do not optimize)
147     for (int i = 0; i < s.length(); i++) {
148       char c = s.charAt(i);
149       if ((c < replacementsLength && replacements[c] != null)
150           || c > safeMaxChar
151           || c < safeMinChar) {
152         return escapeSlow(s, i);
153       }
154     }
155     return s;
156   }
157 
158   /**
159    * Escapes a single Unicode code point using the replacement array and safe range values. If the
160    * given character does not have an explicit replacement and lies outside the safe range then
161    * {@link #escapeUnsafe} is called.
162    *
163    * @return the replacement characters, or {@code null} if no escaping was required
164    */
165   @Override
166   @CheckForNull
167   protected final char[] escape(int cp) {
168     if (cp < replacementsLength) {
169       char[] chars = replacements[cp];
170       if (chars != null) {
171         return chars;
172       }
173     }
174     if (cp >= safeMin && cp <= safeMax) {
175       return null;
176     }
177     return escapeUnsafe(cp);
178   }
179 
180   /* Overridden for performance. */
181   @Override
182   protected final int nextEscapeIndex(CharSequence csq, int index, int end) {
183     while (index < end) {
184       char c = csq.charAt(index);
185       if ((c < replacementsLength && replacements[c] != null)
186           || c > safeMaxChar
187           || c < safeMinChar) {
188         break;
189       }
190       index++;
191     }
192     return index;
193   }
194 
195   /**
196    * Escapes a code point that has no direct explicit value in the replacement array and lies
197    * outside the stated safe range. Subclasses should override this method to provide generalized
198    * escaping for code points if required.
199    *
200    * <p>Note that arrays returned by this method must not be modified once they have been returned.
201    * However it is acceptable to return the same array multiple times (even for different input
202    * characters).
203    *
204    * @param cp the Unicode code point to escape
205    * @return the replacement characters, or {@code null} if no escaping was required
206    */
207   @CheckForNull
208   protected abstract char[] escapeUnsafe(int cp);
209 }