Coverage Summary for Class: PercentEscaper (com.google.common.net)
| Class | Class, % | Method, % | Line, % |
|---|---|---|---|
| PercentEscaper | 0% (0/1) | 0% (0/6) | 0% (0/92) |
1 /* 2 * Copyright (C) 2008 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 * in compliance with the License. You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software distributed under the License 10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 * or implied. See the License for the specific language governing permissions and limitations under 12 * the License. 13 */ 14 15 package com.google.common.net; 16 17 import static com.google.common.base.Preconditions.checkNotNull; 18 19 import com.google.common.annotations.Beta; 20 import com.google.common.annotations.GwtCompatible; 21 import com.google.common.escape.UnicodeEscaper; 22 import javax.annotation.CheckForNull; 23 24 /** 25 * A {@code UnicodeEscaper} that escapes some set of Java characters using a UTF-8 based percent 26 * encoding scheme. The set of safe characters (those which remain unescaped) can be specified on 27 * construction. 28 * 29 * <p>This class is primarily used for creating URI escapers in {@link UrlEscapers} but can be used 30 * directly if required. While URI escapers impose specific semantics on which characters are 31 * considered 'safe', this class has a minimal set of restrictions. 32 * 33 * <p>When escaping a String, the following rules apply: 34 * 35 * <ul> 36 * <li>All specified safe characters remain unchanged. 37 * <li>If {@code plusForSpace} was specified, the space character " " is converted into a plus 38 * sign {@code "+"}. 39 * <li>All other characters are converted into one or more bytes using UTF-8 encoding and each 40 * byte is then represented by the 3-character string "%XX", where "XX" is the two-digit, 41 * uppercase, hexadecimal representation of the byte value. 42 * </ul> 43 * 44 * <p>For performance reasons the only currently supported character encoding of this class is 45 * UTF-8. 46 * 47 * <p><b>Note:</b> This escaper produces <a 48 * href="https://url.spec.whatwg.org/#percent-encode">uppercase</a> hexadecimal sequences. 49 * 50 * @author David Beaumont 51 * @since 15.0 52 */ 53 @Beta 54 @GwtCompatible 55 @ElementTypesAreNonnullByDefault 56 public final class PercentEscaper extends UnicodeEscaper { 57 58 // In some escapers spaces are escaped to '+' 59 private static final char[] PLUS_SIGN = {'+'}; 60 61 // Percent escapers output upper case hex digits (uri escapers require this). 62 private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray(); 63 64 /** If true we should convert space to the {@code +} character. */ 65 private final boolean plusForSpace; 66 67 /** 68 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is true then {@code c} 69 * should remain unmodified in the output. If {@code c >= safeOctets.length} then it should be 70 * escaped. 71 */ 72 private final boolean[] safeOctets; 73 74 /** 75 * Constructs a percent escaper with the specified safe characters and optional handling of the 76 * space character. 77 * 78 * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} as a safe 79 * character. This has the effect of creating an escaper which has no well defined inverse but it 80 * can be useful when escaping additional characters. 81 * 82 * @param safeChars a non null string specifying additional safe characters for this escaper (the 83 * ranges 0..9, a..z and A..Z are always safe and should not be specified here) 84 * @param plusForSpace true if ASCII space should be escaped to {@code +} rather than {@code %20} 85 * @throws IllegalArgumentException if any of the parameters were invalid 86 */ 87 public PercentEscaper(String safeChars, boolean plusForSpace) { 88 // TODO(dbeaumont): Switch to static factory methods for creation now that class is final. 89 // TODO(dbeaumont): Support escapers where alphanumeric chars are not safe. 90 checkNotNull(safeChars); // eager for GWT. 91 // Avoid any misunderstandings about the behavior of this escaper 92 if (safeChars.matches(".*[0-9A-Za-z].*")) { 93 throw new IllegalArgumentException( 94 "Alphanumeric characters are always 'safe' and should not be explicitly specified"); 95 } 96 safeChars += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 97 // Avoid ambiguous parameters. Safe characters are never modified so if 98 // space is a safe character then setting plusForSpace is meaningless. 99 if (plusForSpace && safeChars.contains(" ")) { 100 throw new IllegalArgumentException( 101 "plusForSpace cannot be specified when space is a 'safe' character"); 102 } 103 this.plusForSpace = plusForSpace; 104 this.safeOctets = createSafeOctets(safeChars); 105 } 106 107 /** 108 * Creates a boolean array with entries corresponding to the character values specified in 109 * safeChars set to true. The array is as small as is required to hold the given character 110 * information. 111 */ 112 private static boolean[] createSafeOctets(String safeChars) { 113 int maxChar = -1; 114 char[] safeCharArray = safeChars.toCharArray(); 115 for (char c : safeCharArray) { 116 maxChar = Math.max(c, maxChar); 117 } 118 boolean[] octets = new boolean[maxChar + 1]; 119 for (char c : safeCharArray) { 120 octets[c] = true; 121 } 122 return octets; 123 } 124 125 /* 126 * Overridden for performance. For unescaped strings this improved the performance of the uri 127 * escaper from ~760ns to ~400ns as measured by {@link CharEscapersBenchmark}. 128 */ 129 @Override 130 protected int nextEscapeIndex(CharSequence csq, int index, int end) { 131 checkNotNull(csq); 132 for (; index < end; index++) { 133 char c = csq.charAt(index); 134 if (c >= safeOctets.length || !safeOctets[c]) { 135 break; 136 } 137 } 138 return index; 139 } 140 141 /* 142 * Overridden for performance. For unescaped strings this improved the performance of the uri 143 * escaper from ~400ns to ~170ns as measured by {@link CharEscapersBenchmark}. 144 */ 145 @Override 146 public String escape(String s) { 147 checkNotNull(s); 148 int slen = s.length(); 149 for (int index = 0; index < slen; index++) { 150 char c = s.charAt(index); 151 if (c >= safeOctets.length || !safeOctets[c]) { 152 return escapeSlow(s, index); 153 } 154 } 155 return s; 156 } 157 158 /** Escapes the given Unicode code point in UTF-8. */ 159 @Override 160 @CheckForNull 161 protected char[] escape(int cp) { 162 // We should never get negative values here but if we do it will throw an 163 // IndexOutOfBoundsException, so at least it will get spotted. 164 if (cp < safeOctets.length && safeOctets[cp]) { 165 return null; 166 } else if (cp == ' ' && plusForSpace) { 167 return PLUS_SIGN; 168 } else if (cp <= 0x7F) { 169 // Single byte UTF-8 characters 170 // Start with "%--" and fill in the blanks 171 char[] dest = new char[3]; 172 dest[0] = '%'; 173 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 174 dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 175 return dest; 176 } else if (cp <= 0x7ff) { 177 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 178 // Start with "%--%--" and fill in the blanks 179 char[] dest = new char[6]; 180 dest[0] = '%'; 181 dest[3] = '%'; 182 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 183 cp >>>= 4; 184 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 185 cp >>>= 2; 186 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 187 cp >>>= 4; 188 dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 189 return dest; 190 } else if (cp <= 0xffff) { 191 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 192 // Start with "%E-%--%--" and fill in the blanks 193 char[] dest = new char[9]; 194 dest[0] = '%'; 195 dest[1] = 'E'; 196 dest[3] = '%'; 197 dest[6] = '%'; 198 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 199 cp >>>= 4; 200 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 201 cp >>>= 2; 202 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 203 cp >>>= 4; 204 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 205 cp >>>= 2; 206 dest[2] = UPPER_HEX_DIGITS[cp]; 207 return dest; 208 } else if (cp <= 0x10ffff) { 209 char[] dest = new char[12]; 210 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 211 // Start with "%F-%--%--%--" and fill in the blanks 212 dest[0] = '%'; 213 dest[1] = 'F'; 214 dest[3] = '%'; 215 dest[6] = '%'; 216 dest[9] = '%'; 217 dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 218 cp >>>= 4; 219 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 220 cp >>>= 2; 221 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 222 cp >>>= 4; 223 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 224 cp >>>= 2; 225 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 226 cp >>>= 4; 227 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 228 cp >>>= 2; 229 dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 230 return dest; 231 } else { 232 // If this ever happens it is due to bug in UnicodeEscaper, not bad input. 233 throw new IllegalArgumentException("Invalid unicode character value " + cp); 234 } 235 } 236 }