Coverage Summary for Class: PercentEscaper (com.google.common.net)

Class Class, % Method, % Line, %
PercentEscaper 0% (0/1) 0% (0/6) 0% (0/92)


1 /* 2  * Copyright (C) 2008 The Guava Authors 3  * 4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5  * in compliance with the License. You may obtain a copy of the License at 6  * 7  * http://www.apache.org/licenses/LICENSE-2.0 8  * 9  * Unless required by applicable law or agreed to in writing, software distributed under the License 10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11  * or implied. See the License for the specific language governing permissions and limitations under 12  * the License. 13  */ 14  15 package com.google.common.net; 16  17 import static com.google.common.base.Preconditions.checkNotNull; 18  19 import com.google.common.annotations.Beta; 20 import com.google.common.annotations.GwtCompatible; 21 import com.google.common.escape.UnicodeEscaper; 22 import javax.annotation.CheckForNull; 23  24 /** 25  * A {@code UnicodeEscaper} that escapes some set of Java characters using a UTF-8 based percent 26  * encoding scheme. The set of safe characters (those which remain unescaped) can be specified on 27  * construction. 28  * 29  * <p>This class is primarily used for creating URI escapers in {@link UrlEscapers} but can be used 30  * directly if required. While URI escapers impose specific semantics on which characters are 31  * considered 'safe', this class has a minimal set of restrictions. 32  * 33  * <p>When escaping a String, the following rules apply: 34  * 35  * <ul> 36  * <li>All specified safe characters remain unchanged. 37  * <li>If {@code plusForSpace} was specified, the space character " " is converted into a plus 38  * sign {@code "+"}. 39  * <li>All other characters are converted into one or more bytes using UTF-8 encoding and each 40  * byte is then represented by the 3-character string "%XX", where "XX" is the two-digit, 41  * uppercase, hexadecimal representation of the byte value. 42  * </ul> 43  * 44  * <p>For performance reasons the only currently supported character encoding of this class is 45  * UTF-8. 46  * 47  * <p><b>Note:</b> This escaper produces <a 48  * href="https://url.spec.whatwg.org/#percent-encode">uppercase</a> hexadecimal sequences. 49  * 50  * @author David Beaumont 51  * @since 15.0 52  */ 53 @Beta 54 @GwtCompatible 55 @ElementTypesAreNonnullByDefault 56 public final class PercentEscaper extends UnicodeEscaper { 57  58  // In some escapers spaces are escaped to '+' 59  private static final char[] PLUS_SIGN = {'+'}; 60  61  // Percent escapers output upper case hex digits (uri escapers require this). 62  private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray(); 63  64  /** If true we should convert space to the {@code +} character. */ 65  private final boolean plusForSpace; 66  67  /** 68  * An array of flags where for any {@code char c} if {@code safeOctets[c]} is true then {@code c} 69  * should remain unmodified in the output. If {@code c >= safeOctets.length} then it should be 70  * escaped. 71  */ 72  private final boolean[] safeOctets; 73  74  /** 75  * Constructs a percent escaper with the specified safe characters and optional handling of the 76  * space character. 77  * 78  * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} as a safe 79  * character. This has the effect of creating an escaper which has no well defined inverse but it 80  * can be useful when escaping additional characters. 81  * 82  * @param safeChars a non null string specifying additional safe characters for this escaper (the 83  * ranges 0..9, a..z and A..Z are always safe and should not be specified here) 84  * @param plusForSpace true if ASCII space should be escaped to {@code +} rather than {@code %20} 85  * @throws IllegalArgumentException if any of the parameters were invalid 86  */ 87  public PercentEscaper(String safeChars, boolean plusForSpace) { 88  // TODO(dbeaumont): Switch to static factory methods for creation now that class is final. 89  // TODO(dbeaumont): Support escapers where alphanumeric chars are not safe. 90  checkNotNull(safeChars); // eager for GWT. 91  // Avoid any misunderstandings about the behavior of this escaper 92  if (safeChars.matches(".*[0-9A-Za-z].*")) { 93  throw new IllegalArgumentException( 94  "Alphanumeric characters are always 'safe' and should not be explicitly specified"); 95  } 96  safeChars += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 97  // Avoid ambiguous parameters. Safe characters are never modified so if 98  // space is a safe character then setting plusForSpace is meaningless. 99  if (plusForSpace && safeChars.contains(" ")) { 100  throw new IllegalArgumentException( 101  "plusForSpace cannot be specified when space is a 'safe' character"); 102  } 103  this.plusForSpace = plusForSpace; 104  this.safeOctets = createSafeOctets(safeChars); 105  } 106  107  /** 108  * Creates a boolean array with entries corresponding to the character values specified in 109  * safeChars set to true. The array is as small as is required to hold the given character 110  * information. 111  */ 112  private static boolean[] createSafeOctets(String safeChars) { 113  int maxChar = -1; 114  char[] safeCharArray = safeChars.toCharArray(); 115  for (char c : safeCharArray) { 116  maxChar = Math.max(c, maxChar); 117  } 118  boolean[] octets = new boolean[maxChar + 1]; 119  for (char c : safeCharArray) { 120  octets[c] = true; 121  } 122  return octets; 123  } 124  125  /* 126  * Overridden for performance. For unescaped strings this improved the performance of the uri 127  * escaper from ~760ns to ~400ns as measured by {@link CharEscapersBenchmark}. 128  */ 129  @Override 130  protected int nextEscapeIndex(CharSequence csq, int index, int end) { 131  checkNotNull(csq); 132  for (; index < end; index++) { 133  char c = csq.charAt(index); 134  if (c >= safeOctets.length || !safeOctets[c]) { 135  break; 136  } 137  } 138  return index; 139  } 140  141  /* 142  * Overridden for performance. For unescaped strings this improved the performance of the uri 143  * escaper from ~400ns to ~170ns as measured by {@link CharEscapersBenchmark}. 144  */ 145  @Override 146  public String escape(String s) { 147  checkNotNull(s); 148  int slen = s.length(); 149  for (int index = 0; index < slen; index++) { 150  char c = s.charAt(index); 151  if (c >= safeOctets.length || !safeOctets[c]) { 152  return escapeSlow(s, index); 153  } 154  } 155  return s; 156  } 157  158  /** Escapes the given Unicode code point in UTF-8. */ 159  @Override 160  @CheckForNull 161  protected char[] escape(int cp) { 162  // We should never get negative values here but if we do it will throw an 163  // IndexOutOfBoundsException, so at least it will get spotted. 164  if (cp < safeOctets.length && safeOctets[cp]) { 165  return null; 166  } else if (cp == ' ' && plusForSpace) { 167  return PLUS_SIGN; 168  } else if (cp <= 0x7F) { 169  // Single byte UTF-8 characters 170  // Start with "%--" and fill in the blanks 171  char[] dest = new char[3]; 172  dest[0] = '%'; 173  dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 174  dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 175  return dest; 176  } else if (cp <= 0x7ff) { 177  // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 178  // Start with "%--%--" and fill in the blanks 179  char[] dest = new char[6]; 180  dest[0] = '%'; 181  dest[3] = '%'; 182  dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 183  cp >>>= 4; 184  dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 185  cp >>>= 2; 186  dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 187  cp >>>= 4; 188  dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 189  return dest; 190  } else if (cp <= 0xffff) { 191  // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 192  // Start with "%E-%--%--" and fill in the blanks 193  char[] dest = new char[9]; 194  dest[0] = '%'; 195  dest[1] = 'E'; 196  dest[3] = '%'; 197  dest[6] = '%'; 198  dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 199  cp >>>= 4; 200  dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 201  cp >>>= 2; 202  dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 203  cp >>>= 4; 204  dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 205  cp >>>= 2; 206  dest[2] = UPPER_HEX_DIGITS[cp]; 207  return dest; 208  } else if (cp <= 0x10ffff) { 209  char[] dest = new char[12]; 210  // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 211  // Start with "%F-%--%--%--" and fill in the blanks 212  dest[0] = '%'; 213  dest[1] = 'F'; 214  dest[3] = '%'; 215  dest[6] = '%'; 216  dest[9] = '%'; 217  dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 218  cp >>>= 4; 219  dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 220  cp >>>= 2; 221  dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 222  cp >>>= 4; 223  dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 224  cp >>>= 2; 225  dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 226  cp >>>= 4; 227  dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 228  cp >>>= 2; 229  dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 230  return dest; 231  } else { 232  // If this ever happens it is due to bug in UnicodeEscaper, not bad input. 233  throw new IllegalArgumentException("Invalid unicode character value " + cp); 234  } 235  } 236 }