Coverage Summary for Class: Utf8 (com.google.common.base)

Class Class, % Method, % Line, %
Utf8 0% (0/1) 0% (0/7) 0% (0/57)


1 /* 2  * Copyright (C) 2013 The Guava Authors 3  * 4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5  * in compliance with the License. You may obtain a copy of the License at 6  * 7  * http://www.apache.org/licenses/LICENSE-2.0 8  * 9  * Unless required by applicable law or agreed to in writing, software distributed under the License 10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11  * or implied. See the License for the specific language governing permissions and limitations under 12  * the License. 13  */ 14  15 package com.google.common.base; 16  17 import static com.google.common.base.Preconditions.checkPositionIndexes; 18 import static java.lang.Character.MAX_SURROGATE; 19 import static java.lang.Character.MIN_SURROGATE; 20  21 import com.google.common.annotations.Beta; 22 import com.google.common.annotations.GwtCompatible; 23  24 /** 25  * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8} 26  * character encoding. UTF-8 is defined in section D92 of <a 27  * href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core 28  * Specification, Chapter 3</a>. 29  * 30  * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8 31  * introduced in Unicode 3.1. One implication of this is that it rejects <a 32  * href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte sequences, 33  * even though the JDK decoder may accept them. 34  * 35  * @author Martin Buchholz 36  * @author Clément Roux 37  * @since 16.0 38  */ 39 @Beta 40 @GwtCompatible(emulated = true) 41 @ElementTypesAreNonnullByDefault 42 public final class Utf8 { 43  /** 44  * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this 45  * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both 46  * time and space. 47  * 48  * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 49  * surrogates) 50  */ 51  public static int encodedLength(CharSequence sequence) { 52  // Warning to maintainers: this implementation is highly optimized. 53  int utf16Length = sequence.length(); 54  int utf8Length = utf16Length; 55  int i = 0; 56  57  // This loop optimizes for pure ASCII. 58  while (i < utf16Length && sequence.charAt(i) < 0x80) { 59  i++; 60  } 61  62  // This loop optimizes for chars less than 0x800. 63  for (; i < utf16Length; i++) { 64  char c = sequence.charAt(i); 65  if (c < 0x800) { 66  utf8Length += ((0x7f - c) >>> 31); // branch free! 67  } else { 68  utf8Length += encodedLengthGeneral(sequence, i); 69  break; 70  } 71  } 72  73  if (utf8Length < utf16Length) { 74  // Necessary and sufficient condition for overflow because of maximum 3x expansion 75  throw new IllegalArgumentException( 76  "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); 77  } 78  return utf8Length; 79  } 80  81  private static int encodedLengthGeneral(CharSequence sequence, int start) { 82  int utf16Length = sequence.length(); 83  int utf8Length = 0; 84  for (int i = start; i < utf16Length; i++) { 85  char c = sequence.charAt(i); 86  if (c < 0x800) { 87  utf8Length += (0x7f - c) >>> 31; // branch free! 88  } else { 89  utf8Length += 2; 90  // jdk7+: if (Character.isSurrogate(c)) { 91  if (MIN_SURROGATE <= c && c <= MAX_SURROGATE) { 92  // Check that we have a well-formed surrogate pair. 93  if (Character.codePointAt(sequence, i) == c) { 94  throw new IllegalArgumentException(unpairedSurrogateMsg(i)); 95  } 96  i++; 97  } 98  } 99  } 100  return utf8Length; 101  } 102  103  /** 104  * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to 105  * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be 106  * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte 107  * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered 108  * well-formed. 109  * 110  * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new 111  * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space. 112  */ 113  public static boolean isWellFormed(byte[] bytes) { 114  return isWellFormed(bytes, 0, bytes.length); 115  } 116  117  /** 118  * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by 119  * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code 120  * isWellFormed(bytes)} is true. 121  * 122  * @param bytes the input buffer 123  * @param off the offset in the buffer of the first byte to read 124  * @param len the number of bytes to read from the buffer 125  */ 126  public static boolean isWellFormed(byte[] bytes, int off, int len) { 127  int end = off + len; 128  checkPositionIndexes(off, end, bytes.length); 129  // Look for the first non-ASCII character. 130  for (int i = off; i < end; i++) { 131  if (bytes[i] < 0) { 132  return isWellFormedSlowPath(bytes, i, end); 133  } 134  } 135  return true; 136  } 137  138  private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) { 139  int index = off; 140  while (true) { 141  int byte1; 142  143  // Optimize for interior runs of ASCII bytes. 144  do { 145  if (index >= end) { 146  return true; 147  } 148  } while ((byte1 = bytes[index++]) >= 0); 149  150  if (byte1 < (byte) 0xE0) { 151  // Two-byte form. 152  if (index == end) { 153  return false; 154  } 155  // Simultaneously check for illegal trailing-byte in leading position 156  // and overlong 2-byte form. 157  if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) { 158  return false; 159  } 160  } else if (byte1 < (byte) 0xF0) { 161  // Three-byte form. 162  if (index + 1 >= end) { 163  return false; 164  } 165  int byte2 = bytes[index++]; 166  if (byte2 > (byte) 0xBF 167  // Overlong? 5 most significant bits must not all be zero. 168  || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 169  // Check for illegal surrogate codepoints. 170  || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2) 171  // Third byte trailing-byte test. 172  || bytes[index++] > (byte) 0xBF) { 173  return false; 174  } 175  } else { 176  // Four-byte form. 177  if (index + 2 >= end) { 178  return false; 179  } 180  int byte2 = bytes[index++]; 181  if (byte2 > (byte) 0xBF 182  // Check that 1 <= plane <= 16. Tricky optimized form of: 183  // if (byte1 > (byte) 0xF4 184  // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 185  // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 186  || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 187  // Third byte trailing-byte test 188  || bytes[index++] > (byte) 0xBF 189  // Fourth byte trailing-byte test 190  || bytes[index++] > (byte) 0xBF) { 191  return false; 192  } 193  } 194  } 195  } 196  197  private static String unpairedSurrogateMsg(int i) { 198  return "Unpaired surrogate at index " + i; 199  } 200  201  private Utf8() {} 202 }