Coverage Summary for Class: Utf8 (com.google.common.base)

Class	Class, %	Method, %	Line, %
Utf8	0% (0/1)	0% (0/7)	0% (0/57)
1 /*
2  * Copyright (C) 2013 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5  * in compliance with the License. You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software distributed under the License
10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing permissions and limitations under
12  * the License.
13  */
14 
15 package com.google.common.base;
16 
17 import static com.google.common.base.Preconditions.checkPositionIndexes;
18 import static java.lang.Character.MAX_SURROGATE;
19 import static java.lang.Character.MIN_SURROGATE;
20 
21 import com.google.common.annotations.Beta;
22 import com.google.common.annotations.GwtCompatible;
23 
24 /**
25  * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
26  * character encoding. UTF-8 is defined in section D92 of <a
27  * href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
28  * Specification, Chapter 3</a>.
29  *
30  * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
31  * introduced in Unicode 3.1. One implication of this is that it rejects <a
32  * href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte sequences,
33  * even though the JDK decoder may accept them.
34  *
35  * @author Martin Buchholz
36  * @author Clément Roux
37  * @since 16.0
38  */
39 @Beta
40 @GwtCompatible(emulated = true)
41 @ElementTypesAreNonnullByDefault
42 public final class Utf8 {
43   /**
44    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
45    * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
46    * time and space.
47    *
48    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
49    *     surrogates)
50    */
51   public static int encodedLength(CharSequence sequence) {
52     // Warning to maintainers: this implementation is highly optimized.
53     int utf16Length = sequence.length();
54     int utf8Length = utf16Length;
55     int i = 0;
56 
57     // This loop optimizes for pure ASCII.
58     while (i < utf16Length && sequence.charAt(i) < 0x80) {
59       i++;
60     }
61 
62     // This loop optimizes for chars less than 0x800.
63     for (; i < utf16Length; i++) {
64       char c = sequence.charAt(i);
65       if (c < 0x800) {
66         utf8Length += ((0x7f - c) >>> 31); // branch free!
67       } else {
68         utf8Length += encodedLengthGeneral(sequence, i);
69         break;
70       }
71     }
72 
73     if (utf8Length < utf16Length) {
74       // Necessary and sufficient condition for overflow because of maximum 3x expansion
75       throw new IllegalArgumentException(
76           "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
77     }
78     return utf8Length;
79   }
80 
81   private static int encodedLengthGeneral(CharSequence sequence, int start) {
82     int utf16Length = sequence.length();
83     int utf8Length = 0;
84     for (int i = start; i < utf16Length; i++) {
85       char c = sequence.charAt(i);
86       if (c < 0x800) {
87         utf8Length += (0x7f - c) >>> 31; // branch free!
88       } else {
89         utf8Length += 2;
90         // jdk7+: if (Character.isSurrogate(c)) {
91         if (MIN_SURROGATE <= c && c <= MAX_SURROGATE) {
92           // Check that we have a well-formed surrogate pair.
93           if (Character.codePointAt(sequence, i) == c) {
94             throw new IllegalArgumentException(unpairedSurrogateMsg(i));
95           }
96           i++;
97         }
98       }
99     }
100     return utf8Length;
101   }
102 
103   /**
104    * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
105    * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
106    * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
107    * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
108    * well-formed.
109    *
110    * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
111    * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
112    */
113   public static boolean isWellFormed(byte[] bytes) {
114     return isWellFormed(bytes, 0, bytes.length);
115   }
116 
117   /**
118    * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
119    * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
120    * isWellFormed(bytes)} is true.
121    *
122    * @param bytes the input buffer
123    * @param off the offset in the buffer of the first byte to read
124    * @param len the number of bytes to read from the buffer
125    */
126   public static boolean isWellFormed(byte[] bytes, int off, int len) {
127     int end = off + len;
128     checkPositionIndexes(off, end, bytes.length);
129     // Look for the first non-ASCII character.
130     for (int i = off; i < end; i++) {
131       if (bytes[i] < 0) {
132         return isWellFormedSlowPath(bytes, i, end);
133       }
134     }
135     return true;
136   }
137 
138   private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
139     int index = off;
140     while (true) {
141       int byte1;
142 
143       // Optimize for interior runs of ASCII bytes.
144       do {
145         if (index >= end) {
146           return true;
147         }
148       } while ((byte1 = bytes[index++]) >= 0);
149 
150       if (byte1 < (byte) 0xE0) {
151         // Two-byte form.
152         if (index == end) {
153           return false;
154         }
155         // Simultaneously check for illegal trailing-byte in leading position
156         // and overlong 2-byte form.
157         if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
158           return false;
159         }
160       } else if (byte1 < (byte) 0xF0) {
161         // Three-byte form.
162         if (index + 1 >= end) {
163           return false;
164         }
165         int byte2 = bytes[index++];
166         if (byte2 > (byte) 0xBF
167             // Overlong? 5 most significant bits must not all be zero.
168             || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
169             // Check for illegal surrogate codepoints.
170             || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
171             // Third byte trailing-byte test.
172             || bytes[index++] > (byte) 0xBF) {
173           return false;
174         }
175       } else {
176         // Four-byte form.
177         if (index + 2 >= end) {
178           return false;
179         }
180         int byte2 = bytes[index++];
181         if (byte2 > (byte) 0xBF
182             // Check that 1 <= plane <= 16. Tricky optimized form of:
183             // if (byte1 > (byte) 0xF4
184             //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
185             //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
186             || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
187             // Third byte trailing-byte test
188             || bytes[index++] > (byte) 0xBF
189             // Fourth byte trailing-byte test
190             || bytes[index++] > (byte) 0xBF) {
191           return false;
192         }
193       }
194     }
195   }
196 
197   private static String unpairedSurrogateMsg(int i) {
198     return "Unpaired surrogate at index " + i;
199   }
200 
201   private Utf8() {}
202 }