Coverage Summary for Class: Ascii (com.google.common.base)

Class Class, % Method, % Line, %
Ascii 0% (0/1) 0% (0/12) 0% (0/66)


1 /* 2  * Copyright (C) 2010 The Guava Authors 3  * 4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5  * in compliance with the License. You may obtain a copy of the License at 6  * 7  * http://www.apache.org/licenses/LICENSE-2.0 8  * 9  * Unless required by applicable law or agreed to in writing, software distributed under the License 10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11  * or implied. See the License for the specific language governing permissions and limitations under 12  * the License. 13  */ 14  15 package com.google.common.base; 16  17 import static com.google.common.base.Preconditions.checkArgument; 18 import static com.google.common.base.Preconditions.checkNotNull; 19  20 import com.google.common.annotations.GwtCompatible; 21  22 /** 23  * Static methods pertaining to ASCII characters (those in the range of values {@code 0x00} through 24  * {@code 0x7F}), and to strings containing such characters. 25  * 26  * <p>ASCII utilities also exist in other classes of this package: 27  * 28  * <ul> 29  * <!-- TODO(kevinb): how can we make this not produce a warning when building gwt javadoc? --> 30  * <li>{@link Charsets#US_ASCII} specifies the {@code Charset} of ASCII characters. 31  * <li>{@link CharMatcher#ascii} matches ASCII characters and provides text processing methods 32  * which operate only on the ASCII characters of a string. 33  * </ul> 34  * 35  * @author Catherine Berry 36  * @author Gregory Kick 37  * @since 7.0 38  */ 39 @GwtCompatible 40 @ElementTypesAreNonnullByDefault 41 public final class Ascii { 42  43  private Ascii() {} 44  45  /* The ASCII control characters, per RFC 20. */ 46  /** 47  * Null ('\0'): The all-zeros character which may serve to accomplish time fill and media fill. 48  * Normally used as a C string terminator. 49  * 50  * <p>Although RFC 20 names this as "Null", note that it is distinct from the C/C++ "NULL" 51  * pointer. 52  * 53  * @since 8.0 54  */ 55  public static final byte NUL = 0; 56  57  /** 58  * Start of Heading: A communication control character used at the beginning of a sequence of 59  * characters which constitute a machine-sensible address or routing information. Such a sequence 60  * is referred to as the "heading." An STX character has the effect of terminating a heading. 61  * 62  * @since 8.0 63  */ 64  public static final byte SOH = 1; 65  66  /** 67  * Start of Text: A communication control character which precedes a sequence of characters that 68  * is to be treated as an entity and entirely transmitted through to the ultimate destination. 69  * Such a sequence is referred to as "text." STX may be used to terminate a sequence of characters 70  * started by SOH. 71  * 72  * @since 8.0 73  */ 74  public static final byte STX = 2; 75  76  /** 77  * End of Text: A communication control character used to terminate a sequence of characters 78  * started with STX and transmitted as an entity. 79  * 80  * @since 8.0 81  */ 82  public static final byte ETX = 3; 83  84  /** 85  * End of Transmission: A communication control character used to indicate the conclusion of a 86  * transmission, which may have contained one or more texts and any associated headings. 87  * 88  * @since 8.0 89  */ 90  public static final byte EOT = 4; 91  92  /** 93  * Enquiry: A communication control character used in data communication systems as a request for 94  * a response from a remote station. It may be used as a "Who Are You" (WRU) to obtain 95  * identification, or may be used to obtain station status, or both. 96  * 97  * @since 8.0 98  */ 99  public static final byte ENQ = 5; 100  101  /** 102  * Acknowledge: A communication control character transmitted by a receiver as an affirmative 103  * response to a sender. 104  * 105  * @since 8.0 106  */ 107  public static final byte ACK = 6; 108  109  /** 110  * Bell ('\a'): A character for use when there is a need to call for human attention. It may 111  * control alarm or attention devices. 112  * 113  * @since 8.0 114  */ 115  public static final byte BEL = 7; 116  117  /** 118  * Backspace ('\b'): A format effector which controls the movement of the printing position one 119  * printing space backward on the same printing line. (Applicable also to display devices.) 120  * 121  * @since 8.0 122  */ 123  public static final byte BS = 8; 124  125  /** 126  * Horizontal Tabulation ('\t'): A format effector which controls the movement of the printing 127  * position to the next in a series of predetermined positions along the printing line. 128  * (Applicable also to display devices and the skip function on punched cards.) 129  * 130  * @since 8.0 131  */ 132  public static final byte HT = 9; 133  134  /** 135  * Line Feed ('\n'): A format effector which controls the movement of the printing position to the 136  * next printing line. (Applicable also to display devices.) Where appropriate, this character may 137  * have the meaning "New Line" (NL), a format effector which controls the movement of the printing 138  * point to the first printing position on the next printing line. Use of this convention requires 139  * agreement between sender and recipient of data. 140  * 141  * @since 8.0 142  */ 143  public static final byte LF = 10; 144  145  /** 146  * Alternate name for {@link #LF}. ({@code LF} is preferred.) 147  * 148  * @since 8.0 149  */ 150  public static final byte NL = 10; 151  152  /** 153  * Vertical Tabulation ('\v'): A format effector which controls the movement of the printing 154  * position to the next in a series of predetermined printing lines. (Applicable also to display 155  * devices.) 156  * 157  * @since 8.0 158  */ 159  public static final byte VT = 11; 160  161  /** 162  * Form Feed ('\f'): A format effector which controls the movement of the printing position to the 163  * first pre-determined printing line on the next form or page. (Applicable also to display 164  * devices.) 165  * 166  * @since 8.0 167  */ 168  public static final byte FF = 12; 169  170  /** 171  * Carriage Return ('\r'): A format effector which controls the movement of the printing position 172  * to the first printing position on the same printing line. (Applicable also to display devices.) 173  * 174  * @since 8.0 175  */ 176  public static final byte CR = 13; 177  178  /** 179  * Shift Out: A control character indicating that the code combinations which follow shall be 180  * interpreted as outside of the character set of the standard code table until a Shift In 181  * character is reached. 182  * 183  * @since 8.0 184  */ 185  public static final byte SO = 14; 186  187  /** 188  * Shift In: A control character indicating that the code combinations which follow shall be 189  * interpreted according to the standard code table. 190  * 191  * @since 8.0 192  */ 193  public static final byte SI = 15; 194  195  /** 196  * Data Link Escape: A communication control character which will change the meaning of a limited 197  * number of contiguously following characters. It is used exclusively to provide supplementary 198  * controls in data communication networks. 199  * 200  * @since 8.0 201  */ 202  public static final byte DLE = 16; 203  204  /** 205  * Device Control 1. Characters for the control of ancillary devices associated with data 206  * processing or telecommunication systems, more especially switching devices "on" or "off." (If a 207  * single "stop" control is required to interrupt or turn off ancillary devices, DC4 is the 208  * preferred assignment.) 209  * 210  * @since 8.0 211  */ 212  public static final byte DC1 = 17; // aka XON 213  214  /** 215  * Transmission On: Although originally defined as DC1, this ASCII control character is now better 216  * known as the XON code used for software flow control in serial communications. The main use is 217  * restarting the transmission after the communication has been stopped by the XOFF control code. 218  * 219  * @since 8.0 220  */ 221  public static final byte XON = 17; // aka DC1 222  223  /** 224  * Device Control 2. Characters for the control of ancillary devices associated with data 225  * processing or telecommunication systems, more especially switching devices "on" or "off." (If a 226  * single "stop" control is required to interrupt or turn off ancillary devices, DC4 is the 227  * preferred assignment.) 228  * 229  * @since 8.0 230  */ 231  public static final byte DC2 = 18; 232  233  /** 234  * Device Control 3. Characters for the control of ancillary devices associated with data 235  * processing or telecommunication systems, more especially switching devices "on" or "off." (If a 236  * single "stop" control is required to interrupt or turn off ancillary devices, DC4 is the 237  * preferred assignment.) 238  * 239  * @since 8.0 240  */ 241  public static final byte DC3 = 19; // aka XOFF 242  243  /** 244  * Transmission off. See {@link #XON} for explanation. 245  * 246  * @since 8.0 247  */ 248  public static final byte XOFF = 19; // aka DC3 249  250  /** 251  * Device Control 4. Characters for the control of ancillary devices associated with data 252  * processing or telecommunication systems, more especially switching devices "on" or "off." (If a 253  * single "stop" control is required to interrupt or turn off ancillary devices, DC4 is the 254  * preferred assignment.) 255  * 256  * @since 8.0 257  */ 258  public static final byte DC4 = 20; 259  260  /** 261  * Negative Acknowledge: A communication control character transmitted by a receiver as a negative 262  * response to the sender. 263  * 264  * @since 8.0 265  */ 266  public static final byte NAK = 21; 267  268  /** 269  * Synchronous Idle: A communication control character used by a synchronous transmission system 270  * in the absence of any other character to provide a signal from which synchronism may be 271  * achieved or retained. 272  * 273  * @since 8.0 274  */ 275  public static final byte SYN = 22; 276  277  /** 278  * End of Transmission Block: A communication control character used to indicate the end of a 279  * block of data for communication purposes. ETB is used for blocking data where the block 280  * structure is not necessarily related to the processing format. 281  * 282  * @since 8.0 283  */ 284  public static final byte ETB = 23; 285  286  /** 287  * Cancel: A control character used to indicate that the data with which it is sent is in error or 288  * is to be disregarded. 289  * 290  * @since 8.0 291  */ 292  public static final byte CAN = 24; 293  294  /** 295  * End of Medium: A control character associated with the sent data which may be used to identify 296  * the physical end of the medium, or the end of the used, or wanted, portion of information 297  * recorded on a medium. (The position of this character does not necessarily correspond to the 298  * physical end of the medium.) 299  * 300  * @since 8.0 301  */ 302  public static final byte EM = 25; 303  304  /** 305  * Substitute: A character that may be substituted for a character which is determined to be 306  * invalid or in error. 307  * 308  * @since 8.0 309  */ 310  public static final byte SUB = 26; 311  312  /** 313  * Escape: A control character intended to provide code extension (supplementary characters) in 314  * general information interchange. The Escape character itself is a prefix affecting the 315  * interpretation of a limited number of contiguously following characters. 316  * 317  * @since 8.0 318  */ 319  public static final byte ESC = 27; 320  321  /** 322  * File Separator: These four information separators may be used within data in optional fashion, 323  * except that their hierarchical relationship shall be: FS is the most inclusive, then GS, then 324  * RS, and US is least inclusive. (The content and length of a File, Group, Record, or Unit are 325  * not specified.) 326  * 327  * @since 8.0 328  */ 329  public static final byte FS = 28; 330  331  /** 332  * Group Separator: These four information separators may be used within data in optional fashion, 333  * except that their hierarchical relationship shall be: FS is the most inclusive, then GS, then 334  * RS, and US is least inclusive. (The content and length of a File, Group, Record, or Unit are 335  * not specified.) 336  * 337  * @since 8.0 338  */ 339  public static final byte GS = 29; 340  341  /** 342  * Record Separator: These four information separators may be used within data in optional 343  * fashion, except that their hierarchical relationship shall be: FS is the most inclusive, then 344  * GS, then RS, and US is least inclusive. (The content and length of a File, Group, Record, or 345  * Unit are not specified.) 346  * 347  * @since 8.0 348  */ 349  public static final byte RS = 30; 350  351  /** 352  * Unit Separator: These four information separators may be used within data in optional fashion, 353  * except that their hierarchical relationship shall be: FS is the most inclusive, then GS, then 354  * RS, and US is least inclusive. (The content and length of a File, Group, Record, or Unit are 355  * not specified.) 356  * 357  * @since 8.0 358  */ 359  public static final byte US = 31; 360  361  /** 362  * Space: A normally non-printing graphic character used to separate words. It is also a format 363  * effector which controls the movement of the printing position, one printing position forward. 364  * (Applicable also to display devices.) 365  * 366  * @since 8.0 367  */ 368  public static final byte SP = 32; 369  370  /** 371  * Alternate name for {@link #SP}. 372  * 373  * @since 8.0 374  */ 375  public static final byte SPACE = 32; 376  377  /** 378  * Delete: This character is used primarily to "erase" or "obliterate" erroneous or unwanted 379  * characters in perforated tape. 380  * 381  * @since 8.0 382  */ 383  public static final byte DEL = 127; 384  385  /** 386  * The minimum value of an ASCII character. 387  * 388  * @since 9.0 (was type {@code int} before 12.0) 389  */ 390  public static final char MIN = 0; 391  392  /** 393  * The maximum value of an ASCII character. 394  * 395  * @since 9.0 (was type {@code int} before 12.0) 396  */ 397  public static final char MAX = 127; 398  399  /** A bit mask which selects the bit encoding ASCII character case. */ 400  private static final char CASE_MASK = 0x20; 401  402  /** 403  * Returns a copy of the input string in which all {@linkplain #isUpperCase(char) uppercase ASCII 404  * characters} have been converted to lowercase. All other characters are copied without 405  * modification. 406  */ 407  public static String toLowerCase(String string) { 408  int length = string.length(); 409  for (int i = 0; i < length; i++) { 410  if (isUpperCase(string.charAt(i))) { 411  char[] chars = string.toCharArray(); 412  for (; i < length; i++) { 413  char c = chars[i]; 414  if (isUpperCase(c)) { 415  chars[i] = (char) (c ^ CASE_MASK); 416  } 417  } 418  return String.valueOf(chars); 419  } 420  } 421  return string; 422  } 423  424  /** 425  * Returns a copy of the input character sequence in which all {@linkplain #isUpperCase(char) 426  * uppercase ASCII characters} have been converted to lowercase. All other characters are copied 427  * without modification. 428  * 429  * @since 14.0 430  */ 431  public static String toLowerCase(CharSequence chars) { 432  if (chars instanceof String) { 433  return toLowerCase((String) chars); 434  } 435  char[] newChars = new char[chars.length()]; 436  for (int i = 0; i < newChars.length; i++) { 437  newChars[i] = toLowerCase(chars.charAt(i)); 438  } 439  return String.valueOf(newChars); 440  } 441  442  /** 443  * If the argument is an {@linkplain #isUpperCase(char) uppercase ASCII character} returns the 444  * lowercase equivalent. Otherwise returns the argument. 445  */ 446  public static char toLowerCase(char c) { 447  return isUpperCase(c) ? (char) (c ^ CASE_MASK) : c; 448  } 449  450  /** 451  * Returns a copy of the input string in which all {@linkplain #isLowerCase(char) lowercase ASCII 452  * characters} have been converted to uppercase. All other characters are copied without 453  * modification. 454  */ 455  public static String toUpperCase(String string) { 456  int length = string.length(); 457  for (int i = 0; i < length; i++) { 458  if (isLowerCase(string.charAt(i))) { 459  char[] chars = string.toCharArray(); 460  for (; i < length; i++) { 461  char c = chars[i]; 462  if (isLowerCase(c)) { 463  chars[i] = (char) (c ^ CASE_MASK); 464  } 465  } 466  return String.valueOf(chars); 467  } 468  } 469  return string; 470  } 471  472  /** 473  * Returns a copy of the input character sequence in which all {@linkplain #isLowerCase(char) 474  * lowercase ASCII characters} have been converted to uppercase. All other characters are copied 475  * without modification. 476  * 477  * @since 14.0 478  */ 479  public static String toUpperCase(CharSequence chars) { 480  if (chars instanceof String) { 481  return toUpperCase((String) chars); 482  } 483  char[] newChars = new char[chars.length()]; 484  for (int i = 0; i < newChars.length; i++) { 485  newChars[i] = toUpperCase(chars.charAt(i)); 486  } 487  return String.valueOf(newChars); 488  } 489  490  /** 491  * If the argument is a {@linkplain #isLowerCase(char) lowercase ASCII character} returns the 492  * uppercase equivalent. Otherwise returns the argument. 493  */ 494  public static char toUpperCase(char c) { 495  return isLowerCase(c) ? (char) (c ^ CASE_MASK) : c; 496  } 497  498  /** 499  * Indicates whether {@code c} is one of the twenty-six lowercase ASCII alphabetic characters 500  * between {@code 'a'} and {@code 'z'} inclusive. All others (including non-ASCII characters) 501  * return {@code false}. 502  */ 503  public static boolean isLowerCase(char c) { 504  // Note: This was benchmarked against the alternate expression "(char)(c - 'a') < 26" (Nov '13) 505  // and found to perform at least as well, or better. 506  return (c >= 'a') && (c <= 'z'); 507  } 508  509  /** 510  * Indicates whether {@code c} is one of the twenty-six uppercase ASCII alphabetic characters 511  * between {@code 'A'} and {@code 'Z'} inclusive. All others (including non-ASCII characters) 512  * return {@code false}. 513  */ 514  public static boolean isUpperCase(char c) { 515  return (c >= 'A') && (c <= 'Z'); 516  } 517  518  /** 519  * Truncates the given character sequence to the given maximum length. If the length of the 520  * sequence is greater than {@code maxLength}, the returned string will be exactly {@code 521  * maxLength} chars in length and will end with the given {@code truncationIndicator}. Otherwise, 522  * the sequence will be returned as a string with no changes to the content. 523  * 524  * <p>Examples: 525  * 526  * <pre>{@code 527  * Ascii.truncate("foobar", 7, "..."); // returns "foobar" 528  * Ascii.truncate("foobar", 5, "..."); // returns "fo..." 529  * }</pre> 530  * 531  * <p><b>Note:</b> This method <i>may</i> work with certain non-ASCII text but is not safe for use 532  * with arbitrary Unicode text. It is mostly intended for use with text that is known to be safe 533  * for use with it (such as all-ASCII text) and for simple debugging text. When using this method, 534  * consider the following: 535  * 536  * <ul> 537  * <li>it may split surrogate pairs 538  * <li>it may split characters and combining characters 539  * <li>it does not consider word boundaries 540  * <li>if truncating for display to users, there are other considerations that must be taken 541  * into account 542  * <li>the appropriate truncation indicator may be locale-dependent 543  * <li>it is safe to use non-ASCII characters in the truncation indicator 544  * </ul> 545  * 546  * @throws IllegalArgumentException if {@code maxLength} is less than the length of {@code 547  * truncationIndicator} 548  * @since 16.0 549  */ 550  public static String truncate(CharSequence seq, int maxLength, String truncationIndicator) { 551  checkNotNull(seq); 552  553  // length to truncate the sequence to, not including the truncation indicator 554  int truncationLength = maxLength - truncationIndicator.length(); 555  556  // in this worst case, this allows a maxLength equal to the length of the truncationIndicator, 557  // meaning that a string will be truncated to just the truncation indicator itself 558  checkArgument( 559  truncationLength >= 0, 560  "maxLength (%s) must be >= length of the truncation indicator (%s)", 561  maxLength, 562  truncationIndicator.length()); 563  564  if (seq.length() <= maxLength) { 565  String string = seq.toString(); 566  if (string.length() <= maxLength) { 567  return string; 568  } 569  // if the length of the toString() result was > maxLength for some reason, truncate that 570  seq = string; 571  } 572  573  return new StringBuilder(maxLength) 574  .append(seq, 0, truncationLength) 575  .append(truncationIndicator) 576  .toString(); 577  } 578  579  /** 580  * Indicates whether the contents of the given character sequences {@code s1} and {@code s2} are 581  * equal, ignoring the case of any ASCII alphabetic characters between {@code 'a'} and {@code 'z'} 582  * or {@code 'A'} and {@code 'Z'} inclusive. 583  * 584  * <p>This method is significantly faster than {@link String#equalsIgnoreCase} and should be used 585  * in preference if at least one of the parameters is known to contain only ASCII characters. 586  * 587  * <p>Note however that this method does not always behave identically to expressions such as: 588  * 589  * <ul> 590  * <li>{@code string.toUpperCase().equals("UPPER CASE ASCII")} 591  * <li>{@code string.toLowerCase().equals("lower case ascii")} 592  * </ul> 593  * 594  * <p>due to case-folding of some non-ASCII characters (which does not occur in {@link 595  * String#equalsIgnoreCase}). However in almost all cases that ASCII strings are used, the author 596  * probably wanted the behavior provided by this method rather than the subtle and sometimes 597  * surprising behavior of {@code toUpperCase()} and {@code toLowerCase()}. 598  * 599  * @since 16.0 600  */ 601  public static boolean equalsIgnoreCase(CharSequence s1, CharSequence s2) { 602  // Calling length() is the null pointer check (so do it before we can exit early). 603  int length = s1.length(); 604  if (s1 == s2) { 605  return true; 606  } 607  if (length != s2.length()) { 608  return false; 609  } 610  for (int i = 0; i < length; i++) { 611  char c1 = s1.charAt(i); 612  char c2 = s2.charAt(i); 613  if (c1 == c2) { 614  continue; 615  } 616  int alphaIndex = getAlphaIndex(c1); 617  // This was also benchmarked using '&' to avoid branching (but always evaluate the rhs), 618  // however this showed no obvious improvement. 619  if (alphaIndex < 26 && alphaIndex == getAlphaIndex(c2)) { 620  continue; 621  } 622  return false; 623  } 624  return true; 625  } 626  627  /** 628  * Returns the non-negative index value of the alpha character {@code c}, regardless of case. Ie, 629  * 'a'/'A' returns 0 and 'z'/'Z' returns 25. Non-alpha characters return a value of 26 or greater. 630  */ 631  private static int getAlphaIndex(char c) { 632  // Fold upper-case ASCII to lower-case and make zero-indexed and unsigned (by casting to char). 633  return (char) ((c | CASE_MASK) - 'a'); 634  } 635 }