aboutsummaryrefslogblamecommitdiff
path: root/java/core/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java
blob: 2c086e11b5c59980053f99285801b531a9ec4568 (plain) (tree)
1
2
3

                                                      
                                                  




























                                                                         




                                            
 
                                   





                                          




                                

   
                                                                                       



                                                








                                                                                       






                                                       
 






                                                           




                                                                      



                                                                                                    
     















                                                                    

                                   
                                                                             

        
                                                                                                

                                  
                                                                             

           
                                                            
                                           



                                                                

         
                                                     

                                                      
                                                          


                                                  
                                                              
                                






                                                                                     

                                           
                                                                                  

                
                                                             
                                















                                                                                       
 
                            




                        





















                                                                   
                            


















                                               

                                                                            
 
                                                                                     





                                                         
                                                                                




                  
                                                                                           
    
                                                                 


                                                                            

                                                                                      


     

                                                                                                
    
                                                                 


                                                                            

                                                                                                    
     

                                                                                          













                                                             
                                                   
                                                  

                                                         









                                                                           








                                                             




                                                                            


                    




                                                                       
                                                                                            




                                                                



                                                                                                

                                                        
                                                                       
                                        








                                                                                                   


                                                                       
                                                                                              






                                                              
                                                                                  






                                                                       


                                                                                                    

                         
                                                                 


                                                                            

                                                                                                    
     


                                                                                          

                         


                                                              

                         

                                                              

                                                                    
                                                                                     
























                                                                    
                                                                            

































                                                                 
                                                                                 









                                                                                
                                                                                         



                                                                                      

















                                                        
 
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

package com.google.protobuf;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.lang.ref.SoftReference;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.logging.Logger;

/**
 * Shared testing code for {@link IsValidUtf8Test} and {@link IsValidUtf8FourByteTest}.
 *
 * @author jonp@google.com (Jon Perlow)
 * @author martinrb@google.com (Martin Buchholz)
 */
final class IsValidUtf8TestUtil {
  private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName());

  private IsValidUtf8TestUtil() {}

  static interface ByteStringFactory {
    ByteString newByteString(byte[] bytes);
  }

  static final ByteStringFactory LITERAL_FACTORY =
      new ByteStringFactory() {
        @Override
        public ByteString newByteString(byte[] bytes) {
          return ByteString.wrap(bytes);
        }
      };

  static final ByteStringFactory HEAP_NIO_FACTORY =
      new ByteStringFactory() {
        @Override
        public ByteString newByteString(byte[] bytes) {
          return new NioByteString(ByteBuffer.wrap(bytes));
        }
      };

  private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer =
      new ThreadLocal<SoftReference<ByteBuffer>>();

  /**
   * Factory for direct {@link ByteBuffer} instances. To reduce direct memory usage, this uses a
   * thread local direct buffer. This means that each call will overwrite the buffer's contents from
   * the previous call, so the calling code must be careful not to continue using a buffer returned
   * from a previous invocation.
   */
  static final ByteStringFactory DIRECT_NIO_FACTORY =
      new ByteStringFactory() {
        @Override
        public ByteString newByteString(byte[] bytes) {
          SoftReference<ByteBuffer> ref = directBuffer.get();
          ByteBuffer buffer = ref == null ? null : ref.get();
          if (buffer == null || buffer.capacity() < bytes.length) {
            buffer = ByteBuffer.allocateDirect(bytes.length);
            directBuffer.set(new SoftReference<ByteBuffer>(buffer));
          }
          buffer.clear();
          buffer.put(bytes);
          buffer.flip();
          return new NioByteString(buffer);
        }
      };

  // 128 - [chars 0x0000 to 0x007f]
  static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;

  // 128
  static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 1920 [chars 0x0080 to 0x07FF]
  static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;

  // 18,304
  static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
      // Both bytes are one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
          +
          // The possible number of two byte characters
          TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 2048
  static final long THREE_BYTE_SURROGATES = 2 * 1024;

  // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
  static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;

  // 2,650,112
  static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
      // All one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
          +
          // One two byte character and a one byte character
          2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
          +
          // Three byte characters
          THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 1,048,576 [chars 0x10000L to 0x10FFFF]
  static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;

  // 289,571,839
  static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
      // All one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
          +
          // One and three byte characters
          2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
          +
          // Two two byte characters
          TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
          +
          // Permutations of one and two byte characters
          3
              * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
              * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
              * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
          +
          // Four byte characters
          FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  static final class Shard {
    final long index;
    final long start;
    final long lim;
    final long expected;

    public Shard(long index, long start, long lim, long expected) {
      assertTrue(start < lim);
      this.index = index;
      this.start = start;
      this.lim = lim;
      this.expected = expected;
    }
  }

  static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
      generateFourByteShardsExpectedRunnables();

  private static long[] generateFourByteShardsExpectedRunnables() {
    long[] expected = new long[128];

    // 0-63 are all 5300224
    for (int i = 0; i <= 63; i++) {
      expected[i] = 5300224;
    }

    // 97-111 are all 2342912
    for (int i = 97; i <= 111; i++) {
      expected[i] = 2342912;
    }

    // 113-117 are all 1048576
    for (int i = 113; i <= 117; i++) {
      expected[i] = 1048576;
    }

    // One offs
    expected[112] = 786432;
    expected[118] = 786432;
    expected[119] = 1048576;
    expected[120] = 458752;
    expected[121] = 524288;
    expected[122] = 65536;

    // Anything not assigned was the default 0.
    return expected;
  }

  static final List<Shard> FOUR_BYTE_SHARDS =
      generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);

  private static List<Shard> generateFourByteShards(int numShards, long[] expected) {
    assertEquals(numShards, expected.length);
    List<Shard> shards = new ArrayList<Shard>(numShards);
    long LIM = 1L << 32;
    long increment = LIM / numShards;
    assertTrue(LIM % numShards == 0);
    for (int i = 0; i < numShards; i++) {
      shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i]));
    }
    return shards;
  }

  /**
   * Helper to run the loop to test all the permutations for the number of bytes specified.
   *
   * @param factory the factory for {@link ByteString} instances.
   * @param numBytes the number of bytes in the byte array
   * @param expectedCount the expected number of roundtrippable permutations
   */
  static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) {
    testBytes(factory, numBytes, expectedCount, 0, -1);
  }

  /**
   * Helper to run the loop to test all the permutations for the number of bytes specified. This
   * overload is useful for debugging to get the loop to start at a certain character.
   *
   * @param factory the factory for {@link ByteString} instances.
   * @param numBytes the number of bytes in the byte array
   * @param expectedCount the expected number of roundtrippable permutations
   * @param start the starting bytes encoded as a long as big-endian
   * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
   *     limit for numBytes
   */
  static void testBytes(
      ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
    Random rnd = new Random();
    byte[] bytes = new byte[numBytes];

    if (lim == -1) {
      lim = 1L << (numBytes * 8);
    }
    long count = 0;
    long countRoundTripped = 0;
    for (long byteChar = start; byteChar < lim; byteChar++) {
      long tmpByteChar = byteChar;
      for (int i = 0; i < numBytes; i++) {
        bytes[bytes.length - i - 1] = (byte) tmpByteChar;
        tmpByteChar = tmpByteChar >> 8;
      }
      ByteString bs = factory.newByteString(bytes);
      boolean isRoundTrippable = bs.isValidUtf8();
      String s = new String(bytes, Internal.UTF_8);
      byte[] bytesReencoded = s.getBytes(Internal.UTF_8);
      boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);

      if (bytesEqual != isRoundTrippable) {
        outputFailure(byteChar, bytes, bytesReencoded);
      }

      // Check agreement with static Utf8 methods.
      assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes));
      assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes));

      try {
        assertEquals(s, Utf8.decodeUtf8(bytes, 0, numBytes));
      } catch (InvalidProtocolBufferException e) {
        if (isRoundTrippable) {
          System.out.println("Could not decode utf-8");
          outputFailure(byteChar, bytes, bytesReencoded);
        }
      }

      // Test partial sequences.
      // Partition numBytes into three segments (not necessarily non-empty).
      int i = rnd.nextInt(numBytes);
      int j = rnd.nextInt(numBytes);
      if (j < i) {
        int tmp = i;
        i = j;
        j = tmp;
      }
      int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i);
      int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j);
      int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes);
      if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
        System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j);
        outputFailure(byteChar, bytes, bytesReencoded);
      }
      assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));

      // Test ropes built out of small partial sequences
      ByteString rope =
          RopeByteString.newInstanceForTest(
              bs.substring(0, i),
              RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes)));
      assertSame(RopeByteString.class, rope.getClass());

      ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope};
      for (ByteString x : byteStrings) {
        assertEquals(isRoundTrippable, x.isValidUtf8());
        assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));

        assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i));
        assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i));
        assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i));
        assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i));
        assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j));
        assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j));
      }

      // ByteString reduplication should not affect its UTF-8 validity.
      ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
      assertEquals(isRoundTrippable, ropeADope.isValidUtf8());

      if (isRoundTrippable) {
        countRoundTripped++;
      }
      count++;
      if (byteChar != 0 && byteChar % 1000000L == 0) {
        logger.info("Processed " + (byteChar / 1000000L) + " million characters");
      }
    }
    logger.info("Round tripped " + countRoundTripped + " of " + count);
    assertEquals(expectedCount, countRoundTripped);
  }

  /**
   * Variation of {@link #testBytes} that does less allocation using the low-level encoders/decoders
   * directly. Checked in because it's useful for debugging when trying to process bytes faster, but
   * since it doesn't use the actual String class, it's possible for incompatibilities to develop
   * (although unlikely).
   *
   * @param factory the factory for {@link ByteString} instances.
   * @param numBytes the number of bytes in the byte array
   * @param expectedCount the expected number of roundtrippable permutations
   * @param start the starting bytes encoded as a long as big-endian
   * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
   *     limit for numBytes
   */
  static void testBytesUsingByteBuffers(
      ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
    CharsetDecoder decoder =
        Internal.UTF_8
            .newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    CharsetEncoder encoder =
        Internal.UTF_8
            .newEncoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    byte[] bytes = new byte[numBytes];
    int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
    char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
    int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
    byte[] bytesReencoded = new byte[maxBytes];

    ByteBuffer bb = ByteBuffer.wrap(bytes);
    CharBuffer cb = CharBuffer.wrap(charsDecoded);
    ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
    if (lim == -1) {
      lim = 1L << (numBytes * 8);
    }
    long count = 0;
    long countRoundTripped = 0;
    for (long byteChar = start; byteChar < lim; byteChar++) {
      bb.rewind();
      bb.limit(bytes.length);
      cb.rewind();
      cb.limit(charsDecoded.length);
      bbReencoded.rewind();
      bbReencoded.limit(bytesReencoded.length);
      encoder.reset();
      decoder.reset();
      long tmpByteChar = byteChar;
      for (int i = 0; i < bytes.length; i++) {
        bytes[bytes.length - i - 1] = (byte) tmpByteChar;
        tmpByteChar = tmpByteChar >> 8;
      }
      boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8();
      CoderResult result = decoder.decode(bb, cb, true);
      assertFalse(result.isError());
      result = decoder.flush(cb);
      assertFalse(result.isError());

      int charLen = cb.position();
      cb.rewind();
      cb.limit(charLen);
      result = encoder.encode(cb, bbReencoded, true);
      assertFalse(result.isError());
      result = encoder.flush(bbReencoded);
      assertFalse(result.isError());

      boolean bytesEqual = true;
      int bytesLen = bbReencoded.position();
      if (bytesLen != numBytes) {
        bytesEqual = false;
      } else {
        for (int i = 0; i < numBytes; i++) {
          if (bytes[i] != bytesReencoded[i]) {
            bytesEqual = false;
            break;
          }
        }
      }
      if (bytesEqual != isRoundTrippable) {
        outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
      }

      count++;
      if (isRoundTrippable) {
        countRoundTripped++;
      }
      if (byteChar != 0 && byteChar % 1000000 == 0) {
        logger.info("Processed " + (byteChar / 1000000) + " million characters");
      }
    }
    logger.info("Round tripped " + countRoundTripped + " of " + count);
    assertEquals(expectedCount, countRoundTripped);
  }

  private static void outputFailure(long byteChar, byte[] bytes, byte[] after) {
    outputFailure(byteChar, bytes, after, after.length);
  }

  private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) {
    fail(
        String.format(
            "Failure: (%s) %s => %s",
            Long.toHexString(byteChar), toHexString(bytes), toHexString(after, len)));
  }

  private static String toHexString(byte[] b) {
    return toHexString(b, b.length);
  }

  private static String toHexString(byte[] b, int len) {
    StringBuilder s = new StringBuilder();
    s.append("\"");
    for (int i = 0; i < len; i++) {
      if (i > 0) {
        s.append(" ");
      }
      s.append(String.format("%02x", b[i] & 0xFF));
    }
    s.append("\"");
    return s.toString();
  }
}