1 files changed, 238 insertions, 3 deletions
diff --git a/src/google/protobuf/io/tokenizer_unittest.cc b/src/google/protobuf/io/tokenizer_unittest.cc
index 106d080f..8de43939 100644
--- a/src/google/protobuf/io/tokenizer_unittest.cc
+++ b/src/google/protobuf/io/tokenizer_unittest.cc
@@ -32,9 +32,10 @@
 //  Based on original Protocol Buffers design by
 //  Sanjay Ghemawat, Jeff Dean, and others.
 
-#include <vector>
-#include <math.h>
 #include <limits.h>
+#include <math.h>
+
+#include <vector>
 
 #include <google/protobuf/io/tokenizer.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
@@ -514,6 +515,217 @@ TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
 
 // -------------------------------------------------------------------
 
+// In each case, the input is expected to have two tokens named "prev" and
+// "next" with comments in between.
+struct DocCommentCase {
+  string input;
+
+  const char* prev_trailing_comments;
+  const char* detached_comments[10];
+  const char* next_leading_comments;
+};
+
+inline ostream& operator<<(ostream& out,
+                           const DocCommentCase& test_case) {
+  return out << CEscape(test_case.input);
+}
+
+DocCommentCase kDocCommentCases[] = {
+  {
+    "prev next",
+
+    "",
+    {},
+    ""
+      },
+
+        {
+      "prev /* ignored */ next",
+
+      "",
+      {},
+      ""
+        },
+
+          {
+        "prev // trailing comment\n"
+            "next",
+
+            " trailing comment\n",
+            {},
+            ""
+          },
+
+            {
+          "prev\n"
+              "// leading comment\n"
+              "// line 2\n"
+              "next",
+
+              "",
+              {},
+              " leading comment\n"
+              " line 2\n"
+            },
+
+              {
+            "prev\n"
+                "// trailing comment\n"
+                "// line 2\n"
+                "\n"
+                "next",
+
+                " trailing comment\n"
+                " line 2\n",
+                {},
+                ""
+              },
+
+                {
+              "prev // trailing comment\n"
+                  "// leading comment\n"
+                  "// line 2\n"
+                  "next",
+
+                  " trailing comment\n",
+                  {},
+                  " leading comment\n"
+                  " line 2\n"
+                },
+
+                  {
+                "prev /* trailing block comment */\n"
+                    "/* leading block comment\n"
+                    " * line 2\n"
+                    " * line 3 */"
+                    "next",
+
+                    " trailing block comment ",
+                    {},
+                    " leading block comment\n"
+                    " line 2\n"
+                    " line 3 "
+                  },
+
+                    {
+                  "prev\n"
+                      "/* trailing block comment\n"
+                      " * line 2\n"
+                      " * line 3\n"
+                      " */\n"
+                      "/* leading block comment\n"
+                      " * line 2\n"
+                      " * line 3 */"
+                      "next",
+
+                      " trailing block comment\n"
+                      " line 2\n"
+                      " line 3\n",
+                      {},
+                      " leading block comment\n"
+                      " line 2\n"
+                      " line 3 "
+                    },
+
+                      {
+                    "prev\n"
+                        "// trailing comment\n"
+                        "\n"
+                        "// detached comment\n"
+                        "// line 2\n"
+                        "\n"
+                        "// second detached comment\n"
+                        "/* third detached comment\n"
+                        " * line 2 */\n"
+                        "// leading comment\n"
+                        "next",
+
+                        " trailing comment\n",
+                        {
+                      " detached comment\n"
+                          " line 2\n",
+                          " second detached comment\n",
+                          " third detached comment\n"
+                          " line 2 "
+                        },
+                          " leading comment\n"
+                        },
+
+                          {
+                        "prev /**/\n"
+                            "\n"
+                            "// detached comment\n"
+                            "\n"
+                            "// leading comment\n"
+                            "next",
+
+                            "",
+                            {
+                          " detached comment\n"
+                            },
+                              " leading comment\n"
+                            },
+
+                              {
+                            "prev /**/\n"
+                                "// leading comment\n"
+                                "next",
+
+                                "",
+                                {},
+                                " leading comment\n"
+                              },
+                              };
+
+TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
+  // Set up the tokenizer.
+  TestInputStream input(kDocCommentCases_case.input.data(),
+                        kDocCommentCases_case.input.size(),
+                        kBlockSizes_case);
+  TestErrorCollector error_collector;
+  Tokenizer tokenizer(&input, &error_collector);
+
+  // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
+  TestInputStream input2(kDocCommentCases_case.input.data(),
+                        kDocCommentCases_case.input.size(),
+                        kBlockSizes_case);
+  Tokenizer tokenizer2(&input2, &error_collector);
+
+  tokenizer.Next();
+  tokenizer2.Next();
+
+  EXPECT_EQ("prev", tokenizer.current().text);
+  EXPECT_EQ("prev", tokenizer2.current().text);
+
+  string prev_trailing_comments;
+  vector<string> detached_comments;
+  string next_leading_comments;
+  tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
+                             &next_leading_comments);
+  tokenizer2.NextWithComments(NULL, NULL, NULL);
+  EXPECT_EQ("next", tokenizer.current().text);
+  EXPECT_EQ("next", tokenizer2.current().text);
+
+  EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
+            prev_trailing_comments);
+
+  for (int i = 0; i < detached_comments.size(); i++) {
+    ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
+    ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
+    EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
+              detached_comments[i]);
+  }
+
+  // Verify that we matched all the detached comments.
+  EXPECT_EQ(NULL,
+      kDocCommentCases_case.detached_comments[detached_comments.size()]);
+
+  EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
+            next_leading_comments);
+}
+
+// -------------------------------------------------------------------
+
 // Test parse helpers.  It's not really worth setting up a full data-driven
 // test here.
 TEST_F(TokenizerTest, ParseInteger) {
@@ -614,6 +826,22 @@ TEST_F(TokenizerTest, ParseString) {
   Tokenizer::ParseString("'\\", &output);
   EXPECT_EQ("\\", output);
 
+  // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
+  // characters.
+  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
+  EXPECT_EQ("$¢€𤭢XX", output);
+  // Same thing encoded using UTF16.
+  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
+  EXPECT_EQ("$¢€𤭢XX", output);
+  // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
+  // We just output this as if it were UTF8; it's not a defined code point, but
+  // it has a defined encoding.
+  Tokenizer::ParseString("'\\ud852XX'", &output);
+  EXPECT_EQ("\xed\xa1\x92XX", output);
+  // Malformed escape: Demons may fly out of the nose.
+  Tokenizer::ParseString("\\u0", &output);
+  EXPECT_EQ("u0", output);
+
   // Test invalid strings that will never be tokenized as strings.
 #ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
   EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
@@ -658,6 +886,12 @@ ErrorCase kErrorCases[] = {
     "0:4: String literals cannot cross line boundaries.\n" },
   { "'bar\nfoo", true,
     "0:4: String literals cannot cross line boundaries.\n" },
+  { "'\\u01' foo", true,
+    "0:5: Expected four hex digits for \\u escape sequence.\n" },
+  { "'\\u01' foo", true,
+    "0:5: Expected four hex digits for \\u escape sequence.\n" },
+  { "'\\uXYZ' foo", true,
+    "0:3: Expected four hex digits for \\u escape sequence.\n" },
 
   // Integer errors.
   { "123foo", true,
@@ -734,7 +968,7 @@ TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
   }
 
   // Check that the errors match what was expected.
-  EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
+  EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
 
   // If the error was recoverable, make sure we saw "foo" after it.
   if (kErrorCases_case.recoverable) {
@@ -760,6 +994,7 @@ TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
   EXPECT_EQ(strlen("foo"), input.ByteCount());
 }
 
+
 }  // namespace
 }  // namespace io
 }  // namespace protobuf