diff options
Diffstat (limited to 'src/google/protobuf/io')
-rw-r--r-- | src/google/protobuf/io/coded_stream.cc | 48 | ||||
-rw-r--r-- | src/google/protobuf/io/coded_stream.h | 47 | ||||
-rw-r--r-- | src/google/protobuf/io/coded_stream_inl.h | 2 | ||||
-rw-r--r-- | src/google/protobuf/io/coded_stream_unittest.cc | 62 | ||||
-rw-r--r-- | src/google/protobuf/io/gzip_stream.cc | 19 | ||||
-rw-r--r-- | src/google/protobuf/io/gzip_stream.h | 14 | ||||
-rw-r--r-- | src/google/protobuf/io/printer.cc | 7 | ||||
-rw-r--r-- | src/google/protobuf/io/printer_unittest.cc | 26 | ||||
-rw-r--r-- | src/google/protobuf/io/tokenizer.cc | 497 | ||||
-rw-r--r-- | src/google/protobuf/io/tokenizer.h | 85 | ||||
-rw-r--r-- | src/google/protobuf/io/tokenizer_unittest.cc | 241 | ||||
-rw-r--r-- | src/google/protobuf/io/zero_copy_stream_impl.cc | 3 | ||||
-rw-r--r-- | src/google/protobuf/io/zero_copy_stream_impl_lite.cc | 4 | ||||
-rw-r--r-- | src/google/protobuf/io/zero_copy_stream_unittest.cc | 94 |
14 files changed, 1030 insertions, 119 deletions
diff --git a/src/google/protobuf/io/coded_stream.cc b/src/google/protobuf/io/coded_stream.cc index 402a3ad3..36add8c3 100644 --- a/src/google/protobuf/io/coded_stream.cc +++ b/src/google/protobuf/io/coded_stream.cc @@ -43,7 +43,7 @@ #include <limits.h> #include <google/protobuf/io/zero_copy_stream.h> #include <google/protobuf/stubs/common.h> -#include <google/protobuf/stubs/stl_util-inl.h> +#include <google/protobuf/stubs/stl_util.h> namespace google { @@ -69,6 +69,19 @@ inline bool NextNonEmpty(ZeroCopyInputStream* input, // CodedInputStream ================================================== +CodedInputStream::~CodedInputStream() { + if (input_ != NULL) { + BackUpInputToCurrentPosition(); + } + + if (total_bytes_warning_threshold_ == -2) { + GOOGLE_LOG(WARNING) << "The total number of bytes read was " << total_bytes_read_; + } +} + +// Static. +int CodedInputStream::default_recursion_limit_ = 100; + void CodedInputStream::BackUpInputToCurrentPosition() { int backup_bytes = BufferSize() + buffer_size_after_limit_ + overflow_bytes_; @@ -98,8 +111,7 @@ inline void CodedInputStream::RecomputeBufferLimits() { CodedInputStream::Limit CodedInputStream::PushLimit(int byte_limit) { // Current position relative to the beginning of the stream. - int current_position = total_bytes_read_ - - (BufferSize() + buffer_size_after_limit_); + int current_position = CurrentPosition(); Limit old_limit = current_limit_; @@ -133,10 +145,9 @@ void CodedInputStream::PopLimit(Limit limit) { legitimate_message_end_ = false; } -int CodedInputStream::BytesUntilLimit() { +int CodedInputStream::BytesUntilLimit() const { if (current_limit_ == INT_MAX) return -1; - int current_position = total_bytes_read_ - - (BufferSize() + buffer_size_after_limit_); + int current_position = CurrentPosition(); return current_limit_ - current_position; } @@ -145,10 +156,14 @@ void CodedInputStream::SetTotalBytesLimit( int total_bytes_limit, int warning_threshold) { // Make sure the limit isn't already past, since this could confuse other // code. - int current_position = total_bytes_read_ - - (BufferSize() + buffer_size_after_limit_); + int current_position = CurrentPosition(); total_bytes_limit_ = max(current_position, total_bytes_limit); - total_bytes_warning_threshold_ = warning_threshold; + if (warning_threshold >= 0) { + total_bytes_warning_threshold_ = warning_threshold; + } else { + // warning_threshold is negative + total_bytes_warning_threshold_ = -1; + } RecomputeBufferLimits(); } @@ -368,16 +383,17 @@ uint32 CodedInputStream::ReadTagSlow() { // For the slow path, just do a 64-bit read. Try to optimize for one-byte tags // again, since we have now refreshed the buffer. - uint64 result; + uint64 result = 0; if (!ReadVarint64(&result)) return 0; return static_cast<uint32>(result); } uint32 CodedInputStream::ReadTagFallback() { - if (BufferSize() >= kMaxVarintBytes || + const int buf_size = BufferSize(); + if (buf_size >= kMaxVarintBytes || // Optimization: If the varint ends at exactly the end of the buffer, // we can detect that and still use the fast path. - (buffer_end_ > buffer_ && !(buffer_end_[-1] & 0x80))) { + (buf_size > 0 && !(buffer_end_[-1] & 0x80))) { uint32 tag; const uint8* end = ReadVarint32FromArray(buffer_, &tag); if (end == NULL) { @@ -388,7 +404,9 @@ uint32 CodedInputStream::ReadTagFallback() { } else { // We are commonly at a limit when attempting to read tags. Try to quickly // detect this case without making another function call. - if (buffer_ == buffer_end_ && buffer_size_after_limit_ > 0 && + if ((buf_size == 0) && + ((buffer_size_after_limit_ > 0) || + (total_bytes_read_ == current_limit_)) && // Make sure that the limit we hit is not total_bytes_limit_, since // in that case we still need to call Refresh() so that it prints an // error. @@ -492,8 +510,8 @@ bool CodedInputStream::Refresh() { "CodedInputStream::SetTotalBytesLimit() in " "google/protobuf/io/coded_stream.h."; - // Don't warn again for this stream. - total_bytes_warning_threshold_ = -1; + // Don't warn again for this stream, and print total size at the end. + total_bytes_warning_threshold_ = -2; } const void* void_buffer; diff --git a/src/google/protobuf/io/coded_stream.h b/src/google/protobuf/io/coded_stream.h index 97ac5079..66cbee00 100644 --- a/src/google/protobuf/io/coded_stream.h +++ b/src/google/protobuf/io/coded_stream.h @@ -170,6 +170,9 @@ class LIBPROTOBUF_EXPORT CodedInputStream { // successfully and the stream's byte limit. ~CodedInputStream(); + // Return true if this CodedInputStream reads from a flat array instead of + // a ZeroCopyInputStream. + inline bool IsFlat() const; // Skips a number of bytes. Returns false if an underlying read error // occurs. @@ -311,7 +314,10 @@ class LIBPROTOBUF_EXPORT CodedInputStream { // Returns the number of bytes left until the nearest limit on the // stack is hit, or -1 if no limits are in place. - int BytesUntilLimit(); + int BytesUntilLimit() const; + + // Returns current position relative to the beginning of the input stream. + int CurrentPosition() const; // Total Bytes Limit ----------------------------------------------- // To prevent malicious users from sending excessively large messages @@ -327,8 +333,9 @@ class LIBPROTOBUF_EXPORT CodedInputStream { // cause integer overflows is 512MB. The default limit is 64MB. Apps // should set shorter limits if possible. If warning_threshold is not -1, // a warning will be printed to stderr after warning_threshold bytes are - // read. An error will always be printed to stderr if the limit is - // reached. + // read. For backwards compatibility all negative values get squached to -1, + // as other negative values might have special internal meanings. + // An error will always be printed to stderr if the limit is reached. // // This is unrelated to PushLimit()/PopLimit(). // @@ -355,9 +362,10 @@ class LIBPROTOBUF_EXPORT CodedInputStream { // messages and groups. CodedInputStream keeps track of this because it // is the only object that is passed down the stack during parsing. - // Sets the maximum recursion depth. The default is 64. + // Sets the maximum recursion depth. The default is 100. void SetRecursionLimit(int limit); + // Increments the current recursion depth. Returns true if the depth is // under the limit, false if it has gone over. bool IncrementRecursionDepth(); @@ -433,7 +441,8 @@ class LIBPROTOBUF_EXPORT CodedInputStream { // // Note that this feature is ignored when parsing "lite" messages as they do // not have descriptors. - void SetExtensionRegistry(DescriptorPool* pool, MessageFactory* factory); + void SetExtensionRegistry(const DescriptorPool* pool, + MessageFactory* factory); // Get the DescriptorPool set via SetExtensionRegistry(), or NULL if no pool // has been provided. @@ -482,6 +491,11 @@ class LIBPROTOBUF_EXPORT CodedInputStream { // Maximum number of bytes to read, period. This is unrelated to // current_limit_. Set using SetTotalBytesLimit(). int total_bytes_limit_; + + // If positive/0: Limit for bytes read after which a warning due to size + // should be logged. + // If -1: Printing of warning disabled. Can be set by client. + // If -2: Internal: Limit has been reached, print full size when destructing. int total_bytes_warning_threshold_; // Current recursion depth, controlled by IncrementRecursionDepth() and @@ -539,7 +553,8 @@ class LIBPROTOBUF_EXPORT CodedInputStream { static const int kDefaultTotalBytesLimit = 64 << 20; // 64MB static const int kDefaultTotalBytesWarningThreshold = 32 << 20; // 32MB - static const int kDefaultRecursionLimit = 64; + + static int default_recursion_limit_; // 100 by default. }; // Class which encodes and writes binary data which is composed of varint- @@ -891,7 +906,9 @@ inline bool CodedInputStream::ExpectAtEnd() { // If we are at a limit we know no more bytes can be read. Otherwise, it's // hard to say without calling Refresh(), and we'd rather not do that. - if (buffer_ == buffer_end_ && buffer_size_after_limit_ != 0) { + if (buffer_ == buffer_end_ && + ((buffer_size_after_limit_ != 0) || + (total_bytes_read_ == current_limit_))) { last_tag_ = 0; // Pretend we called ReadTag()... legitimate_message_end_ = true; // ... and it hit EOF. return true; @@ -900,6 +917,10 @@ inline bool CodedInputStream::ExpectAtEnd() { } } +inline int CodedInputStream::CurrentPosition() const { + return total_bytes_read_ - (BufferSize() + buffer_size_after_limit_); +} + inline uint8* CodedOutputStream::GetDirectBufferForNBytesAndAdvance(int size) { if (buffer_size_ < size) { return NULL; @@ -1039,7 +1060,7 @@ inline void CodedInputStream::DecrementRecursionDepth() { if (recursion_depth_ > 0) --recursion_depth_; } -inline void CodedInputStream::SetExtensionRegistry(DescriptorPool* pool, +inline void CodedInputStream::SetExtensionRegistry(const DescriptorPool* pool, MessageFactory* factory) { extension_pool_ = pool; extension_factory_ = factory; @@ -1071,7 +1092,7 @@ inline CodedInputStream::CodedInputStream(ZeroCopyInputStream* input) total_bytes_limit_(kDefaultTotalBytesLimit), total_bytes_warning_threshold_(kDefaultTotalBytesWarningThreshold), recursion_depth_(0), - recursion_limit_(kDefaultRecursionLimit), + recursion_limit_(default_recursion_limit_), extension_pool_(NULL), extension_factory_(NULL) { // Eagerly Refresh() so buffer space is immediately available. @@ -1092,17 +1113,15 @@ inline CodedInputStream::CodedInputStream(const uint8* buffer, int size) total_bytes_limit_(kDefaultTotalBytesLimit), total_bytes_warning_threshold_(kDefaultTotalBytesWarningThreshold), recursion_depth_(0), - recursion_limit_(kDefaultRecursionLimit), + recursion_limit_(default_recursion_limit_), extension_pool_(NULL), extension_factory_(NULL) { // Note that setting current_limit_ == size is important to prevent some // code paths from trying to access input_ and segfaulting. } -inline CodedInputStream::~CodedInputStream() { - if (input_ != NULL) { - BackUpInputToCurrentPosition(); - } +inline bool CodedInputStream::IsFlat() const { + return input_ == NULL; } } // namespace io diff --git a/src/google/protobuf/io/coded_stream_inl.h b/src/google/protobuf/io/coded_stream_inl.h index e9799d47..94495fb8 100644 --- a/src/google/protobuf/io/coded_stream_inl.h +++ b/src/google/protobuf/io/coded_stream_inl.h @@ -38,7 +38,7 @@ #include <google/protobuf/io/coded_stream.h> #include <string> -#include <google/protobuf/stubs/stl_util-inl.h> +#include <google/protobuf/stubs/stl_util.h> namespace google { namespace protobuf { diff --git a/src/google/protobuf/io/coded_stream_unittest.cc b/src/google/protobuf/io/coded_stream_unittest.cc index ff268ab9..2daab194 100644 --- a/src/google/protobuf/io/coded_stream_unittest.cc +++ b/src/google/protobuf/io/coded_stream_unittest.cc @@ -44,7 +44,6 @@ #include <google/protobuf/testing/googletest.h> #include <gtest/gtest.h> #include <google/protobuf/io/zero_copy_stream_impl.h> -#include <google/protobuf/stubs/strutil.h> // This declares an unsigned long long integer literal in a portable way. @@ -125,6 +124,13 @@ namespace { class CodedStreamTest : public testing::Test { protected: + // Helper method used by tests for bytes warning. See implementation comment + // for further information. + static void SetupTotalBytesLimitWarningTest( + int total_bytes_limit, int warning_threshold, + vector<string>* out_errors, vector<string>* out_warnings); + + // Buffer used during most of the tests. This assumes tests run sequentially. static const int kBufferSize = 1024 * 64; static uint8 buffer_[kBufferSize]; }; @@ -1022,6 +1028,59 @@ TEST_F(CodedStreamTest, TotalBytesLimitNotValidMessageEnd) { EXPECT_FALSE(coded_input.ConsumedEntireMessage()); } +// This method is used by the tests below. +// It constructs a CodedInputStream with the given limits and tries to read 2KiB +// of data from it. Then it returns the logged errors and warnings in the given +// vectors. +void CodedStreamTest::SetupTotalBytesLimitWarningTest( + int total_bytes_limit, int warning_threshold, + vector<string>* out_errors, vector<string>* out_warnings) { + ArrayInputStream raw_input(buffer_, sizeof(buffer_), 128); + + ScopedMemoryLog scoped_log; + { + CodedInputStream input(&raw_input); + input.SetTotalBytesLimit(total_bytes_limit, warning_threshold); + string str; + EXPECT_TRUE(input.ReadString(&str, 2048)); + } + + *out_errors = scoped_log.GetMessages(ERROR); + *out_warnings = scoped_log.GetMessages(WARNING); +} + +TEST_F(CodedStreamTest, TotalBytesLimitWarning) { + vector<string> errors; + vector<string> warnings; + SetupTotalBytesLimitWarningTest(10240, 1024, &errors, &warnings); + + EXPECT_EQ(0, errors.size()); + + ASSERT_EQ(2, warnings.size()); + EXPECT_PRED_FORMAT2(testing::IsSubstring, + "Reading dangerously large protocol message. If the message turns out to " + "be larger than 10240 bytes, parsing will be halted for security reasons.", + warnings[0]); + EXPECT_PRED_FORMAT2(testing::IsSubstring, + "The total number of bytes read was 2048", + warnings[1]); +} + +TEST_F(CodedStreamTest, TotalBytesLimitWarningDisabled) { + vector<string> errors; + vector<string> warnings; + + // Test with -1 + SetupTotalBytesLimitWarningTest(10240, -1, &errors, &warnings); + EXPECT_EQ(0, errors.size()); + EXPECT_EQ(0, warnings.size()); + + // Test again with -2, expecting the same result + SetupTotalBytesLimitWarningTest(10240, -2, &errors, &warnings); + EXPECT_EQ(0, errors.size()); + EXPECT_EQ(0, warnings.size()); +} + TEST_F(CodedStreamTest, RecursionLimit) { ArrayInputStream input(buffer_, sizeof(buffer_)); @@ -1060,6 +1119,7 @@ TEST_F(CodedStreamTest, RecursionLimit) { EXPECT_FALSE(coded_input.IncrementRecursionDepth()); // 7 } + class ReallyBigInputStream : public ZeroCopyInputStream { public: ReallyBigInputStream() : backup_amount_(0), buffer_count_(0) {} diff --git a/src/google/protobuf/io/gzip_stream.cc b/src/google/protobuf/io/gzip_stream.cc index 0f1ff872..fe1f3319 100644 --- a/src/google/protobuf/io/gzip_stream.cc +++ b/src/google/protobuf/io/gzip_stream.cc @@ -199,16 +199,6 @@ GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream, Init(sub_stream, options); } -GzipOutputStream::GzipOutputStream( - ZeroCopyOutputStream* sub_stream, Format format, int buffer_size) { - Options options; - options.format = format; - if (buffer_size != -1) { - options.buffer_size = buffer_size; - } - Init(sub_stream, options); -} - void GzipOutputStream::Init(ZeroCopyOutputStream* sub_stream, const Options& options) { sub_stream_ = sub_stream; @@ -309,10 +299,11 @@ int64 GzipOutputStream::ByteCount() const { } bool GzipOutputStream::Flush() { - do { - zerror_ = Deflate(Z_FULL_FLUSH); - } while (zerror_ == Z_OK); - return zerror_ == Z_OK; + zerror_ = Deflate(Z_FULL_FLUSH); + // Return true if the flush succeeded or if it was a no-op. + return (zerror_ == Z_OK) || + (zerror_ == Z_BUF_ERROR && zcontext_.avail_in == 0 && + zcontext_.avail_out != 0); } bool GzipOutputStream::Close() { diff --git a/src/google/protobuf/io/gzip_stream.h b/src/google/protobuf/io/gzip_stream.h index 65dbc5b5..7ee24bc3 100644 --- a/src/google/protobuf/io/gzip_stream.h +++ b/src/google/protobuf/io/gzip_stream.h @@ -45,6 +45,7 @@ #include <zlib.h> +#include <google/protobuf/stubs/common.h> #include <google/protobuf/io/zero_copy_stream.h> namespace google { @@ -144,12 +145,6 @@ class LIBPROTOBUF_EXPORT GzipOutputStream : public ZeroCopyOutputStream { ZeroCopyOutputStream* sub_stream, const Options& options); - // DEPRECATED: Use one of the above constructors instead. - GzipOutputStream( - ZeroCopyOutputStream* sub_stream, - Format format, - int buffer_size = -1) GOOGLE_ATTRIBUTE_DEPRECATED; - virtual ~GzipOutputStream(); // Return last error message or NULL if no error. @@ -165,6 +160,13 @@ class LIBPROTOBUF_EXPORT GzipOutputStream : public ZeroCopyOutputStream { // necessary. // Compression may be less efficient stopping and starting around flushes. // Returns true if no error. + // + // Please ensure that block size is > 6. Here is an excerpt from the zlib + // doc that explains why: + // + // In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that avail_out + // is greater than six to avoid repeated flush markers due to + // avail_out == 0 on return. bool Flush(); // Writes out all data and closes the gzip stream. diff --git a/src/google/protobuf/io/printer.cc b/src/google/protobuf/io/printer.cc index 9ab90dee..d2bf3f54 100644 --- a/src/google/protobuf/io/printer.cc +++ b/src/google/protobuf/io/printer.cc @@ -35,7 +35,6 @@ #include <google/protobuf/io/printer.h> #include <google/protobuf/io/zero_copy_stream.h> #include <google/protobuf/stubs/common.h> -#include <google/protobuf/stubs/strutil.h> namespace google { namespace protobuf { @@ -51,8 +50,8 @@ Printer::Printer(ZeroCopyOutputStream* output, char variable_delimiter) } Printer::~Printer() { - // Only BackUp() if we're sure we've successfully called Next() at least once. - if (buffer_size_ > 0) { + // Only BackUp() if we have called Next() at least once and never failed. + if (buffer_size_ > 0 && !failed_) { output_->BackUp(buffer_size_); } } @@ -169,7 +168,7 @@ void Printer::WriteRaw(const char* data, int size) { if (failed_) return; if (size == 0) return; - if (at_start_of_line_) { + if (at_start_of_line_ && (size > 0) && (data[0] != '\n')) { // Insert an indent. at_start_of_line_ = false; WriteRaw(indent_.data(), indent_.size()); diff --git a/src/google/protobuf/io/printer_unittest.cc b/src/google/protobuf/io/printer_unittest.cc index 580a53da..399395c8 100644 --- a/src/google/protobuf/io/printer_unittest.cc +++ b/src/google/protobuf/io/printer_unittest.cc @@ -233,7 +233,31 @@ TEST(Printer, Death) { } #endif // GTEST_HAS_DEATH_TEST -TEST(Printer, WriteFailure) { +TEST(Printer, WriteFailurePartial) { + char buffer[17]; + + ArrayOutputStream output(buffer, sizeof(buffer)); + Printer printer(&output, '$'); + + // Print 16 bytes to almost fill the buffer (should not fail). + printer.Print("0123456789abcdef"); + EXPECT_FALSE(printer.failed()); + + // Try to print 2 chars. Only one fits. + printer.Print("<>"); + EXPECT_TRUE(printer.failed()); + + // Anything else should fail too. + printer.Print(" "); + EXPECT_TRUE(printer.failed()); + printer.Print("blah"); + EXPECT_TRUE(printer.failed()); + + // Buffer should contain the first 17 bytes written. + EXPECT_EQ("0123456789abcdef<", string(buffer, sizeof(buffer))); +} + +TEST(Printer, WriteFailureExact) { char buffer[16]; ArrayOutputStream output(buffer, sizeof(buffer)); diff --git a/src/google/protobuf/io/tokenizer.cc b/src/google/protobuf/io/tokenizer.cc index 513831d5..a022b71d 100644 --- a/src/google/protobuf/io/tokenizer.cc +++ b/src/google/protobuf/io/tokenizer.cc @@ -89,8 +89,11 @@ // exactly pretty. #include <google/protobuf/io/tokenizer.h> +#include <google/protobuf/stubs/common.h> +#include <google/protobuf/stubs/stringprintf.h> #include <google/protobuf/io/zero_copy_stream.h> #include <google/protobuf/stubs/strutil.h> +#include <google/protobuf/stubs/stl_util.h> namespace google { namespace protobuf { @@ -118,6 +121,8 @@ namespace { CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v' || c == '\f'); +CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' || + c == '\r' || c == '\v' || c == '\f'); CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0'); @@ -187,7 +192,8 @@ Tokenizer::Tokenizer(ZeroCopyInputStream* input, read_error_(false), line_(0), column_(0), - token_start_(-1), + record_target_(NULL), + record_start_(-1), allow_f_after_float_(false), comment_style_(CPP_COMMENT_STYLE) { @@ -238,9 +244,9 @@ void Tokenizer::Refresh() { } // If we're in a token, append the rest of the buffer to it. - if (token_start_ >= 0 && token_start_ < buffer_size_) { - current_.text.append(buffer_ + token_start_, buffer_size_ - token_start_); - token_start_ = 0; + if (record_target_ != NULL && record_start_ < buffer_size_) { + record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_); + record_start_ = 0; } const void* data = NULL; @@ -261,23 +267,33 @@ void Tokenizer::Refresh() { current_char_ = buffer_[0]; } +inline void Tokenizer::RecordTo(string* target) { + record_target_ = target; + record_start_ = buffer_pos_; +} + +inline void Tokenizer::StopRecording() { + // Note: The if() is necessary because some STL implementations crash when + // you call string::append(NULL, 0), presumably because they are trying to + // be helpful by detecting the NULL pointer, even though there's nothing + // wrong with reading zero bytes from NULL. + if (buffer_pos_ != record_start_) { + record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_); + } + record_target_ = NULL; + record_start_ = -1; +} + inline void Tokenizer::StartToken() { - token_start_ = buffer_pos_; current_.type = TYPE_START; // Just for the sake of initializing it. current_.text.clear(); current_.line = line_; current_.column = column_; + RecordTo(¤t_.text); } inline void Tokenizer::EndToken() { - // Note: The if() is necessary because some STL implementations crash when - // you call string::append(NULL, 0), presumably because they are trying to - // be helpful by detecting the NULL pointer, even though there's nothing - // wrong with reading zero bytes from NULL. - if (buffer_pos_ != token_start_) { - current_.text.append(buffer_ + token_start_, buffer_pos_ - token_start_); - } - token_start_ = -1; + StopRecording(); current_.end_column = column_; } @@ -353,6 +369,27 @@ void Tokenizer::ConsumeString(char delimiter) { AddError("Expected hex digits for escape sequence."); } // Possibly followed by another hex digit, but again we don't care. + } else if (TryConsume('u')) { + if (!TryConsumeOne<HexDigit>() || + !TryConsumeOne<HexDigit>() || + !TryConsumeOne<HexDigit>() || + !TryConsumeOne<HexDigit>()) { + AddError("Expected four hex digits for \\u escape sequence."); + } + } else if (TryConsume('U')) { + // We expect 8 hex digits; but only the range up to 0x10ffff is + // legal. + if (!TryConsume('0') || + !TryConsume('0') || + !(TryConsume('0') || TryConsume('1')) || + !TryConsumeOne<HexDigit>() || + !TryConsumeOne<HexDigit>() || + !TryConsumeOne<HexDigit>() || + !TryConsumeOne<HexDigit>() || + !TryConsumeOne<HexDigit>()) { + AddError("Expected eight hex digits up to 10ffff for \\U escape " + "sequence"); + } } else { AddError("Invalid escape sequence in string literal."); } @@ -426,26 +463,51 @@ Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero, return is_float ? TYPE_FLOAT : TYPE_INTEGER; } -void Tokenizer::ConsumeLineComment() { +void Tokenizer::ConsumeLineComment(string* content) { + if (content != NULL) RecordTo(content); + while (current_char_ != '\0' && current_char_ != '\n') { NextChar(); } TryConsume('\n'); + + if (content != NULL) StopRecording(); } -void Tokenizer::ConsumeBlockComment() { +void Tokenizer::ConsumeBlockComment(string* content) { int start_line = line_; int start_column = column_ - 2; + if (content != NULL) RecordTo(content); + while (true) { while (current_char_ != '\0' && current_char_ != '*' && - current_char_ != '/') { + current_char_ != '/' && + current_char_ != '\n') { NextChar(); } - if (TryConsume('*') && TryConsume('/')) { + if (TryConsume('\n')) { + if (content != NULL) StopRecording(); + + // Consume leading whitespace and asterisk; + ConsumeZeroOrMore<WhitespaceNoNewline>(); + if (TryConsume('*')) { + if (TryConsume('/')) { + // End of comment. + break; + } + } + + if (content != NULL) RecordTo(content); + } else if (TryConsume('*') && TryConsume('/')) { // End of comment. + if (content != NULL) { + StopRecording(); + // Strip trailing "*/". + content->erase(content->size() - 2); + } break; } else if (TryConsume('/') && current_char_ == '*') { // Note: We didn't consume the '*' because if there is a '/' after it @@ -456,42 +518,59 @@ void Tokenizer::ConsumeBlockComment() { AddError("End-of-file inside block comment."); error_collector_->AddError( start_line, start_column, " Comment started here."); + if (content != NULL) StopRecording(); break; } } } +Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() { + if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { + if (TryConsume('/')) { + return LINE_COMMENT; + } else if (TryConsume('*')) { + return BLOCK_COMMENT; + } else { + // Oops, it was just a slash. Return it. + current_.type = TYPE_SYMBOL; + current_.text = "/"; + current_.line = line_; + current_.column = column_ - 1; + current_.end_column = column_; + return SLASH_NOT_COMMENT; + } + } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { + return LINE_COMMENT; + } else { + return NO_COMMENT; + } +} + // ------------------------------------------------------------------- bool Tokenizer::Next() { previous_ = current_; - // Did we skip any characters after the last token? - bool skipped_stuff = false; - while (!read_error_) { - if (TryConsumeOne<Whitespace>()) { - ConsumeZeroOrMore<Whitespace>(); - - } else if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { - // Starting a comment? - if (TryConsume('/')) { - ConsumeLineComment(); - } else if (TryConsume('*')) { - ConsumeBlockComment(); - } else { - // Oops, it was just a slash. Return it. - current_.type = TYPE_SYMBOL; - current_.text = "/"; - current_.line = line_; - current_.column = column_ - 1; + ConsumeZeroOrMore<Whitespace>(); + + switch (TryConsumeCommentStart()) { + case LINE_COMMENT: + ConsumeLineComment(NULL); + continue; + case BLOCK_COMMENT: + ConsumeBlockComment(NULL); + continue; + case SLASH_NOT_COMMENT: return true; - } + case NO_COMMENT: + break; + } - } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { - ConsumeLineComment(); + // Check for EOF before continuing. + if (read_error_) break; - } else if (LookingAt<Unprintable>() || current_char_ == '\0') { + if (LookingAt<Unprintable>() || current_char_ == '\0') { AddError("Invalid control characters encountered in text."); NextChar(); // Skip more unprintable characters, too. But, remember that '\0' is @@ -519,7 +598,9 @@ bool Tokenizer::Next() { if (TryConsumeOne<Digit>()) { // It's a floating-point number. - if (previous_.type == TYPE_IDENTIFIER && !skipped_stuff) { + if (previous_.type == TYPE_IDENTIFIER && + current_.line == previous_.line && + current_.column == previous_.end_column) { // We don't accept syntax like "blah.123". error_collector_->AddError(line_, column_ - 2, "Need space between identifier and decimal point."); @@ -544,8 +625,6 @@ bool Tokenizer::Next() { EndToken(); return true; } - - skipped_stuff = true; } // EOF @@ -557,6 +636,195 @@ bool Tokenizer::Next() { return false; } +namespace { + +// Helper class for collecting comments and putting them in the right places. +// +// This basically just buffers the most recent comment until it can be decided +// exactly where that comment should be placed. When Flush() is called, the +// current comment goes into either prev_trailing_comments or detached_comments. +// When the CommentCollector is destroyed, the last buffered comment goes into +// next_leading_comments. +class CommentCollector { + public: + CommentCollector(string* prev_trailing_comments, + vector<string>* detached_comments, + string* next_leading_comments) + : prev_trailing_comments_(prev_trailing_comments), + detached_comments_(detached_comments), + next_leading_comments_(next_leading_comments), + has_comment_(false), + is_line_comment_(false), + can_attach_to_prev_(true) { + if (prev_trailing_comments != NULL) prev_trailing_comments->clear(); + if (detached_comments != NULL) detached_comments->clear(); + if (next_leading_comments != NULL) next_leading_comments->clear(); + } + + ~CommentCollector() { + // Whatever is in the buffer is a leading comment. + if (next_leading_comments_ != NULL && has_comment_) { + comment_buffer_.swap(*next_leading_comments_); + } + } + + // About to read a line comment. Get the comment buffer pointer in order to + // read into it. + string* GetBufferForLineComment() { + // We want to combine with previous line comments, but not block comments. + if (has_comment_ && !is_line_comment_) { + Flush(); + } + has_comment_ = true; + is_line_comment_ = true; + return &comment_buffer_; + } + + // About to read a block comment. Get the comment buffer pointer in order to + // read into it. + string* GetBufferForBlockComment() { + if (has_comment_) { + Flush(); + } + has_comment_ = true; + is_line_comment_ = false; + return &comment_buffer_; + } + + void ClearBuffer() { + comment_buffer_.clear(); + has_comment_ = false; + } + + // Called once we know that the comment buffer is complete and is *not* + // connected to the next token. + void Flush() { + if (has_comment_) { + if (can_attach_to_prev_) { + if (prev_trailing_comments_ != NULL) { + prev_trailing_comments_->append(comment_buffer_); + } + can_attach_to_prev_ = false; + } else { + if (detached_comments_ != NULL) { + detached_comments_->push_back(comment_buffer_); + } + } + ClearBuffer(); + } + } + + void DetachFromPrev() { + can_attach_to_prev_ = false; + } + + private: + string* prev_trailing_comments_; + vector<string>* detached_comments_; + string* next_leading_comments_; + + string comment_buffer_; + + // True if any comments were read into comment_buffer_. This can be true even + // if comment_buffer_ is empty, namely if the comment was "/**/". + bool has_comment_; + + // Is the comment in the comment buffer a line comment? + bool is_line_comment_; + + // Is it still possible that we could be reading a comment attached to the + // previous token? + bool can_attach_to_prev_; +}; + +} // namespace + +bool Tokenizer::NextWithComments(string* prev_trailing_comments, + vector<string>* detached_comments, + string* next_leading_comments) { + CommentCollector collector(prev_trailing_comments, detached_comments, + next_leading_comments); + + if (current_.type == TYPE_START) { + collector.DetachFromPrev(); + } else { + // A comment appearing on the same line must be attached to the previous + // declaration. + ConsumeZeroOrMore<WhitespaceNoNewline>(); + switch (TryConsumeCommentStart()) { + case LINE_COMMENT: + ConsumeLineComment(collector.GetBufferForLineComment()); + + // Don't allow comments on subsequent lines to be attached to a trailing + // comment. + collector.Flush(); + break; + case BLOCK_COMMENT: + ConsumeBlockComment(collector.GetBufferForBlockComment()); + + ConsumeZeroOrMore<WhitespaceNoNewline>(); + if (!TryConsume('\n')) { + // Oops, the next token is on the same line. If we recorded a comment + // we really have no idea which token it should be attached to. + collector.ClearBuffer(); + return Next(); + } + + // Don't allow comments on subsequent lines to be attached to a trailing + // comment. + collector.Flush(); + break; + case SLASH_NOT_COMMENT: + return true; + case NO_COMMENT: + if (!TryConsume('\n')) { + // The next token is on the same line. There are no comments. + return Next(); + } + break; + } + } + + // OK, we are now on the line *after* the previous token. + while (true) { + ConsumeZeroOrMore<WhitespaceNoNewline>(); + + switch (TryConsumeCommentStart()) { + case LINE_COMMENT: + ConsumeLineComment(collector.GetBufferForLineComment()); + break; + case BLOCK_COMMENT: + ConsumeBlockComment(collector.GetBufferForBlockComment()); + + // Consume the rest of the line so that we don't interpret it as a + // blank line the next time around the loop. + ConsumeZeroOrMore<WhitespaceNoNewline>(); + TryConsume('\n'); + break; + case SLASH_NOT_COMMENT: + return true; + case NO_COMMENT: + if (TryConsume('\n')) { + // Completely blank line. + collector.Flush(); + collector.DetachFromPrev(); + } else { + bool result = Next(); + if (!result || + current_.text == "}" || + current_.text == "]" || + current_.text == ")") { + // It looks like we're at the end of a scope. In this case it + // makes no sense to attach a comment to the following token. + collector.Flush(); + } + return result; + } + break; + } + } +} + // ------------------------------------------------------------------- // Token-parsing helpers. Remember that these don't need to report // errors since any errors should already have been reported while @@ -626,17 +894,138 @@ double Tokenizer::ParseFloat(const string& text) { return result; } +// Helper to append a Unicode code point to a string as UTF8, without bringing +// in any external dependencies. +static void AppendUTF8(uint32 code_point, string* output) { + uint32 tmp = 0; + int len = 0; + if (code_point <= 0x7f) { + tmp = code_point; + len = 1; + } else if (code_point <= 0x07ff) { + tmp = 0x0000c080 | + ((code_point & 0x07c0) << 2) | + (code_point & 0x003f); + len = 2; + } else if (code_point <= 0xffff) { + tmp = 0x00e08080 | + ((code_point & 0xf000) << 4) | + ((code_point & 0x0fc0) << 2) | + (code_point & 0x003f); + len = 3; + } else if (code_point <= 0x1fffff) { + tmp = 0xf0808080 | + ((code_point & 0x1c0000) << 6) | + ((code_point & 0x03f000) << 4) | + ((code_point & 0x000fc0) << 2) | + (code_point & 0x003f); + len = 4; + } else { + // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is + // normally only defined up to there as well. + StringAppendF(output, "\\U%08x", code_point); + return; + } + tmp = ghtonl(tmp); + output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len); +} + +// Try to read <len> hex digits from ptr, and stuff the numeric result into +// *result. Returns true if that many digits were successfully consumed. +static bool ReadHexDigits(const char* ptr, int len, uint32* result) { + *result = 0; + if (len == 0) return false; + for (const char* end = ptr + len; ptr < end; ++ptr) { + if (*ptr == '\0') return false; + *result = (*result << 4) + DigitValue(*ptr); + } + return true; +} + +// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range +// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail +// surrogate. These numbers are in a reserved range of Unicode code points, so +// if we encounter such a pair we know how to parse it and convert it into a +// single code point. +static const uint32 kMinHeadSurrogate = 0xd800; +static const uint32 kMaxHeadSurrogate = 0xdc00; +static const uint32 kMinTrailSurrogate = 0xdc00; +static const uint32 kMaxTrailSurrogate = 0xe000; + +static inline bool IsHeadSurrogate(uint32 code_point) { + return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate); +} + +static inline bool IsTrailSurrogate(uint32 code_point) { + return (code_point >= kMinTrailSurrogate) && + (code_point < kMaxTrailSurrogate); +} + +// Combine a head and trail surrogate into a single Unicode code point. +static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) { + GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate)); + GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate)); + return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) | + (trail_surrogate - kMinTrailSurrogate)); +} + +// Convert the escape sequence parameter to a number of expected hex digits. +static inline int UnicodeLength(char key) { + if (key == 'u') return 4; + if (key == 'U') return 8; + return 0; +} + +// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt +// to parse that sequence. On success, returns a pointer to the first char +// beyond that sequence, and fills in *code_point. On failure, returns ptr +// itself. +static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) { + const char* p = ptr; + // Fetch the code point. + const int len = UnicodeLength(*p++); + if (!ReadHexDigits(p, len, code_point)) + return ptr; + p += len; + + // Check if the code point we read is a "head surrogate." If so, then we + // expect it to be immediately followed by another code point which is a valid + // "trail surrogate," and together they form a UTF-16 pair which decodes into + // a single Unicode point. Trail surrogates may only use \u, not \U. + if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') { + uint32 trail_surrogate; + if (ReadHexDigits(p + 2, 4, &trail_surrogate) && + IsTrailSurrogate(trail_surrogate)) { + *code_point = AssembleUTF16(*code_point, trail_surrogate); + p += 6; + } + // If this failed, then we just emit the head surrogate as a code point. + // It's bogus, but so is the string. + } + + return p; +} + +// The text string must begin and end with single or double quote +// characters. void Tokenizer::ParseStringAppend(const string& text, string* output) { - // Reminder: text[0] is always the quote character. (If text is - // empty, it's invalid, so we'll just return.) - if (text.empty()) { + // Reminder: text[0] is always a quote character. (If text is + // empty, it's invalid, so we'll just return). + const size_t text_size = text.size(); + if (text_size == 0) { GOOGLE_LOG(DFATAL) << " Tokenizer::ParseStringAppend() passed text that could not" " have been tokenized as a string: " << CEscape(text); return; } - output->reserve(output->size() + text.size()); + // Reserve room for new string. The branch is necessary because if + // there is already space available the reserve() call might + // downsize the output. + const size_t new_len = text_size + output->size(); + if (new_len > output->capacity()) { + output->reserve(new_len); + } // Loop through the string copying characters to "output" and // interpreting escape sequences. Note that any invalid escape @@ -674,19 +1063,27 @@ void Tokenizer::ParseStringAppend(const string& text, string* output) { } output->push_back(static_cast<char>(code)); + } else if (*ptr == 'u' || *ptr == 'U') { + uint32 unicode; + const char* end = FetchUnicodePoint(ptr, &unicode); + if (end == ptr) { + // Failure: Just dump out what we saw, don't try to parse it. + output->push_back(*ptr); + } else { + AppendUTF8(unicode, output); + ptr = end - 1; // Because we're about to ++ptr. + } } else { // Some other escape code. output->push_back(TranslateEscape(*ptr)); } - } else if (*ptr == text[0]) { - // Ignore quote matching the starting quote. + } else if (*ptr == text[0] && ptr[1] == '\0') { + // Ignore final quote matching the starting quote. } else { output->push_back(*ptr); } } - - return; } } // namespace io diff --git a/src/google/protobuf/io/tokenizer.h b/src/google/protobuf/io/tokenizer.h index 8f759abb..d85b82f9 100644 --- a/src/google/protobuf/io/tokenizer.h +++ b/src/google/protobuf/io/tokenizer.h @@ -38,6 +38,7 @@ #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ #include <string> +#include <vector> #include <google/protobuf/stubs/common.h> namespace google { @@ -137,6 +138,53 @@ class LIBPROTOBUF_EXPORT Tokenizer { // reached. bool Next(); + // Like Next(), but also collects comments which appear between the previous + // and next tokens. + // + // Comments which appear to be attached to the previous token are stored + // in *prev_tailing_comments. Comments which appear to be attached to the + // next token are stored in *next_leading_comments. Comments appearing in + // between which do not appear to be attached to either will be added to + // detached_comments. Any of these parameters can be NULL to simply discard + // the comments. + // + // A series of line comments appearing on consecutive lines, with no other + // tokens appearing on those lines, will be treated as a single comment. + // + // Only the comment content is returned; comment markers (e.g. //) are + // stripped out. For block comments, leading whitespace and an asterisk will + // be stripped from the beginning of each line other than the first. Newlines + // are included in the output. + // + // Examples: + // + // optional int32 foo = 1; // Comment attached to foo. + // // Comment attached to bar. + // optional int32 bar = 2; + // + // optional string baz = 3; + // // Comment attached to baz. + // // Another line attached to baz. + // + // // Comment attached to qux. + // // + // // Another line attached to qux. + // optional double qux = 4; + // + // // Detached comment. This is not attached to qux or corge + // // because there are blank lines separating it from both. + // + // optional string corge = 5; + // /* Block comment attached + // * to corge. Leading asterisks + // * will be removed. */ + // /* Block comment attached to + // * grault. */ + // optional int32 grault = 6; + bool NextWithComments(string* prev_trailing_comments, + vector<string>* detached_comments, + string* next_leading_comments); + // Parse helpers --------------------------------------------------- // Parses a TYPE_FLOAT token. This never fails, so long as the text actually @@ -200,11 +248,12 @@ class LIBPROTOBUF_EXPORT Tokenizer { int line_; int column_; - // Position in buffer_ where StartToken() was called. If the token - // started in the previous buffer, this is zero, and current_.text already - // contains the part of the token from the previous buffer. If not - // currently parsing a token, this is -1. - int token_start_; + // String to which text should be appended as we advance through it. + // Call RecordTo(&str) to start recording and StopRecording() to stop. + // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the + // position within the current buffer where recording started. + string* record_target_; + int record_start_; // Options. bool allow_f_after_float_; @@ -223,6 +272,9 @@ class LIBPROTOBUF_EXPORT Tokenizer { // Read a new buffer from the input. void Refresh(); + inline void RecordTo(string* target); + inline void StopRecording(); + // Called when the current character is the first character of a new // token (not including whitespace or comments). inline void StartToken(); @@ -255,9 +307,28 @@ class LIBPROTOBUF_EXPORT Tokenizer { TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); // Consume the rest of a line. - void ConsumeLineComment(); + void ConsumeLineComment(string* content); // Consume until "*/". - void ConsumeBlockComment(); + void ConsumeBlockComment(string* content); + + enum NextCommentStatus { + // Started a line comment. + LINE_COMMENT, + + // Started a block comment. + BLOCK_COMMENT, + + // Consumed a slash, then realized it wasn't a comment. current_ has + // been filled in with a slash token. The caller should return it. + SLASH_NOT_COMMENT, + + // We do not appear to be starting a comment here. + NO_COMMENT + }; + + // If we're at the start of a new comment, consume it and return what kind + // of comment it is. + NextCommentStatus TryConsumeCommentStart(); // ----------------------------------------------------------------- // These helper methods make the parsing code more readable. The diff --git a/src/google/protobuf/io/tokenizer_unittest.cc b/src/google/protobuf/io/tokenizer_unittest.cc index 106d080f..8de43939 100644 --- a/src/google/protobuf/io/tokenizer_unittest.cc +++ b/src/google/protobuf/io/tokenizer_unittest.cc @@ -32,9 +32,10 @@ // Based on original Protocol Buffers design by // Sanjay Ghemawat, Jeff Dean, and others. -#include <vector> -#include <math.h> #include <limits.h> +#include <math.h> + +#include <vector> #include <google/protobuf/io/tokenizer.h> #include <google/protobuf/io/zero_copy_stream_impl.h> @@ -514,6 +515,217 @@ TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { // ------------------------------------------------------------------- +// In each case, the input is expected to have two tokens named "prev" and +// "next" with comments in between. +struct DocCommentCase { + string input; + + const char* prev_trailing_comments; + const char* detached_comments[10]; + const char* next_leading_comments; +}; + +inline ostream& operator<<(ostream& out, + const DocCommentCase& test_case) { + return out << CEscape(test_case.input); +} + +DocCommentCase kDocCommentCases[] = { + { + "prev next", + + "", + {}, + "" + }, + + { + "prev /* ignored */ next", + + "", + {}, + "" + }, + + { + "prev // trailing comment\n" + "next", + + " trailing comment\n", + {}, + "" + }, + + { + "prev\n" + "// leading comment\n" + "// line 2\n" + "next", + + "", + {}, + " leading comment\n" + " line 2\n" + }, + + { + "prev\n" + "// trailing comment\n" + "// line 2\n" + "\n" + "next", + + " trailing comment\n" + " line 2\n", + {}, + "" + }, + + { + "prev // trailing comment\n" + "// leading comment\n" + "// line 2\n" + "next", + + " trailing comment\n", + {}, + " leading comment\n" + " line 2\n" + }, + + { + "prev /* trailing block comment */\n" + "/* leading block comment\n" + " * line 2\n" + " * line 3 */" + "next", + + " trailing block comment ", + {}, + " leading block comment\n" + " line 2\n" + " line 3 " + }, + + { + "prev\n" + "/* trailing block comment\n" + " * line 2\n" + " * line 3\n" + " */\n" + "/* leading block comment\n" + " * line 2\n" + " * line 3 */" + "next", + + " trailing block comment\n" + " line 2\n" + " line 3\n", + {}, + " leading block comment\n" + " line 2\n" + " line 3 " + }, + + { + "prev\n" + "// trailing comment\n" + "\n" + "// detached comment\n" + "// line 2\n" + "\n" + "// second detached comment\n" + "/* third detached comment\n" + " * line 2 */\n" + "// leading comment\n" + "next", + + " trailing comment\n", + { + " detached comment\n" + " line 2\n", + " second detached comment\n", + " third detached comment\n" + " line 2 " + }, + " leading comment\n" + }, + + { + "prev /**/\n" + "\n" + "// detached comment\n" + "\n" + "// leading comment\n" + "next", + + "", + { + " detached comment\n" + }, + " leading comment\n" + }, + + { + "prev /**/\n" + "// leading comment\n" + "next", + + "", + {}, + " leading comment\n" + }, + }; + +TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { + // Set up the tokenizer. + TestInputStream input(kDocCommentCases_case.input.data(), + kDocCommentCases_case.input.size(), + kBlockSizes_case); + TestErrorCollector error_collector; + Tokenizer tokenizer(&input, &error_collector); + + // Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). + TestInputStream input2(kDocCommentCases_case.input.data(), + kDocCommentCases_case.input.size(), + kBlockSizes_case); + Tokenizer tokenizer2(&input2, &error_collector); + + tokenizer.Next(); + tokenizer2.Next(); + + EXPECT_EQ("prev", tokenizer.current().text); + EXPECT_EQ("prev", tokenizer2.current().text); + + string prev_trailing_comments; + vector<string> detached_comments; + string next_leading_comments; + tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, + &next_leading_comments); + tokenizer2.NextWithComments(NULL, NULL, NULL); + EXPECT_EQ("next", tokenizer.current().text); + EXPECT_EQ("next", tokenizer2.current().text); + + EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, + prev_trailing_comments); + + for (int i = 0; i < detached_comments.size(); i++) { + ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases)); + ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL); + EXPECT_EQ(kDocCommentCases_case.detached_comments[i], + detached_comments[i]); + } + + // Verify that we matched all the detached comments. + EXPECT_EQ(NULL, + kDocCommentCases_case.detached_comments[detached_comments.size()]); + + EXPECT_EQ(kDocCommentCases_case.next_leading_comments, + next_leading_comments); +} + +// ------------------------------------------------------------------- + // Test parse helpers. It's not really worth setting up a full data-driven // test here. TEST_F(TokenizerTest, ParseInteger) { @@ -614,6 +826,22 @@ TEST_F(TokenizerTest, ParseString) { Tokenizer::ParseString("'\\", &output); EXPECT_EQ("\\", output); + // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode + // characters. + Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output); + EXPECT_EQ("$¢€𤭢XX", output); + // Same thing encoded using UTF16. + Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output); + EXPECT_EQ("$¢€𤭢XX", output); + // Here's some broken UTF16; there's a head surrogate with no tail surrogate. + // We just output this as if it were UTF8; it's not a defined code point, but + // it has a defined encoding. + Tokenizer::ParseString("'\\ud852XX'", &output); + EXPECT_EQ("\xed\xa1\x92XX", output); + // Malformed escape: Demons may fly out of the nose. + Tokenizer::ParseString("\\u0", &output); + EXPECT_EQ("u0", output); + // Test invalid strings that will never be tokenized as strings. #ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output), @@ -658,6 +886,12 @@ ErrorCase kErrorCases[] = { "0:4: String literals cannot cross line boundaries.\n" }, { "'bar\nfoo", true, "0:4: String literals cannot cross line boundaries.\n" }, + { "'\\u01' foo", true, + "0:5: Expected four hex digits for \\u escape sequence.\n" }, + { "'\\u01' foo", true, + "0:5: Expected four hex digits for \\u escape sequence.\n" }, + { "'\\uXYZ' foo", true, + "0:3: Expected four hex digits for \\u escape sequence.\n" }, // Integer errors. { "123foo", true, @@ -734,7 +968,7 @@ TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { } // Check that the errors match what was expected. - EXPECT_EQ(error_collector.text_, kErrorCases_case.errors); + EXPECT_EQ(kErrorCases_case.errors, error_collector.text_); // If the error was recoverable, make sure we saw "foo" after it. if (kErrorCases_case.recoverable) { @@ -760,6 +994,7 @@ TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { EXPECT_EQ(strlen("foo"), input.ByteCount()); } + } // namespace } // namespace io } // namespace protobuf diff --git a/src/google/protobuf/io/zero_copy_stream_impl.cc b/src/google/protobuf/io/zero_copy_stream_impl.cc index 1384c746..9fcbb622 100644 --- a/src/google/protobuf/io/zero_copy_stream_impl.cc +++ b/src/google/protobuf/io/zero_copy_stream_impl.cc @@ -46,7 +46,8 @@ #include <google/protobuf/io/zero_copy_stream_impl.h> #include <google/protobuf/stubs/common.h> -#include <google/protobuf/stubs/stl_util-inl.h> +#include <google/protobuf/stubs/stl_util.h> + namespace google { namespace protobuf { diff --git a/src/google/protobuf/io/zero_copy_stream_impl_lite.cc b/src/google/protobuf/io/zero_copy_stream_impl_lite.cc index e8012510..f552e1f8 100644 --- a/src/google/protobuf/io/zero_copy_stream_impl_lite.cc +++ b/src/google/protobuf/io/zero_copy_stream_impl_lite.cc @@ -32,9 +32,9 @@ // Based on original Protocol Buffers design by // Sanjay Ghemawat, Jeff Dean, and others. -#include <google/protobuf/io/zero_copy_stream_impl.h> +#include <google/protobuf/io/zero_copy_stream_impl_lite.h> #include <google/protobuf/stubs/common.h> -#include <google/protobuf/stubs/stl_util-inl.h> +#include <google/protobuf/stubs/stl_util.h> namespace google { namespace protobuf { diff --git a/src/google/protobuf/io/zero_copy_stream_unittest.cc b/src/google/protobuf/io/zero_copy_stream_unittest.cc index 5196d905..6f155df7 100644 --- a/src/google/protobuf/io/zero_copy_stream_unittest.cc +++ b/src/google/protobuf/io/zero_copy_stream_unittest.cc @@ -370,6 +370,100 @@ TEST_F(IoTest, GzipIo) { delete [] buffer; } +TEST_F(IoTest, GzipIoWithFlush) { + const int kBufferSize = 2*1024; + uint8* buffer = new uint8[kBufferSize]; + // We start with i = 4 as we want a block size > 6. With block size <= 6 + // Flush() fills up the entire 2K buffer with flush markers and the test + // fails. See documentation for Flush() for more detail. + for (int i = 4; i < kBlockSizeCount; i++) { + for (int j = 0; j < kBlockSizeCount; j++) { + for (int z = 0; z < kBlockSizeCount; z++) { + int gzip_buffer_size = kBlockSizes[z]; + int size; + { + ArrayOutputStream output(buffer, kBufferSize, kBlockSizes[i]); + GzipOutputStream::Options options; + options.format = GzipOutputStream::GZIP; + if (gzip_buffer_size != -1) { + options.buffer_size = gzip_buffer_size; + } + GzipOutputStream gzout(&output, options); + WriteStuff(&gzout); + EXPECT_TRUE(gzout.Flush()); + gzout.Close(); + size = output.ByteCount(); + } + { + ArrayInputStream input(buffer, size, kBlockSizes[j]); + GzipInputStream gzin( + &input, GzipInputStream::GZIP, gzip_buffer_size); + ReadStuff(&gzin); + } + } + } + } + delete [] buffer; +} + +TEST_F(IoTest, GzipIoContiguousFlushes) { + const int kBufferSize = 2*1024; + uint8* buffer = new uint8[kBufferSize]; + + int block_size = kBlockSizes[4]; + int gzip_buffer_size = block_size; + int size; + + ArrayOutputStream output(buffer, kBufferSize, block_size); + GzipOutputStream::Options options; + options.format = GzipOutputStream::GZIP; + if (gzip_buffer_size != -1) { + options.buffer_size = gzip_buffer_size; + } + GzipOutputStream gzout(&output, options); + WriteStuff(&gzout); + EXPECT_TRUE(gzout.Flush()); + EXPECT_TRUE(gzout.Flush()); + gzout.Close(); + size = output.ByteCount(); + + ArrayInputStream input(buffer, size, block_size); + GzipInputStream gzin( + &input, GzipInputStream::GZIP, gzip_buffer_size); + ReadStuff(&gzin); + + delete [] buffer; +} + +TEST_F(IoTest, GzipIoReadAfterFlush) { + const int kBufferSize = 2*1024; + uint8* buffer = new uint8[kBufferSize]; + + int block_size = kBlockSizes[4]; + int gzip_buffer_size = block_size; + int size; + ArrayOutputStream output(buffer, kBufferSize, block_size); + GzipOutputStream::Options options; + options.format = GzipOutputStream::GZIP; + if (gzip_buffer_size != -1) { + options.buffer_size = gzip_buffer_size; + } + + GzipOutputStream gzout(&output, options); + WriteStuff(&gzout); + EXPECT_TRUE(gzout.Flush()); + size = output.ByteCount(); + + ArrayInputStream input(buffer, size, block_size); + GzipInputStream gzin( + &input, GzipInputStream::GZIP, gzip_buffer_size); + ReadStuff(&gzin); + + gzout.Close(); + + delete [] buffer; +} + TEST_F(IoTest, ZlibIo) { const int kBufferSize = 2*1024; uint8* buffer = new uint8[kBufferSize]; |