aboutsummaryrefslogtreecommitdiff
path: root/src/google/protobuf/io/tokenizer.h
blob: e80d564cd5e8f477a4cfdbee3c6cfb93ab95900f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Author: kenton@google.com (Kenton Varda)
//  Based on original Protocol Buffers design by
//  Sanjay Ghemawat, Jeff Dean, and others.
//
// Class for parsing tokenized text from a ZeroCopyInputStream.

#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__

#include <string>
#include <vector>
#include <google/protobuf/stubs/common.h>
#include <google/protobuf/stubs/logging.h>

namespace google {
namespace protobuf {
namespace io {

class ZeroCopyInputStream;     // zero_copy_stream.h

// Defined in this file.
class ErrorCollector;
class Tokenizer;

// By "column number", the proto compiler refers to a count of the number
// of bytes before a given byte, except that a tab character advances to
// the next multiple of 8 bytes.  Note in particular that column numbers
// are zero-based, while many user interfaces use one-based column numbers.
typedef int ColumnNumber;

// Abstract interface for an object which collects the errors that occur
// during parsing.  A typical implementation might simply print the errors
// to stdout.
class LIBPROTOBUF_EXPORT ErrorCollector {
 public:
  inline ErrorCollector() {}
  virtual ~ErrorCollector();

  // Indicates that there was an error in the input at the given line and
  // column numbers.  The numbers are zero-based, so you may want to add
  // 1 to each before printing them.
  virtual void AddError(int line, ColumnNumber column,
                        const string& message) = 0;

  // Indicates that there was a warning in the input at the given line and
  // column numbers.  The numbers are zero-based, so you may want to add
  // 1 to each before printing them.
  virtual void AddWarning(int line, ColumnNumber column,
                          const string& message) { }

 private:
  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
};

// This class converts a stream of raw text into a stream of tokens for
// the protocol definition parser to parse.  The tokens recognized are
// similar to those that make up the C language; see the TokenType enum for
// precise descriptions.  Whitespace and comments are skipped.  By default,
// C- and C++-style comments are recognized, but other styles can be used by
// calling set_comment_style().
class LIBPROTOBUF_EXPORT Tokenizer {
 public:
  // Construct a Tokenizer that reads and tokenizes text from the given
  // input stream and writes errors to the given error_collector.
  // The caller keeps ownership of input and error_collector.
  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
  ~Tokenizer();

  enum TokenType {
    TYPE_START,       // Next() has not yet been called.
    TYPE_END,         // End of input reached.  "text" is empty.

    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
                      // starting with a digit.  It is an error for a number
                      // to be followed by an identifier with no space in
                      // between.
    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
                      // the digits are decimal, but a prefix of "0x" indicates
                      // a hex number and a leading zero indicates octal, just
                      // like with C numeric literals.  A leading negative sign
                      // is NOT included in the token; it's up to the parser to
                      // interpret the unary minus operator on its own.
    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
                      // an exponent.  Always in decimal.  Again, never
                      // negative.
    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
                      // or double quotes can be used, but they must match.
                      // A string literal cannot cross a line break.
    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
                      // Symbols are always a single character, so "!+$%" is
                      // four tokens.
  };

  // Structure representing a token read from the token stream.
  struct Token {
    TokenType type;
    string text;       // The exact text of the token as it appeared in
                       // the input.  e.g. tokens of TYPE_STRING will still
                       // be escaped and in quotes.

    // "line" and "column" specify the position of the first character of
    // the token within the input stream.  They are zero-based.
    int line;
    ColumnNumber column;
    ColumnNumber end_column;
  };

  // Get the current token.  This is updated when Next() is called.  Before
  // the first call to Next(), current() has type TYPE_START and no contents.
  const Token& current();

  // Return the previous token -- i.e. what current() returned before the
  // previous call to Next().
  const Token& previous();

  // Advance to the next token.  Returns false if the end of the input is
  // reached.
  bool Next();

  // Like Next(), but also collects comments which appear between the previous
  // and next tokens.
  //
  // Comments which appear to be attached to the previous token are stored
  // in *prev_tailing_comments.  Comments which appear to be attached to the
  // next token are stored in *next_leading_comments.  Comments appearing in
  // between which do not appear to be attached to either will be added to
  // detached_comments.  Any of these parameters can be NULL to simply discard
  // the comments.
  //
  // A series of line comments appearing on consecutive lines, with no other
  // tokens appearing on those lines, will be treated as a single comment.
  //
  // Only the comment content is returned; comment markers (e.g. //) are
  // stripped out.  For block comments, leading whitespace and an asterisk will
  // be stripped from the beginning of each line other than the first.  Newlines
  // are included in the output.
  //
  // Examples:
  //
  //   optional int32 foo = 1;  // Comment attached to foo.
  //   // Comment attached to bar.
  //   optional int32 bar = 2;
  //
  //   optional string baz = 3;
  //   // Comment attached to baz.
  //   // Another line attached to baz.
  //
  //   // Comment attached to qux.
  //   //
  //   // Another line attached to qux.
  //   optional double qux = 4;
  //
  //   // Detached comment.  This is not attached to qux or corge
  //   // because there are blank lines separating it from both.
  //
  //   optional string corge = 5;
  //   /* Block comment attached
  //    * to corge.  Leading asterisks
  //    * will be removed. */
  //   /* Block comment attached to
  //    * grault. */
  //   optional int32 grault = 6;
  bool NextWithComments(string* prev_trailing_comments,
                        std::vector<string>* detached_comments,
                        string* next_leading_comments);

  // Parse helpers ---------------------------------------------------

  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
  // result is undefined (possibly an assert failure).
  static double ParseFloat(const string& text);

  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
  // result is undefined (possibly an assert failure).
  static void ParseString(const string& text, string* output);

  // Identical to ParseString, but appends to output.
  static void ParseStringAppend(const string& text, string* output);

  // Parses a TYPE_INTEGER token.  Returns false if the result would be
  // greater than max_value.  Otherwise, returns true and sets *output to the
  // result.  If the text is not from a Token of type TYPE_INTEGER originally
  // parsed by a Tokenizer, the result is undefined (possibly an assert
  // failure).
  static bool ParseInteger(const string& text, uint64 max_value,
                           uint64* output);

  // Options ---------------------------------------------------------

  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
  // which would otherwise be integers but which have the 'f' suffix will be
  // forced to be interpreted as floats.  For all other purposes, the 'f' is
  // ignored.
  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }

  // Valid values for set_comment_style().
  enum CommentStyle {
    // Line comments begin with "//", block comments are delimited by "/*" and
    // "*/".
    CPP_COMMENT_STYLE,
    // Line comments begin with "#".  No way to write block comments.
    SH_COMMENT_STYLE
  };

  // Sets the comment style.
  void set_comment_style(CommentStyle style) { comment_style_ = style; }

  // Whether to require whitespace between a number and a field name.
  // Default is true. Do not use this; for Google-internal cleanup only.
  void set_require_space_after_number(bool require) {
    require_space_after_number_ = require;
  }

  // Whether to allow string literals to span multiple lines. Default is false.
  // Do not use this; for Google-internal cleanup only.
  void set_allow_multiline_strings(bool allow) {
    allow_multiline_strings_ = allow;
  }

  // External helper: validate an identifier.
  static bool IsIdentifier(const string& text);

  // -----------------------------------------------------------------
 private:
  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);

  Token current_;           // Returned by current().
  Token previous_;          // Returned by previous().

  ZeroCopyInputStream* input_;
  ErrorCollector* error_collector_;

  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
  const char* buffer_;      // Current buffer returned from input_.
  int buffer_size_;         // Size of buffer_.
  int buffer_pos_;          // Current position within the buffer.
  bool read_error_;         // Did we previously encounter a read error?

  // Line and column number of current_char_ within the whole input stream.
  int line_;
  ColumnNumber column_;

  // String to which text should be appended as we advance through it.
  // Call RecordTo(&str) to start recording and StopRecording() to stop.
  // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
  // position within the current buffer where recording started.
  string* record_target_;
  int record_start_;

  // Options.
  bool allow_f_after_float_;
  CommentStyle comment_style_;
  bool require_space_after_number_;
  bool allow_multiline_strings_;

  // Since we count columns we need to interpret tabs somehow.  We'll take
  // the standard 8-character definition for lack of any way to do better.
  // This must match the documentation of ColumnNumber.
  static const int kTabWidth = 8;

  // -----------------------------------------------------------------
  // Helper methods.

  // Consume this character and advance to the next one.
  void NextChar();

  // Read a new buffer from the input.
  void Refresh();

  inline void RecordTo(string* target);
  inline void StopRecording();

  // Called when the current character is the first character of a new
  // token (not including whitespace or comments).
  inline void StartToken();
  // Called when the current character is the first character after the
  // end of the last token.  After this returns, current_.text will
  // contain all text consumed since StartToken() was called.
  inline void EndToken();

  // Convenience method to add an error at the current line and column.
  void AddError(const string& message) {
    error_collector_->AddError(line_, column_, message);
  }

  // -----------------------------------------------------------------
  // The following four methods are used to consume tokens of specific
  // types.  They are actually used to consume all characters *after*
  // the first, since the calling function consumes the first character
  // in order to decide what kind of token is being read.

  // Read and consume a string, ending when the given delimiter is
  // consumed.
  void ConsumeString(char delimiter);

  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
  // depending on what was read.  This needs to know if the first
  // character was a zero in order to correctly recognize hex and octal
  // numbers.
  // It also needs to know if the first character was a . to parse floating
  // point correctly.
  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);

  // Consume the rest of a line.
  void ConsumeLineComment(string* content);
  // Consume until "*/".
  void ConsumeBlockComment(string* content);

  enum NextCommentStatus {
    // Started a line comment.
    LINE_COMMENT,

    // Started a block comment.
    BLOCK_COMMENT,

    // Consumed a slash, then realized it wasn't a comment.  current_ has
    // been filled in with a slash token.  The caller should return it.
    SLASH_NOT_COMMENT,

    // We do not appear to be starting a comment here.
    NO_COMMENT
  };

  // If we're at the start of a new comment, consume it and return what kind
  // of comment it is.
  NextCommentStatus TryConsumeCommentStart();

  // -----------------------------------------------------------------
  // These helper methods make the parsing code more readable.  The
  // "character classes" referred to are defined at the top of the .cc file.
  // Basically it is a C++ class with one method:
  //   static bool InClass(char c);
  // The method returns true if c is a member of this "class", like "Letter"
  // or "Digit".

  // Returns true if the current character is of the given character
  // class, but does not consume anything.
  template<typename CharacterClass>
  inline bool LookingAt();

  // If the current character is in the given class, consume it and return
  // true.  Otherwise return false.
  // e.g. TryConsumeOne<Letter>()
  template<typename CharacterClass>
  inline bool TryConsumeOne();

  // Like above, but try to consume the specific character indicated.
  inline bool TryConsume(char c);

  // Consume zero or more of the given character class.
  template<typename CharacterClass>
  inline void ConsumeZeroOrMore();

  // Consume one or more of the given character class or log the given
  // error message.
  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
  template<typename CharacterClass>
  inline void ConsumeOneOrMore(const char* error);
};

// inline methods ====================================================
inline const Tokenizer::Token& Tokenizer::current() {
  return current_;
}

inline const Tokenizer::Token& Tokenizer::previous() {
  return previous_;
}

inline void Tokenizer::ParseString(const string& text, string* output) {
  output->clear();
  ParseStringAppend(text, output);
}

}  // namespace io
}  // namespace protobuf

}  // namespace google
#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__