aboutsummaryrefslogtreecommitdiff
path: root/python/google/protobuf/text_format.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/google/protobuf/text_format.py')
-rwxr-xr-xpython/google/protobuf/text_format.py149
1 files changed, 97 insertions, 52 deletions
diff --git a/python/google/protobuf/text_format.py b/python/google/protobuf/text_format.py
index 2cbd21bc..998cd681 100755
--- a/python/google/protobuf/text_format.py
+++ b/python/google/protobuf/text_format.py
@@ -55,15 +55,15 @@ from google.protobuf.internal import type_checkers
from google.protobuf import descriptor
from google.protobuf import text_encoding
-__all__ = ['MessageToString', 'PrintMessage', 'PrintField', 'PrintFieldValue',
- 'Merge']
+__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
+ 'PrintFieldValue', 'Merge', 'MessageToBytes']
_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
type_checkers.Int32ValueChecker(),
type_checkers.Uint64ValueChecker(),
type_checkers.Int64ValueChecker())
-_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
-_FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
+_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
+_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
_FLOAT_TYPES = frozenset([descriptor.FieldDescriptor.CPPTYPE_FLOAT,
descriptor.FieldDescriptor.CPPTYPE_DOUBLE])
_QUOTES = frozenset(("'", '"'))
@@ -121,6 +121,7 @@ class TextWriter(object):
def MessageToString(message,
as_utf8=False,
as_one_line=False,
+ use_short_repeated_primitives=False,
pointy_brackets=False,
use_index_order=False,
float_format=None,
@@ -128,6 +129,7 @@ def MessageToString(message,
descriptor_pool=None,
indent=0,
message_formatter=None):
+ # type: (...) -> str
"""Convert protobuf message to text format.
Floating point values can be formatted compactly with 15 digits of
@@ -137,8 +139,11 @@ def MessageToString(message,
Args:
message: The protocol buffers message.
- as_utf8: Produce text output in UTF8 format.
+ as_utf8: Return unescaped Unicode for non-ASCII characters.
+ In Python 3 actual Unicode characters may appear as is in strings.
+ In Python 2 the return value will be valid UTF-8 rather than only ASCII.
as_one_line: Don't introduce newlines between fields.
+ use_short_repeated_primitives: Use short repeated format for primitives.
pointy_brackets: If True, use angle brackets instead of curly braces for
nesting.
use_index_order: If True, fields of a proto message will be printed using
@@ -150,7 +155,7 @@ def MessageToString(message,
(per the "Format Specification Mini-Language"); otherwise, str() is used.
use_field_number: If True, print field numbers instead of names.
descriptor_pool: A DescriptorPool used to resolve Any types.
- indent: The indent level, in terms of spaces, for pretty print.
+ indent: The initial indent level, in terms of spaces, for pretty print.
message_formatter: A function(message, indent, as_one_line): unicode|None
to custom format selected sub-messages (usually based on message type).
Use to pretty print parts of the protobuf for easier diffing.
@@ -159,7 +164,8 @@ def MessageToString(message,
A string of the text formatted protocol buffer message.
"""
out = TextWriter(as_utf8)
- printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
+ printer = _Printer(out, indent, as_utf8, as_one_line,
+ use_short_repeated_primitives, pointy_brackets,
use_index_order, float_format, use_field_number,
descriptor_pool, message_formatter)
printer.PrintMessage(message)
@@ -170,6 +176,16 @@ def MessageToString(message,
return result
+def MessageToBytes(message, **kwargs):
+ # type: (...) -> bytes
+ """Convert protobuf message to encoded text format. See MessageToString."""
+ text = MessageToString(message, **kwargs)
+ if isinstance(text, bytes):
+ return text
+ codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
+ return text.encode(codec)
+
+
def _IsMapEntry(field):
return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
field.message_type.has_options and
@@ -181,13 +197,15 @@ def PrintMessage(message,
indent=0,
as_utf8=False,
as_one_line=False,
+ use_short_repeated_primitives=False,
pointy_brackets=False,
use_index_order=False,
float_format=None,
use_field_number=False,
descriptor_pool=None,
message_formatter=None):
- printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
+ printer = _Printer(out, indent, as_utf8, as_one_line,
+ use_short_repeated_primitives, pointy_brackets,
use_index_order, float_format, use_field_number,
descriptor_pool, message_formatter)
printer.PrintMessage(message)
@@ -199,12 +217,14 @@ def PrintField(field,
indent=0,
as_utf8=False,
as_one_line=False,
+ use_short_repeated_primitives=False,
pointy_brackets=False,
use_index_order=False,
float_format=None,
message_formatter=None):
"""Print a single field name/value pair."""
- printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
+ printer = _Printer(out, indent, as_utf8, as_one_line,
+ use_short_repeated_primitives, pointy_brackets,
use_index_order, float_format, message_formatter)
printer.PrintField(field, value)
@@ -215,12 +235,14 @@ def PrintFieldValue(field,
indent=0,
as_utf8=False,
as_one_line=False,
+ use_short_repeated_primitives=False,
pointy_brackets=False,
use_index_order=False,
float_format=None,
message_formatter=None):
"""Print a single field value (not including name)."""
- printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
+ printer = _Printer(out, indent, as_utf8, as_one_line,
+ use_short_repeated_primitives, pointy_brackets,
use_index_order, float_format, message_formatter)
printer.PrintFieldValue(field, value)
@@ -258,6 +280,7 @@ class _Printer(object):
indent=0,
as_utf8=False,
as_one_line=False,
+ use_short_repeated_primitives=False,
pointy_brackets=False,
use_index_order=False,
float_format=None,
@@ -273,9 +296,12 @@ class _Printer(object):
Args:
out: To record the text format result.
- indent: The indent level for pretty print.
- as_utf8: Produce text output in UTF8 format.
+ indent: The initial indent level for pretty print.
+ as_utf8: Return unescaped Unicode for non-ASCII characters.
+ In Python 3 actual Unicode characters may appear as is in strings.
+ In Python 2 the return value will be valid UTF-8 rather than ASCII.
as_one_line: Don't introduce newlines between fields.
+ use_short_repeated_primitives: Use short repeated format for primitives.
pointy_brackets: If True, use angle brackets instead of curly braces for
nesting.
use_index_order: If True, print fields of a proto message using the order
@@ -294,6 +320,7 @@ class _Printer(object):
self.indent = indent
self.as_utf8 = as_utf8
self.as_one_line = as_one_line
+ self.use_short_repeated_primitives = use_short_repeated_primitives
self.pointy_brackets = pointy_brackets
self.use_index_order = use_index_order
self.float_format = float_format
@@ -303,11 +330,13 @@ class _Printer(object):
def _TryPrintAsAnyMessage(self, message):
"""Serializes if message is a google.protobuf.Any field."""
+ if '/' not in message.type_url:
+ return False
packed_message = _BuildMessageFromTypeName(message.TypeName(),
self.descriptor_pool)
if packed_message:
packed_message.MergeFromString(message.value)
- self.out.write('%s[%s]' % (self.indent * ' ', message.type_url))
+ self.out.write('%s[%s] ' % (self.indent * ' ', message.type_url))
self._PrintMessageFieldValue(packed_message)
self.out.write(' ' if self.as_one_line else '\n')
return True
@@ -351,13 +380,18 @@ class _Printer(object):
entry_submsg = value.GetEntryClass()(key=key, value=value[key])
self.PrintField(field, entry_submsg)
elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
- for element in value:
- self.PrintField(field, element)
+ if (self.use_short_repeated_primitives
+ and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
+ and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
+ self._PrintShortRepeatedPrimitivesValue(field, value)
+ else:
+ for element in value:
+ self.PrintField(field, element)
else:
self.PrintField(field, value)
- def PrintField(self, field, value):
- """Print a single field name/value pair."""
+ def _PrintFieldName(self, field):
+ """Print field name."""
out = self.out
out.write(' ' * self.indent)
if self.use_field_number:
@@ -381,13 +415,25 @@ class _Printer(object):
if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
# The colon is optional in this case, but our cross-language golden files
# don't include it.
- out.write(': ')
+ out.write(':')
+ def PrintField(self, field, value):
+ """Print a single field name/value pair."""
+ self._PrintFieldName(field)
+ self.out.write(' ')
self.PrintFieldValue(field, value)
- if self.as_one_line:
- out.write(' ')
- else:
- out.write('\n')
+ self.out.write(' ' if self.as_one_line else '\n')
+
+ def _PrintShortRepeatedPrimitivesValue(self, field, value):
+ # Note: this is called only when value has at least one element.
+ self._PrintFieldName(field)
+ self.out.write('[')
+ for i in xrange(len(value) - 1):
+ self.PrintFieldValue(field, value[i])
+ self.out.write(', ')
+ self.PrintFieldValue(field, value[-1])
+ self.out.write(']')
+ self.out.write(' ' if self.as_one_line else '\n')
def _PrintMessageFieldValue(self, value):
if self.pointy_brackets:
@@ -398,11 +444,11 @@ class _Printer(object):
closeb = '}'
if self.as_one_line:
- self.out.write(' %s ' % openb)
+ self.out.write('%s ' % openb)
self.PrintMessage(value)
self.out.write(closeb)
else:
- self.out.write(' %s\n' % openb)
+ self.out.write('%s\n' % openb)
self.indent += 2
self.PrintMessage(value)
self.indent -= 2
@@ -428,12 +474,12 @@ class _Printer(object):
out.write(str(value))
elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
out.write('\"')
- if isinstance(value, six.text_type):
+ if isinstance(value, six.text_type) and (six.PY2 or not self.as_utf8):
out_value = value.encode('utf-8')
else:
out_value = value
if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
- # We need to escape non-UTF8 chars in TYPE_BYTES field.
+ # We always need to escape all binary data in TYPE_BYTES fields.
out_as_utf8 = False
else:
out_as_utf8 = self.as_utf8
@@ -487,12 +533,7 @@ def Parse(text,
Raises:
ParseError: On text parsing problems.
"""
- if not isinstance(text, str):
- if six.PY3:
- text = text.decode('utf-8')
- else:
- text = text.encode('utf-8')
- return ParseLines(text.split('\n'),
+ return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
message,
allow_unknown_extension,
allow_field_number,
@@ -523,13 +564,8 @@ def Merge(text,
Raises:
ParseError: On text parsing problems.
"""
- if not isinstance(text, str):
- if six.PY3:
- text = text.decode('utf-8')
- else:
- text = text.encode('utf-8')
return MergeLines(
- text.split('\n'),
+ text.split(b'\n' if isinstance(text, bytes) else u'\n'),
message,
allow_unknown_extension,
allow_field_number,
@@ -570,6 +606,9 @@ def MergeLines(lines,
descriptor_pool=None):
"""Parses a text representation of a protocol message into a message.
+ Like ParseLines(), but allows repeated values for a non-repeated field, and
+ uses the last one.
+
Args:
lines: An iterable of lines of a message's text representation.
message: A protocol buffer message to merge into.
@@ -601,22 +640,12 @@ class _Parser(object):
self.allow_field_number = allow_field_number
self.descriptor_pool = descriptor_pool
- def ParseFromString(self, text, message):
- """Parses a text representation of a protocol message into a message."""
- if not isinstance(text, str):
- text = text.decode('utf-8')
- return self.ParseLines(text.split('\n'), message)
-
def ParseLines(self, lines, message):
"""Parses a text representation of a protocol message into a message."""
self._allow_multiple_scalars = False
self._ParseOrMerge(lines, message)
return message
- def MergeFromString(self, text, message):
- """Merges a text representation of a protocol message into a message."""
- return self._MergeLines(text.split('\n'), message)
-
def MergeLines(self, lines, message):
"""Merges a text representation of a protocol message into a message."""
self._allow_multiple_scalars = True
@@ -633,7 +662,14 @@ class _Parser(object):
Raises:
ParseError: On text parsing problems.
"""
- tokenizer = Tokenizer(lines)
+ # Tokenize expects native str lines.
+ if six.PY2:
+ str_lines = (line if isinstance(line, str) else line.encode('utf-8')
+ for line in lines)
+ else:
+ str_lines = (line if isinstance(line, str) else line.decode('utf-8')
+ for line in lines)
+ tokenizer = Tokenizer(str_lines)
while not tokenizer.AtEnd():
self._MergeField(tokenizer, message)
@@ -1019,7 +1055,9 @@ class Tokenizer(object):
r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier
r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number
] + [ # quoted str for each quote mark
- r'{qt}([^{qt}\n\\]|\\.)*({qt}|\\?$)'.format(qt=mark) for mark in _QUOTES
+ # Avoid backtracking! https://stackoverflow.com/a/844267
+ r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
+ for mark in _QUOTES
]))
_IDENTIFIER = re.compile(r'[^\d\W]\w*')
@@ -1316,7 +1354,8 @@ class Tokenizer(object):
def ParseError(self, message):
"""Creates and *returns* a ParseError for the current token."""
- return ParseError(message, self._line + 1, self._column + 1)
+ return ParseError('\'' + self._current_line + '\': ' + message,
+ self._line + 1, self._column + 1)
def _StringParseError(self, e):
return self.ParseError('Couldn\'t parse string: ' + str(e))
@@ -1490,6 +1529,12 @@ def _ParseAbstractInteger(text, is_long=False):
ValueError: Thrown Iff the text is not a valid integer.
"""
# Do the actual parsing. Exception handling is propagated to caller.
+ orig_text = text
+ c_octal_match = re.match(r'(-?)0(\d+)$', text)
+ if c_octal_match:
+ # Python 3 no longer supports 0755 octal syntax without the 'o', so
+ # we always use the '0o' prefix for multi-digit numbers starting with 0.
+ text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
try:
# We force 32-bit values to int and 64-bit values to long to make
# alternate implementations where the distinction is more significant
@@ -1499,7 +1544,7 @@ def _ParseAbstractInteger(text, is_long=False):
else:
return int(text, 0)
except ValueError:
- raise ValueError('Couldn\'t parse integer: %s' % text)
+ raise ValueError('Couldn\'t parse integer: %s' % orig_text)
def ParseFloat(text):