diff options
Diffstat (limited to 'src/google/protobuf/stubs/structurally_valid.cc')
-rw-r--r-- | src/google/protobuf/stubs/structurally_valid.cc | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/src/google/protobuf/stubs/structurally_valid.cc b/src/google/protobuf/stubs/structurally_valid.cc index 0f6afe6d..d79a6ee4 100644 --- a/src/google/protobuf/stubs/structurally_valid.cc +++ b/src/google/protobuf/stubs/structurally_valid.cc @@ -3,6 +3,8 @@ #include <google/protobuf/stubs/common.h> +#include <google/protobuf/stubs/stringpiece.h> + namespace google { namespace protobuf { namespace internal { @@ -531,6 +533,56 @@ bool IsStructurallyValidUTF8(const char* buf, int len) { return (bytes_consumed == len); } +int UTF8SpnStructurallyValid(const StringPiece& str) { + if (!module_initialized_) return str.size(); + + int bytes_consumed = 0; + UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, + str.data(), str.size(), &bytes_consumed); + return bytes_consumed; +} + +// Coerce UTF-8 byte string in src_str to be +// a structurally-valid equal-length string by selectively +// overwriting illegal bytes with replace_char (typically blank). +// replace_char must be legal printable 7-bit Ascii 0x20..0x7e. +// src_str is read-only. If any overwriting is needed, a modified byte string +// is created in idst, length isrclen. +// +// Returns pointer to output buffer, isrc if no changes were made, +// or idst if some bytes were changed. +// +// Fast case: all is structurally valid and no byte copying is done. +// +char* UTF8CoerceToStructurallyValid(const StringPiece& src_str, + char* idst, + const char replace_char) { + const char* isrc = src_str.data(); + const int len = src_str.length(); + int n = UTF8SpnStructurallyValid(src_str); + if (n == len) { // Normal case -- all is cool, return + return const_cast<char*>(isrc); + } else { // Unusual case -- copy w/o bad bytes + const char* src = isrc; + const char* srclimit = isrc + len; + char* dst = idst; + memmove(dst, src, n); // Copy initial good chunk + src += n; + dst += n; + while (src < srclimit) { // src points to bogus byte or is off the end + dst[0] = replace_char; // replace one bad byte + src++; + dst++; + StringPiece str2(src, srclimit - src); + n = UTF8SpnStructurallyValid(str2); // scan the remainder + memmove(dst, src, n); // copy next good chunk + src += n; + dst += n; + } + } + return idst; +} + } // namespace internal } // namespace protobuf } // namespace google |