From 65da9fd97f67a9499771b6294cf017248ece5661 Mon Sep 17 00:00:00 2001 From: Laszlo Csomor Date: Thu, 30 Nov 2017 14:12:24 +0100 Subject: io_win32: support non-ASCII paths Fixes https://github.com/google/protobuf/issues/3951 --- src/google/protobuf/stubs/io_win32.cc | 157 ++++++++++++++++--------- src/google/protobuf/stubs/io_win32.h | 18 ++- src/google/protobuf/stubs/io_win32_unittest.cc | 90 +++++++++----- 3 files changed, 176 insertions(+), 89 deletions(-) diff --git a/src/google/protobuf/stubs/io_win32.cc b/src/google/protobuf/stubs/io_win32.cc index fa2cb8b1..b5af494a 100644 --- a/src/google/protobuf/stubs/io_win32.cc +++ b/src/google/protobuf/stubs/io_win32.cc @@ -30,10 +30,11 @@ // Author: laszlocsomor@google.com (Laszlo Csomor) // -// Implementation for long-path-aware open/mkdir/etc. on Windows. +// Implementation for long-path-aware open/mkdir/access/etc. on Windows, as well +// as for the supporting utility functions. // // These functions convert the input path to an absolute Windows path -// with "\\?\" prefix if necessary, then pass that to _wopen/_wmkdir/etc. +// with "\\?\" prefix, then pass that to _wopen/_wmkdir/_waccess/etc. // (declared in ) respectively. This allows working with files/directories // whose paths are longer than MAX_PATH (260 chars). // @@ -59,7 +60,6 @@ #include #include -#include #include #include #include @@ -89,6 +89,11 @@ struct CharTraits { static bool is_alpha(wchar_t ch) { return iswalpha(ch); } }; +template +bool null_or_empty(const char_type* s) { + return s == nullptr || *s == 0; +} + // Returns true if the path starts with a drive letter, e.g. "c:". // Note that this won't check for the "\" after the drive letter, so this also // returns true for "c:foo" (which is "c:\${PWD}\foo"). @@ -121,16 +126,7 @@ bool is_drive_relative(const char_type* path) { return has_drive_letter(path) && (path[2] == 0 || !is_separator(path[2])); } -template -void replace_directory_separators(char_type* p) { - for (; *p; ++p) { - if (*p == '/') { - *p = '\\'; - } - } -} - -string join_paths(const string& path1, const string& path2) { +wstring join_paths(const wstring& path1, const wstring& path2) { if (path1.empty() || is_path_absolute(path2.c_str()) || has_longpath_prefix(path2.c_str())) { return path2; @@ -144,23 +140,23 @@ string join_paths(const string& path1, const string& path2) { : (path1 + path2); } else { return is_separator(path2[0]) ? (path1 + path2) - : (path1 + '\\' + path2); + : (path1 + L'\\' + path2); } } -string normalize(string path) { +wstring normalize(wstring path) { if (has_longpath_prefix(path.c_str())) { path = path.substr(4); } - static const string dot("."); - static const string dotdot(".."); + static const wstring dot(L"."); + static const wstring dotdot(L".."); - std::vector segments; + std::vector segments; int segment_start = -1; // Find the path segments in `path` (separated by "/"). for (int i = 0;; ++i) { - if (!is_separator(path[i]) && path[i] != '\0') { + if (!is_separator(path[i]) && path[i] != L'\0') { // The current character does not end a segment, so start one unless it's // already started. if (segment_start < 0) { @@ -169,7 +165,7 @@ string normalize(string path) { } else if (segment_start >= 0 && i > segment_start) { // The current character is "/" or "\0", so this ends a segment. // Add that to `segments` if there's anything to add; handle "." and "..". - string segment(path, segment_start, i - segment_start); + wstring segment(path, segment_start, i - segment_start); segment_start = -1; if (segment == dotdot) { if (!segments.empty() && @@ -180,7 +176,7 @@ string normalize(string path) { segments.push_back(segment); } } - if (path[i] == '\0') { + if (path[i] == L'\0') { break; } } @@ -189,64 +185,58 @@ string normalize(string path) { // form of it, e.g. "c:\.."). if (segments.size() == 1 && segments[0].size() == 2 && has_drive_letter(segments[0].c_str())) { - return segments[0] + '\\'; + return segments[0] + L'\\'; } // Join all segments. bool first = true; - std::ostringstream result; + std::wstringstream result; for (int i = 0; i < segments.size(); ++i) { if (!first) { - result << '\\'; + result << L'\\'; } first = false; result << segments[i]; } // Preserve trailing separator if the input contained it. if (!path.empty() && is_separator(path[path.size() - 1])) { - result << '\\'; + result << L'\\'; } return result.str(); } -WCHAR* as_wstring(const string& s) { - int len = ::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), s.size(), NULL, 0); - WCHAR* result = new WCHAR[len + 1]; - ::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), s.size(), result, len + 1); - result[len] = 0; - return result; -} - -void as_wchar_path(const string& path, wstring* wchar_path) { - scoped_array wbuf(as_wstring(path)); - replace_directory_separators(wbuf.get()); - wchar_path->assign(wbuf.get()); -} - -bool as_windows_path(const string& path, wstring* result) { - if (path.empty()) { +bool as_windows_path(const char* path, wstring* result) { + if (null_or_empty(path)) { result->clear(); return true; } - if (is_separator(path[0]) || is_drive_relative(path.c_str())) { + if (is_separator(path[0]) || is_drive_relative(path)) { + return false; + } + + wstring wpath; + if (!strings::utf8_to_wcs(path, &wpath)) { return false; } - string mutable_path = path; - if (!is_path_absolute(mutable_path.c_str()) && - !has_longpath_prefix(mutable_path.c_str())) { - char cwd[MAX_PATH]; - ::GetCurrentDirectoryA(MAX_PATH, cwd); - mutable_path = join_paths(cwd, mutable_path); + if (!is_path_absolute(wpath.c_str()) && !has_longpath_prefix(wpath.c_str())) { + int size = ::GetCurrentDirectoryW(0, NULL); + if (size == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return false; + } + scoped_array wcwd(new WCHAR[size]); + ::GetCurrentDirectoryW(size, wcwd.get()); + wpath = join_paths(wcwd.get(), wpath); } - as_wchar_path(normalize(mutable_path), result); - if (!has_longpath_prefix(result->c_str())) { + wpath = normalize(wpath); + if (!has_longpath_prefix(wpath.c_str())) { // Add the "\\?\" prefix unconditionally. This way we prevent the Win32 API // from processing the path and "helpfully" removing trailing dots from the // path, for example. // See https://github.com/bazelbuild/bazel/issues/2935 - *result = wstring(L"\\\\?\\") + *result; + wpath = wstring(L"\\\\?\\") + wpath; } + *result = wpath; return true; } @@ -319,13 +309,21 @@ int stat(const char* path, struct _stat* buffer) { FILE* fopen(const char* path, const char* mode) { #ifdef SUPPORT_LONGPATHS + if (null_or_empty(path)) { + errno = EINVAL; + return NULL; + } wstring wpath; if (!as_windows_path(path, &wpath)) { errno = ENOENT; return NULL; } - scoped_array wmode(as_wstring(mode)); - return ::_wfopen(wpath.c_str(), wmode.get()); + wstring wmode; + if (!strings::utf8_to_wcs(mode, &wmode)) { + errno = EINVAL; + return NULL; + } + return ::_wfopen(wpath.c_str(), wmode.c_str()); #else return ::fopen(path, mode); #endif @@ -347,16 +345,61 @@ int write(int fd, const void* buffer, size_t size) { return ::_write(fd, buffer, size); } -wstring testonly_path_to_winpath(const string& path) { +wstring testonly_utf8_to_winpath(const char* path) { wstring wpath; - as_windows_path(path, &wpath); - return wpath; + return as_windows_path(path, &wpath) ? wpath : wstring(); +} + +namespace strings { + +bool wcs_to_mbs(const WCHAR* s, string* out, bool outUtf8) { + if (null_or_empty(s)) { + out->clear(); + return true; + } + BOOL usedDefaultChar = FALSE; + SetLastError(0); + int size = WideCharToMultiByte( + outUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, NULL, 0, NULL, + outUtf8 ? NULL : &usedDefaultChar); + if ((size == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) + || usedDefaultChar) { + return false; + } + scoped_array astr(new CHAR[size]); + WideCharToMultiByte( + outUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, astr.get(), size, NULL, NULL); + out->assign(astr.get()); + return true; +} + +bool mbs_to_wcs(const char* s, wstring* out, bool inUtf8) { + if (null_or_empty(s)) { + out->clear(); + return true; + } + + SetLastError(0); + int size = + MultiByteToWideChar(inUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, NULL, 0); + if (size == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return false; + } + scoped_array wstr(new WCHAR[size]); + MultiByteToWideChar( + inUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, wstr.get(), size + 1); + out->assign(wstr.get()); + return true; } +bool utf8_to_wcs(const char* input, wstring* out) { + return mbs_to_wcs(input, out, true); +} + +} // namespace strings } // namespace win32 } // namespace internal } // namespace protobuf } // namespace google #endif // defined(_WIN32) - diff --git a/src/google/protobuf/stubs/io_win32.h b/src/google/protobuf/stubs/io_win32.h index 53160089..60d0ceb6 100644 --- a/src/google/protobuf/stubs/io_win32.h +++ b/src/google/protobuf/stubs/io_win32.h @@ -69,8 +69,22 @@ LIBPROTOBUF_EXPORT int read(int fd, void* buffer, size_t size); LIBPROTOBUF_EXPORT int setmode(int fd, int mode); LIBPROTOBUF_EXPORT int stat(const char* path, struct _stat* buffer); LIBPROTOBUF_EXPORT int write(int fd, const void* buffer, size_t size); -LIBPROTOBUF_EXPORT std::wstring testonly_path_to_winpath( - const std::string& path); +LIBPROTOBUF_EXPORT std::wstring testonly_utf8_to_winpath(const char* path); + +namespace strings { + +// Convert from UTF-16 to Active-Code-Page-encoded or to UTF-8-encoded text. +LIBPROTOBUF_EXPORT bool wcs_to_mbs( + const wchar_t* s, std::string* out, bool outUtf8); + +// Convert from Active-Code-Page-encoded or UTF-8-encoded text to UTF-16. +LIBPROTOBUF_EXPORT bool mbs_to_wcs( + const char* s, std::wstring* out, bool inUtf8); + +// Convert from UTF-8-encoded text to UTF-16. +LIBPROTOBUF_EXPORT bool utf8_to_wcs(const char* input, std::wstring* out); + +} // namespace strings } // namespace win32 } // namespace internal diff --git a/src/google/protobuf/stubs/io_win32_unittest.cc b/src/google/protobuf/stubs/io_win32_unittest.cc index a5c7dbfd..e88b7554 100644 --- a/src/google/protobuf/stubs/io_win32_unittest.cc +++ b/src/google/protobuf/stubs/io_win32_unittest.cc @@ -30,7 +30,8 @@ // Author: laszlocsomor@google.com (Laszlo Csomor) // -// Unit tests for long-path-aware open/mkdir/access on Windows. +// Unit tests for long-path-aware open/mkdir/access/etc. on Windows, as well as +// for the supporting utility functions. // // This file is only used on Windows, it's empty on other platforms. @@ -89,13 +90,17 @@ void StripTrailingSlashes(string* str) { str->resize(i+1); } -bool GetEnvVar(const char* name, string* result) { - DWORD size = ::GetEnvironmentVariableA(name, NULL, 0); +bool GetEnvVarAsUtf8(const WCHAR* name, string* result) { + DWORD size = ::GetEnvironmentVariableW(name, NULL, 0); if (size > 0 && GetLastError() != ERROR_ENVVAR_NOT_FOUND) { - scoped_array str(new char[size]); - ::GetEnvironmentVariableA(name, str.get(), size); - result->assign(str.get()); - return true; + scoped_array wcs(new WCHAR[size]); + ::GetEnvironmentVariableW(name, wcs.get(), size); + // GetEnvironmentVariableA retrieves an Active-Code-Page-encoded text which + // we'd first need to convert to UTF-16 then to UTF-8, because there seems + // to be no API function to do that conversion directly. + // GetEnvironmentVariableW retrieves an UTF-16-encoded text, which we need + // to convert to UTF-8. + return strings::wcs_to_mbs(wcs.get(), result, true); } else { return false; } @@ -104,30 +109,30 @@ bool GetEnvVar(const char* name, string* result) { } // namespace void IoWin32Test::SetUp() { + test_tmpdir.clear(); + wtest_tmpdir.clear(); + string tmp; bool ok = false; if (!ok) { - ok = GetEnvVar("TEST_TMPDIR", &tmp); + ok = GetEnvVarAsUtf8(L"TEST_TMPDIR", &tmp); } if (!ok) { - ok = GetEnvVar("TEMP", &tmp); + ok = GetEnvVarAsUtf8(L"TEMP", &tmp); } if (!ok) { - ok = GetEnvVar("TMP", &tmp); + ok = GetEnvVarAsUtf8(L"TMP", &tmp); } if (!ok || tmp.empty()) { FAIL(); } - StripTrailingSlashes(&tmp); test_tmpdir = tmp + "\\io_win32_unittest.tmp"; - wtest_tmpdir = testonly_path_to_winpath(test_tmpdir); - if (!DeleteAllUnder(wtest_tmpdir) || !CreateAllUnder(wtest_tmpdir)) { - FAIL(); - test_tmpdir.clear(); - wtest_tmpdir.clear(); - } + wtest_tmpdir = testonly_utf8_to_winpath(test_tmpdir.c_str()); + ASSERT_FALSE(wtest_tmpdir.empty()); + ASSERT_TRUE(DeleteAllUnder(wtest_tmpdir)); + ASSERT_TRUE(CreateAllUnder(wtest_tmpdir)); } void IoWin32Test::TearDown() { @@ -171,8 +176,8 @@ bool IoWin32Test::DeleteAllUnder(wstring path) { path = wstring(L"\\\\?\\") + path; } // Append "\" if necessary. - if (path[path.size() - 1] != '\\') { - path.push_back('\\'); + if (path[path.size() - 1] != L'\\') { + path.push_back(L'\\'); } WIN32_FIND_DATAW metadata; @@ -290,12 +295,12 @@ TEST_F(IoWin32Test, MkdirTest) { } TEST_F(IoWin32Test, ChdirTest) { - char owd[MAX_PATH]; - EXPECT_GT(::GetCurrentDirectoryA(MAX_PATH, owd), 0); + WCHAR owd[MAX_PATH]; + EXPECT_GT(::GetCurrentDirectoryW(MAX_PATH, owd), 0); string path("C:\\"); EXPECT_EQ(access(path.c_str(), F_OK), 0); ASSERT_EQ(chdir(path.c_str()), 0); - EXPECT_TRUE(::SetCurrentDirectoryA(owd)); + EXPECT_TRUE(::SetCurrentDirectoryW(owd)); // Do not try to chdir into the test_tmpdir, it may already contain directory // names with trailing dots. @@ -316,11 +321,11 @@ TEST_F(IoWin32Test, AsWindowsPathTest) { EXPECT_GT(GetCurrentDirectoryW(size, cwd_str.get()), 0); wstring cwd = wstring(L"\\\\?\\") + cwd_str.get(); - ASSERT_EQ(testonly_path_to_winpath("relative_mkdirtest"), + ASSERT_EQ(testonly_utf8_to_winpath("relative_mkdirtest"), cwd + L"\\relative_mkdirtest"); - ASSERT_EQ(testonly_path_to_winpath("preserve//\\trailing///"), + ASSERT_EQ(testonly_utf8_to_winpath("preserve//\\trailing///"), cwd + L"\\preserve\\trailing\\"); - ASSERT_EQ(testonly_path_to_winpath("./normalize_me\\/../blah"), + ASSERT_EQ(testonly_utf8_to_winpath("./normalize_me\\/../blah"), cwd + L"\\blah"); std::ostringstream relpath; for (wchar_t* p = cwd_str.get(); *p; ++p) { @@ -329,18 +334,43 @@ TEST_F(IoWin32Test, AsWindowsPathTest) { } } relpath << ".\\/../\\./beyond-toplevel"; - ASSERT_EQ(testonly_path_to_winpath(relpath.str()), + ASSERT_EQ(testonly_utf8_to_winpath(relpath.str().c_str()), wstring(L"\\\\?\\") + cwd_str.get()[0] + L":\\beyond-toplevel"); // Absolute unix paths lack drive letters, driveless absolute windows paths // do too. Neither can be converted to a drive-specifying absolute Windows // path. - ASSERT_EQ(testonly_path_to_winpath("/absolute/unix/path"), L""); + ASSERT_EQ(testonly_utf8_to_winpath("/absolute/unix/path"), L""); // Though valid on Windows, we also don't support UNC paths (\\UNC\\blah). - ASSERT_EQ(testonly_path_to_winpath("\\driveless\\absolute"), L""); + ASSERT_EQ(testonly_utf8_to_winpath("\\driveless\\absolute"), L""); // Though valid in cmd.exe, drive-relative paths are not supported. - ASSERT_EQ(testonly_path_to_winpath("c:foo"), L""); - ASSERT_EQ(testonly_path_to_winpath("c:/foo"), L"\\\\?\\c:\\foo"); + ASSERT_EQ(testonly_utf8_to_winpath("c:foo"), L""); + ASSERT_EQ(testonly_utf8_to_winpath("c:/foo"), L"\\\\?\\c:\\foo"); +} + +TEST_F(IoWin32Test, Utf8ToUtf16Test) { + const char hi_utf8[] = { + 'h', 'i', ' ', + // utf-8: 11010000 10011111, utf-16: 100 0001 1111 = 0x041F + 0xd0, 0x9f, + // utf-8: 11010001 10000000, utf-16: 100 0100 0000 = 0x0440 + 0xd1, 0x80, + // utf-8: 11010000 10111000, utf-16: 100 0011 1000 = 0x0438 + 0xd0, 0xb8, + // utf-8: 11010000 10110010, utf-16: 100 0011 0010 = 0x0432 + 0xd0, 0xb2, + // utf-8: 11010000 10110101, utf-16: 100 0011 0101 = 0x0435 + 0xd0, 0xb5, + // utf-8: 11010001 10000010, utf-16: 100 0100 0010 = 0x0442 + 0xd1, 0x82, 0 + }; + const wchar_t hi_utf16[] = { + L'h', L'i', L' ', 0x041f, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0 + }; + + wstring wcs; + ASSERT_TRUE(strings::utf8_to_wcs(hi_utf8, &wcs)); + ASSERT_EQ(wcs, hi_utf16); } } // namespace -- cgit v1.2.3