llvm.org GIT mirror llvm / 52fa0d0
Add writeFileWithSystemEncoding to LibLLVMSuppor. This patch adds to LLVMSupport the capability of writing files with international characters encoded in the current system encoding. This is relevant for Windows, where we can either use UTF16 or the current code page (the legacy Windows international characters). On UNIX, the file is always saved in UTF8. This will be used in a patch for clang to thoroughly support response files creation when calling other tools, addressing PR15171. On Windows, to correctly support internationalization, we need the ability to write response files both in UTF16 or the current code page, depending on the tool we will call. GCC for mingw, for instance, requires files to be encoded in the current code page. MSVC tools requires files to be encoded in UTF16. Patch by Rafael Auler! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217068 91177308-0d34-0410-b5e6-96231b3b80d8 Rafael Espindola 5 years ago
6 changed file(s) with 167 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
125125 /// argument length limits.
126126 bool argumentsFitWithinSystemLimits(ArrayRef Args);
127127
128 /// File encoding options when writing contents that a non-UTF8 tool will
129 /// read (on Windows systems). For UNIX, we always use UTF-8.
130 enum WindowsEncodingMethod {
131 /// UTF-8 is the LLVM native encoding, being the same as "do not perform
132 /// encoding conversion".
133 WEM_UTF8,
134 WEM_CurrentCodePage,
135 WEM_UTF16
136 };
137
138 /// Saves the UTF8-encoded \p contents string into the file \p FileName
139 /// using a specific encoding.
140 ///
141 /// This write file function adds the possibility to choose which encoding
142 /// to use when writing a text file. On Windows, this is important when
143 /// writing files with internationalization support with an encoding that is
144 /// different from the one used in LLVM (UTF-8). We use this when writing
145 /// response files, since GCC tools on MinGW only understand legacy code
146 /// pages, and VisualStudio tools only understand UTF-16.
147 /// For UNIX, using different encodings is silently ignored, since all tools
148 /// work well with UTF-8.
149 /// This function assumes that you only use UTF-8 *text* data and will convert
150 /// it to your desired encoding before writing to the file.
151 ///
152 /// FIXME: We use EM_CurrentCodePage to write response files for GNU tools in
153 /// a MinGW/MinGW-w64 environment, which has serious flaws but currently is
154 /// our best shot to make gcc/ld understand international characters. This
155 /// should be changed as soon as binutils fix this to support UTF16 on mingw.
156 ///
157 /// \returns non-zero error_code if failed
158 std::error_code
159 writeFileWithEncoding(StringRef FileName, StringRef Contents,
160 WindowsEncodingMethod Encoding = WEM_UTF8);
161
128162 /// This function waits for the process specified by \p PI to finish.
129163 /// \returns A \see ProcessInfo struct with Pid set to:
130164 /// \li The process id of the child process if the child process has changed
1818 #include "Unix.h"
1919 #include "llvm/Support/Compiler.h"
2020 #include "llvm/Support/FileSystem.h"
21 #include "llvm/Support/raw_ostream.h"
2122 #include
2223 #if HAVE_SYS_STAT_H
2324 #include
439440 return std::error_code();
440441 }
441442
443 std::error_code
444 llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
445 WindowsEncodingMethod Encoding /*unused*/) {
446 std::error_code EC;
447 llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
448
449 if (EC)
450 return EC;
451
452 OS << Contents;
453
454 if (OS.has_error())
455 return std::make_error_code(std::errc::io_error);
456
457 return EC;
458 }
459
442460 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef Args) {
443461 static long ArgMax = sysconf(_SC_ARG_MAX);
444462
918918 return std::error_code();
919919 }
920920
921 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
922 llvm::SmallVectorImpl &utf8) {
921 static
922 std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16,
923 size_t utf16_len,
924 llvm::SmallVectorImpl &utf8) {
923925 if (utf16_len) {
924926 // Get length.
925 int len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.begin(),
927 int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.begin(),
926928 0, NULL, NULL);
927929
928930 if (len == 0)
932934 utf8.set_size(len);
933935
934936 // Now do the actual conversion.
935 len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.data(),
937 len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.data(),
936938 utf8.size(), NULL, NULL);
937939
938940 if (len == 0)
945947
946948 return std::error_code();
947949 }
950
951 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
952 llvm::SmallVectorImpl &utf8) {
953 return UTF16ToCodePage(CP_UTF8, utf16, utf16_len, utf8);
954 }
955
956 std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
957 llvm::SmallVectorImpl &utf8) {
958 return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8);
959 }
948960 } // end namespace windows
949961 } // end namespace sys
950962 } // end namespace llvm
1111 //===----------------------------------------------------------------------===//
1212
1313 #include "WindowsSupport.h"
14 #include "llvm/Support/ConvertUTF.h"
1415 #include "llvm/Support/FileSystem.h"
16 #include "llvm/Support/raw_ostream.h"
1517 #include
1618 #include
1719 #include
439441 return std::error_code();
440442 }
441443
444 std::error_code
445 llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
446 WindowsEncodingMethod Encoding) {
447 std::error_code EC;
448 llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
449 if (EC)
450 return EC;
451
452 if (Encoding == WEM_UTF8) {
453 OS << Contents;
454 } else if (Encoding == WEM_CurrentCodePage) {
455 SmallVector ArgsUTF16;
456 SmallVector ArgsCurCP;
457
458 if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)))
459 return EC;
460
461 if ((EC = windows::UTF16ToCurCP(
462 ArgsUTF16.data(), ArgsUTF16.size(), ArgsCurCP)))
463 return EC;
464
465 OS.write(ArgsCurCP.data(), ArgsCurCP.size());
466 } else if (Encoding == WEM_UTF16) {
467 SmallVector ArgsUTF16;
468
469 if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)))
470 return EC;
471
472 // Endianness guessing
473 char BOM[2];
474 uint16_t src = UNI_UTF16_BYTE_ORDER_MARK_NATIVE;
475 memcpy(BOM, &src, 2);
476 OS.write(BOM, 2);
477 OS.write((char *)ArgsUTF16.data(), ArgsUTF16.size() << 1);
478 } else {
479 llvm_unreachable("Unknown encoding");
480 }
481
482 if (OS.has_error())
483 return std::make_error_code(std::errc::io_error);
484
485 return EC;
486 }
487
442488 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef Args) {
443489 // The documented max length of the command line passed to CreateProcess.
444490 static const size_t MaxCommandStringLength = 32768;
165165 std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl &utf16);
166166 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
167167 SmallVectorImpl &utf8);
168 /// Convert from UTF16 to the current code page used in the system
169 std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
170 SmallVectorImpl &utf8);
168171 } // end namespace windows
169172 } // end namespace sys
170173 } // end namespace llvm.
3333 #error sleep_for is not implemented on your platform.
3434 #endif
3535
36 #define ASSERT_NO_ERROR(x) \
37 if (std::error_code ASSERT_NO_ERROR_ec = x) { \
38 SmallString<128> MessageStorage; \
39 raw_svector_ostream Message(MessageStorage); \
40 Message << #x ": did not return errc::success.\n" \
41 << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \
42 << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \
43 GTEST_FATAL_FAILURE_(MessageStorage.c_str()); \
44 } else { \
45 }
3646 // From TestMain.cpp.
3747 extern const char *TestMainArgv0;
3848
219229
220230 }
221231
232 #ifdef LLVM_ON_WIN32
233 const char utf16le_text[] =
234 "\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61\x00";
235 const char utf16be_text[] =
236 "\x00\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61";
237 #endif
238 const char utf8_text[] = "\x6c\x69\x6e\x67\xc3\xbc\x69\xc3\xa7\x61";
239
240 TEST(ProgramTest, TestWriteWithSystemEncoding) {
241 SmallString<128> TestDirectory;
242 ASSERT_NO_ERROR(fs::createUniqueDirectory("program-test", TestDirectory));
243 errs() << "Test Directory: " << TestDirectory << '\n';
244 errs().flush();
245 SmallString<128> file_pathname(TestDirectory);
246 path::append(file_pathname, "international-file.txt");
247 // Only on Windows we should encode in UTF16. For other systems, use UTF8
248 ASSERT_NO_ERROR(sys::writeFileWithEncoding(file_pathname.c_str(), utf8_text,
249 sys::WEM_UTF16));
250 int fd = 0;
251 ASSERT_NO_ERROR(fs::openFileForRead(file_pathname.c_str(), fd));
252 #if defined(LLVM_ON_WIN32)
253 char buf[18];
254 ASSERT_EQ(::read(fd, buf, 18), 18);
255 if (strncmp(buf, "\xfe\xff", 2) == 0) { // UTF16-BE
256 ASSERT_EQ(strncmp(&buf[2], utf16be_text, 16), 0);
257 } else if (strncmp(buf, "\xff\xfe", 2) == 0) { // UTF16-LE
258 ASSERT_EQ(strncmp(&buf[2], utf16le_text, 16), 0);
259 } else {
260 FAIL() << "Invalid BOM in UTF-16 file";
261 }
262 #else
263 char buf[10];
264 ASSERT_EQ(::read(fd, buf, 10), 10);
265 ASSERT_EQ(strncmp(buf, utf8_text, 10), 0);
266 #endif
267 ::close(fd);
268 ASSERT_NO_ERROR(fs::remove(file_pathname.str()));
269 ASSERT_NO_ERROR(fs::remove(TestDirectory.str()));
270 }
271
222272 } // end anonymous namespace