llvm.org GIT mirror llvm / 86d0c61
[YAML] Escape non-printable multibyte UTF8 in Output::scalarString. The existing YAML Output::scalarString code path includes a partial and incorrect implementation of YAML escaping logic. In particular, the logic put in place in rL321283 escapes non-printable bytes only if they are not part of a multibyte UTF8 sequence; implicitly this means that all multibyte UTF8 sequences -- printable and non -- are passed through verbatim. The simplest solution to this is to direct the Output::scalarString method to use the standalone yaml::escape function, and this _almost_ works, except that the existing code in that function _over_ escapes: any multibyte UTF8 sequence is escaped, even printable ones. While this is permitted for YAML, it is also more aggressive (and hard to read for non-English locales) than necessary, and the entire point of rL321283 was to back off such aggressive over-escaping. So in this change, I have both redirected Output::scalarString to use yaml::escape _and_ modified yaml::escape to optionally restrict its escaping to non-printables. This preserves behaviour of any existing clients while giving them a path to more moderate escaping should they desire. Reviewers: JDevlieghere, thegameg, MatzeB, vladimir.plyashkun Reviewed By: thegameg Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D44863 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328661 91177308-0d34-0410-b5e6-96231b3b80d8 Graydon Hoare 1 year, 10 months ago
4 changed file(s) with 41 addition(s) and 35 deletion(s). Raw diff Collapse all Expand all
7272 /// \returns true if there was an error, false otherwise.
7373 bool scanTokens(StringRef Input);
7474
75 /// \brief Escape \a Input for a double quoted scalar.
76 std::string escape(StringRef Input);
75 /// \brief Escape \a Input for a double quoted scalar; if \p EscapePrintable
76 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
77 /// false, those UTF8 sequences encoding printable unicode scalars will not be
78 /// escaped, but emitted verbatim.
79 std::string escape(StringRef Input, bool EscapePrintable = true);
7780
7881 /// \brief This class represents a YAML stream potentially containing multiple
7982 /// documents.
2525 #include "llvm/Support/MemoryBuffer.h"
2626 #include "llvm/Support/SMLoc.h"
2727 #include "llvm/Support/SourceMgr.h"
28 #include "llvm/Support/Unicode.h"
2829 #include "llvm/Support/raw_ostream.h"
2930 #include
3031 #include
686687 return true;
687688 }
688689
689 std::string yaml::escape(StringRef Input) {
690 std::string yaml::escape(StringRef Input, bool EscapePrintable) {
690691 std::string EscapedInput;
691692 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
692693 if (*i == '\\')
733734 EscapedInput += "\\L";
734735 else if (UnicodeScalarValue.first == 0x2029)
735736 EscapedInput += "\\P";
737 else if (!EscapePrintable &&
738 sys::unicode::isPrintable(UnicodeScalarValue.first))
739 EscapedInput += StringRef(i, UnicodeScalarValue.second);
736740 else {
737741 std::string HexStr = utohexstr(UnicodeScalarValue.first);
738742 if (HexStr.size() <= 2)
637637 const char *Base = S.data();
638638
639639 const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
640 const char QuoteChar = MustQuote == QuotingType::Single ? '\'' : '"';
641
642640 output(Quote); // Starting quote.
643641
644 // When using single-quoted strings, any single quote ' must be doubled to be
645 // escaped.
646 // When using double-quoted strings, print \x + hex for non-printable ASCII
647 // characters, and escape double quotes.
642 // When using double-quoted strings (and only in that case), non-printable characters may be
643 // present, and will be escaped using a variety of unicode-scalar and special short-form
644 // escapes. This is handled in yaml::escape.
645 if (MustQuote == QuotingType::Double) {
646 output(yaml::escape(Base, /* EscapePrintable= */ false));
647 this->outputUpToEndOfLine(Quote);
648 return;
649 }
650
651 // When using single-quoted strings, any single quote ' must be doubled to be escaped.
648652 while (j < End) {
649 if (S[j] == QuoteChar) { // Escape quotes.
650 output(StringRef(&Base[i], j - i)); // "flush".
651 if (MustQuote == QuotingType::Double) { // Print it as \"
652 output(StringLiteral("\\"));
653 output(StringRef(Quote, 1));
654 } else { // Single
655 output(StringLiteral("''")); // Print it as ''
656 }
657 i = j + 1;
658 } else if (MustQuote == QuotingType::Double &&
659 !sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) {
660 // If we're double quoting non-printable characters, we prefer printing
661 // them as "\x" + their hex representation. Note that special casing is
662 // needed for UTF-8, where a byte may be part of a UTF-8 sequence and
663 // appear as non-printable, in which case we want to print the correct
664 // unicode character and not its hex representation.
665 output(StringRef(&Base[i], j - i)); // "flush"
666 output(StringLiteral("\\x"));
667
668 // Output the byte 0x0F as \x0f.
669 auto FormattedHex = format_hex_no_prefix(S[j], 2);
670 Out << FormattedHex;
671 Column += 4; // one for the '\', one for the 'x', and two for the hex
672
653 if (S[j] == '\'') { // Escape quotes.
654 output(StringRef(&Base[i], j - i)); // "flush".
655 output(StringLiteral("''")); // Print it as ''
673656 i = j + 1;
674657 }
675658 ++j;
24632463 yamlize(xout, Input, true, Ctx);
24642464
24652465 ostr.flush();
2466 EXPECT_EQ(Expected, out);
2466
2467 // Make a separate StringRef so we get nice byte-by-byte output.
2468 llvm::StringRef Got(out);
2469 EXPECT_EQ(Expected, Got);
24672470 }
24682471
24692472 TEST(YAMLIO, TestEscaped) {
24842487 // UTF8 with single quote inside double quote
24852488 TestEscaped("parameter 'параметр' is unused",
24862489 "\"parameter 'параметр' is unused\"");
2487 }
2490
2491 // String with embedded non-printable multibyte UTF-8 sequence (U+200B
2492 // zero-width space). The thing to test here is that we emit a
2493 // unicode-scalar level escape like \uNNNN (at the YAML level), and don't
2494 // just pass the UTF-8 byte sequence through as with quoted printables.
2495 TestEscaped("foo\u200Bbar", "\"foo\\u200Bbar\"");
2496 {
2497 const unsigned char foobar[10] = {'f', 'o', 'o',
2498 0xE2, 0x80, 0x8B, // UTF-8 of U+200B
2499 'b', 'a', 'r',
2500 0x0};
2501 TestEscaped((char const *)foobar, "\"foo\\u200Bbar\"");
2502 }
2503 }