llvm.org GIT mirror llvm / 18ff709
[Support] Harded JSON against invalid UTF-8. Parsing invalid UTF-8 input is now a parse error. Creating JSON values from invalid UTF-8 now triggers an assertion, and (in no-assert builds) substitutes the unicode replacement character. Strings retrieved from json::Value are always valid UTF-8. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@336657 91177308-0d34-0410-b5e6-96231b3b80d8 Sam McCall 1 year, 10 months ago
4 changed file(s) with 147 addition(s) and 18 deletion(s). Raw diff Collapse all Expand all
8686 /// Checks whether character \p C is either a decimal digit or an uppercase or
8787 /// lowercase letter as classified by "C" locale.
8888 inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
89
90 /// Checks whether character \p C is valid ASCII (high bit is zero).
91 inline bool isASCII(char C) { return static_cast(C) <= 127; }
92
93 /// Checks whether all characters in S are ASCII.
94 inline bool isASCII(llvm::StringRef S) {
95 for (char C : S)
96 if (LLVM_UNLIKELY(!isASCII(C)))
97 return false;
98 return true;
99 }
89100
90101 /// Returns the corresponding lowercase character if \p x is uppercase.
91102 inline char toLower(char x) {
5353
5454 namespace llvm {
5555 namespace json {
56
57 // === String encodings ===
58 //
59 // JSON strings are character sequences (not byte sequences like std::string).
60 // We need to know the encoding, and for simplicity only support UTF-8.
61 //
62 // - When parsing, invalid UTF-8 is a syntax error like any other
63 //
64 // - When creating Values from strings, callers must ensure they are UTF-8.
65 // with asserts on, invalid UTF-8 will crash the program
66 // with asserts off, we'll substitute the replacement character (U+FFFD)
67 // Callers can use json::isUTF8() and json::fixUTF8() for validation.
68 //
69 // - When retrieving strings from Values (e.g. asString()), the result will
70 // always be valid UTF-8.
71
72 /// Returns true if \p S is valid UTF-8, which is required for use as JSON.
73 /// If it returns false, \p Offset is set to a byte offset near the first error.
74 bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
75 /// Replaces invalid UTF-8 sequences in \p S with the replacement character
76 /// (U+FFFD). The returned string is valid UTF-8.
77 /// This is much slower than isUTF8, so test that first.
78 std::string fixUTF8(llvm::StringRef S);
79
5680 class Array;
5781 class ObjectKey;
5882 class Value;
272296 Value(json::Object &&Properties) : Type(T_Object) {
273297 create(std::move(Properties));
274298 }
275 // Strings: types with value semantics.
276 Value(std::string &&V) : Type(T_String) { create(std::move(V)); }
277 Value(const std::string &V) : Type(T_String) { create(V); }
278 Value(const llvm::SmallVectorImpl &V) : Type(T_String) {
279 create(V.begin(), V.end());
280 }
299 // Strings: types with value semantics. Must be valid UTF-8.
300 Value(std::string V) : Type(T_String) {
301 if (LLVM_UNLIKELY(!isUTF8(V))) {
302 assert(false && "Invalid UTF-8 in value used as JSON");
303 V = fixUTF8(std::move(V));
304 }
305 create(std::move(V));
306 }
307 Value(const llvm::SmallVectorImpl &V)
308 : Value(std::string(V.begin(), V.end())){};
281309 Value(const llvm::formatv_object_base &V) : Value(V.str()){};
282 // Strings: types with reference semantics.
283 Value(llvm::StringRef V) : Type(T_StringRef) { create(V); }
284 Value(const char *V) : Type(T_StringRef) { create(V); }
310 // Strings: types with reference semantics. Must be valid UTF-8.
311 Value(StringRef V) : Type(T_StringRef) {
312 create(V);
313 if (LLVM_UNLIKELY(!isUTF8(V))) {
314 assert(false && "Invalid UTF-8 in value used as JSON");
315 *this = Value(fixUTF8(V));
316 }
317 }
318 Value(const char *V) : Value(StringRef(V)) {}
285319 Value(std::nullptr_t) : Type(T_Null) {}
286320 // Boolean (disallow implicit conversions).
287321 // (The last template parameter is a dummy to keep templates distinct.)
448482 /// ObjectKey is a used to capture keys in Object. Like Value but:
449483 /// - only strings are allowed
450484 /// - it's optimized for the string literal case (Owned == nullptr)
485 /// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
451486 class ObjectKey {
452487 public:
453 ObjectKey(const char *S) : Data(S) {}
454 ObjectKey(llvm::StringRef S) : Data(S) {}
455 ObjectKey(std::string &&V)
456 : Owned(new std::string(std::move(V))), Data(*Owned) {}
457 ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {}
488 ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
489 ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
490 if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
491 assert(false && "Invalid UTF-8 in value used as JSON");
492 *Owned = fixUTF8(std::move(*Owned));
493 }
494 Data = *Owned;
495 }
496 ObjectKey(llvm::StringRef S) : Data(S) {
497 if (LLVM_UNLIKELY(!isUTF8(Data))) {
498 assert(false && "Invalid UTF-8 in value used as JSON");
499 *this = ObjectKey(fixUTF8(S));
500 }
501 }
458502 ObjectKey(const llvm::SmallVectorImpl &V)
459503 : ObjectKey(std::string(V.begin(), V.end())) {}
460504 ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}
77 //===---------------------------------------------------------------------===//
88
99 #include "llvm/Support/JSON.h"
10 #include "llvm/Support/ConvertUTF.h"
1011 #include "llvm/Support/Format.h"
1112 #include
1213
197198 public:
198199 Parser(StringRef JSON)
199200 : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
201
202 bool checkUTF8() {
203 size_t ErrOffset;
204 if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
205 return true;
206 P = Start + ErrOffset; // For line/column calculation.
207 return parseError("Invalid UTF-8 sequence");
208 }
200209
201210 bool parseValue(Value &Out);
202211
457466
458467 // Case 3: it's a leading surrogate. We expect a trailing one next.
459468 // Case 3a: there's no trailing \u escape. Don't advance in the stream.
460 if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
469 if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
461470 Invalid(); // Leading surrogate was unpaired.
462471 return true;
463472 }
495504 Expected parse(StringRef JSON) {
496505 Parser P(JSON);
497506 Value E = nullptr;
498 if (P.parseValue(E))
499 if (P.assertEnd())
500 return std::move(E);
507 if (P.checkUTF8())
508 if (P.parseValue(E))
509 if (P.assertEnd())
510 return std::move(E);
501511 return P.takeError();
502512 }
503513 char ParseError::ID = 0;
511521 return L->first < R->first;
512522 });
513523 return Elements;
524 }
525
526 bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
527 // Fast-path for ASCII, which is valid UTF-8.
528 if (LLVM_LIKELY(isASCII(S)))
529 return true;
530
531 const UTF8 *Data = reinterpret_cast(S.data()), *Rest = Data;
532 if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
533 return true;
534
535 if (ErrOffset)
536 *ErrOffset = Rest - Data;
537 return false;
538 }
539
540 std::string fixUTF8(llvm::StringRef S) {
541 // This isn't particularly efficient, but is only for error-recovery.
542 std::vector Codepoints(S.size()); // 1 codepoint per byte suffices.
543 const UTF8 *In8 = reinterpret_cast(S.data());
544 UTF32 *Out32 = Codepoints.data();
545 ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
546 lenientConversion);
547 Codepoints.resize(Out32 - Codepoints.data());
548 std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
549 const UTF32 *In32 = Codepoints.data();
550 UTF8 *Out8 = reinterpret_cast(&Res[0]);
551 ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
552 strictConversion);
553 Res.resize(reinterpret_cast(Out8) - Res.data());
554 return Res;
514555 }
515556
516557 } // namespace json
2626 EXPECT_EQ(R"("foo")", s("foo"));
2727 EXPECT_EQ("[1,2,3]", s({1, 2, 3}));
2828 EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}}));
29
30 #ifdef NDEBUG
31 EXPECT_EQ(R"("��")", s("\xC0\x80"));
32 EXPECT_EQ(R"({"��":0})", s(Object{{"\xC0\x80", 0}}));
33 #else
34 EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8");
35 EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8");
36 #endif
2937 }
3038
3139 TEST(JSONTest, Constructors) {
180188 "valid": 1,
181189 invalid: 2
182190 })");
191 ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null
192 }
193
194 // Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere.
195 TEST(JSONTest, UTF8) {
196 for (const char *Valid : {
197 "this is ASCII text",
198 "thïs tëxt häs BMP chäräctërs",
199 "𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃",
200 }) {
201 EXPECT_TRUE(isUTF8(Valid)) << Valid;
202 EXPECT_EQ(fixUTF8(Valid), Valid);
203 }
204 for (auto Invalid : std::vector>{
205 {"lone trailing \x81\x82 bytes", "lone trailing �� bytes"},
206 {"missing trailing \xD0 bytes", "missing trailing � bytes"},
207 {"truncated character \xD0", "truncated character �"},
208 {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding",
209 "not �� the ��� shortest ���� encoding"},
210 {"too \xF9\x80\x80\x80\x80 long", "too ����� long"},
211 {"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80",
212 "surrogate ��� invalid ����"}}) {
213 EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first;
214 EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second);
215 }
183216 }
184217
185218 TEST(JSONTest, Inspection) {