llvm.org GIT mirror llvm / b96942f
YAML: Implement block scalar parsing. This commit implements the parsing of YAML block scalars. Some code existed for it before, but it couldn't parse block scalars. This commit adds a new yaml node type to represent the block scalar values. This commit also deletes the 'spec-09-27' and 'spec-09-28' tests as they are identical to the test file 'spec-09-26'. This commit introduces 3 new utility functions to the YAML scanner class: `skip_s_space`, `advanceWhile` and `consumeLineBreakIfPresent`. Reviewers: Duncan P. N. Exon Smith Differential Revision: http://reviews.llvm.org/D9503 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237314 91177308-0d34-0410-b5e6-96231b3b80d8 Alex Lorenz 4 years ago
15 changed file(s) with 363 addition(s) and 59 deletion(s). Raw diff Collapse all Expand all
106106 enum NodeKind {
107107 NK_Null,
108108 NK_Scalar,
109 NK_BlockScalar,
109110 NK_KeyValue,
110111 NK_Mapping,
111112 NK_Sequence,
219220 StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
220221 StringRef::size_type Start,
221222 SmallVectorImpl &Storage) const;
223 };
224
225 /// \brief A block scalar node is an opaque datum that can be presented as a
226 /// series of zero or more Unicode scalar values.
227 ///
228 /// Example:
229 /// |
230 /// Hello
231 /// World
232 class BlockScalarNode : public Node {
233 void anchor() override;
234
235 public:
236 BlockScalarNode(std::unique_ptr &D, StringRef Anchor, StringRef Tag,
237 std::string &Value, StringRef RawVal)
238 : Node(NK_BlockScalar, D, Anchor, Tag), Value(std::move(Value)) {
239 SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
240 SMLoc End = SMLoc::getFromPointer(RawVal.end());
241 SourceRange = SMRange(Start, End);
242 }
243
244 /// \brief Gets the value of this node as a StringRef.
245 StringRef getValue() const { return Value; }
246
247 static inline bool classof(const Node *N) {
248 return N->getType() == NK_BlockScalar;
249 }
250
251 private:
252 std::string Value;
222253 };
223254
224255 /// \brief A key and value pair. While not technically a Node under the YAML
100100 void Node::anchor() {}
101101 void NullNode::anchor() {}
102102 void ScalarNode::anchor() {}
103 void BlockScalarNode::anchor() {}
103104 void KeyValueNode::anchor() {}
104105 void MappingNode::anchor() {}
105106 void SequenceNode::anchor() {}
127128 TK_Key,
128129 TK_Value,
129130 TK_Scalar,
131 TK_BlockScalar,
130132 TK_Alias,
131133 TK_Anchor,
132134 TK_Tag
135137 /// A string of length 0 or more whose begin() points to the logical location
136138 /// of the token in the input.
137139 StringRef Range;
140
141 /// The value of a block scalar node.
142 std::string Value;
138143
139144 Token() : Kind(TK_Error) {}
140145 };
347352 /// b-break.
348353 StringRef::iterator skip_b_break(StringRef::iterator Position);
349354
355 /// Skip a single s-space[31] starting at Position.
356 ///
357 /// An s-space is 0x20
358 ///
359 /// @returns The code unit after the s-space, or Position if it's not a
360 /// s-space.
361 StringRef::iterator skip_s_space(StringRef::iterator Position);
362
350363 /// @brief Skip a single s-white[33] starting at Position.
351364 ///
352365 /// A s-white is 0x20 | 0x9
372385 StringRef::iterator skip_while( SkipWhileFunc Func
373386 , StringRef::iterator Position);
374387
388 /// Skip minimal well-formed code unit subsequences until Func returns its
389 /// input.
390 void advanceWhile(SkipWhileFunc Func);
391
375392 /// @brief Scan ns-uri-char[39]s starting at Cur.
376393 ///
377394 /// This updates Cur and Column while scanning.
392409 /// Pos is whitespace or a new line
393410 bool isBlankOrBreak(StringRef::iterator Position);
394411
412 /// Consume a single b-break[28] if it's present at the current position.
413 ///
414 /// Return false if the code unit at the current position isn't a line break.
415 bool consumeLineBreakIfPresent();
416
395417 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
396418 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
397419 , unsigned AtColumn
464486
465487 /// @brief Scan a block scalar starting with | or >.
466488 bool scanBlockScalar(bool IsLiteral);
489
490 /// Scan a chomping indicator in a block scalar header.
491 char scanBlockChompingIndicator();
492
493 /// Scan an indentation indicator in a block scalar header.
494 unsigned scanBlockIndentationIndicator();
495
496 /// Scan a block scalar header.
497 ///
498 /// Return false if an error occurred.
499 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
500 bool &IsDone);
501
502 /// Look for the indentation level of a block scalar.
503 ///
504 /// Return false if an error occurred.
505 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
506 unsigned &LineBreaks, bool &IsDone);
507
508 /// Scan the indentation of a text line in a block scalar.
509 ///
510 /// Return false if an error occurred.
511 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
512 bool &IsDone);
467513
468514 /// @brief Scan a tag of the form !stuff.
469515 bool scanTag();
610656 break;
611657 case Token::TK_Scalar:
612658 OS << "Scalar: ";
659 break;
660 case Token::TK_BlockScalar:
661 OS << "Block Scalar: ";
613662 break;
614663 case Token::TK_Alias:
615664 OS << "Alias: ";
815864 return Position;
816865 }
817866
867 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
868 if (Position == End)
869 return Position;
870 if (*Position == ' ')
871 return Position + 1;
872 return Position;
873 }
818874
819875 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
820876 if (Position == End)
841897 Position = i;
842898 }
843899 return Position;
900 }
901
902 void Scanner::advanceWhile(SkipWhileFunc Func) {
903 auto Final = skip_while(Func, Current);
904 Column += Final - Current;
905 Current = Final;
844906 }
845907
846908 static bool is_ns_hex_digit(const char C) {
903965 || *Position == '\r' || *Position == '\n')
904966 return true;
905967 return false;
968 }
969
970 bool Scanner::consumeLineBreakIfPresent() {
971 auto Next = skip_b_break(Current);
972 if (Next == Current)
973 return false;
974 Column = 0;
975 ++Line;
976 Current = Next;
977 return true;
906978 }
907979
908980 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
13731445 return true;
13741446 }
13751447
1376 bool Scanner::scanBlockScalar(bool IsLiteral) {
1377 StringRef::iterator Start = Current;
1378 skip(1); // Eat | or >
1379 while(true) {
1380 StringRef::iterator i = skip_nb_char(Current);
1381 if (i == Current) {
1382 if (Column == 0)
1383 break;
1384 i = skip_b_break(Current);
1385 if (i != Current) {
1386 // We got a line break.
1387 Column = 0;
1388 ++Line;
1389 Current = i;
1390 continue;
1391 } else {
1392 // There was an error, which should already have been printed out.
1448 char Scanner::scanBlockChompingIndicator() {
1449 char Indicator = ' ';
1450 if (Current != End && (*Current == '+' || *Current == '-')) {
1451 Indicator = *Current;
1452 skip(1);
1453 }
1454 return Indicator;
1455 }
1456
1457 /// Get the number of line breaks after chomping.
1458 ///
1459 /// Return the number of trailing line breaks to emit, depending on
1460 /// \p ChompingIndicator.
1461 static unsigned getChompedLineBreaks(char ChompingIndicator,
1462 unsigned LineBreaks, StringRef Str) {
1463 if (ChompingIndicator == '-') // Strip all line breaks.
1464 return 0;
1465 if (ChompingIndicator == '+') // Keep all line breaks.
1466 return LineBreaks;
1467 // Clip trailing lines.
1468 return Str.empty() ? 0 : 1;
1469 }
1470
1471 unsigned Scanner::scanBlockIndentationIndicator() {
1472 unsigned Indent = 0;
1473 if (Current != End && (*Current >= '1' && *Current <= '9')) {
1474 Indent = unsigned(*Current - '0');
1475 skip(1);
1476 }
1477 return Indent;
1478 }
1479
1480 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
1481 unsigned &IndentIndicator, bool &IsDone) {
1482 auto Start = Current;
1483
1484 ChompingIndicator = scanBlockChompingIndicator();
1485 IndentIndicator = scanBlockIndentationIndicator();
1486 // Check for the chomping indicator once again.
1487 if (ChompingIndicator == ' ')
1488 ChompingIndicator = scanBlockChompingIndicator();
1489 Current = skip_while(&Scanner::skip_s_white, Current);
1490 skipComment();
1491
1492 if (Current == End) { // EOF, we have an empty scalar.
1493 Token T;
1494 T.Kind = Token::TK_BlockScalar;
1495 T.Range = StringRef(Start, Current - Start);
1496 TokenQueue.push_back(T);
1497 IsDone = true;
1498 return true;
1499 }
1500
1501 if (!consumeLineBreakIfPresent()) {
1502 setError("Expected a line break after block scalar header", Current);
1503 return false;
1504 }
1505 return true;
1506 }
1507
1508 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
1509 unsigned BlockExitIndent,
1510 unsigned &LineBreaks, bool &IsDone) {
1511 unsigned MaxAllSpaceLineCharacters = 0;
1512 StringRef::iterator LongestAllSpaceLine;
1513
1514 while (true) {
1515 advanceWhile(&Scanner::skip_s_space);
1516 if (skip_nb_char(Current) != Current) {
1517 // This line isn't empty, so try and find the indentation.
1518 if (Column <= BlockExitIndent) { // End of the block literal.
1519 IsDone = true;
1520 return true;
1521 }
1522 // We found the block's indentation.
1523 BlockIndent = Column;
1524 if (MaxAllSpaceLineCharacters > BlockIndent) {
1525 setError(
1526 "Leading all-spaces line must be smaller than the block indent",
1527 LongestAllSpaceLine);
13931528 return false;
13941529 }
1395 }
1396 Current = i;
1530 return true;
1531 }
1532 if (skip_b_break(Current) != Current &&
1533 Column > MaxAllSpaceLineCharacters) {
1534 // Record the longest all-space line in case it's longer than the
1535 // discovered block indent.
1536 MaxAllSpaceLineCharacters = Column;
1537 LongestAllSpaceLine = Current;
1538 }
1539
1540 // Check for EOF.
1541 if (Current == End) {
1542 IsDone = true;
1543 return true;
1544 }
1545
1546 if (!consumeLineBreakIfPresent()) {
1547 IsDone = true;
1548 return true;
1549 }
1550 ++LineBreaks;
1551 }
1552 return true;
1553 }
1554
1555 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
1556 unsigned BlockExitIndent, bool &IsDone) {
1557 // Skip the indentation.
1558 while (Column < BlockIndent) {
1559 auto I = skip_s_space(Current);
1560 if (I == Current)
1561 break;
1562 Current = I;
13971563 ++Column;
13981564 }
13991565
1400 if (Start == Current) {
1401 setError("Got empty block scalar", Start);
1566 if (skip_nb_char(Current) == Current)
1567 return true;
1568
1569 if (Column <= BlockExitIndent) { // End of the block literal.
1570 IsDone = true;
1571 return true;
1572 }
1573
1574 if (Column < BlockIndent) {
1575 if (Current != End && *Current == '#') { // Trailing comment.
1576 IsDone = true;
1577 return true;
1578 }
1579 setError("A text line is less indented than the block scalar", Current);
14021580 return false;
14031581 }
1582 return true; // A normal text line.
1583 }
1584
1585 bool Scanner::scanBlockScalar(bool IsLiteral) {
1586 // Eat '|' or '>'
1587 assert(*Current == '|' || *Current == '>');
1588 skip(1);
1589
1590 char ChompingIndicator;
1591 unsigned BlockIndent;
1592 bool IsDone = false;
1593 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
1594 return false;
1595 if (IsDone)
1596 return true;
1597
1598 auto Start = Current;
1599 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
1600 unsigned LineBreaks = 0;
1601 if (BlockIndent == 0) {
1602 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
1603 IsDone))
1604 return false;
1605 }
1606
1607 // Scan the block's scalars body.
1608 SmallString<256> Str;
1609 while (!IsDone) {
1610 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
1611 return false;
1612 if (IsDone)
1613 break;
1614
1615 // Parse the current line.
1616 auto LineStart = Current;
1617 advanceWhile(&Scanner::skip_nb_char);
1618 if (LineStart != Current) {
1619 Str.append(LineBreaks, '\n');
1620 Str.append(StringRef(LineStart, Current - LineStart));
1621 LineBreaks = 0;
1622 }
1623
1624 // Check for EOF.
1625 if (Current == End)
1626 break;
1627
1628 if (!consumeLineBreakIfPresent())
1629 break;
1630 ++LineBreaks;
1631 }
1632
1633 if (Current == End && !LineBreaks)
1634 // Ensure that there is at least one line break before the end of file.
1635 LineBreaks = 1;
1636 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
1637
1638 // New lines may start a simple key.
1639 if (!FlowLevel)
1640 IsSimpleKeyAllowed = true;
14041641
14051642 Token T;
1406 T.Kind = Token::TK_Scalar;
1643 T.Kind = Token::TK_BlockScalar;
14071644 T.Range = StringRef(Start, Current - Start);
1645 T.Value = Str.str().str();
14081646 TokenQueue.push_back(T);
14091647 return true;
14101648 }
16061844 case NK_Null:
16071845 return "tag:yaml.org,2002:null";
16081846 case NK_Scalar:
1847 case NK_BlockScalar:
16091848 // TODO: Tag resolution.
16101849 return "tag:yaml.org,2002:str";
16111850 case NK_Mapping:
21372376 , AnchorInfo.Range.substr(1)
21382377 , TagInfo.Range
21392378 , T.Range);
2379 case Token::TK_BlockScalar:
2380 getNext();
2381 return new (NodeAllocator)
2382 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
2383 TagInfo.Range, T.Value, T.Range);
21402384 case Token::TK_Key:
21412385 // Don't eat the TK_Key, KeyValueNode expects it.
21422386 return new (NodeAllocator)
None # RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s
0 # RUN: not yaml-bench -canonical %s 2>&1 | FileCheck %s
11 #
2 # FIXME: This test should actually fail. Yaml bench should report an error that
3 # says that the '---' and '...' document start/end markers must not be specified
4 # as the first content line of a non-indented plain scalar.
5 # CHECK: !!str
2 # CHECK: error: Expected a line break after block scalar header
63
74 ---
85 --- ||| : foo
None # RUN: yaml-bench -canonical %s
0 # RUN: yaml-bench -canonical %s | FileCheck %s
1 # CHECK: !!str "literal\n"
2 # CHECK: !!str " folded\n"
3 # CHECK: !!str "keep\n\n"
4 # CHECK: !!str " strip"
15
26 - | # Just the style
37 literal
None # RUN: yaml-bench -canonical %s
0 # RUN: yaml-bench -canonical %s | FileCheck %s
1 # CHECK: !!str "literal\n"
2 # CHECK: !!str "folded\n"
13
24 - |
35 literal
None # RUN: yaml-bench -canonical %s
0 # RUN: yaml-bench -canonical %s | FileCheck %s
1 # CHECK: !!str "detected\n"
2 # CHECK: !!str "\n\n# detected\n"
3 # CHECK: !!str " explicit\n"
4 # CHECK: !!str "\t\ndetected\n"
15
26 - |
37 detected
88 - |1
99 text
1010
11 # CHECK: error
11 # CHECK: 8:2: error: A text line is less indented than the block scalar
None # RUN: yaml-bench -canonical %s
0 # RUN: yaml-bench -canonical %s | FileCheck %s
1 # CHECK: !!str "text"
2 # CHECK: !!str "text\n"
3 # CHECK: !!str "text\n\n"
14
25 strip: |-
3 text
clip: |
4 text…keep: |+
5 text
6 text
7 clip: |
8 text
9 keep: |+
10 text
11
None # RUN: yaml-bench -canonical %s
0 # RUN: yaml-bench -canonical %s | FileCheck %s
1 # CHECK: ? !!str "strip"
2 # CHECK: : !!str ""
3 # CHECK: ? !!str "clip"
4 # CHECK: : !!str ""
5 # CHECK: ? !!str "keep"
6 # CHECK: : !!str "\n"
17
28 strip: >-
39
410 clip: >
511
612 keep: |+
7
None # RUN: yaml-bench -canonical %s
0 # RUN: yaml-bench -canonical %s | FileCheck %s
1 # CHECK: !!str "literal\n\ttext\n"
12
23 | # Simple block scalar
34 literal
None # RUN: yaml-bench -canonical %s
0 # RUN: yaml-bench -canonical %s | FileCheck %s
1 # CHECK: !!str "\n\nliteral\n\ntext\n"
12
23 |
34
+0
-10
test/YAMLParser/spec-09-27.test less more
None # RUN: yaml-bench -canonical %s
1
2 |
3
4
5 literal
6
7 text
8
9 # Comment
+0
-10
test/YAMLParser/spec-09-28.test less more
None # RUN: yaml-bench -canonical %s
1
2 |
3
4
5 literal
6
7 text
8
9 # Comment
127127
128128 TEST(YAMLParser, ParsesArrayOfArrays) {
129129 ExpectParseSuccess("Array of arrays", "[[]]");
130 }
131
132 TEST(YAMLParser, ParsesBlockLiteralScalars) {
133 ExpectParseSuccess("Block literal scalar", "test: |\n Hello\n World\n");
134 ExpectParseSuccess("Block literal scalar EOF", "test: |\n Hello\n World");
135 ExpectParseSuccess("Empty block literal scalar header EOF", "test: | ");
136 ExpectParseSuccess("Empty block literal scalar", "test: |\ntest2: 20");
137 ExpectParseSuccess("Empty block literal scalar 2", "- | \n \n\n \n- 42");
138 ExpectParseSuccess("Block literal scalar in sequence",
139 "- |\n Testing\n Out\n\n- 22");
140 ExpectParseSuccess("Block literal scalar in document",
141 "--- |\n Document\n...");
142 ExpectParseSuccess("Empty non indented lines still count",
143 "- |\n First line\n \n\n Another line\n\n- 2");
144 ExpectParseSuccess("Comment in block literal scalar header",
145 "test: | # Comment \n No Comment\ntest 2: | # Void");
146 ExpectParseSuccess("Chomping indicators in block literal scalar header",
147 "test: |- \n Hello\n\ntest 2: |+ \n\n World\n\n\n");
148 ExpectParseSuccess("Indent indicators in block literal scalar header",
149 "test: |1 \n \n Hello \n World\n");
150 ExpectParseSuccess("Chomping and indent indicators in block literals",
151 "test: |-1\n Hello\ntest 2: |9+\n World");
152 ExpectParseSuccess("Trailing comments in block literals",
153 "test: |\n Content\n # Trailing\n #Comment\ntest 2: 3");
154 ExpectParseError("Invalid block scalar header", "test: | failure");
155 ExpectParseError("Invalid line indentation", "test: |\n First line\n Error");
156 ExpectParseError("Long leading space line", "test: |\n \n Test\n");
130157 }
131158
132159 TEST(YAMLParser, HandlesEndOfFileGracefully) {
9595 SmallString<32> Storage;
9696 StringRef Val = sn->getValue(Storage);
9797 outs() << prettyTag(n) << " \"" << yaml::escape(Val) << "\"";
98 } else if (yaml::BlockScalarNode *BN = dyn_cast(n)) {
99 outs() << prettyTag(n) << " \"" << yaml::escape(BN->getValue()) << "\"";
98100 } else if (yaml::SequenceNode *sn = dyn_cast(n)) {
99101 outs() << prettyTag(n) << " [\n";
100102 ++Indent;