llvm.org GIT mirror llvm / 6c9cbed
Add .rc scripts tokenizer. This extends the shell of llvm-rc tool with the ability of tokenization of the input files. Currently, ASCII and ASCII-compatible UTF-8 files are supported. Thanks to Nico Weber (thakis) for his original work in this area. Differential Revision: https://reviews.llvm.org/D35957 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310621 91177308-0d34-0410-b5e6-96231b3b80d8 Marek Sokolowski 2 years ago
7 changed file(s) with 507 addition(s) and 2 deletion(s). Raw diff Collapse all Expand all
0 1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
1 He11o LLVM
2
3 "RC string test.",L"Another RC string test.'&{",42,100
4
5
6
7 ":))"
0 ; RUN: llvm-rc /V %p/Inputs/tokens.rc | FileCheck %s
1
2 ; CHECK: Int: 1; int value = 1
3 ; CHECK-NEXT: Plus: +
4 ; CHECK-NEXT: Int: 2; int value = 2
5 ; CHECK-NEXT: Minus: -
6 ; CHECK-NEXT: Int: 3214L; int value = 3214
7 ; CHECK-NEXT: Amp: &
8 ; CHECK-NEXT: Int: 0x120894; int value = 1181844
9 ; CHECK-NEXT: Int: 032173; int value = 13435
10 ; CHECK-NEXT: Int: 2; int value = 2
11 ; CHECK-NEXT: Pipe: |
12 ; CHECK-NEXT: Amp: &
13 ; CHECK-NEXT: Tilde: ~
14 ; CHECK-NEXT: Plus: +
15 ; CHECK-NEXT: LeftParen: (
16 ; CHECK-NEXT: Minus: -
17 ; CHECK-NEXT: Int: 7; int value = 7
18 ; CHECK-NEXT: RightParen: )
19 ; CHECK-NEXT: BlockBegin: {
20 ; CHECK-NEXT: Int: 0xabcdef; int value = 11259375
21 ; CHECK-NEXT: Int: 0xABCDEFl; int value = 11259375
22 ; CHECK-NEXT: BlockEnd: }
23 ; CHECK-NEXT: BlockBegin: Begin
24 ; CHECK-NEXT: BlockEnd: End
25 ; CHECK-NEXT: Identifier: He11o
26 ; CHECK-NEXT: Identifier: LLVM
27 ; CHECK-NEXT: String: "RC string test."
28 ; CHECK-NEXT: Comma: ,
29 ; CHECK-NEXT: String: L"Another RC string test.'&{"
30 ; CHECK-NEXT: Comma: ,
31 ; CHECK-NEXT: Int: 42; int value = 42
32 ; CHECK-NEXT: Comma: ,
33 ; CHECK-NEXT: Int: 100; int value = 100
34 ; CHECK-NEXT: String: ":))"
99
1010 add_llvm_tool(llvm-rc
1111 llvm-rc.cpp
12 ResourceScriptToken.cpp
1213 )
0 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This file implements an interface defined in ResourceScriptToken.h.
10 // In particular, it defines an .rc script tokenizer.
11 //
12 //===---------------------------------------------------------------------===//
13
14 #include "ResourceScriptToken.h"
15 #include "llvm/Support/raw_ostream.h"
16
17 #include
18 #include
19 #include
20 #include
21 #include
22
23 using namespace llvm;
24
25 using Kind = RCToken::Kind;
26
27 // Checks if Representation is a correct description of an RC integer.
28 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
29 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
30 // character (that is the difference between our representation and
31 // StringRef's one). If Representation is correct, 'true' is returned and
32 // the return value is put back in Num.
33 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
34 size_t Length = Representation.size();
35 if (Length == 0)
36 return false;
37 // Strip the last 'L' if unnecessary.
38 if (std::toupper(Representation.back()) == 'L')
39 Representation = Representation.drop_back(1);
40
41 return !Representation.getAsInteger(0, Num);
42 }
43
44 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
45 : TokenKind(RCTokenKind), TokenValue(Value) {}
46
47 uint32_t RCToken::intValue() const {
48 assert(TokenKind == Kind::Int);
49 // We assume that the token already is a correct integer (checked by
50 // rcGetAsInteger).
51 uint32_t Result;
52 bool IsSuccess = rcGetAsInteger(TokenValue, Result);
53 assert(IsSuccess);
54 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
55 return Result;
56 }
57
58 StringRef RCToken::value() const { return TokenValue; }
59
60 Kind RCToken::kind() const { return TokenKind; }
61
62 static Error getStringError(const Twine &message) {
63 return make_error("Error parsing file: " + message,
64 inconvertibleErrorCode());
65 }
66
67 namespace {
68
69 class Tokenizer {
70 public:
71 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
72
73 Expected> run();
74
75 private:
76 // All 'advancing' methods return boolean values; if they're equal to false,
77 // the stream has ended or failed.
78 bool advance(size_t Amount = 1);
79 bool skipWhitespaces();
80
81 // Consumes a token. If any problem occurred, a non-empty Error is returned.
82 Error consumeToken(const Kind TokenKind);
83
84 // Check if tokenizer is about to read FollowingChars.
85 bool willNowRead(StringRef FollowingChars) const;
86
87 // Check if tokenizer can start reading an identifier at current position.
88 // The original tool did non specify the rules to determine what is a correct
89 // identifier. We assume they should follow the C convention:
90 // [a-zA-z_][a-zA-Z0-9_]*.
91 bool canStartIdentifier() const;
92 // Check if tokenizer can continue reading an identifier.
93 bool canContinueIdentifier() const;
94
95 // Check if tokenizer can start reading an integer.
96 // A correct integer always starts with a 0-9 digit,
97 // can contain characters 0-9A-Fa-f (digits),
98 // Ll (marking the integer is 32-bit), Xx (marking the representation
99 // is hexadecimal). As some kind of separator should come after the
100 // integer, we can consume the integer until a non-alphanumeric
101 // character.
102 bool canStartInt() const;
103 bool canContinueInt() const;
104
105 bool canStartString() const;
106
107 bool streamEof() const;
108
109 // Classify the token that is about to be read from the current position.
110 Kind classifyCurrentToken() const;
111
112 // Process the Kind::Identifier token - check if it is
113 // an identifier describing a block start or end.
114 void processIdentifier(RCToken &token) const;
115
116 StringRef Data;
117 size_t DataLength, Pos;
118 };
119
120 Expected> Tokenizer::run() {
121 Pos = 0;
122 std::vector Result;
123
124 // Consume an optional UTF-8 Byte Order Mark.
125 if (willNowRead("\xef\xbb\xbf"))
126 advance(3);
127
128 while (!streamEof()) {
129 if (!skipWhitespaces())
130 break;
131
132 Kind TokenKind = classifyCurrentToken();
133 if (TokenKind == Kind::Invalid)
134 return getStringError("Invalid token found at position " + Twine(Pos));
135
136 const size_t TokenStart = Pos;
137 if (Error TokenError = consumeToken(TokenKind))
138 return std::move(TokenError);
139
140 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
141 if (TokenKind == Kind::Identifier) {
142 processIdentifier(Token);
143 } else if (TokenKind == Kind::Int) {
144 uint32_t TokenInt;
145 if (!rcGetAsInteger(Token.value(), TokenInt)) {
146 // The integer has incorrect format or cannot be represented in
147 // a 32-bit integer.
148 return getStringError("Integer invalid or too large: " +
149 Token.value().str());
150 }
151 }
152
153 Result.push_back(Token);
154 }
155
156 return Result;
157 }
158
159 bool Tokenizer::advance(size_t Amount) {
160 Pos += Amount;
161 return !streamEof();
162 }
163
164 bool Tokenizer::skipWhitespaces() {
165 while (!streamEof() && std::isspace(Data[Pos]))
166 advance();
167 return !streamEof();
168 }
169
170 Error Tokenizer::consumeToken(const Kind TokenKind) {
171 switch (TokenKind) {
172 // One-character token consumption.
173 #define TOKEN(Name)
174 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
175 #include "ResourceScriptTokenList.h"
176 #undef TOKEN
177 #undef SHORT_TOKEN
178 advance();
179 return Error::success();
180
181 case Kind::Identifier:
182 while (!streamEof() && canContinueIdentifier())
183 advance();
184 return Error::success();
185
186 case Kind::Int:
187 while (!streamEof() && canContinueInt())
188 advance();
189 return Error::success();
190
191 case Kind::String:
192 // Consume the preceding 'L', if there is any.
193 if (std::toupper(Data[Pos]) == 'L')
194 advance();
195 // Consume the double-quote.
196 advance();
197
198 // Consume the characters until the end of the file, line or string.
199 while (true) {
200 if (streamEof()) {
201 return getStringError("Unterminated string literal.");
202 } else if (Data[Pos] == '"') {
203 // Consume the ending double-quote.
204 advance();
205 return Error::success();
206 } else if (Data[Pos] == '\n') {
207 return getStringError("String literal not terminated in the line.");
208 }
209
210 advance();
211 }
212
213 case Kind::Invalid:
214 assert(false && "Cannot consume an invalid token.");
215 }
216 }
217
218 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
219 return Data.drop_front(Pos).startswith(FollowingChars);
220 }
221
222 bool Tokenizer::canStartIdentifier() const {
223 assert(!streamEof());
224
225 const char CurChar = Data[Pos];
226 return std::isalpha(CurChar) || CurChar == '_';
227 }
228
229 bool Tokenizer::canContinueIdentifier() const {
230 assert(!streamEof());
231 const char CurChar = Data[Pos];
232 return std::isalnum(CurChar) || CurChar == '_';
233 }
234
235 bool Tokenizer::canStartInt() const {
236 assert(!streamEof());
237 return std::isdigit(Data[Pos]);
238 }
239
240 bool Tokenizer::canContinueInt() const {
241 assert(!streamEof());
242 return std::isalnum(Data[Pos]);
243 }
244
245 bool Tokenizer::canStartString() const {
246 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
247 }
248
249 bool Tokenizer::streamEof() const { return Pos == DataLength; }
250
251 Kind Tokenizer::classifyCurrentToken() const {
252 if (canStartInt())
253 return Kind::Int;
254 if (canStartString())
255 return Kind::String;
256 // BEGIN and END are at this point of lexing recognized as identifiers.
257 if (canStartIdentifier())
258 return Kind::Identifier;
259
260 const char CurChar = Data[Pos];
261
262 switch (CurChar) {
263 // One-character token classification.
264 #define TOKEN(Name)
265 #define SHORT_TOKEN(Name, Ch) \
266 case Ch: \
267 return Kind::Name;
268 #include "ResourceScriptTokenList.h"
269 #undef TOKEN
270 #undef SHORT_TOKEN
271
272 default:
273 return Kind::Invalid;
274 }
275 }
276
277 void Tokenizer::processIdentifier(RCToken &Token) const {
278 assert(Token.kind() == Kind::Identifier);
279 StringRef Name = Token.value();
280
281 if (Name.equals_lower("begin"))
282 Token = RCToken(Kind::BlockBegin, Name);
283 else if (Name.equals_lower("end"))
284 Token = RCToken(Kind::BlockEnd, Name);
285 }
286
287 } // anonymous namespace
288
289 namespace llvm {
290
291 Expected> tokenizeRC(StringRef Input) {
292 return Tokenizer(Input).run();
293 }
294
295 } // namespace llvm
0 //===-- ResourceScriptToken.h -----------------------------------*- C++-*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This declares the .rc script tokens and defines an interface for tokenizing
10 // the input data. The list of available tokens is located at
11 // ResourceScriptTokenList.h.
12 //
13 // Note that the tokenizer does not support comments or preprocessor
14 // directives. The preprocessor should do its work on the .rc file before
15 // running llvm-rc.
16 //
17 // As for now, it is possible to parse ASCII files only (the behavior on
18 // UTF files might be undefined). However, it already consumes UTF-8 BOM, if
19 // there is any. Thus, ASCII-compatible UTF-8 files are tokenized correctly.
20 //
21 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380599(v=vs.85).aspx
22 //
23 //===---------------------------------------------------------------------===//
24
25 #ifndef LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
26 #define LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
27
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/Support/Error.h"
30
31 #include
32 #include
33 #include
34 #include
35
36 namespace llvm {
37
38 // A definition of a single resource script token. Each token has its kind
39 // (declared in ResourceScriptTokenList) and holds a value - a reference
40 // representation of the token.
41 // RCToken does not claim ownership on its value. A memory buffer containing
42 // the token value should be stored in a safe place and cannot be freed
43 // nor reallocated.
44 class RCToken {
45 public:
46 enum class Kind {
47 #define TOKEN(Name) Name,
48 #define SHORT_TOKEN(Name, Ch) Name,
49 #include "ResourceScriptTokenList.h"
50 #undef TOKEN
51 #undef SHORT_TOKEN
52 };
53
54 RCToken(RCToken::Kind RCTokenKind, StringRef Value);
55
56 // Get an integer value of the integer token.
57 uint32_t intValue() const;
58
59 StringRef value() const;
60 Kind kind() const;
61
62 private:
63 Kind TokenKind;
64 StringRef TokenValue;
65 };
66
67 // Tokenize Input.
68 // In case no error occured, the return value contains
69 // tokens in order they were in the input file.
70 // In case of any error, the return value contains
71 // a textual representation of error.
72 //
73 // Tokens returned by this function hold only references to the parts
74 // of the Input. Memory buffer containing Input cannot be freed,
75 // modified or reallocated.
76 Expected> tokenizeRC(StringRef Input);
77
78 } // namespace llvm
79
80 #endif
0 //===-- ResourceScriptTokenList.h -------------------------------*- C++-*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This is a part of llvm-rc tokenizer. It lists all the possible tokens
10 // that might occur in a correct .rc script.
11 //
12 //===---------------------------------------------------------------------===//
13
14
15 // Long tokens. They might consist of more than one character.
16 TOKEN(Invalid) // Invalid token. Should not occur in a valid script.
17 TOKEN(Int) // Integer (decimal, octal or hexadecimal).
18 TOKEN(String) // String value.
19 TOKEN(Identifier) // Script identifier (resource name or type).
20
21 // Short tokens. They usually consist of exactly one character.
22 // The definitions are of the form SHORT_TOKEN(TokenName, TokenChar).
23 // TokenChar is the one-character token representation occuring in the correct
24 // .rc scripts.
25 SHORT_TOKEN(BlockBegin, '{') // Start of the script block; can also be BEGIN.
26 SHORT_TOKEN(BlockEnd, '}') // End of the block; can also be END.
27 SHORT_TOKEN(Comma, ',') // Comma - resource arguments separator.
28 SHORT_TOKEN(Plus, '+') // Addition operator.
29 SHORT_TOKEN(Minus, '-') // Subtraction operator.
30 SHORT_TOKEN(Pipe, '|') // Bitwise-OR operator.
31 SHORT_TOKEN(Amp, '&') // Bitwise-AND operator.
32 SHORT_TOKEN(Tilde, '~') // Bitwise-NOT operator.
33 SHORT_TOKEN(LeftParen, '(') // Left parenthesis in the script expressions.
34 SHORT_TOKEN(RightParen, ')') // Right parenthesis.
None //===- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*--===//
0 //===-- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*-===//
11 //
22 // The LLVM Compiler Infrastructure
33 //
1010 // platform-independent port of Microsoft's rc.exe tool.
1111 //
1212 //===----------------------------------------------------------------------===//
13
14 #include "ResourceScriptToken.h"
1315
1416 #include "llvm/Option/Arg.h"
1517 #include "llvm/Option/ArgList.h"
5961 };
6062
6163 static ExitOnError ExitOnErr;
64
65 LLVM_ATTRIBUTE_NORETURN static void fatalError(Twine Message) {
66 errs() << Message << "\n";
67 exit(1);
68 }
69
6270 } // anonymous namespace
6371
6472 int main(int argc_, const char *argv_[]) {
8088 opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC);
8189
8290 // The tool prints nothing when invoked with no command-line arguments.
83 if (InputArgs.hasArg(OPT_HELP))
91 if (InputArgs.hasArg(OPT_HELP)) {
8492 T.PrintHelp(outs(), "rc", "Resource Converter", false);
93 return 0;
94 }
95
96 const bool BeVerbose = InputArgs.hasArg(OPT_VERBOSE);
97
98 std::vector InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT);
99 if (InArgsInfo.size() != 1) {
100 fatalError("Exactly one input file should be provided.");
101 }
102
103 // Read and tokenize the input file.
104 const Twine &Filename = InArgsInfo[0];
105 ErrorOr> File = MemoryBuffer::getFile(Filename);
106 if (!File) {
107 fatalError("Error opening file '" + Filename +
108 "': " + File.getError().message());
109 }
110
111 std::unique_ptr FileContents = std::move(*File);
112 StringRef Contents = FileContents->getBuffer();
113
114 std::vector Tokens = ExitOnErr(tokenizeRC(Contents));
115
116 if (BeVerbose) {
117 const Twine TokenNames[] = {
118 #define TOKEN(Name) #Name,
119 #define SHORT_TOKEN(Name, Ch) #Name,
120 #include "ResourceScriptTokenList.h"
121 #undef TOKEN
122 #undef SHORT_TOKEN
123 };
124
125 for (const RCToken &Token : Tokens) {
126 outs() << TokenNames[static_cast(Token.kind())] << ": "
127 << Token.value();
128 if (Token.kind() == RCToken::Kind::Int)
129 outs() << "; int value = " << Token.intValue();
130
131 outs() << "\n";
132 }
133 }
85134
86135 return 0;
87136 }