llvm.org GIT mirror llvm / 431b0a7
[Support] Beef up and expose the response file parsing in llvm::cl The plan is to use it for clang and lld. Major behavior changes: - We can now parse UTF-16 files that have a byte order mark. - PR16209: Don't drop backslashes on the floor if they don't escape anything. The actual parsing loop was based on code from Clang's driver.cpp, although it's been rewritten to track its state with control flow rather than state variables. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1170 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186587 91177308-0d34-0410-b5e6-96231b3b80d8 Reid Kleckner 6 years ago
4 changed file(s) with 227 addition(s) and 67 deletion(s). Raw diff Collapse all Expand all
17441744 /// llvm::cl::ParseCommandLineOptions().
17451745 void getRegisteredOptions(StringMap &Map);
17461746
1747 //===----------------------------------------------------------------------===//
1748 // Standalone command line processing utilities.
1749 //
1750
1751 /// \brief Saves strings in the inheritor's stable storage and returns a stable
1752 /// raw character pointer.
1753 class StringSaver {
1754 public:
1755 virtual const char *SaveString(const char *Str) = 0;
1756 virtual ~StringSaver() {}; // Pacify -Wnon-virtual-dtor.
1757 };
1758
1759 /// \brief Tokenizes a command line that can contain escapes and quotes.
1760 //
1761 /// The quoting rules match those used by GCC and other tools that use
1762 /// libiberty's buildargv() or expandargv() utilities, and do not match bash.
1763 /// They differ from buildargv() on treatment of backslashes that do not escape
1764 /// a special character to make it possible to accept most Windows file paths.
1765 ///
1766 /// \param [in] Source The string to be split on whitespace with quotes.
1767 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
1768 /// \param [out] NewArgv All parsed strings are appended to NewArgv.
1769 void TokenizeGNUCommandLine(StringRef Source, StringSaver &Saver,
1770 SmallVectorImpl &NewArgv);
1771
1772 /// \brief Tokenizes a Windows command line which may contain quotes and escaped
1773 /// quotes.
1774 ///
1775 /// See MSDN docs for CommandLineToArgvW for information on the quoting rules.
1776 /// http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
1777 ///
1778 /// \param [in] Source The string to be split on whitespace with quotes.
1779 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
1780 /// \param [out] NewArgv All parsed strings are appended to NewArgv.
1781 void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
1782 SmallVectorImpl &NewArgv);
1783
1784 /// \brief String tokenization function type. Should be compatible with either
1785 /// Windows or Unix command line tokenizers.
1786 typedef void (*TokenizerCallback)(StringRef Source, StringSaver &Saver,
1787 SmallVectorImpl &NewArgv);
1788
1789 /// \brief Expand response files on a command line recursively using the given
1790 /// StringSaver and tokenization strategy. Argv should contain the command line
1791 /// before expansion and will be modified in place.
1792 ///
1793 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
1794 /// \param [in] Tokenize Tokenization strategy. Typically Unix or Windows.
1795 /// \param [in,out] Argv Command line into which to expand response files.
1796 /// \return true if all @files were expanded successfully or there were none.
1797 bool ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
1798 SmallVectorImpl &Argv);
1799
17471800 } // End namespace cl
17481801
17491802 } // End namespace llvm
1616 //===----------------------------------------------------------------------===//
1717
1818 #include "llvm/Support/CommandLine.h"
19 #include "llvm/ADT/ArrayRef.h"
1920 #include "llvm/ADT/OwningPtr.h"
2021 #include "llvm/ADT/SmallPtrSet.h"
2122 #include "llvm/ADT/SmallString.h"
2223 #include "llvm/ADT/StringMap.h"
2324 #include "llvm/ADT/Twine.h"
2425 #include "llvm/Config/config.h"
26 #include "llvm/Support/ConvertUTF.h"
2527 #include "llvm/Support/Debug.h"
2628 #include "llvm/Support/ErrorHandling.h"
2729 #include "llvm/Support/Host.h"
433435 O->getNumOccurrencesFlag() == cl::OneOrMore;
434436 }
435437
436 /// ParseCStringVector - Break INPUT up wherever one or more
437 /// whitespace characters are found, and store the resulting tokens in
438 /// OUTPUT. The tokens stored in OUTPUT are dynamically allocated
439 /// using strdup(), so it is the caller's responsibility to free()
440 /// them later.
441 ///
442 static void ParseCStringVector(std::vector &OutputVector,
443 const char *Input) {
444 // Characters which will be treated as token separators:
445 StringRef Delims = " \v\f\t\r\n";
446
447 StringRef WorkStr(Input);
448 while (!WorkStr.empty()) {
449 // If the first character is a delimiter, strip them off.
450 if (Delims.find(WorkStr[0]) != StringRef::npos) {
451 size_t Pos = WorkStr.find_first_not_of(Delims);
452 if (Pos == StringRef::npos) Pos = WorkStr.size();
453 WorkStr = WorkStr.substr(Pos);
438 static bool isWhitespace(char C) {
439 return strchr(" \t\n\r\f\v", C);
440 }
441
442 static bool isQuote(char C) {
443 return C == '\"' || C == '\'';
444 }
445
446 static bool isGNUSpecial(char C) {
447 return strchr("\\\"\' ", C);
448 }
449
450 void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
451 SmallVectorImpl &NewArgv) {
452 SmallString<128> Token;
453 for (size_t I = 0, E = Src.size(); I != E; ++I) {
454 // Consume runs of whitespace.
455 if (Token.empty()) {
456 while (I != E && isWhitespace(Src[I]))
457 ++I;
458 if (I == E) break;
459 }
460
461 // Backslashes can escape backslashes, spaces, and other quotes. Otherwise
462 // they are literal. This makes it much easier to read Windows file paths.
463 if (I + 1 < E && Src[I] == '\\' && isGNUSpecial(Src[I + 1])) {
464 ++I; // Skip the escape.
465 Token.push_back(Src[I]);
454466 continue;
455467 }
456468
457 // Find position of first delimiter.
458 size_t Pos = WorkStr.find_first_of(Delims);
459 if (Pos == StringRef::npos) Pos = WorkStr.size();
460
461 // Everything from 0 to Pos is the next word to copy.
462 char *NewStr = (char*)malloc(Pos+1);
463 memcpy(NewStr, WorkStr.data(), Pos);
464 NewStr[Pos] = 0;
465 OutputVector.push_back(NewStr);
466
467 WorkStr = WorkStr.substr(Pos);
468 }
469 // Consume a quoted string.
470 if (isQuote(Src[I])) {
471 char Quote = Src[I++];
472 while (I != E && Src[I] != Quote) {
473 // Backslashes are literal, unless they escape a special character.
474 if (Src[I] == '\\' && I + 1 != E && isGNUSpecial(Src[I + 1]))
475 ++I;
476 Token.push_back(Src[I]);
477 ++I;
478 }
479 if (I == E) break;
480 continue;
481 }
482
483 // End the token if this is whitespace.
484 if (isWhitespace(Src[I])) {
485 if (!Token.empty())
486 NewArgv.push_back(Saver.SaveString(Token.c_str()));
487 Token.clear();
488 continue;
489 }
490
491 // This is a normal character. Append it.
492 Token.push_back(Src[I]);
493 }
494
495 // Append the last token after hitting EOF with no whitespace.
496 if (!Token.empty())
497 NewArgv.push_back(Saver.SaveString(Token.c_str()));
498 }
499
500 void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
501 SmallVectorImpl &NewArgv) {
502 llvm_unreachable("FIXME not implemented");
503 }
504
505 static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
506 TokenizerCallback Tokenizer,
507 SmallVectorImpl &NewArgv) {
508 OwningPtr MemBuf;
509 if (MemoryBuffer::getFile(FName, MemBuf))
510 return false;
511 StringRef Str(MemBuf->getBufferStart(), MemBuf->getBufferSize());
512
513 // If we have a UTF-16 byte order mark, convert to UTF-8 for parsing.
514 ArrayRef BufRef(MemBuf->getBufferStart(), MemBuf->getBufferEnd());
515 std::string UTF8Buf;
516 if (hasUTF16ByteOrderMark(BufRef)) {
517 if (!convertUTF16ToUTF8String(BufRef, UTF8Buf))
518 return false;
519 Str = StringRef(UTF8Buf);
520 }
521
522 // Tokenize the contents into NewArgv.
523 Tokenizer(Str, Saver, NewArgv);
524
525 return true;
526 }
527
528 /// \brief Expand response files on a command line recursively using the given
529 /// StringSaver and tokenization strategy.
530 bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
531 SmallVectorImpl &Argv) {
532 unsigned RspFiles = 0;
533 bool AllExpanded = false;
534
535 // Don't cache Argv.size() because it can change.
536 for (unsigned I = 0; I != Argv.size(); ) {
537 const char *Arg = Argv[I];
538 if (Arg[0] != '@') {
539 ++I;
540 continue;
541 }
542
543 // If we have too many response files, leave some unexpanded. This avoids
544 // crashing on self-referential response files.
545 if (RspFiles++ > 20)
546 return false;
547
548 // Replace this response file argument with the tokenization of its
549 // contents. Nested response files are expanded in subsequent iterations.
550 // FIXME: If a nested response file uses a relative path, is it relative to
551 // the cwd of the process or the response file?
552 SmallVector ExpandedArgv;
553 if (!ExpandResponseFile(Arg + 1, Saver, Tokenizer, ExpandedArgv)) {
554 AllExpanded = false;
555 continue;
556 }
557 Argv.erase(Argv.begin() + I);
558 Argv.insert(Argv.begin() + I, ExpandedArgv.begin(), ExpandedArgv.end());
559 }
560 return AllExpanded;
561 }
562
563 namespace {
564 class StrDupSaver : public StringSaver {
565 const char *SaveString(const char *Str) LLVM_OVERRIDE {
566 return strdup(Str);
567 }
568 };
469569 }
470570
471571 /// ParseEnvironmentOptions - An alternative entry point to the
486586
487587 // Get program's "name", which we wouldn't know without the caller
488588 // telling us.
489 std::vector> newArgv;
589 SmallVector> newArgv;
490590 newArgv.push_back(strdup(progName));
491591
492592 // Parse the value of the environment variable into a "command line"
493593 // and hand it off to ParseCommandLineOptions().
494 ParseCStringVector(newArgv, envValue);
594 StrDupSaver Saver;
595 TokenizeGNUCommandLine(envValue, Saver, newArgv);
495596 int newArgc = static_cast(newArgv.size());
496597 ParseCommandLineOptions(newArgc, &newArgv[0], Overview);
497598
498599 // Free all the strdup()ed strings.
499 for (std::vector::iterator i = newArgv.begin(), e = newArgv.end();
600 for (SmallVectorImpl::iterator i = newArgv.begin(),
601 e = newArgv.end();
500602 i != e; ++i)
501 free(*i);
502 }
503
504
505 /// ExpandResponseFiles - Copy the contents of argv into newArgv,
506 /// substituting the contents of the response files for the arguments
507 /// of type @file.
508 static void ExpandResponseFiles(unsigned argc, const char*const* argv,
509 std::vector& newArgv) {
510 for (unsigned i = 1; i != argc; ++i) {
511 const char *arg = argv[i];
512
513 if (arg[0] == '@') {
514 // TODO: we should also support recursive loading of response files,
515 // since this is how gcc behaves. (From their man page: "The file may
516 // itself contain additional @file options; any such options will be
517 // processed recursively.")
518
519 // Mmap the response file into memory.
520 OwningPtr respFilePtr;
521 if (!MemoryBuffer::getFile(arg + 1, respFilePtr)) {
522 ParseCStringVector(newArgv, respFilePtr->getBufferStart());
523 continue;
524 }
525 }
526 newArgv.push_back(strdup(arg));
527 }
603 free(const_cast(*i));
528604 }
529605
530606 void cl::ParseCommandLineOptions(int argc, const char * const *argv,
539615 "No options specified!");
540616
541617 // Expand response files.
542 std::vector newArgv;
543 newArgv.push_back(strdup(argv[0]));
544 ExpandResponseFiles(argc, argv, newArgv);
618 SmallVector newArgv;
619 for (int i = 0; i != argc; ++i)
620 newArgv.push_back(strdup(argv[i]));
621 StrDupSaver Saver;
622 ExpandResponseFiles(Saver, TokenizeGNUCommandLine, newArgv);
545623 argv = &newArgv[0];
546624 argc = static_cast(newArgv.size());
547625
837915
838916 // Free the memory allocated by ExpandResponseFiles.
839917 // Free all the strdup()ed strings.
840 for (std::vector::iterator i = newArgv.begin(), e = newArgv.end();
918 for (SmallVectorImpl::iterator i = newArgv.begin(),
919 e = newArgv.end();
841920 i != e; ++i)
842 free(*i);
921 free(const_cast(*i));
843922
844923 // If we had an error processing our arguments, don't let the program execute
845924 if (ErrorParsing) exit(1);
None ; RUN: echo %s > %t.list
1 ; RUN: llvm-as @%t.list -o %t.bc
0 ; Test that we can recurse, at least a little bit. The -time-passes flag here
1 ; is a hack to make sure that neither echo nor the shell expands the response
2 ; file for us. Tokenization with quotes is tested in unittests.
3 ; RUN: echo %s > %t.list1
4 ; RUN: echo "-time-passes @%t.list1" > %t.list2
5 ; RUN: llvm-as @%t.list2 -o %t.bc
26 ; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
37
48 ; CHECK: T foobar
66 //
77 //===----------------------------------------------------------------------===//
88
9 #include "llvm/ADT/STLExtras.h"
910 #include "llvm/Support/CommandLine.h"
1011 #include "llvm/Config/config.h"
1112 #include "gtest/gtest.h"
117118 "Category.";
118119 }
119120
121 class StrDupSaver : public cl::StringSaver {
122 const char *SaveString(const char *Str) LLVM_OVERRIDE {
123 return strdup(Str);
124 }
125 };
126
127 TEST(CommandLineTest, TokenizeGNUCommandLine) {
128 const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' "
129 "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\"";
130 const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar",
131 "foobarbaz", "C:\\src\\foo.cpp",
132 "C:\\src\\foo.cpp" };
133 SmallVector Actual;
134 StrDupSaver Saver;
135 cl::TokenizeGNUCommandLine(Input, Saver, Actual);
136 EXPECT_EQ(array_lengthof(Output), Actual.size());
137 for (unsigned I = 0, E = Actual.size(); I != E; ++I) {
138 if (I < array_lengthof(Output))
139 EXPECT_STREQ(Output[I], Actual[I]);
140 free(const_cast(Actual[I]));
141 }
142 }
143
120144 } // anonymous namespace