llvm.org GIT mirror llvm / 47e7c04
[lit] Support parsing scripts with inconsistent or invalid encodings. - For whatever reason, we have a lot of test files with bogus unicode characters. This patch allows those scripts to still be parsed on Python3 by changing the parsing logic to work on binary files, and only require the actual script commands to be convertible to ascii. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188376 91177308-0d34-0410-b5e6-96231b3b80d8 Daniel Dunbar 6 years ago
2 changed file(s) with 45 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
304304
305305 return False
306306
307 def parseIntegratedTestScriptCommands(sourcepath):
307 def parseIntegratedTestScriptCommands(source_path):
308308 """
309309 parseIntegratedTestScriptCommands(source_path) -> commands
310310
311311 Parse the commands in an integrated test script file into a list of
312312 (line_number, command_type, line).
313313 """
314 line_number = 0
315 for ln in open(sourcepath):
316 line_number += 1
317 if 'RUN:' in ln:
318 yield (line_number, 'RUN', ln[ln.index('RUN:')+4:])
319 elif 'XFAIL:' in ln:
320 yield (line_number, 'XFAIL', ln[ln.index('XFAIL:') + 6:])
321 elif 'REQUIRES:' in ln:
322 yield (line_number, 'REQUIRES', ln[ln.index('REQUIRES:') + 9:])
323 elif 'END.' in ln:
324 yield (line_number, 'END', ln[ln.index('END.') + 4:])
314
315 # This code is carefully written to be dual compatible with Python 2.5+ and
316 # Python 3 without requiring input files to always have valid codings. The
317 # trick we use is to open the file in binary mode and use the regular
318 # expression library to find the commands, with it scanning strings in
319 # Python2 and bytes in Python3.
320 #
321 # Once we find a match, we do require each script line to be decodable to
322 # ascii, so we convert the outputs to ascii before returning. This way the
323 # remaining code can work with "strings" agnostic of the executing Python
324 # version.
325
326 def to_bytes(str):
327 # Encode to Latin1 to get binary data.
328 return str.encode('ISO-8859-1')
329 keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.')
330 keywords_re = re.compile(
331 to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
332
333 f = open(source_path, 'rb')
334 try:
335 # Read the entire file contents.
336 data = f.read()
337
338 # Iterate over the matches.
339 line_number = 1
340 last_match_position = 0
341 for match in keywords_re.finditer(data):
342 # Compute the updated line number by counting the intervening
343 # newlines.
344 match_position = match.start()
345 line_number += data.count(to_bytes('\n'), last_match_position,
346 match_position)
347 last_match_position = match_position
348
349 # Convert the keyword and line to ascii and yield the command.
350 keyword,ln = match.groups()
351 yield (line_number, keyword[:-1].decode('ascii'),
352 ln.decode('ascii'))
353 finally:
354 f.close()
325355
326356 def parseIntegratedTestScript(test, normalize_slashes=False,
327357 extra_substitutions=[]):
0 # RUN: true
1
2 # Here is a string that cannot be decoded in line mode: Â.