llvm.org GIT mirror llvm / 0b3aae9
[llvm.py] Implement interface to enhanced disassembler This requires a C++ change to EDDisassembler's ctor to function properly (the llvm::InitializeAll* functions aren't being called currently and there is no way to call them from Python). Code is partially tested and works well enough for initial commit. There are probably many small bugs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@152506 91177308-0d34-0410-b5e6-96231b3b80d8 Gregory Szorc 7 years ago
2 changed file(s) with 626 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 #===- disassembler.py - Python LLVM Bindings -----------------*- python -*--===#
1 #
2 # The LLVM Compiler Infrastructure
3 #
4 # This file is distributed under the University of Illinois Open Source
5 # License. See LICENSE.TXT for details.
6 #
7 #===------------------------------------------------------------------------===#
8
9 from abc import ABCMeta
10 from abc import abstractmethod
11
12 from ctypes import CFUNCTYPE
13 from ctypes import POINTER
14 from ctypes import byref
15 from ctypes import c_char_p
16 from ctypes import c_int
17 from ctypes import c_ubyte
18 from ctypes import c_uint64
19 from ctypes import c_uint
20 from ctypes import c_void_p
21 from ctypes import memmove
22
23 from .common import CachedProperty
24 from .common import LLVMObject
25 from .common import c_object_p
26 from .common import get_library
27
28 __all__ = [
29 'DisassemblerByteArraySource',
30 'DisassemblerFileSource',
31 'DisassemblerSource',
32 'Disassembler',
33 'Instruction',
34 'Operand',
35 'Token',
36 ]
37
38 callbacks = {}
39
40 class DisassemblerSource:
41 """Abstract base class for disassembler input.
42
43 This defines the interface to which inputs to the disassembler must
44 conform.
45
46 Basically, the disassembler input is a read-only sequence of a finite
47 length.
48 """
49 __metaclass__ = ABCMeta
50
51 @abstractmethod
52 def __len__(self):
53 """Returns the number of bytes that are available for input."""
54 pass
55
56 @abstractmethod
57 def get_byte(self, address):
58 """Returns the byte at the specified address."""
59 pass
60
61 @abstractmethod
62 def start_address(self):
63 """Returns the address at which to start fetch bytes, as a long."""
64 pass
65
66 class DisassemblerByteArraySource(DisassemblerSource):
67 """A disassembler source for byte arrays."""
68
69 def __init__(self, b):
70 self._array = b
71
72 def __len__(self):
73 return len(self._array)
74
75 def get_byte(self, address):
76 return self._array[address]
77
78 def start_address(self):
79 return 0
80
81 class DisassemblerFileSource(DisassemblerSource):
82 """A disassembler source for file segments.
83
84 This allows you to feed in segments of a file into a Disassembler.
85 """
86
87 def __init__(self, filename, start_offset, length=None, end_offset=None,
88 start_address=None):
89 """Create a new source from a file.
90
91 A source begins at a specified byte offset and can be defined in terms
92 of byte length of the end byte offset.
93 """
94 if length is None and end_offset is None:
95 raise Exception('One of length or end_offset must be defined.')
96
97 self._start_address = start_address
98 if self._start_address is None:
99 self._start_address = 0
100
101 count = length
102 if length is None:
103 count = end_offset - start_offset
104
105 with open(filename, 'rb') as fh:
106 fh.seek(start_offset)
107
108 # FIXME handle case where read bytes != requested
109 self._buf = fh.read(count)
110
111 def __len__(self):
112 return len(self._buf)
113
114 def get_byte(self, address):
115 return self._buf[address - self._start_address]
116
117 def start_address(self):
118 return self._start_address
119
120 class Disassembler(LLVMObject):
121 """Interface to LLVM's enhanced disassembler.
122
123 The API is slightly different from the C API in that we tightly couple a
124 disassembler instance to an input source. This saves an extra level of
125 abstraction and makes the Python implementation easier.
126 """
127
128 SYNTAX_X86_INTEL = 0
129 SYNTAX_X86_ATT = 1
130 SYNTAX_ARM_UAL = 2
131
132 def __init__(self, triple, source, syntax=0):
133 """Create a new disassembler instance.
134
135 Arguments:
136
137 triple -- str target type (e.g. x86_64-apple-darwin10)
138 source -- DisassemblerSource instance to be fed into this disassembler.
139 syntax -- The assembly syntax to use. One of the SYNTAX_* class
140 constants. e.g. EnhancedDisassembler.SYNTAX_X86_INTEL
141 """
142 assert isinstance(source, DisassemblerSource)
143
144 ptr = c_object_p()
145 result = lib.EDGetDisassembler(byref(ptr), c_char_p(triple),
146 c_int(syntax))
147 if result != 0:
148 raise Exception('Non-0 return code.')
149
150 LLVMObject.__init__(self, ptr)
151
152 self._source = source
153
154 def get_instructions(self):
155 """Obtain the instructions from the input.
156
157 This is a generator for Instruction instances.
158
159 By default, this will return instructions for the entire source which
160 has been defined. It does this by querying the source's start_address()
161 method and continues to request instructions until len(source) is
162 exhausted.
163 """
164
165 # We currently obtain 1 instruction at a time because it is easiest.
166
167 # This serves as our EDByteReaderCallback. It is a proxy between C and
168 # the Python DisassemblerSource.
169 def byte_reader(dest, address, arg):
170 try:
171 byte = self._source.get_byte(address)
172 memmove(dest, byte, 1)
173
174 return 0
175 except:
176 return -1
177
178 address = self._source.start_address()
179 end_address = address + len(self._source)
180 cb = callbacks['byte_reader'](byte_reader)
181 while address < end_address:
182 ptr = c_object_p()
183
184 result = lib.EDCreateInsts(byref(ptr), c_uint(1), self, cb,
185 address, c_void_p(None))
186
187 if result != 1:
188 raise Exception('Error obtaining instruction at address %d' %
189 address)
190
191 instruction = Instruction(ptr, self)
192 yield instruction
193
194 address += instruction.byte_size
195
196
197 class Instruction(LLVMObject):
198 """Represents an individual instruction.
199
200 Instruction instances are obtained from Disassembler.get_instructions().
201 """
202 def __init__(self, ptr, disassembler):
203 """Create a new instruction.
204
205 Instructions are created from within this module. You should have no
206 need to call this from outside this module.
207 """
208 assert isinstance(ptr, c_object_p)
209 assert isinstance(disassembler, Disassembler)
210
211 LLVMObject.__init__(self, ptr, disposer=lib.EDReleaseInst)
212 self._disassembler = disassembler
213
214 def __str__(self):
215 s = c_char_p(None)
216 result = lib.EDGetInstString(byref(s), self)
217 if result != 0:
218 raise Exception('Non-0 return code.')
219
220 return s.value
221
222 @CachedProperty
223 def byte_size(self):
224 result = lib.EDInstByteSize(self)
225 if result == -1:
226 raise Exception('Error code returned.')
227
228 return result
229
230 @CachedProperty
231 def id(self):
232 i = c_uint()
233 result = lib.EDInstID(byref(i), self)
234 if result != 0:
235 raise Exception('Non-0 return code.')
236
237 return i.value
238
239 @CachedProperty
240 def is_branch(self):
241 result = lib.EDInstIsBranch(self)
242 if result == -1:
243 raise Exception('Error code returned.')
244
245 return result > 0
246
247 @CachedProperty
248 def is_move(self):
249 result = lib.EDInstIsMove(self)
250 if result == -1:
251 raise Exception('Error code returned.')
252
253 return result > 0
254
255 @CachedProperty
256 def branch_target_id(self):
257 result = lib.EDBranchTargetID(self)
258 if result == -1:
259 raise Exception('Error code returned.')
260
261 return result
262
263 @CachedProperty
264 def move_source_id(self):
265 result = lib.EDMoveSourceID(self)
266 if result == -1:
267 raise Exception('Error code returned.')
268
269 return result
270
271 def get_tokens(self):
272 """Obtain the tokens in this instruction.
273
274 This is a generator for Token instances.
275 """
276 count = lib.EDNumTokens(self)
277 if count == -1:
278 raise Exception('Error code returned.')
279
280 for i in range(0, count):
281 ptr = c_object_p()
282 result = lib.EDGetToken(byref(ptr), self, c_int(i))
283 if result != 0:
284 raise Exception('Non-0 return code.')
285
286 yield Token(ptr, self)
287
288 def get_operands(self):
289 """Obtain the operands in this instruction.
290
291 This is a generator for Operand instances.
292 """
293 count = lib.EDNumOperands(self)
294 if count == -1:
295 raise Exception('Error code returned.')
296
297 for i in range(0, count):
298 ptr = c_object_p()
299 result = lib.EDGetOperand(byref(ptr), self, c_int(i))
300 if result != 0:
301 raise Exception('Non-0 return code.')
302
303 yield Operand(ptr, self)
304
305 class Token(LLVMObject):
306 def __init__(self, ptr, instruction):
307 assert isinstance(ptr, c_object_p)
308 assert isinstance(instruction, Instruction)
309
310 LLVMObject.__init__(self, ptr)
311
312 self._instruction = instruction
313
314 def __str__(self):
315 s = c_char_p(None)
316 result = lib.EDGetTokenString(byref(s), self)
317 if result != 0:
318 raise Exception('Non-0 return code.')
319
320 return s.value
321
322 @CachedProperty
323 def operand_index(self):
324 result = lib.EDOperandIndexForToken(self)
325 if result == -1:
326 raise Exception('Error code returned.')
327
328 return result
329
330 @CachedProperty
331 def is_whitespace(self):
332 result = lib.EDTokenIsWhitespace(self)
333 if result == -1:
334 raise Exception('Error code returned.')
335
336 return result > 0
337
338 @CachedProperty
339 def is_punctuation(self):
340 result = lib.EDTokenIsPunctuation(self)
341 if result == -1:
342 raise Exception('Error code returned.')
343
344 return result > 0
345
346 @CachedProperty
347 def is_opcode(self):
348 result = lib.EDTokenIsOpcode(self)
349 if result == -1:
350 raise Exception('Error code returned.')
351
352 return result > 0
353
354 @CachedProperty
355 def is_literal(self):
356 result = lib.EDTokenIsLiteral(self)
357 if result == -1:
358 raise Exception('Error code returned.')
359
360 return result > 0
361
362 @CachedProperty
363 def is_register(self):
364 result = lib.EDTokenIsRegister(self)
365 if result == -1:
366 raise Exception('Error code returned.')
367
368 return result > 0
369
370 @CachedProperty
371 def is_negative_literal(self):
372 result = lib.EDTokenIsNegativeLiteral(self)
373 if result == -1:
374 raise Exception('Error code returned.')
375
376 return result > 0
377
378 @CachedProperty
379 def absolute_value(self):
380 value = c_uint64()
381 result = lib.EDLiteralTokenAbsoluteValue(byref(value), self)
382 if result != 0:
383 raise Exception('Non-0 return code.')
384
385 return value
386
387 @CachedProperty
388 def register_value(self):
389 value = c_uint()
390 result = lib.EDRegisterTokenValue(byref(value), self)
391 if result != 0:
392 raise Exception('Non-0 return code.')
393
394 return value
395
396 class Operand(LLVMObject):
397 """Represents an operand in an instruction.
398
399 FIXME support register evaluation.
400 """
401 def __init__(self, ptr, instruction):
402 assert isinstance(ptr, c_object_p)
403 assert isinstance(instruction, Instruction)
404
405 LLVMObject.__init__(self, ptr)
406
407 self._instruction = instruction
408
409 @CachedProperty
410 def is_register(self):
411 result = lib.EDOperandIsRegister(self)
412 if result == -1:
413 raise Exception('Error code returned.')
414
415 return result > 0
416
417 @CachedProperty
418 def is_immediate(self):
419 result = lib.EDOperandIsImmediate(self)
420 if result == -1:
421 raise Exception('Error code returned.')
422
423 return result > 0
424
425 @CachedProperty
426 def is_memory(self):
427 result = lib.EDOperandIsMemory(self)
428 if result == -1:
429 raise Exception('Error code returned.')
430
431 return result > 0
432
433 @CachedProperty
434 def register_value(self):
435 value = c_uint()
436 result = lib.EDRegisterOperandValue(byref(value), self)
437 if result != 0:
438 raise Exception('Non-0 return code.')
439
440 return value
441
442 @CachedProperty
443 def immediate_value(self):
444 value = c_uint64()
445 result = lib.EDImmediateOperandValue(byref(value), self)
446 if result != 0:
447 raise Exception('Non-0 return code.')
448
449 return value
450
451 def register_library(library):
452 library.EDGetDisassembler.argtypes = [POINTER(c_object_p), c_char_p, c_int]
453 library.EDGetDisassembler.restype = c_int
454
455 library.EDGetRegisterName.argtypes = [POINTER(c_char_p), Disassembler,
456 c_uint]
457 library.EDGetRegisterName.restype = c_int
458
459 library.EDRegisterIsStackPointer.argtypes = [Disassembler, c_uint]
460 library.EDRegisterIsStackPointer.restype = c_int
461
462 library.EDRegisterIsProgramCounter.argtypes = [Disassembler, c_uint]
463 library.EDRegisterIsProgramCounter.restype = c_int
464
465 library.EDCreateInsts.argtypes = [POINTER(c_object_p), c_uint,
466 Disassembler, callbacks['byte_reader'], c_uint64, c_void_p]
467 library.EDCreateInsts.restype = c_uint
468
469 library.EDReleaseInst.argtypes = [Instruction]
470
471 library.EDInstByteSize.argtypes = [Instruction]
472 library.EDInstByteSize.restype = c_int
473
474 library.EDGetInstString.argtypes = [POINTER(c_char_p), Instruction]
475 library.EDGetInstString.restype = c_int
476
477 library.EDInstID.argtypes = [POINTER(c_uint), Instruction]
478 library.EDInstID.restype = c_int
479
480 library.EDInstIsBranch.argtypes = [Instruction]
481 library.EDInstIsBranch.restype = c_int
482
483 library.EDInstIsMove.argtypes = [Instruction]
484 library.EDInstIsMove.restype = c_int
485
486 library.EDBranchTargetID.argtypes = [Instruction]
487 library.EDBranchTargetID.restype = c_int
488
489 library.EDMoveSourceID.argtypes = [Instruction]
490 library.EDMoveSourceID.restype = c_int
491
492 library.EDMoveTargetID.argtypes = [Instruction]
493 library.EDMoveTargetID.restype = c_int
494
495 library.EDNumTokens.argtypes = [Instruction]
496 library.EDNumTokens.restype = c_int
497
498 library.EDGetToken.argtypes = [POINTER(c_object_p), Instruction, c_int]
499 library.EDGetToken.restype = c_int
500
501 library.EDGetTokenString.argtypes = [POINTER(c_char_p), Token]
502 library.EDGetTokenString.restype = c_int
503
504 library.EDOperandIndexForToken.argtypes = [Token]
505 library.EDOperandIndexForToken.restype = c_int
506
507 library.EDTokenIsWhitespace.argtypes = [Token]
508 library.EDTokenIsWhitespace.restype = c_int
509
510 library.EDTokenIsPunctuation.argtypes = [Token]
511 library.EDTokenIsPunctuation.restype = c_int
512
513 library.EDTokenIsOpcode.argtypes = [Token]
514 library.EDTokenIsOpcode.restype = c_int
515
516 library.EDTokenIsLiteral.argtypes = [Token]
517 library.EDTokenIsLiteral.restype = c_int
518
519 library.EDTokenIsRegister.argtypes = [Token]
520 library.EDTokenIsRegister.restype = c_int
521
522 library.EDTokenIsNegativeLiteral.argtypes = [Token]
523 library.EDTokenIsNegativeLiteral.restype = c_int
524
525 library.EDLiteralTokenAbsoluteValue.argtypes = [POINTER(c_uint64), Token]
526 library.EDLiteralTokenAbsoluteValue.restype = c_int
527
528 library.EDRegisterTokenValue.argtypes = [POINTER(c_uint), Token]
529 library.EDRegisterTokenValue.restype = c_int
530
531 library.EDNumOperands.argtypes = [Instruction]
532 library.EDNumOperands.restype = c_int
533
534 library.EDGetOperand.argtypes = [POINTER(c_object_p), Instruction, c_int]
535 library.EDGetOperand.restype = c_int
536
537 library.EDOperandIsRegister.argtypes = [Operand]
538 library.EDOperandIsRegister.restype = c_int
539
540 library.EDOperandIsImmediate.argtypes = [Operand]
541 library.EDOperandIsImmediate.restype = c_int
542
543 library.EDOperandIsMemory.argtypes = [Operand]
544 library.EDOperandIsMemory.restype = c_int
545
546 library.EDRegisterOperandValue.argtypes = [POINTER(c_uint), Operand]
547 library.EDRegisterOperandValue.restype = c_int
548
549 library.EDImmediateOperandValue.argtypes = [POINTER(c_uint64), Operand]
550 library.EDImmediateOperandValue.restype = c_int
551
552 library.EDEvaluateOperand.argtypes = [c_uint64, Operand,
553 callbacks['register_reader'], c_void_p]
554 library.EDEvaluateOperand.restype = c_int
555
556 # Enhanced disassembler.
557 callbacks['byte_reader'] = CFUNCTYPE(c_int, POINTER(c_ubyte), c_uint64,
558 c_void_p)
559 callbacks['register_reader'] = CFUNCTYPE(c_int, POINTER(c_uint64), c_uint,
560 c_void_p)
561
562 lib = get_library()
563 register_library(lib)
0 from unittest import expectedFailure
1 from unittest import skip
2
3 from .base import TestBase
4 from ..disassembler import DisassemblerByteArraySource
5 from ..disassembler import DisassemblerFileSource
6 from ..disassembler import Disassembler
7 from ..object import ObjectFile
8
9 class TestDisassembler(TestBase):
10 def test_simple(self):
11 sequence = '\x67\xe3\x81' # jcxz -127
12 triple = 'i686-apple-darwin9'
13
14 source = DisassemblerByteArraySource(sequence)
15
16 disassembler = Disassembler(triple, source)
17 instructions = list(disassembler.get_instructions())
18
19 self.assertEqual(len(instructions), 1)
20
21 i = instructions[0]
22 self.assertEqual(str(i), '\tjcxz\t-127\n')
23 self.assertEqual(i.byte_size, 3)
24 self.assertEqual(i.id, 1032)
25 self.assertTrue(i.is_branch)
26 self.assertFalse(i.is_move)
27 self.assertEqual(i.branch_target_id, 0)
28
29 tokens = list(i.get_tokens())
30 self.assertEqual(len(tokens), 4)
31 token = tokens[0]
32 self.assertEqual(str(token), 'jcxz')
33 self.assertFalse(token.is_whitespace)
34 self.assertFalse(token.is_punctuation)
35 self.assertTrue(token.is_opcode)
36 self.assertFalse(token.is_literal)
37 self.assertFalse(token.is_register)
38
39 self.assertTrue(tokens[1].is_whitespace)
40
41 operands = list(i.get_operands())
42 self.assertEqual(len(operands), 1)
43
44 # TODO implement operand tests
45
46 @skip('This test is horribly broken and probably not even correct.')
47 def test_read_instructions(self):
48 filename = self.get_test_binary()
49 o = ObjectFile(filename=filename)
50
51 for symbol in o.get_symbols():
52 address = symbol.address
53 offset = symbol.file_offset
54 size = symbol.size
55
56 source = DisassemblerFileSource(filename, offset, length=size,
57 start_address=address)
58
59 disassembler = Disassembler('x86-generic-gnu-linux', source)
60 for instruction in disassembler.get_instructions():
61 print instruction