Jump to content


x86 Decoder


20 replies to this topic

#1 Nick

    Senior Member

  • Members
  • PipPipPipPip
  • 1225 posts

Posted 19 November 2004 - 11:19 AM

Another assembly related Code Spotlight. A few days ago I noticed that there aren't any public domain simple x86 binary decoders. This can be useful for a disassembler, a debugger, a code optimizer or many low-level tricks. So I decided to write one myself. The whole theory about the x86 instruction format can be found on the sandpile.org site. Instead of using big tables with the encoding format of every instruction, I decided to keep things compact and determine their format purely from the binary code. This is also how an x86 processor does it in hardware...

int instructionCount(unsigned char *func) 

{ 

    int count = 0; 


    while(*func != 0xCC) 

    { 

        // Skip prefixes F0h, F2h, F3h, 66h, 67h, D8h-DFh, 2Eh, 36h, 3Eh, 26h, 64h and 65h

        int operandSize = 4; 

        int FPU = 0; 

        while(*func == 0xF0 || 

              *func == 0xF2 || 

              *func == 0xF3 || 

             (*func & 0xFC) == 0x64 || 

             (*func & 0xF8) == 0xD8 ||

             (*func & 0x7E) == 0x62)

        { 

            if(*func == 0x66) 

            { 

                operandSize = 2; 

            }

            else if((*func & 0xF8) == 0xD8) 

            {

                FPU = *func++;

                break;

            }


            func++;

        }


        // Skip two-byte opcode byte 

        bool twoByte = false; 

        if(*func == 0x0F) 

        { 

            twoByte = true; 

            func++; 

        } 


        // Skip opcode byte 

        unsigned char opcode = *func++; 


        // Skip mod R/M byte 

        unsigned char modRM = 0xFF; 

        if(FPU) 

        { 

            if((opcode & 0xC0) != 0xC0) 

            { 

                modRM = opcode; 

            } 

        } 

        else if(!twoByte) 

        { 

            if((opcode & 0xC4) == 0x00 || 

               (opcode & 0xF4) == 0x60 && ((opcode & 0x0A) == 0x02 || (opcode & 0x09) == 0x9) || 

               (opcode & 0xF0) == 0x80 || 

               (opcode & 0xF8) == 0xC0 && (opcode & 0x0E) != 0x02 || 

               (opcode & 0xFC) == 0xD0 || 

               (opcode & 0xF6) == 0xF6) 

            { 

                modRM = *func++; 

            } 

        } 

        else 

        { 

            if((opcode & 0xF0) == 0x00 && (opcode & 0x0F) >= 0x04 && (opcode & 0x0D) != 0x0D || 

               (opcode & 0xF0) == 0x30 || 

               opcode == 0x77 || 

               (opcode & 0xF0) == 0x80 || 

               (opcode & 0xF0) == 0xA0 && (opcode & 0x07) <= 0x02 || 

               (opcode & 0xF8) == 0xC8) 

            { 

                // No mod R/M byte 

            } 

            else 

            { 

                modRM = *func++; 

            } 

        } 


        // Skip SIB

        if((modRM & 0x07) == 0x04 &&

           (modRM & 0xC0) != 0xC0)

        {

            func += 1;   // SIB

        }


        // Skip displacement

        if((modRM & 0xC5) == 0x05) func += 4;   // Dword displacement, no base 

        if((modRM & 0xC0) == 0x40) func += 1;   // Byte displacement 

        if((modRM & 0xC0) == 0x80) func += 4;   // Dword displacement 


        // Skip immediate 

        if(FPU) 

        { 

            // Can't have immediate operand 

        } 

        else if(!twoByte) 

        { 

            if((opcode & 0xC7) == 0x04 || 

               (opcode & 0xFE) == 0x6A ||   // PUSH/POP/IMUL 

               (opcode & 0xF0) == 0x70 ||   // Jcc 

               opcode == 0x80 || 

               opcode == 0x83 || 

               (opcode & 0xFD) == 0xA0 ||   // MOV 

               opcode == 0xA8 ||            // TEST 

               (opcode & 0xF8) == 0xB0 ||   // MOV

               (opcode & 0xFE) == 0xC0 ||   // RCL 

               opcode == 0xC6 ||            // MOV 

               opcode == 0xCD ||            // INT 

               (opcode & 0xFE) == 0xD4 ||   // AAD/AAM 

               (opcode & 0xF8) == 0xE0 ||   // LOOP/JCXZ 

               opcode == 0xEB || 

               opcode == 0xF6 && (modRM & 0x30) == 0x00)   // TEST 

            { 

                func += 1; 

            } 

            else if((opcode & 0xF7) == 0xC2) 

            { 

                func += 2;   // RET 

            } 

            else if((opcode & 0xFC) == 0x80 || 

                    (opcode & 0xC7) == 0x05 || 

                    (opcode & 0xF8) == 0xB8 ||

                    (opcode & 0xFE) == 0xE8 ||      // CALL/Jcc 

                    (opcode & 0xFE) == 0x68 || 

                    (opcode & 0xFC) == 0xA0 || 

                    (opcode & 0xEE) == 0xA8 || 

                    opcode == 0xC7 || 

                    opcode == 0xF7 && (modRM & 0x30) == 0x00) 

            { 

                func += operandSize; 

            } 

        } 

        else 

        { 

            if(opcode == 0xBA ||            // BT 

               opcode == 0x0F ||            // 3DNow! 

               (opcode & 0xFC) == 0x70 ||   // PSLLW 

               (opcode & 0xF7) == 0xA4 ||   // SHLD 

               opcode == 0xC2 || 

               opcode == 0xC4 || 

               opcode == 0xC5 || 

               opcode == 0xC6) 

            { 

                func += 1; 

            } 

            else if((opcode & 0xF0) == 0x80) 

            {

                func += operandSize;   // Jcc -i

            }

        }


        count++;

    }


    return count;

}

This code actually only counts the number of instructions in a binary buffer. But it's very easy to extend it to a full decoder. It was tested extensively using SoftWire, so it should work flawlessly for 99.9% of all existing instructions. I know this isn't much game development related, but I hope it reaches some people who find it useful.

Enjoy!

Nicolas 'Nick' Capens

#2 cdgray

    New Member

  • Members
  • PipPip
  • 21 posts

Posted 25 November 2004 - 05:51 PM

nice piece of code. did you miss the B0-BF instructions?

#3 Nick

    Senior Member

  • Members
  • PipPipPipPip
  • 1225 posts

Posted 25 November 2004 - 06:12 PM

That's correct. It was noted on flipCode.com as well, but I forgot to update it here. I'll do it right away...

#4 bladder

    DevMaster Staff

  • Moderators
  • 1057 posts

Posted 27 February 2005 - 02:50 AM

I think you're forgetting to check for the following prefixes.

2eh—36h—3eh—26h—64h—65h - the segment overrides

#5 bladder

    DevMaster Staff

  • Moderators
  • 1057 posts

Posted 27 February 2005 - 05:00 AM

Also, the sib byte is not there if the rm portion of the mod R/M byte is 11, so changing this:

if( (modRM & 0x07) == 0x04 ) func++;

to this:

if( (modRM & 0x07) == 0x04 && (modRM & 0xc0) != 0xc0 ) func++;

Seems to have fixed that.

#6 Nick

    Senior Member

  • Members
  • PipPipPipPip
  • 1225 posts

Posted 28 February 2005 - 11:41 PM

Thanks a lot for reporting those bugs bladder!

#7 tyyeh0

    New Member

  • Members
  • Pip
  • 1 posts

Posted 11 October 2005 - 06:15 AM

Is there a final version with the fixes?

I'd appreciate a link or post. Also, is there a verified x86 decoder out there? or is this one pretty bug free after these postings?

thx

#8 earlnsk

    New Member

  • Members
  • Pip
  • 1 posts

Posted 11 May 2007 - 03:13 AM

There are still some bugs in this code. There are alternative code for decoder (http://hack-expo.voi...text/disasm.txt)

#9 dave_

    Senior Member

  • Members
  • PipPipPipPip
  • 584 posts

Posted 11 May 2007 - 08:57 AM

You could try the disassembler from bochs.

#10 Nick

    Senior Member

  • Members
  • PipPipPipPip
  • 1225 posts

Posted 11 May 2007 - 09:37 AM

earlnsk said:

There are still some bugs in this code.
Could you point them out for me?

#11 roel

    Senior Member

  • Members
  • PipPipPipPip
  • 697 posts

Posted 11 May 2007 - 09:47 PM

earlnsk said:

There are still some bugs in this code. There are alternative code for decoder (http://hack-expo.voi...text/disasm.txt)

Can you (or someone else) read russian? It mentions LDE (it looks similar to it), but I'm curious what the text exactly says about it.

#12 Mihail121

    Senior Member

  • Members
  • PipPipPipPip
  • 1052 posts

Posted 12 May 2007 - 09:58 AM

Basically, the guy speaks about how he wrote the universal disassembler LDE and the numerous modifications of it for different tasks, that were developed shorty after. He gives some examples and points, that he only had to check for the length of the processed instructions, since everything else is already matched by the check for EB,E8,E9, 7x, 0F 8x and etc. Then he says, that in some cases, one would wish to know more than simply the length and a long passage starts explaining the disassembler internals.

#13 roel

    Senior Member

  • Members
  • PipPipPipPip
  • 697 posts

Posted 12 May 2007 - 07:33 PM

great, thanks!

#14 Alboin

    New Member

  • Members
  • Pip
  • 3 posts

Posted 30 August 2007 - 03:03 PM

I'm really sorry to post at such an old thread, but can anyone explain to me how this decoder works? It seems to be magic as there is nothing in the Intel manuals at all, and nothing that I could find online.

Thank you!

#15 roel

    Senior Member

  • Members
  • PipPipPipPip
  • 697 posts

Posted 31 August 2007 - 01:58 PM

Since Nick, the author, is here often, he can probably give you right the answer, but in short: there are (obviously) some patterns in the encoding of x86 instructions, and this decoder makes smart usage of it. When I wrote my own equivalent, somewhere in the previous century, I used information from
http://www.sandpile.org/

#16 Alboin

    New Member

  • Members
  • Pip
  • 3 posts

Posted 31 August 2007 - 04:58 PM

I've heard that there is a pattern, but I have been unable to find it myself by studying the opcode encodings. Also, I've used sandpile, but to what I found, they don't have anything on it either. (Just opcode maps, and the like.)

The thing I'm having trouble with is determining if there is a modrm byte based on the opcode bits.

Thanks again!
Alboin

#17 Nick

    Senior Member

  • Members
  • PipPipPipPip
  • 1225 posts

Posted 31 August 2007 - 10:22 PM

Alboin said:

The thing I'm having trouble with is determining if there is a modrm byte based on the opcode bits.
Take a piece of checkered paper and draw a square of 16 by 16 little squares. Now, these 256 squares each represent an opcode (starting with 0x00 in the upper left and 0xFF in the lower right). Color every square corresponding to an opcode that has a ModRM byte. You'll see a few patterns emerge. Now the fun part is to 'cover' these patterns with the least number of logical operations. It's a Karnaugh map with eight variables.

It's most likely not optimal at all, but it gave me this reasonably compact result. A table-based approach might actually be more practical and faster if you need to do some more serious decoding. :ninja:

#18 Alboin

    New Member

  • Members
  • Pip
  • 3 posts

Posted 01 September 2007 - 03:13 PM

Nick said:

It's most likely not optimal at all, but it gave me this reasonably compact result. A table-based approach might actually be more practical and faster if you need to do some more serious decoding. :ninja:
Would a table really be faster? (Speed is somewhat important to me.) Maybe I'll go with that. (I was thinking it would be the slowest method.)

Thanks once again.
Alboin

#19 roel

    Senior Member

  • Members
  • PipPipPipPip
  • 697 posts

Posted 02 September 2007 - 10:11 AM

Sure, a table can be faster. To identify an opcode, one can do it more or less in a few mov reg32, [offset table+reg32*4]-alike instructions. Which is/can/should be faster than many bitwise operations and comparions.

#20 WolfgangSt

    New Member

  • Members
  • Pip
  • 2 posts

Posted 07 December 2007 - 02:35 PM

For speed issues i'd suggest replacing large logical chains like

            if((opcode & 0xC4) == 0x00 || 
               (opcode & 0xF4) == 0x60 && ((opcode & 0x0A) == 0x02 || (opcode & 0x09) == 0x9) || 
               (opcode & 0xF0) == 0x80 || 
               (opcode & 0xF8) == 0xC0 && (opcode & 0x0E) != 0x02 || 
               (opcode & 0xFC) == 0xD0 || 
               (opcode & 0xF6) == 0xF6) 

With a bitmap lookup table that could look like this:
(Sorry the macros look horrible but it makes the tables most readable)
#define BITMASK(b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,b11,b12,b13,b14,b15,b16,b17,\
 b18,b19,b20,b21,b22,b23,b24,b25,b26,b27,b28,b29,b30,b31) \
 (b0) | (b1 << 1) | (b2 << 2) | (b3 << 3) | (b4 << 4) | (b5 << 5) | (b6 << 6) \ 
 | (b7 << 7) | (b8 << 8) | (b9 << 9) | (b10 << 10) | (b11 << 11) | (b12 << 12)\
 | (b13 << 13) | (b14 << 14) | (b15 << 15) | (b16 << 16) | (b17 << 17)\
 | (b18 << 18) | (b19 << 19) | (b20 << 20) | (b21 << 21) | (b22 << 22)\
 | (b23 << 23) | (b24 << 24) | (b25 << 25) | (b26 << 26) | (b27 << 27)\
 | (b28 << 28) | (b29 << 29) | (b30 << 30) | (b31 << 31)
#define IsInTable(element, table) ((table[element >> 5] >> (element & 0x1F)) & 1)

unsigned int modrmtab1[] = 
{
	//      0 1 2 3 4 5 6 7  8 9 A B C D E F
	BITMASK(1,1,1,1,0,0,0,0, 1,1,1,1,0,0,0,0,  // 0
	        1,1,1,1,0,0,0,0, 1,1,1,1,0,0,0,0), // 1
	BITMASK(1,1,1,1,0,0,0,0, 1,1,1,1,0,0,0,0,  // 2
	        1,1,1,1,0,0,0,0, 1,1,1,1,0,0,0,0), // 3
	BITMASK(0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  // 4
	        0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0), // 5
	BITMASK(0,0,1,1,0,0,0,0, 0,1,0,1,0,0,0,0,  // 6
	        0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0), // 7
	BITMASK(1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,  // 8
	        0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0), // 9
	BITMASK(0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  // A
	        0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0), // B
	BITMASK(1,1,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,  // C
	        1,1,1,1,0,0,0,0, 0,0,0,0,0,0,0,0), // D
	BITMASK(0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  // E
	        0,0,0,0,0,0,1,1, 0,0,0,0,0,0,1,1)  // F
};

....
if (IsInTable(op, modrmtab1))
...

Note: On a 64bit machine when writing 64bit code you'd use 64bit types for even more speed

The advantages of using a lookup table like this should be pretty obvious.
Making all table entries 32bit allows aligned access (a byte array would get severe penalty for not having) while the whole table just needs 256bit (32bytes) and thus should be pretty cache friendly when heavily random accessed.
Note that a cache miss for a 256*4 byte table or alignment penalty from a 256byte might be way more costly than the 4 simple ALU instructions to decode the according bit of the table.

Tables however are unlikely to outperform short ALU chains but you'd have to test in practise for this. Having many branches (compares) might breakdown performance alot however.





1 user(s) are reading this topic

0 members, 1 guests, 0 anonymous users