I'm wondering if anybody has something faster than this ?
Cheers.
//------------------------------------------------------------------------------------
// Sort
// ----
// Binary radix sort - used for ordering partially transparent entities, etc.
//
// Sorts an array of CSortEntry.
//
// When adding entries to the array:
//
// iMaxKey |= key; // This is therefore an upper bound on
// // unsigned values.
// Initially: iMaxKey = 0;
//
//------------------------------------------------------------------------------------
#pragma warning( push )
#pragma warning( disable : 4731 ) // 4731 = frame pointer register 'ebp' modified by inline assembly code
struct CSortEntry //Used for sorting. Total of 32bits per entry
{
unsigned short key;
unsigned short idx;
};
void CGeneric::Sort( CSortEntry * pArray, int iCount, unsigned int iMaxKey)
{
if( iCount == 0 ) // Early out if required
return;
/*
unsigned int iMask = 1;
while( iMask < iMaxKey )
{
unsigned int Count1 = 0;
CSortEntry * inPtr = pArray;
CSortEntry * outPtr0 = inPtr; // Output 0 is always to the original array
CSortEntry * outPtr1 = CSortArrayTemp; // Output 1 is always to a temporary store
for( int i = 0; i < iCount; i++ )
{
if( (inPtr->key & iMask)==0)
{
*outPtr0++ = *inPtr++;
}
else
{
*outPtr1++ = *inPtr++;
Count1++;
}
};
// Weld the two parts together
memcpy( outPtr0, CSortArrayTemp, (Count1)*sizeof(CSortEntry) );
// Advance mask, to next bit
iMask += iMask;
};
*/
// Use local stack space vars for older compilers not recognising
// parameter vars in _asm{}
unsigned int maxMask = iMaxKey;
CSortEntry * inPtr = pArray;
CSortEntry * outPtr1 = CSortArrayTemp; // A temporary buffer at least as big as pArray.
unsigned int totalEntries = iCount;
_asm
{
pusha
push ebp
push maxMask
push totalEntries
push inPtr
push outPtr1
mov eax, 1 // eax holds the mask
jmp L0002
mov ebx, [esp + 0 ]
mov esi, [esp + 4 ]
L0003a:
mov ecx, [esp + 8 ]
mov edi, esi
lea esi, [esi + ecx*4]
neg ecx
//--------------------------------------------------------------------------
// Main loop -
// ---------
// branchless decision on destination buffer
//--------------------------------------------------------------------------
L0000:
mov ebp, [esi + ecx*4]
test ebp, eax
cmovne edx, ebx
cmove edx, edi
mov [edx], ebp
lea edx, [edx + 4]
cmovne ebx, edx
cmove edi, edx
inc ecx
jl L0000
//--------------------------------------------------------------------------
// End of Main Loop
//--------------------------------------------------------------------------
mov esi, [esp + 0] // Top of stack (push ebx)
sub ebx, esi // Total * 8
jz L0004 // if none then early exit - copy loop won't work for zero!
add esi, ebx
add edi, ebx
neg ebx
L0001:
mov ecx, [esi + ebx]
mov [edi + ebx], ecx
add ebx, 4 // Seperate from jl L001 so that there are no dependencies
jl L0001
L0004:
add eax, eax
L0002:
// Interlace the first two instructions at the destination loop label - this breaks up dependencies
// and is effective since the destination label is always taken except the final time
mov esi, [esp + 4 ]
cmp eax, [esp + 12] //maxMask
mov ebx, [esp + 0 ]
jle L0003a
add esp, 16 // Account for top 4 stack dwords
pop ebp
popa
};
};
#pragma warning( pop )












