SSE mat4 inverse

40df6f0cd8571262dabedf4bdcf1e093
0
Groove 101 Aug 11, 2009 at 14:00

Hi everyone!

I wrote a function to inverse a 4 by 4 matrix using intrinsic instructions.
I used Cramer’s rule, prevent coefficient processing duplication and processed two 2 by 2 sub-factor at a time.
It involved quite a lot of _mm_shuffle_ps but considering that this instruction cost 1 cycle on Core i7 I thought it was a fair trade.

There is probably some possibilities to improve my code but the result seams quite nice to me:
On a Core 2 Q6600 build with VC2008, i get 162 cycles , my original implementation with FPU cost 918 cycles.
Using _mm_rcp_ps instruction instead of _mm_div_ps it goes down to 135 cycles but with some accuracy lost. I would love to see he number of cycles needed on a Core i7!

I have added my mat4 product as well: 63 cycles instead of 378 cycles.

I bet it could be improved more so I am waiting for you comment!

Matrix inverse:

inline void _mm_inverse_ps(__m128 const in[4], __m128 out[4])
{
    __m128 Fac0;
    {
        //  valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
        //  valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
        //  valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
        //  valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];

        __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
        __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));

        __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
        __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));

        __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
        __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
        Fac0 = _mm_sub_ps(Mul00, Mul01);

        bool stop = true;
    }

    __m128 Fac1;
    {
        //  valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
        //  valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
        //  valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
        //  valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];

        __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
        __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));

        __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
        __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));

        __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
        __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
        Fac1 = _mm_sub_ps(Mul00, Mul01);

        bool stop = true;
    }


    __m128 Fac2;
    {
        //  valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
        //  valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
        //  valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
        //  valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];

        __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
        __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));

        __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
        __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));

        __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
        __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
        Fac2 = _mm_sub_ps(Mul00, Mul01);

        bool stop = true;
    }

    __m128 Fac3;
    {
        //  valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
        //  valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
        //  valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
        //  valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];

        __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
        __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));

        __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
        __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));

        __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
        __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
        Fac3 = _mm_sub_ps(Mul00, Mul01);

        bool stop = true;
    }

    __m128 Fac4;
    {
        //  valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
        //  valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
        //  valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
        //  valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];

        __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
        __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));

        __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
        __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));

        __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
        __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
        Fac4 = _mm_sub_ps(Mul00, Mul01);

        bool stop = true;
    }

    __m128 Fac5;
    {
        //  valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
        //  valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
        //  valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
        //  valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];

        __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
        __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));

        __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
        __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
        __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));

        __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
        __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
        Fac5 = _mm_sub_ps(Mul00, Mul01);

        bool stop = true;
    }

    __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
    __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);

    // m[1][0]
    // m[0][0]
    // m[0][0]
    // m[0][0]
    __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
    __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));

    // m[1][1]
    // m[0][1]
    // m[0][1]
    // m[0][1]
    __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
    __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));

    // m[1][2]
    // m[0][2]
    // m[0][2]
    // m[0][2]
    __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
    __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));

    // m[1][3]
    // m[0][3]
    // m[0][3]
    // m[0][3]
    __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
    __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));

    // col0
    // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
    // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
    // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
    // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
    __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
    __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
    __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
    __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
    __m128 Add00 = _mm_add_ps(Sub00, Mul02);
    __m128 Inv0 = _mm_mul_ps(SignB, Add00);

    // col1
    // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
    // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
    // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
    // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
    __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
    __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
    __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
    __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
    __m128 Add01 = _mm_add_ps(Sub01, Mul05);
    __m128 Inv1 = _mm_mul_ps(SignA, Add01);

    // col2
    // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
    // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
    // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
    // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
    __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
    __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
    __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
    __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
    __m128 Add02 = _mm_add_ps(Sub02, Mul08);
    __m128 Inv2 = _mm_mul_ps(SignB, Add02);

    // col3
    // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
    // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
    // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
    // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
    __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
    __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
    __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
    __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
    __m128 Add03 = _mm_add_ps(Sub03, Mul11);
    __m128 Inv3 = _mm_mul_ps(SignA, Add03);

    __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
    __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
    __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));

    //  valType Determinant = m[0][0] * Inverse[0][0] 
    //                      + m[0][1] * Inverse[1][0] 
    //                      + m[0][2] * Inverse[2][0] 
    //                      + m[0][3] * Inverse[3][0];
    __m128 Det0 = _mm_dot_ps(in[0], Row2);
    __m128 Rcp0 = _mm_div_ps(one, Det0);
    //__m128 Rcp0 = _mm_rcp_ps(Det0);

    //  Inverse /= Determinant;
    out[0] = _mm_mul_ps(Inv0, Rcp0);
    out[1] = _mm_mul_ps(Inv1, Rcp0);
    out[2] = _mm_mul_ps(Inv2, Rcp0);
    out[3] = _mm_mul_ps(Inv3, Rcp0);
}

Matrix product:

static const __m128 one = _mm_set_ps1(1.0f);

inline void _mm_mul_ps(__m128 in1[4], __m128 in2[4], __m128 out[4])
{
    {
        __m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0));
        __m128 e1 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(1, 1, 1, 1));
        __m128 e2 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(2, 2, 2, 2));
        __m128 e3 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(3, 3, 3, 3));

        __m128 m0 = _mm_mul_ps(in1[0], e0);
        __m128 m1 = _mm_mul_ps(in1[1], e1);
        __m128 m2 = _mm_mul_ps(in1[2], e2);
        __m128 m3 = _mm_mul_ps(in1[3], e3);

        __m128 a0 = _mm_add_ps(m0, m1);
        __m128 a1 = _mm_add_ps(m2, m3);
        __m128 a2 = _mm_add_ps(a0, a1);

        out[0] = a2;
    }

    {
        __m128 e0 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(0, 0, 0, 0));
        __m128 e1 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(1, 1, 1, 1));
        __m128 e2 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(2, 2, 2, 2));
        __m128 e3 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(3, 3, 3, 3));

        __m128 m0 = _mm_mul_ps(in1[0], e0);
        __m128 m1 = _mm_mul_ps(in1[1], e1);
        __m128 m2 = _mm_mul_ps(in1[2], e2);
        __m128 m3 = _mm_mul_ps(in1[3], e3);

        __m128 a0 = _mm_add_ps(m0, m1);
        __m128 a1 = _mm_add_ps(m2, m3);
        __m128 a2 = _mm_add_ps(a0, a1);

        out[1] = a2;
    }

    {
        __m128 e0 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(0, 0, 0, 0));
        __m128 e1 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(1, 1, 1, 1));
        __m128 e2 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(2, 2, 2, 2));
        __m128 e3 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(3, 3, 3, 3));

        __m128 m0 = _mm_mul_ps(in1[0], e0);
        __m128 m1 = _mm_mul_ps(in1[1], e1);
        __m128 m2 = _mm_mul_ps(in1[2], e2);
        __m128 m3 = _mm_mul_ps(in1[3], e3);

        __m128 a0 = _mm_add_ps(m0, m1);
        __m128 a1 = _mm_add_ps(m2, m3);
        __m128 a2 = _mm_add_ps(a0, a1);

        out[2] = a2;
    }

    {
        //(__m128&)_mm_shuffle_epi32(__m128i&)in2[0], _MM_SHUFFLE(3, 3, 3, 3))
        __m128 e0 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(0, 0, 0, 0));
        __m128 e1 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(1, 1, 1, 1));
        __m128 e2 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(2, 2, 2, 2));
        __m128 e3 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(3, 3, 3, 3));

        __m128 m0 = _mm_mul_ps(in1[0], e0);
        __m128 m1 = _mm_mul_ps(in1[1], e1);
        __m128 m2 = _mm_mul_ps(in1[2], e2);
        __m128 m3 = _mm_mul_ps(in1[3], e3);

        __m128 a0 = _mm_add_ps(m0, m1);
        __m128 a1 = _mm_add_ps(m2, m3);
        __m128 a2 = _mm_add_ps(a0, a1);

        out[3] = a2;
    }
}

– EDIT –

Dot product:

//dot
inline __m128 _mm_dot_ps(__m128 v1, __m128 v2)
{
    __m128 mul0 = _mm_mul_ps(v1, v2);
    __m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
    __m128 add0 = _mm_add_ps(mul0, swp0);
    __m128 swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3));
    __m128 add1 = _mm_add_ps(add0, swp1);
    return add1;
}

The dot product function is pretty basic so I guest more some cycles could be saved here as well!

14 Replies

Please log in or register to post a reply.

8fd4a055522ce713cde7dd1cb4083cb2
0
martinsm 101 Aug 11, 2009 at 21:24

Have you compared performance with matrix inversion from XNA Math Library from latest DirectX SDK?

http://msdn.microsoft.com/en-us/library/dd607622(loband).aspx

99f6aeec9715bb034bba93ba2a7eb360
0
Nick 102 Aug 12, 2009 at 00:21

@Groove

I would love to see he number of cycles needed on a Core i7!

Core i7’s pipelines are identical to Penryn/Yorkfield as far as I know. Your Q6600 is a Kentsfield architecture though (two Conroe chips). The difference is the addition of a ‘Supper Shuffle Engine’ which means _mm_shuffle_ps operations should execute faster, and a radix-16 divider which makes divisions twice as fast.

I’ll do some actual timing when I get the chance…

By the way, to speed up the division without losing a great deal of precision, try using a Newton-Rhapson iteration after using _mm_rcp_ps for the first approximation.

99f6aeec9715bb034bba93ba2a7eb360
0
Nick 102 Aug 12, 2009 at 08:15

Your code appears to be missing the _mm_dot_ps function (or is this an intrinsic that isn’t recognised by Visual C++ 2005)? Also the ‘one’ variable isn’t defined.

I tried benchmarking it on my Core i7, but the results weren’t consistent. The problem is that it has Turbo Boost, which can increase the clock frequency temporarily. I could disable it to get accurate readings though. Anyway, could you post your benchmarking code so we use the exact same thing? The results can differ depending on code and memory layout…

40df6f0cd8571262dabedf4bdcf1e093
0
Groove 101 Aug 12, 2009 at 08:47

@Nick

_mm_shuffle_ps operations should execute faster

That exactly why I would love to see the number of cycles on a Core i7 ;)

The code contains 43 _mm_shuffle_ps which require 2 cycles on Q6600 each if I remember well and only cost 1 cycle on i7. So it may save 43 cycles … not bad at all. That’s theory only.

Fe8a5d0ee91f9db7f5b82b8fd4a4e1e6
0
JarkkoL 102 Aug 12, 2009 at 10:49

I tested this on i7 920, and it took 13.051580s (using QPC for timing) to execute 800 million matrix inverses (100 million iterations doing 8 inverses each on different matrices). I changed the code to use _mm_rcp_ps() and also replaced __m128 Det0 = _mm_dot_ps(in[0], Row2); with __m128 Det0 = _mm_dp_ps(in[0], Row2, 0xff); which is SSE4 instruction though. I also changed inline to __forceinline because otherwise the inverse code had some function calls significantly degrading the results (down to \~30s).

After each inverse I added the result to static matrix to make sure there was a side effect from the inverse and that the compiler (MSVC 2008) didn’t optimize the code away, so that added extra 4 simd adds per inverse.

Results were pretty consistent over 10 runs I did (only 0.02% variation), and the 13.051580s result was from the best run.

40df6f0cd8571262dabedf4bdcf1e093
0
Groove 101 Aug 13, 2009 at 08:57

How much benefit did you get using _mm_dp_ps instead of my really basic _mm_dot_ps?

Fe8a5d0ee91f9db7f5b82b8fd4a4e1e6
0
JarkkoL 102 Aug 13, 2009 at 09:52

Ah, didn’t notice you edited your original post and added the _mm_dot_ps. With _mm_dot_ps the best run was 14.532731s. Without inlining the times for _mm_dp_ps and _mm_dot_ps are 29.669986s and 30.244751s respectively. I don’t know how much code the compiler is able to omit because of inlining, but I noticed for example that with inlined version there were only 11 shuffles per inverse (vs 33 in non-inlined version), so the non-inlined versions probably represent more closely the real-life performance.

9f75fe3ca5967cf99dfd3ddb57752677
0
Abnormalia 101 Jan 16, 2010 at 18:43

Awesome work !

Where can I find more information about “intrinsic instructions” ?

I’m also interested if there’s some modern books about asm optimizations and

latest CPU extension ?

Thank you in advance.

46407cc1bdfbd2db4f6e8876d74f990a
0
Kenneth_Gorking 101 Jan 17, 2010 at 16:22

Compiler intrinsics: http://msdn.microsoft.com/en-us/library/26td21ds(VS.80).aspx

There is a butt-load of info on assembly on the net, like hereand here

9f75fe3ca5967cf99dfd3ddb57752677
0
Abnormalia 101 Jan 17, 2010 at 20:44

Is there good book for on modern asm ?

extension like SSE2,3,4,5 ?

All I found on amazon was this: http://www.amazon.com/exec/obidos/tg/detail/-/6130062222/ref=ord_cart_shr?_encoding=UTF8&m=ATVPDKIKX0DER&v=glance

Is it good for reference ?

What will I need if I decide to write very cpu dependent vector and matrix library ?

8fd4a055522ce713cde7dd1cb4083cb2
0
martinsm 101 Jan 17, 2010 at 22:07

For reference use offical pdf docs from Intel and AMD sites. Links for them are mentioned on Agner’s site.

9f75fe3ca5967cf99dfd3ddb57752677
0
Abnormalia 101 Jan 18, 2010 at 13:49

@martinsm

For reference use offical pdf docs from Intel and AMD sites. Links for them are mentioned on Agner’s site.

Thanks those guides really are helpful !

D077d0bf7229ce747187481ac934e8d3
0
Sascha88 101 Sep 05, 2011 at 08:23

thanks for the link on amazon!! found there all necessary information spy phone glance

340bf64ac6abda6e40f7e860279823cb
0
_oisyn 101 Sep 05, 2011 at 10:14

@Groove

//dot
inline __m128 _mm_dot_ps(__m128 v1, __m128 v2)
{
    __m128 mul0 = _mm_mul_ps(v1, v2);
    __m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
    __m128 add0 = _mm_add_ps(mul0, swp0);
    __m128 swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3));
    __m128 add1 = _mm_add_ps(add0, swp1);
    return add1;
}

The dot product function is pretty basic so I guest more some cycles could be saved here as well!

Is that even correct? What’s the point of shuffling with (0, 1, 2, 3)? You’ll just end up with your input. You probably meant (1, 0, 3, 2).