leobenaducci said:
and the asm
__asm int 3
0040103A int 3
b.mul(feb, a, zxy);
__asm int 3
0040103B int 3
Clearly something has gone wrong here... :)
Anyways, I tried to create some kind of swizzle support for a float4 class I was working on, but didn't have much luck. After seeing this thread, I decided to go back and give it another go, and finally succeeded. Here it is, in its entirety:
#pragma once
#include <xmmintrin.h>
struct __declspec(align(16)) float4
{
private:
template <unsigned mask = ((3 << 6) | (2 << 4) | (1 << 2) | 0)>
struct swizzle_proxy
{
__m128 &ref;
swizzle_proxy(__m128 &ref)
: ref(ref)
{ }
__m128 get_swizzled() const { return _mm_shuffle_ps(ref, ref, mask); }
swizzle_proxy& operator = (const float4 &other);
template<unsigned other_mask>
swizzle_proxy& operator = (const swizzle_proxy<other_mask> &other)
{
__m128 data = other.get_swizzled();
ref = _mm_shuffle_ps(data, data, mask);
return *this;
}
};
public:
float4()
{ }
float4(const float4 &other)
: x(other.x)
, y(other.y)
, z(other.z)
, w(other.w)
{ }
explicit float4(const __m128 _xmm)
: xmm(_xmm)
{ }
float4(float a, float b, float c, float d)
: x(a)
, y(b)
, z(c)
, w(d)
{ }
template<unsigned mask>
float4(const swizzle_proxy<mask> &other)
: xmm(other.get_swizzled())
{ }
float4 operator + (const float4 &other) const { return float4(_mm_add_ps(xmm, other.xmm)); }
float4 operator - (const float4 &other) const { return float4(_mm_sub_ps(xmm, other.xmm)); }
float4 operator * (const float4 &other) const { return float4(_mm_mul_ps(xmm, other.xmm)); }
float4 operator / (const float4 &other) const { return float4(_mm_div_ps(xmm, other.xmm)); }
float4 operator & (const float4 &other) const { return float4(_mm_and_ps(xmm, other.xmm)); }
float4 operator | (const float4 &other) const { return float4(_mm_or_ps(xmm, other.xmm)); }
float4 operator ^ (const float4 &other) const { return float4(_mm_xor_ps(xmm, other.xmm)); }
float4 andnot(const float4 &other) const { return float4(_mm_andnot_ps(xmm, other.xmm)); } // "~this & other"
float4 operator + (float f) const { return float4(_mm_add_ps(xmm, _mm_set_ps1(f))); }
float4 operator - (float f) const { return float4(_mm_sub_ps(xmm, _mm_set_ps1(f))); }
float4 operator * (float f) const { return float4(_mm_mul_ps(xmm, _mm_set_ps1(f))); }
float4 operator / (float f) const { return float4(_mm_div_ps(xmm, _mm_set_ps1(f))); }
float4 operator & (float f) const { return float4(_mm_and_ps(xmm, _mm_set_ps1(f))); }
float4 operator | (float f) const { return float4(_mm_or_ps(xmm, _mm_set_ps1(f))); }
float4 operator ^ (float f) const { return float4(_mm_xor_ps(xmm, _mm_set_ps1(f))); }
float4 andnot(float f) const { return float4(_mm_andnot_ps(xmm, _mm_set_ps1(f))); } // "~this & f"
template<unsigned mask>
float4& operator = (const swizzle_proxy<mask> &other) { xmm = other.get_swizzled(); return *this; }
float4& operator = (const float4 &other) { xmm = other.xmm; return *this; }
float4& operator += (const float4 &other) { xmm = _mm_add_ps(xmm, other.xmm); return *this; }
float4& operator -= (const float4 &other) { xmm = _mm_sub_ps(xmm, other.xmm); return *this; }
float4& operator *= (const float4 &other) { xmm = _mm_mul_ps(xmm, other.xmm); return *this; }
float4& operator /= (const float4 &other) { xmm = _mm_div_ps(xmm, other.xmm); return *this; }
float4& operator &= (const float4 &other) { xmm = _mm_and_ps(xmm, other.xmm); return *this; }
float4& operator |= (const float4 &other) { xmm = _mm_or_ps(xmm, other.xmm); return *this; }
float4& operator ^= (const float4 &other) { xmm = _mm_xor_ps(xmm, other.xmm); return *this; }
float4& andnot_asg(const float4 &other) { xmm = _mm_andnot_ps(xmm, other.xmm); return *this; } // "this = ~this & other"
float4& operator += (float f) { xmm = _mm_add_ps(xmm, _mm_set_ps1(f)); return *this; }
float4& operator -= (float f) { xmm = _mm_sub_ps(xmm, _mm_set_ps1(f)); return *this; }
float4& operator *= (float f) { xmm = _mm_mul_ps(xmm, _mm_set_ps1(f)); return *this; }
float4& operator /= (float f) { xmm = _mm_div_ps(xmm, _mm_set_ps1(f)); return *this; }
float4& operator &= (float f) { xmm = _mm_and_ps(xmm, _mm_set_ps1(f)); return *this; }
float4& operator |= (float f) { xmm = _mm_or_ps(xmm, _mm_set_ps1(f)); return *this; }
float4& operator ^= (float f) { xmm = _mm_xor_ps(xmm, _mm_set_ps1(f)); return *this; }
float4& andnot_asg(float f) { xmm = _mm_andnot_ps(xmm, _mm_set_ps1(f)); return *this; } // "this = ~this & f"
friend float4 operator / (float f, const float4 &a) { return float4(_mm_mul_ps(_mm_set_ps1(1.0f/f), a.xmm)); }
friend float4 sqrt(const float4 &a) { return float4(_mm_sqrt_ps(a.xmm)); }
friend float4 rcp(const float4 &a) { return float4(_mm_rcp_ps(a.xmm)); }
friend float4 rsqrt(const float4 &a) { return float4(_mm_rsqrt_ps(a.xmm)); }
friend float4 horizontal_add(const float4 &a) { return float4(_mm_add_ss(a.xmm,_mm_add_ss(_mm_shuffle_ps(a.xmm, a.xmm, 1),_mm_add_ss(_mm_shuffle_ps(a.xmm, a.xmm, 2),_mm_shuffle_ps(a.xmm, a.xmm, 3))))); }
friend float4 min(const float4 &a, const float4 &b) { return float4(_mm_min_ps(a.xmm, b.xmm)); }
friend float4 max(const float4 &a, const float4 &b) { return float4(_mm_max_ps(a.xmm, b.xmm)); }
friend float4 dot(const float4 &a, const float4 &b) { return horizontal_add(a*b); }
friend float4 length(const float4 &a) { return sqrt(dot(a, a)); }
friend float4 rlength(const float4 &a) { return rsqrt(dot(a, a)); }
friend float4 normalize(const float4 &a) { return a * rlength(a); }
friend float4 distance(const float4 &a, const float4 &b) { return length(a-b); }
friend float4 clamp(const float4 &x, const float4 &a, const float4 &b) { return max(a, min(b, x)); }
friend float4 cross(const float4 &a, const float4 &b)
{
enum
{
shuf_yzxw = _MM_SHUFFLE(3, 0, 2, 1),
shuf_zxyw = _MM_SHUFFLE(3, 1, 0, 2)
};
__m128 left = _mm_mul_ps(_mm_shuffle_ps(a.xmm, a.xmm, shuf_yzxw), _mm_shuffle_ps(b.xmm, b.xmm, shuf_zxyw));
__m128 right = _mm_mul_ps(_mm_shuffle_ps(a.xmm, a.xmm, shuf_zxyw), _mm_shuffle_ps(b.xmm, b.xmm, shuf_yzxw));
#if 0
return float4(_mm_add_ps(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f), _mm_sub_ps(left, right)));
#else
return float4(_mm_sub_ps(left, right)); // .w equals zero
#endif
}
// NewtonRaphson Reciprocal
// [2 * rcpps(a) - (a * rcpps(a) * rcpps(a))]
friend float4 rcp_nr(const float4 &a)
{
float4 ra0 = rcp(a);
return (ra0 + ra0) - (a * ra0 * ra0);
}
template<const unsigned a, const unsigned b, const unsigned c, const unsigned d>
swizzle_proxy<(d << 6) | (c << 4) | (b << 2) | a> shuffle()
{
swizzle_proxy<(d << 6) | (c << 4) | (b << 2) | a> sw(xmm);
return sw;
}
public:
union
{
struct { float x,y,z,w; };
struct { float r,g,b,a; };
__m128 xmm;
};
};
template<unsigned mask>
float4::swizzle_proxy<mask>& float4::swizzle_proxy<mask>::operator = (const float4 &other)
{
ref = _mm_shuffle_ps(other.xmm, other.xmm, mask);
return *this;
}
// Test-defines
#define xyzw shuffle<0,1,2,3>()
#define wzyx shuffle<3,2,1,0>()
#define xyxy shuffle<0,1,0,1>()
#define yzyx shuffle<1,2,1,0>()
#define xxxx shuffle<0,0,0,0>()
#define yyyy shuffle<1,1,1,1>()
#define zzzz shuffle<2,2,2,2>()
#define wwww shuffle<3,3,3,3>()
Using the supplied defines, it is now possible to write code like this (which is pretty close to Cg):
float4 float4_test()
{
float4 f1 = float4(1,2,3,4);
printf("'f1' : %f, %f, %f, %f\n", f1.x, f1.y, f1.z, f1.w);
float4 f3;
f3.wzyx = f1;
printf("'f3.wzyx = f1' : %f, %f, %f, %f\n", f3.x, f3.y, f3.z, f3.w);
float4 f2 = f1.yyyy;
printf("'f2 = f1.yyyy' : %f, %f, %f, %f\n", f2.x, f2.y, f2.z, f2.w);
f2.wzyx = f3.xyxy;
printf("'f2.wzyx = f3.xyxyx' : %f, %f, %f, %f\n", f2.x, f2.y, f2.z, f2.w);
float4 f4 = f2 + f1.wzyx;
f1 = f1.wzyx;
printf("'f1.wzyx' : %f, %f, %f, %f\n", f1.x, f1.y, f1.z, f1.w);
printf("'f2.xyzw' : %f, %f, %f, %f\n", f2.x, f2.y, f2.z, f2.w);
printf("'f4 = f2 + f1.wzyx' : %f, %f, %f, %f\n", f4.x, f4.y, f4.z, f4.w);
float4 f5 = f4 * f2.yzyx;
printf("f5 = 'f4 * f2.yzyx' : %f, %f, %f, %f\n", f5.x, f5.y, f5.z, f5.w);
return f5;
}
which results in the following output:
[INDENT]'f1' : 1.000000, 2.000000, 3.000000, 4.000000
'f3.wzyx = f1' : 4.000000, 3.000000, 2.000000, 1.000000
'f2 = f1.yyyy' : 2.000000, 2.000000, 2.000000, 2.000000
'f2.wzyx = f3.xyxyx' : 3.000000, 4.000000, 3.000000, 4.000000
'f1.wzyx' : 4.000000, 3.000000, 2.000000, 1.000000
'f2.xyzw' : 3.000000, 4.000000, 3.000000, 4.000000
'f4 = f2 + f1.wzyx' : 7.000000, 7.000000, 5.000000, 5.000000
f5 = 'f4 * f2.yzyx' : 28.000000, 21.000000, 20.000000, 15.000000[/INDENT]
And finally, the generated assembly (without all the printf calls):
; 6 : float4 f1 = float4(1,2,3,4);
fld1
fstp DWORD PTR _f1$[esp+16]
fld DWORD PTR __real@40000000
fstp DWORD PTR _f1$[esp+20]
fld DWORD PTR __real@40400000
fstp DWORD PTR _f1$[esp+24]
fld DWORD PTR __real@40800000
fstp DWORD PTR _f1$[esp+28]
; 7 : //printf("'f1' : %f, %f, %f, %f\n", f1.x, f1.y, f1.z, f1.w);
; 8 :
; 9 : float4 f3;
; 10 : f3.wzyx = f1;
movaps xmm1, XMMWORD PTR _f1$[esp+16]
shufps xmm1, xmm1, 27 ; 0000001bH
; 11 : //printf("'f3.wzyx = f1' : %f, %f, %f, %f\n", f3.x, f3.y, f3.z, f3.w);
; 12 :
; 13 : float4 f2 = f1.yyyy;
; 14 : //printf("'f2 = f1.yyyy' : %f, %f, %f, %f\n", f2.x, f2.y, f2.z, f2.w);
; 15 :
; 16 : f2.wzyx = f3.xyxy;
movaps xmm0, xmm1
shufps xmm0, xmm1, 68 ; 00000044H
shufps xmm0, xmm0, 27 ; 0000001bH
; 17 : //printf("'f2.wzyx = f3.xyxyx' : %f, %f, %f, %f\n", f2.x, f2.y, f2.z, f2.w);
; 18 :
; 19 : float4 f4 = f2 + f1.wzyx;
; 20 : f1 = f1.wzyx;
; 21 : //printf("'f1.wzyx' : %f, %f, %f, %f\n", f1.x, f1.y, f1.z, f1.w);
; 22 : //printf("'f2.xyzw' : %f, %f, %f, %f\n", f2.x, f2.y, f2.z, f2.w);
; 23 : //printf("'f4 = f2 + f1.wzyx' : %f, %f, %f, %f\n", f4.x, f4.y, f4.z, f4.w);
; 24 :
; 25 : float4 f5 = f4 * f2.yzyx;
movaps xmm2, xmm0
shufps xmm2, xmm0, 25 ; 00000019H
addps xmm1, xmm0
mulps xmm2, xmm1
movaps XMMWORD PTR [eax], xmm2
; 26 : //printf("f5 = 'f4 * f2.yzyx' : %f, %f, %f, %f\n", f5.x, f5.y, f5.z, f5.w);
; 27 :
; 28 : return f5;
; 29 : }