include/glm/simd/common.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

#if(GLM_COMPILER & GLM_COMPILER_VC)
#pragma warning(push)
#pragma warning(disable : 4510 4512 4610)
#endif

	union ieee754_QNAN
	{
		const float f;
		struct i
		{
			const unsigned int mantissa:23, exp:8, sign:1;
		};

		ieee754_QNAN() : f(0.0)/*, mantissa(0x7FFFFF), exp(0xFF), sign(0x0)*/ {}
	};

#if(GLM_COMPILER & GLM_COMPILER_VC)
#pragma warning(pop)
#endif

static const __m128 GLM_VAR_USED glm_zero = _mm_setzero_ps();
static const __m128 GLM_VAR_USED glm_one = _mm_set_ps1(1.0f);
static const __m128 GLM_VAR_USED glm_half = _mm_set_ps1(0.5f);
static const __m128 GLM_VAR_USED glm_minus_one = _mm_set_ps1(-1.0f);
static const __m128 GLM_VAR_USED glm_two = _mm_set_ps1(2.0f);
static const __m128 GLM_VAR_USED glm_three = _mm_set_ps1(3.0f);

static const ieee754_QNAN glm_abs_mask;
static const __m128 GLM_VAR_USED glm_abs4_mask = _mm_set_ps1(glm_abs_mask.f);
static const __m128 GLM_VAR_USED glm_epi32_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(static_cast<int>(0x80000000)));
static const __m128 GLM_VAR_USED glm_ps_2pow23 = _mm_set_ps1(8388608.0f);
static const __m128 GLM_VAR_USED glm_ps_1 = _mm_set_ps1(1.0f);

GLM_FUNC_QUALIFIER __m128 glm_abs_ps(__m128 x)
{
	return _mm_and_ps(glm_abs4_mask, x);
}

//sign
GLM_FUNC_QUALIFIER __m128 glm_sgn_ps(__m128 x)
{
	__m128 const Cmp0 = _mm_cmplt_ps(x, glm_zero);
	__m128 const Cmp1 = _mm_cmpgt_ps(x, glm_zero);
	__m128 const And0 = _mm_and_ps(Cmp0, glm_minus_one);
	__m128 const And1 = _mm_and_ps(Cmp1, glm_one);
	return _mm_or_ps(And0, And1);
}

//round
GLM_FUNC_QUALIFIER __m128 glm_rnd_ps(__m128 x)
{
	__m128 const and0 = _mm_and_ps(glm_epi32_sign_mask, x);
	__m128 const or0 = _mm_or_ps(and0, glm_ps_2pow23);
	__m128 const add0 = _mm_add_ps(x, or0);
	__m128 const sub0 = _mm_sub_ps(add0, or0);
	return sub0;
}

//floor
GLM_FUNC_QUALIFIER __m128 glm_flr_ps(__m128 x)
{
	__m128 const rnd0 = glm_rnd_ps(x);
	__m128 const cmp0 = _mm_cmplt_ps(x, rnd0);
	__m128 const and0 = _mm_and_ps(cmp0, glm_ps_1);
	__m128 const sub0 = _mm_sub_ps(rnd0, and0);
	return sub0;
}

//trunc
//GLM_FUNC_QUALIFIER __m128 _mm_trc_ps(__m128 v)
//{
//	return __m128();
//}

//roundEven
GLM_FUNC_QUALIFIER __m128 glm_rde_ps(__m128 x)
{
	__m128 const and0 = _mm_and_ps(glm_epi32_sign_mask, x);
	__m128 const or0 = _mm_or_ps(and0, glm_ps_2pow23);
	__m128 const add0 = _mm_add_ps(x, or0);
	__m128 const sub0 = _mm_sub_ps(add0, or0);
	return sub0;
}

GLM_FUNC_QUALIFIER __m128 glm_ceil_ps(__m128 x)
{
	__m128 const rnd0 = glm_rnd_ps(x);
	__m128 const cmp0 = _mm_cmpgt_ps(x, rnd0);
	__m128 const and0 = _mm_and_ps(cmp0, glm_ps_1);
	__m128 const add0 = _mm_add_ps(rnd0, and0);
	return add0;
}

GLM_FUNC_QUALIFIER __m128 glm_frc_ps(__m128 x)
{
	__m128 const flr0 = glm_flr_ps(x);
	__m128 const sub0 = _mm_sub_ps(x, flr0);
	return sub0;
}

GLM_FUNC_QUALIFIER __m128 glm_mod_ps(__m128 x, __m128 y)
{
	__m128 const div0 = _mm_div_ps(x, y);
	__m128 const flr0 = glm_flr_ps(div0);
	__m128 const mul0 = _mm_mul_ps(y, flr0);
	__m128 const sub0 = _mm_sub_ps(x, mul0);
	return sub0;
}

GLM_FUNC_QUALIFIER __m128 glm_clp_ps(__m128 v, __m128 minVal, __m128 maxVal)
{
	__m128 const min0 = _mm_min_ps(v, maxVal);
	__m128 const max0 = _mm_max_ps(min0, minVal);
	return max0;
}

GLM_FUNC_QUALIFIER __m128 glm_mix_ps(__m128 v1, __m128 v2, __m128 a)
{
	__m128 const sub0 = _mm_sub_ps(glm_one, a);
	__m128 const mul0 = _mm_mul_ps(v1, sub0);
	__m128 const mul1 = _mm_mul_ps(v2, a);
	__m128 const add0 = _mm_add_ps(mul0, mul1);
	return add0;
}

//step
GLM_FUNC_QUALIFIER __m128 glm_stp_ps(__m128 edge, __m128 x)
{
	__m128 const cmp = _mm_cmple_ps(x, edge);
	return _mm_movemask_ps(cmp) == 0 ? glm_one : glm_zero;
}

// smoothstep
GLM_FUNC_QUALIFIER __m128 glm_ssp_ps(__m128 edge0, __m128 edge1, __m128 x)
{
	__m128 const sub0 = _mm_sub_ps(x, edge0);
	__m128 const sub1 = _mm_sub_ps(edge1, edge0);
	__m128 const div0 = _mm_sub_ps(sub0, sub1);
	__m128 const clp0 = glm_clp_ps(div0, glm_zero, glm_one);
	__m128 const mul0 = _mm_mul_ps(glm_two, clp0);
	__m128 const sub2 = _mm_sub_ps(glm_three, mul0);
	__m128 const mul1 = _mm_mul_ps(clp0, clp0);
	__m128 const mul2 = _mm_mul_ps(mul1, sub2);
	return mul2;
}

/// \todo
//GLM_FUNC_QUALIFIER __m128 glm_nan_ps(__m128 x)
//{
//	__m128 empty;
//	return empty;
//}

/// \todo
//GLM_FUNC_QUALIFIER __m128 glm_inf_ps(__m128 x)
//{
//	__m128 empty;
//	return empty;
//}

// SSE scalar reciprocal sqrt using rsqrt op, plus one Newton-Rhaphson iteration
// By Elan Ruskin, http://assemblyrequired.crashworks.org/
GLM_FUNC_QUALIFIER __m128 glm_sqrt_wip_ss(__m128 x)
{
	__m128 const recip = _mm_rsqrt_ss(x);  // "estimate" opcode
	__m128 const halfrecip = _mm_mul_ss(glm_half, recip);
	__m128 const threeminus_xrr = _mm_sub_ss(glm_three, _mm_mul_ss(x, _mm_mul_ss(recip, recip)));
	return _mm_mul_ss(halfrecip, threeminus_xrr);
}