- Joined
- Aug 1, 2011
- Messages
- 126
- Reaction score
- 90
Anyone solved high FPS dropdown when monsters are rendered?
I searching for reason.
Experimental codes:
benchmark is this working better or worse.
I searching for reason.
Experimental codes:
C++:
#define _XM_SVML_INTRINSICS_
#include <xmmintrin.h>
#include <emmintrin.h>
#define M_PI 3.14159265358979323846f
float QPIx2d360 = (M_PI * 2 / 360);
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
static const ALIGN16_BEG int _mm_cst_sign_mask_ps[4] ALIGN16_END =
{ 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
static const ALIGN16_BEG int _mm_cst_inv_sign_mask_ps[4] ALIGN16_END =
{ ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
static const ALIGN16_BEG int _mm_cst_sign_mask_pd[4] ALIGN16_END =
{ 0, 0x80000000, 0, 0x80000000 };
static const ALIGN16_BEG int _mm_cst_inv_sign_mask_pd[4] ALIGN16_END =
{ ~0, ~0x80000000, ~0, ~0x80000000 };
static const ALIGN16_BEG int _mm_cst_one[4] ALIGN16_END = { 1, 1, 1, 1 };
static const ALIGN16_BEG int _mm_cst_inv1[4] ALIGN16_END = { ~1, ~1, ~1, ~1 };
static const ALIGN16_BEG int _mm_cst_two[4] ALIGN16_END = { 2, 2, 2, 2 };
static const ALIGN16_BEG int _mm_cst_four[4] ALIGN16_END = { 4, 4, 4, 4 };
static const __m128 _mm_cst_ps_fopi = _mm_set1_ps(1.27323954473516f);
static const __m128 _mm_cst_ps_one = _mm_set1_ps(1.0f);
static const __m128 _mm_cst_ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f);
static const __m128 _mm_cst_ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f);
static const __m128 _mm_cst_ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f);
static const __m128 _mm_cst_ps_0p5 = _mm_set1_ps(0.5f);
static const __m128 _mm_cst_ps_DP1 = _mm_set1_ps(0.78515625f);
static const __m128 _mm_cst_ps_DP2 = _mm_set1_ps(2.4187564849853515625e-4f);
static const __m128 _mm_cst_ps_DP3 = _mm_set1_ps(3.77489497744594108e-8f);
static const __m128 _mm_cst_ps_sincof_p0 = _mm_set1_ps(-1.9515295891e-4f);
static const __m128 _mm_cst_ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f);
static const __m128 _mm_cst_ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f);
__m128 _mm_sin_ps(__m128 x) {
__m128 signbit = _mm_and_ps(x, *(__m128*) _mm_cst_sign_mask_ps);
x = _mm_and_ps(x, *(__m128*) _mm_cst_inv_sign_mask_ps);
__m128 y = _mm_mul_ps(x, _mm_cst_ps_fopi);
__m128i yf = _mm_cvttps_epi32(y); // floor
// see j = (j+1) & (~1) in Cephes
yf = _mm_add_epi32(yf, *(__m128i*) _mm_cst_one);
yf = _mm_and_si128(yf, *(__m128i*) _mm_cst_inv1);
y = _mm_cvtepi32_ps(yf);
__m128i flag = _mm_and_si128(yf, *(__m128i*) _mm_cst_four);
flag = _mm_slli_epi32(flag, 29); // flag << 29
yf = _mm_and_si128(yf, *(__m128i*) _mm_cst_two);
yf = _mm_cmpeq_epi32(yf, _mm_setzero_si128());
__m128 swapsign = _mm_castsi128_ps(flag);
__m128 polymask = _mm_castsi128_ps(yf);
signbit = _mm_xor_ps(signbit, swapsign);
// z = ((x - y * DP1) - y * DP2) - y * DP3
__m128 xmm1 = _mm_mul_ps(y, _mm_cst_ps_DP1);
__m128 xmm2 = _mm_mul_ps(y, _mm_cst_ps_DP2);
__m128 xmm3 = _mm_mul_ps(y, _mm_cst_ps_DP3);
x = _mm_sub_ps(x, xmm1);
x = _mm_sub_ps(x, xmm2);
x = _mm_sub_ps(x, xmm3);
__m128 z = _mm_mul_ps(x, x);
y = _mm_mul_ps(_mm_cst_ps_coscof_p0, z);
y = _mm_add_ps(y, _mm_cst_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _mm_cst_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, _mm_cst_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, _mm_cst_ps_one);
__m128 y2 = _mm_mul_ps(_mm_cst_ps_sincof_p0, z);
y2 = _mm_add_ps(y2, _mm_cst_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _mm_cst_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
y2 = _mm_and_ps(polymask, y2);
y = _mm_andnot_ps(polymask, y);
y = _mm_add_ps(y, y2);
y = _mm_xor_ps(y, signbit);
return y;
}
__m128 _mm_cos_ps(__m128 x)
{
x = _mm_and_ps(x, *(__m128*) _mm_cst_inv_sign_mask_ps);
__m128 y = _mm_mul_ps(x, _mm_cst_ps_fopi);
__m128i yf = _mm_cvttps_epi32(y); // floor
// see j = (j+1) & (~1) in Cephes
yf = _mm_add_epi32(yf, *(__m128i*) _mm_cst_one);
yf = _mm_and_si128(yf, *(__m128i*) _mm_cst_inv1);
y = _mm_cvtepi32_ps(yf);
yf = _mm_sub_epi32(yf, *(__m128i*) _mm_cst_two);
__m128i flag = _mm_andnot_si128(yf, *(__m128i*) _mm_cst_four);
flag = _mm_slli_epi32(flag, 29); // flag << 29
yf = _mm_and_si128(yf, *(__m128i*) _mm_cst_two);
yf = _mm_cmpeq_epi32(yf, _mm_setzero_si128());
__m128 signbit = _mm_castsi128_ps(flag);
__m128 polymask = _mm_castsi128_ps(yf);
// z = ((x - y * DP1) - y * DP2) - y * DP3
__m128 xmm1 = _mm_mul_ps(y, _mm_cst_ps_DP1);
__m128 xmm2 = _mm_mul_ps(y, _mm_cst_ps_DP2);
__m128 xmm3 = _mm_mul_ps(y, _mm_cst_ps_DP3);
x = _mm_sub_ps(x, xmm1);
x = _mm_sub_ps(x, xmm2);
x = _mm_sub_ps(x, xmm3);
__m128 z = _mm_mul_ps(x, x);
y = _mm_mul_ps(_mm_cst_ps_coscof_p0, z);
y = _mm_add_ps(y, _mm_cst_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _mm_cst_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, _mm_cst_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, _mm_cst_ps_one);
__m128 y2 = _mm_mul_ps(_mm_cst_ps_sincof_p0, z);
y2 = _mm_add_ps(y2, _mm_cst_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _mm_cst_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
y2 = _mm_and_ps(polymask, y2);
y = _mm_andnot_ps(polymask, y);
y = _mm_add_ps(y, y2);
y = _mm_xor_ps(y, signbit);
return y;
}
void AngleMatrix(const float angles[3], float matrix[3][4])
{
__m128 angleX = _mm_set1_ps(angles[0] * QPIx2d360);
__m128 angleY = _mm_set1_ps(angles[1] * QPIx2d360);
__m128 angleZ = _mm_set1_ps(angles[2] * QPIx2d360);
__m128 cx, sx, cy, sy, cz, sz;
cx = _mm_cos_ps(angleX);
sx = _mm_sin_ps(angleX);
cy = _mm_cos_ps(angleY);
sy = _mm_sin_ps(angleY);
cz = _mm_cos_ps(angleZ);
sz = _mm_sin_ps(angleZ);
__m128 sy_cz = _mm_mul_ps(sy, cz);
__m128 sy_sz = _mm_mul_ps(sy, sz);
__m128 sx_sy_cz = _mm_mul_ps(_mm_mul_ps(sx, sy), cz);
__m128 sx_sy_sz = _mm_mul_ps(_mm_mul_ps(sx, sy), sz);
__m128 cx_sy_cz = _mm_mul_ps(_mm_mul_ps(cx, sy), cz);
__m128 cx_sy_sz = _mm_mul_ps(_mm_mul_ps(cx, sy), sz);
matrix[0][0] = _mm_cvtss_f32(_mm_mul_ps(cy, cz));
matrix[1][0] = _mm_cvtss_f32(_mm_mul_ps(cy, sz));
matrix[2][0] = -_mm_cvtss_f32(sy);
matrix[0][1] = _mm_cvtss_f32(_mm_sub_ps(sx_sy_cz, _mm_mul_ps(cx, sz)));
matrix[1][1] = _mm_cvtss_f32(_mm_add_ps(sx_sy_sz, _mm_mul_ps(cx, cz)));
matrix[2][1] = _mm_cvtss_f32(_mm_mul_ps(sx, cy));
matrix[0][2] = _mm_cvtss_f32(_mm_add_ps(cx_sy_cz, _mm_mul_ps(sx, sz)));
matrix[1][2] = _mm_cvtss_f32(_mm_sub_ps(cx_sy_sz, _mm_mul_ps(sx, cz)));
matrix[2][2] = _mm_cvtss_f32(_mm_mul_ps(cx, cy));
matrix[0][3] = 0.0f;
matrix[1][3] = 0.0f;
matrix[2][3] = 0.0f;
}
void AngleIMatrix (const vec3_t angles, float matrix[3][4] )
{
__m128 angleX = _mm_set1_ps(angles[0] * QPIx2d360);
__m128 angleY = _mm_set1_ps(angles[1] * QPIx2d360);
__m128 angleZ = _mm_set1_ps(angles[2] * QPIx2d360);
__m128 cx, sx, cy, sy, cz, sz;
cx = _mm_cos_ps(angleX);
sx = _mm_sin_ps(angleX);
cy = _mm_cos_ps(angleY);
sy = _mm_sin_ps(angleY);
cz = _mm_cos_ps(angleZ);
sz = _mm_sin_ps(angleZ);
__m128 sy_cz = _mm_mul_ps(sy, cz);
__m128 sy_sz = _mm_mul_ps(sy, sz);
__m128 sx_sy_cz = _mm_mul_ps(_mm_mul_ps(sx, sy), cz);
__m128 sx_sy_sz = _mm_mul_ps(_mm_mul_ps(sx, sy), sz);
__m128 cx_sy_cz = _mm_mul_ps(_mm_mul_ps(cx, sy), cz);
__m128 cx_sy_sz = _mm_mul_ps(_mm_mul_ps(cx, sy), sz);
matrix[0][0] = _mm_cvtss_f32(_mm_mul_ps(cy, cz));
matrix[1][0] = _mm_cvtss_f32(_mm_mul_ps(cy, sz));
matrix[2][0] = -_mm_cvtss_f32(sy);
matrix[0][1] = _mm_cvtss_f32(_mm_sub_ps(sx_sy_cz, _mm_mul_ps(cx, sz)));
matrix[1][1] = _mm_cvtss_f32(_mm_add_ps(sx_sy_sz, _mm_mul_ps(cx, cz)));
matrix[2][1] = _mm_cvtss_f32(_mm_mul_ps(sx, cy));
matrix[0][2] = _mm_cvtss_f32(_mm_add_ps(cx_sy_cz, _mm_mul_ps(sx, sz)));
matrix[1][2] = _mm_cvtss_f32(_mm_sub_ps(cx_sy_sz, _mm_mul_ps(sx, cz)));
matrix[2][2] = _mm_cvtss_f32(_mm_mul_ps(cx, cy));
matrix[0][3] = 0.0f;
matrix[1][3] = 0.0f;
matrix[2][3] = 0.0f;
}
benchmark is this working better or worse.
Last edited: