From b1a707fac854753088d9b17ceef11d20dd01ad20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= Date: Wed, 6 May 2020 16:57:16 +0300 Subject: [PATCH] Micro-optimize float4x4 operations when SIMD is not supported --- include/bx/float4x4_t.h | 12 ++++- include/bx/inline/float4x4_t.inl | 87 ++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/include/bx/float4x4_t.h b/include/bx/float4x4_t.h index 973d4a7..c64365a 100644 --- a/include/bx/float4x4_t.h +++ b/include/bx/float4x4_t.h @@ -22,9 +22,19 @@ namespace bx /// simd128_t simd_mul(simd128_t _a, const float4x4_t* _b); - /// + /// Multiplies two matrices void float4x4_mul(float4x4_t* _result, const float4x4_t* _a, const float4x4_t* _b); + /// Multiplies two 3x4 affine matrices (i.e. "model" or "world" matrices). + /// This function is a micro-optimized version of float4x4_mul() in the case + /// when the last row of the both input matrices are (0, 0, 0, 1). + void model4x4_mul(float4x4_t* _result, const float4x4_t* _a, const float4x4_t* _b); + + /// Multiplies a 3x4 affine matrix with a general 4x4 matrix. + /// This function is a micro-optimized version of float4x4_mul() in the case + /// when the last row of the _model input matrix is (0, 0, 0, 1). + void model4x4_mul_viewproj4x4(float4x4_t* _result, const float4x4_t* _model, const float4x4_t* _viewProj); + /// void float4x4_transpose(float4x4_t* _result, const float4x4_t* _mtx); diff --git a/include/bx/inline/float4x4_t.inl b/include/bx/inline/float4x4_t.inl index e07057b..fb459ef 100644 --- a/include/bx/inline/float4x4_t.inl +++ b/include/bx/inline/float4x4_t.inl @@ -40,10 +40,97 @@ namespace bx BX_SIMD_INLINE void float4x4_mul(float4x4_t* _result, const float4x4_t* _a, const float4x4_t* _b) { +#if !BX_CONFIG_SUPPORTS_SIMD + const float *a = (const float*)_a; + const float *b = (const float*)_b; + float *r = (float*)_result; + r[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12]; + r[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13]; + r[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14]; + r[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15]; + + r[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12]; + r[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13]; + r[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14]; + r[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15]; + + r[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12]; + r[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13]; + r[10] = a[8]*b[2] + a[9]*b[6] + a[10]*b[10] + a[11]*b[14]; + r[11] = a[8]*b[3] + a[9]*b[7] + a[10]*b[11] + a[11]*b[15]; + + r[12] = a[12]*b[0] + a[13]*b[4] + a[14]*b[8] + a[15]*b[12]; + r[13] = a[12]*b[1] + a[13]*b[5] + a[14]*b[9] + a[15]*b[13]; + r[14] = a[12]*b[2] + a[13]*b[6] + a[14]*b[10] + a[15]*b[14]; + r[15] = a[12]*b[3] + a[13]*b[7] + a[14]*b[11] + a[15]*b[15]; +#else _result->col[0] = simd_mul(_a->col[0], _b); _result->col[1] = simd_mul(_a->col[1], _b); _result->col[2] = simd_mul(_a->col[2], _b); _result->col[3] = simd_mul(_a->col[3], _b); +#endif + } + + BX_SIMD_INLINE void model4x4_mul(float4x4_t* _result, const float4x4_t* _a, const float4x4_t* _b) + { +#if !BX_CONFIG_SUPPORTS_SIMD + const float *a = (const float*)_a; // a[3]==a[7]==a[11]==0, a[15]=1 + const float *b = (const float*)_b; // b[3]==b[7]==b[11]==0, b[15]=1 + float *r = (float*)_result; + r[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8]; + r[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9]; + r[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10]; + r[3] = 0.f; + + r[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8]; + r[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9]; + r[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10]; + r[7] = 0.f; + + r[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8]; + r[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9]; + r[10] = a[8]*b[2] + a[9]*b[6] + a[10]*b[10]; + r[11] = 0.f; + + r[12] = a[12]*b[0] + a[13]*b[4] + a[14]*b[8] + b[12]; + r[13] = a[12]*b[1] + a[13]*b[5] + a[14]*b[9] + b[13]; + r[14] = a[12]*b[2] + a[13]*b[6] + a[14]*b[10] + b[14]; + r[15] = 1.f; +#else + // With SIMD faster to do the general 4x4 form: + float4x4_mul(_result, _a, _b); +#endif + } + + BX_SIMD_INLINE void model4x4_mul_viewproj4x4(float4x4_t* _result, const float4x4_t* _model, const float4x4_t* _viewProj) + { +#if !BX_CONFIG_SUPPORTS_SIMD + const float *a = (const float*)_model; // a[3]==a[7]==a[11]==0, a[15]=1 + const float *b = (const float*)_viewProj; + float *r = (float*)_result; + r[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8]; + r[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9]; + r[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10]; + r[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11]; + + r[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8]; + r[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9]; + r[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10]; + r[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11]; + + r[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8]; + r[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9]; + r[10] = a[8]*b[2] + a[9]*b[6] + a[10]*b[10]; + r[11] = a[8]*b[3] + a[9]*b[7] + a[10]*b[11]; + + r[12] = a[12]*b[0] + a[13]*b[4] + a[14]*b[8] + b[12]; + r[13] = a[12]*b[1] + a[13]*b[5] + a[14]*b[9] + b[13]; + r[14] = a[12]*b[2] + a[13]*b[6] + a[14]*b[10] + b[14]; + r[15] = a[12]*b[3] + a[13]*b[7] + a[14]*b[11] + b[15]; +#else + // With SIMD faster to do the general 4x4 form: + float4x4_mul(_result, _model, _viewProj); +#endif } BX_SIMD_FORCE_INLINE void float4x4_transpose(float4x4_t* _result, const float4x4_t* _mtx)