Removes FMA check for matrix multiplication

Removes preprocessor check for FMA instructions in matrix multiplication functions. This simplifies the code and relies on the compiler's ability to optimize the code based on available hardware support. The assumption is that modern compilers will automatically utilize FMA instructions if available, and fall back to alternative implementations if not.
2026-02-13 07:03:25 +00:00 · 2025-09-18 06:02:37 +03:00
parent 89bb4aa625
commit e05f9ef5a9
1 changed files with 2 additions and 16 deletions
--- a/include/omath/linear_algebra/mat.hpp
+++ b/include/omath/linear_algebra/mat.hpp
@@ -431,11 +431,7 @@ namespace omath
                        {
                            __m256 cvec = _mm256_loadu_ps(c_col + i);
                            __m256 avec = _mm256_loadu_ps(a_col_k + i);
 #if defined(__FMA__)
                            cvec = _mm256_fmadd_ps(avec, bkjv, cvec);
 #else
                            cvec = _mm256_add_ps(cvec, _mm256_mul_ps(avec, bkjv));
 #endif
                            _mm256_storeu_ps(c_col + i, cvec);
                        }
                        for (; i < Rows; ++i)
@@ -462,11 +458,7 @@ namespace omath
                        {
                            __m256d cvec = _mm256_loadu_pd(c_col + i);
                            __m256d avec = _mm256_loadu_pd(a_col_k + i);
 #if defined(__FMA__)
                            cvec = _mm256_fmadd_pd(avec, bkjv, cvec);
 #else
                            cvec = _mm256_add_pd(cvec, _mm256_mul_pd(avec, bkjv));
 #endif
                            _mm256_storeu_pd(c_col + i, cvec);
                        }
                        for (; i < Rows; ++i)
@@ -508,11 +500,8 @@ namespace omath
                        {
                            __m256 cvec = _mm256_loadu_ps(c_row + j);
                            __m256 bvec = _mm256_loadu_ps(b_row + j);
 #if defined(__FMA__)
                            cvec = _mm256_fmadd_ps(bvec, aikv, cvec);
-#else
+
                            cvec = _mm256_add_ps(cvec, _mm256_mul_ps(bvec, aikv));
 #endif
                            _mm256_storeu_ps(c_row + j, cvec);
                        }
                        for (; j < OtherColumns; ++j)
@@ -538,11 +527,8 @@ namespace omath
                        {
                            __m256d cvec = _mm256_loadu_pd(c_row + j);
                            __m256d bvec = _mm256_loadu_pd(b_row + j);
 #if defined(__FMA__)
                            cvec = _mm256_fmadd_pd(bvec, aikv, cvec);
-#else
+
                            cvec = _mm256_add_pd(cvec, _mm256_mul_pd(bvec, aikv));
 #endif
                            _mm256_storeu_pd(c_row + j, cvec);
                        }
                        for (; j < OtherColumns; ++j)