Removes FMA check for matrix multiplication

Removes preprocessor check for FMA instructions in matrix multiplication functions.
This simplifies the code and relies on the compiler's ability to optimize the
code based on available hardware support. The assumption is that modern
compilers will automatically utilize FMA instructions if available, and fall
back to alternative implementations if not.
This commit is contained in:
2025-09-18 06:02:37 +03:00
parent 89bb4aa625
commit e05f9ef5a9

View File

@@ -431,11 +431,7 @@ namespace omath
{ {
__m256 cvec = _mm256_loadu_ps(c_col + i); __m256 cvec = _mm256_loadu_ps(c_col + i);
__m256 avec = _mm256_loadu_ps(a_col_k + i); __m256 avec = _mm256_loadu_ps(a_col_k + i);
#if defined(__FMA__)
cvec = _mm256_fmadd_ps(avec, bkjv, cvec); cvec = _mm256_fmadd_ps(avec, bkjv, cvec);
#else
cvec = _mm256_add_ps(cvec, _mm256_mul_ps(avec, bkjv));
#endif
_mm256_storeu_ps(c_col + i, cvec); _mm256_storeu_ps(c_col + i, cvec);
} }
for (; i < Rows; ++i) for (; i < Rows; ++i)
@@ -462,11 +458,7 @@ namespace omath
{ {
__m256d cvec = _mm256_loadu_pd(c_col + i); __m256d cvec = _mm256_loadu_pd(c_col + i);
__m256d avec = _mm256_loadu_pd(a_col_k + i); __m256d avec = _mm256_loadu_pd(a_col_k + i);
#if defined(__FMA__)
cvec = _mm256_fmadd_pd(avec, bkjv, cvec); cvec = _mm256_fmadd_pd(avec, bkjv, cvec);
#else
cvec = _mm256_add_pd(cvec, _mm256_mul_pd(avec, bkjv));
#endif
_mm256_storeu_pd(c_col + i, cvec); _mm256_storeu_pd(c_col + i, cvec);
} }
for (; i < Rows; ++i) for (; i < Rows; ++i)
@@ -508,11 +500,8 @@ namespace omath
{ {
__m256 cvec = _mm256_loadu_ps(c_row + j); __m256 cvec = _mm256_loadu_ps(c_row + j);
__m256 bvec = _mm256_loadu_ps(b_row + j); __m256 bvec = _mm256_loadu_ps(b_row + j);
#if defined(__FMA__)
cvec = _mm256_fmadd_ps(bvec, aikv, cvec); cvec = _mm256_fmadd_ps(bvec, aikv, cvec);
#else
cvec = _mm256_add_ps(cvec, _mm256_mul_ps(bvec, aikv));
#endif
_mm256_storeu_ps(c_row + j, cvec); _mm256_storeu_ps(c_row + j, cvec);
} }
for (; j < OtherColumns; ++j) for (; j < OtherColumns; ++j)
@@ -538,11 +527,8 @@ namespace omath
{ {
__m256d cvec = _mm256_loadu_pd(c_row + j); __m256d cvec = _mm256_loadu_pd(c_row + j);
__m256d bvec = _mm256_loadu_pd(b_row + j); __m256d bvec = _mm256_loadu_pd(b_row + j);
#if defined(__FMA__)
cvec = _mm256_fmadd_pd(bvec, aikv, cvec); cvec = _mm256_fmadd_pd(bvec, aikv, cvec);
#else
cvec = _mm256_add_pd(cvec, _mm256_mul_pd(bvec, aikv));
#endif
_mm256_storeu_pd(c_row + j, cvec); _mm256_storeu_pd(c_row + j, cvec);
} }
for (; j < OtherColumns; ++j) for (; j < OtherColumns; ++j)