Compare commits

..

29 Commits

Author SHA1 Message Date
3df7d65ac1 Update README.md 2025-09-18 19:39:11 +03:00
e1a1164136 fixed warning 2025-09-18 19:06:56 +03:00
23216279dc fix 2025-09-18 18:42:02 +03:00
9e082f7dfa now its ref 2025-09-18 18:39:28 +03:00
7750819e83 improved naming 2025-09-18 18:38:07 +03:00
2ec0e2f93f Update CREDITS.md to include Billy O'Neal
Added acknowledgment for Billy O'Neal's contributions.
2025-09-18 06:08:49 +03:00
9170ffb1a9 Merge pull request #74 from BillyONeal/fmodf
Don't name std::fmodf.
2025-09-18 06:04:43 +03:00
e05f9ef5a9 Removes FMA check for matrix multiplication
Removes preprocessor check for FMA instructions in matrix multiplication functions.
This simplifies the code and relies on the compiler's ability to optimize the
code based on available hardware support. The assumption is that modern
compilers will automatically utilize FMA instructions if available, and fall
back to alternative implementations if not.
2025-09-18 06:02:37 +03:00
89bb4aa625 Guards AVX2 usage with a preprocessor definition
Ensures that AVX2 intrinsics are only included when the
OMATH_USE_AVX2 preprocessor definition is set. This prevents
compilation errors when AVX2 support is not available or
explicitly disabled.
2025-09-18 05:22:22 +03:00
Billy Robert O'Neal III
9b0845593d Don't name std::fmodf.
The C standard library function fmodf is not guaranteed to be in namespace std, and in fact is not with a default Ubuntu 24.04 installation, leading to the following compile error:

```console
Change Dir: '/vcpkg/buildtrees/vcpkg-ci-orange-math/x64-linux-dbg'

Run Build Command(s): /vcpkg/downloads/tools/ninja/1.12.1-linux/ninja -v -v -j33
[1/2] /usr/bin/c++ -DOMATH_SUPRESS_SAFETY_CHECKS -DOMATH_VERSION=\"3.5.0\" -isystem /vcpkg/installed/x64-linux/include -fPIC -g -std=gnu++23 -MD -MT CMakeFiles/main.dir/main.cpp.o -MF CMakeFiles/main.dir/main.cpp.o.d -o CMakeFiles/main.dir/main.cpp.o -c /vcpkg/scripts/test_ports/vcpkg-ci-orange-math/project/main.cpp
FAILED: CMakeFiles/main.dir/main.cpp.o
/usr/bin/c++ -DOMATH_SUPRESS_SAFETY_CHECKS -DOMATH_VERSION=\"3.5.0\" -isystem /vcpkg/installed/x64-linux/include -fPIC -g -std=gnu++23 -MD -MT CMakeFiles/main.dir/main.cpp.o -MF CMakeFiles/main.dir/main.cpp.o.d -o CMakeFiles/main.dir/main.cpp.o -c /vcpkg/scripts/test_ports/vcpkg-ci-orange-math/project/main.cpp
In file included from /vcpkg/installed/x64-linux/include/omath/omath.hpp:22,
                 from /vcpkg/scripts/test_ports/vcpkg-ci-orange-math/project/main.cpp:1:
/vcpkg/installed/x64-linux/include/omath/color.hpp: In member function ‘constexpr omath::Hsv omath::Color::to_hsv() const’:
/vcpkg/installed/x64-linux/include/omath/color.hpp:98:45: error: ‘fmodf’ is not a member of ‘std’; did you mean ‘modf’?
   98 |                 hsv_data.hue = 60.f * (std::fmodf(((green - blue) / delta), 6.f));
      |                                             ^~~~~
      |                                             modf
ninja: build stopped: subcommand failed.
```

Only the 'sufficient additional overloads' of `fmod` are guaranteed to be in `std`. Since this is clearly intended to call the (float, float) overload, explicitly cast `((green - blue) / delta)` (which is a `double`) to `float` and call the name in `std` as suggested by the diagnostic.
2025-09-17 19:15:10 -07:00
617ded2dd4 Merge pull request #73 from orange-cpp/featore/performance_tests
added performance folder
2025-09-17 20:53:11 +03:00
e882a224d2 fix 2025-09-17 20:50:30 +03:00
e04f6573c0 patch 2025-09-17 20:46:00 +03:00
791e3b2313 improved bench 2025-09-17 20:40:03 +03:00
26b56d757c fix 2025-09-17 20:25:22 +03:00
fbb77b9925 patch 2025-09-17 20:22:42 +03:00
7b671dbd90 added benchmark submodule 2025-09-17 20:14:33 +03:00
5875930f1a added benchmark 2025-09-17 19:56:50 +03:00
d773985822 added avx mutiplication 2025-09-17 19:47:29 +03:00
a2de6f8fae renamed folder 2025-09-17 18:07:28 +03:00
d71795006d added performance folder 2025-09-17 17:47:55 +03:00
561438d45c Merge pull request #72 from orange-cpp/feature/mat_refactor
Feature/mat refactor
2025-09-17 17:41:15 +03:00
874b028e86 removed unused var 2025-09-17 17:38:17 +03:00
68ec42d9ed added space 2025-09-17 17:33:05 +03:00
8aeb4667d7 decomposed mutiplication 2025-09-17 17:30:57 +03:00
565464f0cd forgot std 2025-09-17 17:23:02 +03:00
04b50d4545 Merge pull request #71 from orange-cpp/feature/mat_perf_boost
Improves matrix multiplication performance
2025-09-17 17:18:12 +03:00
e01d32fb22 Improves matrix multiplication performance
Optimizes matrix multiplication by specializing the algorithm
based on the matrix storage type (row-major or column-major).

This change significantly improves performance by leveraging
memory access patterns specific to each storage order.
2025-09-17 17:12:41 +03:00
a3a023a664 Add acknowledgment for AmbushedRaccoon's contribution 2025-09-16 16:58:00 +03:00
13 changed files with 328 additions and 31 deletions

5
.gitmodules vendored
View File

@@ -1,3 +1,6 @@
[submodule "extlibs/googletest"] [submodule "extlibs/googletest"]
path = extlibs/googletest path = extlibs/googletest
url = https://github.com/google/googletest.git url = https://github.com/google/googletest.git
[submodule "extlibs/benchmark"]
path = extlibs/benchmark
url = https://github.com/google/benchmark.git

1
.idea/vcs.xml generated
View File

@@ -2,6 +2,7 @@
<project version="4"> <project version="4">
<component name="VcsDirectoryMappings"> <component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" /> <mapping directory="" vcs="Git" />
<mapping directory="$PROJECT_DIR$/extlibs/benchmark" vcs="Git" />
<mapping directory="$PROJECT_DIR$/extlibs/googletest" vcs="Git" /> <mapping directory="$PROJECT_DIR$/extlibs/googletest" vcs="Git" />
</component> </component>
</project> </project>

View File

@@ -6,6 +6,7 @@ include(CMakePackageConfigHelpers)
option(OMATH_BUILD_TESTS "Build unit tests" ${PROJECT_IS_TOP_LEVEL}) option(OMATH_BUILD_TESTS "Build unit tests" ${PROJECT_IS_TOP_LEVEL})
option(OMATH_BUILD_BENCHMARK "Build benchmarks" ${PROJECT_IS_TOP_LEVEL})
option(OMATH_THREAT_WARNING_AS_ERROR "Set highest level of warnings and force compiler to treat them as errors" ON) option(OMATH_THREAT_WARNING_AS_ERROR "Set highest level of warnings and force compiler to treat them as errors" ON)
option(OMATH_BUILD_AS_SHARED_LIBRARY "Build Omath as .so or .dll" OFF) option(OMATH_BUILD_AS_SHARED_LIBRARY "Build Omath as .so or .dll" OFF)
option(OMATH_USE_AVX2 "Omath will use AVX2 to boost performance" ON) option(OMATH_USE_AVX2 "Omath will use AVX2 to boost performance" ON)
@@ -16,9 +17,10 @@ option(OMATH_SUPRESS_SAFETY_CHECKS "Supress some safety checks in release build
option(OMATH_USE_UNITY_BUILD "Will enable unity build to speed up compilation" OFF) option(OMATH_USE_UNITY_BUILD "Will enable unity build to speed up compilation" OFF)
option(OMATH_ENABLE_LEGACY "Will enable legacy classes that MUST be used ONLY for backward compatibility" OFF) option(OMATH_ENABLE_LEGACY "Will enable legacy classes that MUST be used ONLY for backward compatibility" OFF)
message(STATUS "[${PROJECT_NAME}]: Building on ${CMAKE_HOST_SYSTEM_NAME}") message(STATUS "[${PROJECT_NAME}]: Building on ${CMAKE_HOST_SYSTEM_NAME}, compiler ${CMAKE_CXX_COMPILER_ID}")
message(STATUS "[${PROJECT_NAME}]: Warnings as errors ${OMATH_THREAT_WARNING_AS_ERROR}") message(STATUS "[${PROJECT_NAME}]: Warnings as errors ${OMATH_THREAT_WARNING_AS_ERROR}")
message(STATUS "[${PROJECT_NAME}]: Build unit tests ${OMATH_BUILD_TESTS}") message(STATUS "[${PROJECT_NAME}]: Build unit tests ${OMATH_BUILD_TESTS}")
message(STATUS "[${PROJECT_NAME}]: Build benchmark ${OMATH_BUILD_BENCHMARK}")
message(STATUS "[${PROJECT_NAME}]: As dynamic library ${OMATH_BUILD_AS_SHARED_LIBRARY}") message(STATUS "[${PROJECT_NAME}]: As dynamic library ${OMATH_BUILD_AS_SHARED_LIBRARY}")
message(STATUS "[${PROJECT_NAME}]: Static C++ runtime ${OMATH_STATIC_MSVC_RUNTIME_LIBRARY}") message(STATUS "[${PROJECT_NAME}]: Static C++ runtime ${OMATH_STATIC_MSVC_RUNTIME_LIBRARY}")
message(STATUS "[${PROJECT_NAME}]: CMake unity build ${OMATH_USE_UNITY_BUILD}") message(STATUS "[${PROJECT_NAME}]: CMake unity build ${OMATH_USE_UNITY_BUILD}")
@@ -90,19 +92,25 @@ if (OMATH_STATIC_MSVC_RUNTIME_LIBRARY)
) )
endif () endif ()
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") if (OMATH_USE_AVX2 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
target_compile_options(${PROJECT_NAME} PRIVATE -mavx2 -mfma) target_compile_options(${PROJECT_NAME} PUBLIC -mavx2 -mavx -mfma)
endif () endif ()
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_23) target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_23)
if (OMATH_BUILD_TESTS OR OMATH_BUILD_BENCHMARK)
add_subdirectory(extlibs)
endif ()
if (OMATH_BUILD_TESTS) if (OMATH_BUILD_TESTS)
add_subdirectory(extlibs)
add_subdirectory(tests) add_subdirectory(tests)
target_compile_definitions(${PROJECT_NAME} PUBLIC OMATH_BUILD_TESTS) target_compile_definitions(${PROJECT_NAME} PUBLIC OMATH_BUILD_TESTS)
endif () endif ()
if (OMATH_BUILD_BENCHMARK)
add_subdirectory(benchmark)
endif ()
if (OMATH_BUILD_EXAMPLES) if (OMATH_BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)
endif () endif ()

View File

@@ -3,6 +3,8 @@
Thanks to everyone who made this possible, including: Thanks to everyone who made this possible, including:
- Saikari aka luadebug for VCPKG port and awesome new initial logo design. - Saikari aka luadebug for VCPKG port and awesome new initial logo design.
- AmbushedRaccoon for telegram post about omath to boost repository activity.
- Billy O'Neal aka BillyONeal for fixing compilation issues due to C math library compatibility.
And a big hand to everyone else who has contributed over the past! And a big hand to everyone else who has contributed over the past!

View File

@@ -20,9 +20,10 @@ It provides the latest features, is highly customizable, has all for cheat devel
--- ---
**[<kbd><br>Install<br></kbd>][INSTALL]** **[<kbd><br>Install<br></kbd>][INSTALL]**
**[<kbd><br>Examples<br></kbd>][EXAMPLES]** **[<kbd><br>Examples<br></kbd>][EXAMPLES]**
**[<kbd><br>Contribute<br></kbd>][CONTRIBUTING]** **[<kbd><br>Contribute<br></kbd>][CONTRIBUTING]**
**[<kbd><br>Donate<br></kbd>][SPONSOR]**
--- ---
@@ -125,3 +126,4 @@ for (auto ent: apex_sdk::EntityList::GetAllEntities())
[INSTALL]: INSTALL.md [INSTALL]: INSTALL.md
[CONTRIBUTING]: CONTRIBUTING.md [CONTRIBUTING]: CONTRIBUTING.md
[EXAMPLES]: examples [EXAMPLES]: examples
[SPONSOR]: https://boosty.to/orangecpp/purchase/3568644?ssource=DIRECT&share=subscription_link

15
benchmark/CMakeLists.txt Normal file
View File

@@ -0,0 +1,15 @@
project(omath_benchmark)
file(GLOB_RECURSE OMATH_BENCHMARK_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
add_executable(${PROJECT_NAME} ${OMATH_BENCHMARK_SOURCES})
set_target_properties(${PROJECT_NAME} PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}"
CXX_STANDARD 23
CXX_STANDARD_REQUIRED ON)
target_link_libraries(${PROJECT_NAME} PRIVATE benchmark::benchmark omath)

View File

@@ -0,0 +1,66 @@
//
// Created by Vlad on 9/17/2025.
//
#include <benchmark/benchmark.h>
#include <omath/omath.hpp>
#include <chrono>
using namespace omath;
void mat_float_multiplication_col_major(benchmark::State& state)
{
using MatType = Mat<128, 128, float, MatStoreType::COLUMN_MAJOR>;
MatType a;
MatType b;
a.set(3.f);
b.set(7.f);
for (auto _ : state)
std::ignore = a * b;
}
void mat_float_multiplication_row_major(benchmark::State& state)
{
using MatType = Mat<128, 128, float, MatStoreType::ROW_MAJOR>;
MatType a;
MatType b;
a.set(3.f);
b.set(7.f);
for (auto _ : state)
std::ignore = a * b;
}
void mat_double_multiplication_row_major(benchmark::State& state)
{
using MatType = Mat<128, 128, double, MatStoreType::ROW_MAJOR>;
MatType a;
MatType b;
a.set(3.f);
b.set(7.f);
for (auto _ : state)
std::ignore = a * b;
}
void mat_double_multiplication_col_major(benchmark::State& state)
{
using MatType = Mat<128, 128, double, MatStoreType::COLUMN_MAJOR>;
MatType a;
MatType b;
a.set(3.f);
b.set(7.f);
for (auto _ : state)
std::ignore = a * b;
}
BENCHMARK(mat_float_multiplication_col_major)->Iterations(5000);
BENCHMARK(mat_float_multiplication_row_major)->Iterations(5000);
BENCHMARK(mat_double_multiplication_col_major)->Iterations(5000);
BENCHMARK(mat_double_multiplication_row_major)->Iterations(5000);

5
benchmark/main.cpp Normal file
View File

@@ -0,0 +1,5 @@
//
// Created by Vlad on 9/17/2025.
//
#include <benchmark/benchmark.h>
BENCHMARK_MAIN();

View File

@@ -1 +1,2 @@
add_subdirectory(googletest) add_subdirectory(googletest)
add_subdirectory(benchmark)

1
extlibs/benchmark Submodule

Submodule extlibs/benchmark added at 2948b6a2e6

View File

@@ -151,21 +151,39 @@ namespace omath
}; };
} // namespace omath } // namespace omath
template<class T, T MinV, T MaxV, omath::AngleFlags F, class CharT> template<class T, T MinV, T MaxV, omath::AngleFlags F>
struct std::formatter<omath::Angle<T, MinV, MaxV, F>, CharT> struct std::formatter<omath::Angle<T, MinV, MaxV, F>, char> // NOLINT(*-dcl58-cpp)
{ {
using AngleT = omath::Angle<T, MinV, MaxV, F>; using AngleT = omath::Angle<T, MinV, MaxV, F>;
[[nodiscard]]
static constexpr auto parse(std::basic_format_parse_context<CharT>& ctx) static constexpr auto parse(std::format_parse_context& ctx)
-> std::basic_format_parse_context<CharT>::iterator
{ {
return ctx.begin(); return ctx.begin();
} }
template<class FormatContext> template<class FormatContext>
auto format(const AngleT& deg, FormatContext& ctx) const auto format(const AngleT& a, FormatContext& ctx) const
{ {
if constexpr (std::is_same_v<typename FormatContext::char_type, char>) static_assert(std::is_same_v<typename FormatContext::char_type, char>);
return std::format_to(ctx.out(), "{}deg", deg.as_degrees()); return std::format_to(ctx.out(), "{}deg", a.as_degrees());
return std::format_to(ctx.out(), L"{}deg", deg.as_degrees());
} }
}; };
// wchar_t formatter
template<class T, T MinV, T MaxV, omath::AngleFlags F>
struct std::formatter<omath::Angle<T, MinV, MaxV, F>, wchar_t> // NOLINT(*-dcl58-cpp)
{
using AngleT = omath::Angle<T, MinV, MaxV, F>;
static constexpr auto parse(std::wformat_parse_context& ctx)
{
return ctx.begin();
}
template<class FormatContext>
auto format(const AngleT& a, FormatContext& ctx) const
{
static_assert(std::is_same_v<typename FormatContext::char_type, wchar_t>);
return std::format_to(ctx.out(), L"{}deg", a.as_degrees());
}
};

View File

@@ -95,7 +95,7 @@ namespace omath
hsv_data.hue = 0.f; hsv_data.hue = 0.f;
else if (max == red) else if (max == red)
hsv_data.hue = 60.f * (std::fmodf(((green - blue) / delta), 6.f)); hsv_data.hue = 60.f * (std::fmod(static_cast<float>((green - blue) / delta), 6.f));
else if (max == green) else if (max == green)
hsv_data.hue = 60.f * (((blue - red) / delta) + 2.f); hsv_data.hue = 60.f * (((blue - red) / delta) + 2.f);
else if (max == blue) else if (max == blue)

View File

@@ -11,6 +11,9 @@
#include <stdexcept> #include <stdexcept>
#include <utility> #include <utility>
#ifdef OMATH_USE_AVX2
#include <immintrin.h>
#endif
namespace omath namespace omath
{ {
@@ -155,17 +158,19 @@ namespace omath
constexpr Mat<Rows, OtherColumns, Type, StoreType> constexpr Mat<Rows, OtherColumns, Type, StoreType>
operator*(const Mat<Columns, OtherColumns, Type, StoreType>& other) const operator*(const Mat<Columns, OtherColumns, Type, StoreType>& other) const
{ {
Mat<Rows, OtherColumns, Type, StoreType> result; #ifdef OMATH_USE_AVX2
if constexpr (StoreType == MatStoreType::ROW_MAJOR)
for (size_t i = 0; i < Rows; ++i) return avx_multiply_row_major(other);
for (size_t j = 0; j < OtherColumns; ++j) else if constexpr (StoreType == MatStoreType::COLUMN_MAJOR)
{ return avx_multiply_col_major(other);
Type sum = 0; #else
for (size_t k = 0; k < Columns; ++k) if constexpr (StoreType == MatStoreType::ROW_MAJOR)
sum += at(i, k) * other.at(k, j); return cache_friendly_multiply_row_major(other);
result.at(i, j) = sum; else if constexpr (StoreType == MatStoreType::COLUMN_MAJOR)
} return cache_friendly_multiply_col_major(other);
return result; #endif
else
std::unreachable();
} }
constexpr Mat& operator*=(const Type& f) noexcept constexpr Mat& operator*=(const Type& f) noexcept
@@ -367,6 +372,176 @@ namespace omath
private: private:
std::array<Type, Rows * Columns> m_data; std::array<Type, Rows * Columns> m_data;
template<size_t OtherColumns> [[nodiscard]]
constexpr Mat<Rows, OtherColumns, Type, MatStoreType::ROW_MAJOR>
cache_friendly_multiply_row_major(const Mat<Columns, OtherColumns, Type, MatStoreType::ROW_MAJOR>& other) const
{
Mat<Rows, OtherColumns, Type, MatStoreType::ROW_MAJOR> result;
for (std::size_t row_index = 0; row_index < Rows; ++row_index)
for (std::size_t column_index = 0; column_index < Columns; ++column_index)
{
const Type& current_number = at(row_index, column_index);
for (std::size_t other_column = 0; other_column < OtherColumns; ++other_column)
result.at(row_index, other_column) += current_number * other.at(column_index, other_column);
}
return result;
}
template<size_t OtherColumns> [[nodiscard]]
constexpr Mat<Rows, OtherColumns, Type, MatStoreType::COLUMN_MAJOR> cache_friendly_multiply_col_major(
const Mat<Columns, OtherColumns, Type, MatStoreType::COLUMN_MAJOR>& other) const
{
Mat<Rows, OtherColumns, Type, MatStoreType::COLUMN_MAJOR> result;
for (std::size_t other_column = 0; other_column < OtherColumns; ++other_column)
for (std::size_t column_index = 0; column_index < Columns; ++column_index)
{
const Type& current_number = other.at(column_index, other_column);
for (std::size_t row_index = 0; row_index < Rows; ++row_index)
result.at(row_index, other_column) += at(row_index, column_index) * current_number;
}
return result;
}
#ifdef OMATH_USE_AVX2
template<size_t OtherColumns> [[nodiscard]]
constexpr Mat<Rows, OtherColumns, Type, MatStoreType::COLUMN_MAJOR>
avx_multiply_col_major(const Mat<Columns, OtherColumns, Type, MatStoreType::COLUMN_MAJOR>& other) const
{
Mat<Rows, OtherColumns, Type, MatStoreType::COLUMN_MAJOR> result;
const Type* this_mat_data = this->raw_array().data();
const Type* other_mat_data = other.raw_array().data();
Type* result_mat_data = result.raw_array().data();
if constexpr (std::is_same_v<Type, float>)
{
// ReSharper disable once CppTooWideScopeInitStatement
constexpr std::size_t vector_size = 8;
for (std::size_t j = 0; j < OtherColumns; ++j)
{
auto* c_col = reinterpret_cast<float*>(result_mat_data + j * Rows);
for (std::size_t k = 0; k < Columns; ++k)
{
const float bkj = reinterpret_cast<const float*>(other_mat_data)[k + j * Columns];
__m256 bkjv = _mm256_set1_ps(bkj);
const auto* a_col_k = reinterpret_cast<const float*>(this_mat_data + k * Rows);
std::size_t i = 0;
for (; i + vector_size <= Rows; i += vector_size)
{
__m256 cvec = _mm256_loadu_ps(c_col + i);
__m256 avec = _mm256_loadu_ps(a_col_k + i);
cvec = _mm256_fmadd_ps(avec, bkjv, cvec);
_mm256_storeu_ps(c_col + i, cvec);
}
for (; i < Rows; ++i)
c_col[i] += a_col_k[i] * bkj;
}
}
}
else if (std::is_same_v<Type, double>)
{ // double
// ReSharper disable once CppTooWideScopeInitStatement
constexpr std::size_t vector_size = 4;
for (std::size_t j = 0; j < OtherColumns; ++j)
{
auto* c_col = reinterpret_cast<double*>(result_mat_data + j * Rows);
for (std::size_t k = 0; k < Columns; ++k)
{
const double bkj = reinterpret_cast<const double*>(other_mat_data)[k + j * Columns];
__m256d bkjv = _mm256_set1_pd(bkj);
const auto* a_col_k = reinterpret_cast<const double*>(this_mat_data + k * Rows);
std::size_t i = 0;
for (; i + vector_size <= Rows; i += vector_size)
{
__m256d cvec = _mm256_loadu_pd(c_col + i);
__m256d avec = _mm256_loadu_pd(a_col_k + i);
cvec = _mm256_fmadd_pd(avec, bkjv, cvec);
_mm256_storeu_pd(c_col + i, cvec);
}
for (; i < Rows; ++i)
c_col[i] += a_col_k[i] * bkj;
}
}
}
else
std::unreachable();
return result;
}
template<size_t OtherColumns> [[nodiscard]]
constexpr Mat<Rows, OtherColumns, Type, MatStoreType::ROW_MAJOR>
avx_multiply_row_major(const Mat<Columns, OtherColumns, Type, MatStoreType::ROW_MAJOR>& other) const
{
Mat<Rows, OtherColumns, Type, MatStoreType::ROW_MAJOR> result;
const Type* this_mat_data = this->raw_array().data();
const Type* other_mat_data = other.raw_array().data();
Type* result_mat_data = result.raw_array().data();
if constexpr (std::is_same_v<Type, float>)
{
// ReSharper disable once CppTooWideScopeInitStatement
constexpr std::size_t vector_size = 8;
for (std::size_t i = 0; i < Rows; ++i)
{
Type* c_row = result_mat_data + i * OtherColumns;
for (std::size_t k = 0; k < Columns; ++k)
{
const auto aik = static_cast<float>(this_mat_data[i * Columns + k]);
__m256 aikv = _mm256_set1_ps(aik);
const auto* b_row = reinterpret_cast<const float*>(other_mat_data + k * OtherColumns);
std::size_t j = 0;
for (; j + vector_size <= OtherColumns; j += vector_size)
{
__m256 cvec = _mm256_loadu_ps(c_row + j);
__m256 bvec = _mm256_loadu_ps(b_row + j);
cvec = _mm256_fmadd_ps(bvec, aikv, cvec);
_mm256_storeu_ps(c_row + j, cvec);
}
for (; j < OtherColumns; ++j)
c_row[j] += aik * b_row[j];
}
}
}
else if (std::is_same_v<Type, double>)
{ // double
// ReSharper disable once CppTooWideScopeInitStatement
constexpr std::size_t vector_size = 4;
for (std::size_t i = 0; i < Rows; ++i)
{
Type* c_row = result_mat_data + i * OtherColumns;
for (std::size_t k = 0; k < Columns; ++k)
{
const auto aik = static_cast<double>(this_mat_data[i * Columns + k]);
__m256d aikv = _mm256_set1_pd(aik);
const auto* b_row = reinterpret_cast<const double*>(other_mat_data + k * OtherColumns);
std::size_t j = 0;
for (; j + vector_size <= OtherColumns; j += vector_size)
{
__m256d cvec = _mm256_loadu_pd(c_row + j);
__m256d bvec = _mm256_loadu_pd(b_row + j);
cvec = _mm256_fmadd_pd(bvec, aikv, cvec);
_mm256_storeu_pd(c_row + j, cvec);
}
for (; j < OtherColumns; ++j)
c_row[j] += aik * b_row[j];
}
}
}
else
std::unreachable();
return result;
}
#endif
}; };
template<class Type = float, MatStoreType St = MatStoreType::ROW_MAJOR> [[nodiscard]] template<class Type = float, MatStoreType St = MatStoreType::ROW_MAJOR> [[nodiscard]]