diff --git a/.gitignore b/.gitignore index 9ed92a0..8c5aaa8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build GNUmakefile +.vs diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..fdf3ff6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,16 @@ cmake_minimum_required(VERSION 3.12) project(hellocmake LANGUAGES CXX) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() add_executable(main main.cpp) + +if (CMAKE_COMPILER_IS_GNUCXX) + target_compile_options(main PRIVATE $<$:-march=native -funroll-loops -O3>) +endif() +if (MSVC) + target_compile_options(main PRIVATE $<$:/arch:AVX2 /fp:fast>) +endif() diff --git a/main.cpp b/main.cpp index cf6369b..1f52dd2 100644 --- a/main.cpp +++ b/main.cpp @@ -1,78 +1,125 @@ +#include #include -#include -#include #include #include +#include + float frand() { - return (float)rand() / RAND_MAX * 2 - 1; + return (float)rand() / float(RAND_MAX) * 2 - 1; } -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; +constexpr const auto stars_size = 48; +constexpr const auto alignment = 64; // cache lines width + +struct alignas(alignment) Stars { + using Element = std::array; + + alignas(alignment) Element vx, vy, vz; + alignas(alignment) Element px, py, pz; + alignas(alignment) Element mass; }; -std::vector stars; +Stars stars; void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); + for (int i = 0; i < stars_size; i++) { + stars.px[i] = frand(); + stars.py[i] = frand(); + stars.pz[i] = frand(); + stars.vx[i] = frand(); + stars.vy[i] = frand(); + stars.vz[i] = frand(); + stars.mass[i] = frand() + 1; } } -float G = 0.001; -float eps = 0.001; -float dt = 0.01; +constexpr float G = 0.001; +constexpr float eps = 0.001; +constexpr float dt = 0.01; -void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; +auto epsx = _mm256_set1_ps(eps * eps); +auto Gdt = _mm256_set1_ps(G * dt); +auto dtx = _mm256_set1_ps(dt); +alignas(alignment) __m256 mgdt[stars_size / 8]; +alignas(alignment) __m256 gvx[stars_size]; +alignas(alignment) __m256 gvy[stars_size]; +alignas(alignment) __m256 gvz[stars_size]; + +float reduce_sum(const __m256& x) noexcept { + __m256 s0 = _mm256_hadd_ps(x, x); + s0 = _mm256_hadd_ps(s0, s0); + + __m128 s1 = _mm256_extractf128_ps(s0, 1); + s1 = _mm_add_ps(_mm256_extractf128_ps(s0, 0), s1); + + return _mm_cvtss_f32(s1); +} + +void step() noexcept { + for (unsigned i = 0; i < stars_size; i += 8) { + mgdt[i >> 3] = _mm256_mul_ps(Gdt, _mm256_load_ps(&stars.mass[i])); + } + for (unsigned i = 0; i < stars_size; ++i) { + auto vx = _mm256_setzero_ps(); + auto vy = _mm256_setzero_ps(); + auto vz = _mm256_setzero_ps(); + auto px = _mm256_set1_ps(stars.px[i]); + auto py = _mm256_set1_ps(stars.py[i]); + auto pz = _mm256_set1_ps(stars.pz[i]); + for (unsigned j = 0; j < stars_size; j += 8) { + auto dx = _mm256_sub_ps(_mm256_load_ps(&stars.px[j]), px); + auto dy = _mm256_sub_ps(_mm256_load_ps(&stars.py[j]), py); + auto dz = _mm256_sub_ps(_mm256_load_ps(&stars.pz[j]), pz); + auto d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_fmadd_ps(dz, dz, epsx))); + d2 = _mm256_mul_ps(mgdt[j >> 3], _mm256_rcp_ps(_mm256_mul_ps(d2, _mm256_sqrt_ps(d2)))); + vx = _mm256_fmadd_ps(dx, d2, vx); + vy = _mm256_fmadd_ps(dy, d2, vy); + vz = _mm256_fmadd_ps(dz, d2, vz); } + gvx[i] = vx; + gvy[i] = vy; + gvz[i] = vz; + } + for (unsigned i = 0; i < stars_size; i += 1) { + stars.vx[i] += reduce_sum(gvx[i]); + stars.vy[i] += reduce_sum(gvy[i]); + stars.vz[i] += reduce_sum(gvz[i]); } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; + for (unsigned i = 0; i < stars_size; i += 8) { + auto x = _mm256_add_ps(_mm256_load_ps(&stars.px[i]), _mm256_mul_ps(_mm256_load_ps(&stars.vx[i]), dtx)); + auto y = _mm256_add_ps(_mm256_load_ps(&stars.py[i]), _mm256_mul_ps(_mm256_load_ps(&stars.vy[i]), dtx)); + auto z = _mm256_add_ps(_mm256_load_ps(&stars.pz[i]), _mm256_mul_ps(_mm256_load_ps(&stars.vz[i]), dtx)); + _mm256_store_ps(&stars.px[i], x); + _mm256_store_ps(&stars.py[i], y); + _mm256_store_ps(&stars.pz[i], z); } } float calc() { float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; + for (auto i = 0; i < stars_size; ++i) { + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i]; + energy += stars.mass[i] * v2 / 2; + + for (auto j = 0; j < stars_size; ++j) { + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; + energy -= stars.mass[j] * stars.mass[i] * G / sqrt(d2) / 2; } } return energy; } template -long benchmark(Func const &func) { +long benchmark(Func const& func) { auto t0 = std::chrono::steady_clock::now(); func(); auto t1 = std::chrono::steady_clock::now(); auto dt = std::chrono::duration_cast(t1 - t0); - return dt.count(); + return static_cast(dt.count()); } int main() {