一 题目4
# 高性能并行编程与优化 - 第04讲的回家作业
通过 pull request 提交作业。会批分数,但是:
没有结业证书,回家作业仅仅作为评估学习效果和巩固知识的手段,不必为分数感到紧张 :)
量力而行,只要能在本课中,学到昨天的自己不懂的知识,就是胜利,没必要和别人攀比。
注意不要偷看别人的作业哦!
- 课件:https://github.com/parallel101/course
- 录播:https://space.bilibili.com/263032155
作业提交时间不限 :) 即使完结了还想交的话我也会看的~ 不过最好在下一讲开播前完成。
- 如何开 pull request:https://zhuanlan.zhihu.com/p/51199833
- 如何设置 https 代理:https://www.jianshu.com/p/b481d2a42274
## 评分规则
- 在你的电脑上加速了多少倍,就是多少分!请在 PR 描述中写明加速前后的用时数据。
- 最好详细解释一下为什么这样可以优化。会额外以乘法的形式加分。
- 比如你优化后加速了 50 倍,讲的很详细,所以分数乘 2,变成 100 分!
- 比如你优化后加速了 1000 倍,但是你的 PR 描述是空,所以分数乘 0,变成 0 分!
## 作业要求
利用这次课上所学知识,修改 main.cpp,优化其中的多体引力求解器:
- 不允许使用多线程并行
- 不允许做算法复杂度优化
- 可以针对编译器和平台优化,这次不要求跨平台
- 可以用 xmmintrin.h,如果你觉得编译器靠不住的话
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>
float frand() {
return (float)rand() / RAND_MAX * 2 - 1;
}
struct Star {
float px, py, pz;
float vx, vy, vz;
float mass;
};
std::vector<Star> stars;
void init() {
for (int i = 0; i < 48; i++) {
stars.push_back({
frand(), frand(), frand(),
frand(), frand(), frand(),
frand() + 1,
});
}
}
float G = 0.001;
float eps = 0.001;
float dt = 0.01;
void step() {
for (auto &star: stars) {
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
d2 *= sqrt(d2);
star.vx += dx * other.mass * G * dt / d2;
star.vy += dy * other.mass * G * dt / d2;
star.vz += dz * other.mass * G * dt / d2;
}
}
for (auto &star: stars) {
star.px += star.vx * dt;
star.py += star.vy * dt;
star.pz += star.vz * dt;
}
}
float calc() {
float energy = 0;
for (auto &star: stars) {
float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
energy += star.mass * v2 / 2;
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
energy -= other.mass * star.mass * G / sqrt(d2) / 2;
}
}
return energy;
}
template <class Func>
long benchmark(Func const &func) {
auto t0 = std::chrono::steady_clock::now();
func();
auto t1 = std::chrono::steady_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
return dt.count();
}
int main() {
init();
printf("Initial energy: %f\n", calc());
auto dt = benchmark([&] {
for (int i = 0; i < 100000; i++)
step();
});
printf("Final energy: %f\n", calc());
printf("Time elapsed: %ld ms\n", dt);
return 0;
}
二 原来代码运行结果
三 优秀的代码答案
//参考代码:https://github.com/parallel101/hw04/pull/3
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>
#include <iostream>
#include <immintrin.h>
#include <cassert>
float frand() {
return (float)rand() / RAND_MAX * 2 - 1;
}
constexpr std::size_t N = 48;
constexpr float G = 0.001f;
constexpr float eps = 0.001f;
constexpr float dt = 0.01f;
constexpr float eps_sqr = eps * eps;
constexpr float G_dt = G * dt;
template<std::size_t N>
struct Star {
//float px, py, pz;
//float vx, vy, vz;
//float mass;
alignas(64) float px[N], py[N], pz[N];
alignas(64) float vx[N], vy[N], vz[N];
alignas(64) float mass[N];
};
//std::vector<Star> stars;
Star<N> stars;
//void init() {
// for (int i = 0; i < 48; i++) {
// stars.push_back({
// frand(), frand(), frand(),
// frand(), frand(), frand(),
// frand() + 1,
// });
void init() {
for (std::size_t i{
}; i < N; ++i) {
stars.px[i] = frand(), stars.py[i] = frand(), stars.pz[i] = frand();
stars.vx[i] = frand(), stars.vy[i] = frand(), stars.vz[i] = frand();
stars.mass[i] = frand() + 1;
}
}
//float G = 0.001;
//float eps = 0.001;
//float dt = 0.01;
//
//void step() {
// for (auto &star: stars) {
// for (auto &other: stars) {
// float dx = other.px - star.px;
// float dy = other.py - star.py;
// float dz = other.pz - star.pz;
// float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
// d2 *= sqrt(d2);
// star.vx += dx * other.mass * G * dt / d2;
// star.vy += dy * other.mass * G * dt / d2;
// star.vz += dz * other.mass * G * dt / d2;
// }
using fsimd_t = __m256;
inline auto set1(float f) {
return _mm256_set1_ps(f); }
inline auto load(const float* f) {
return _mm256_load_ps(f); }
inline void store(float* f, fsimd_t a) {
_mm256_store_ps(f, a); }
inline auto add(fsimd_t a, fsimd_t b) {
return _mm256_add_ps(a, b); }
inline auto sub(fsimd_t a, fsimd_t b) {
return _mm256_sub_ps(a, b); }
inline auto mul(fsimd_t a, fsimd_t b) {
return _mm256_mul_ps(a, b); }
inline auto div(fsimd_t a, fsimd_t b) {
return _mm256_div_ps(a, b); }
inline auto sqrt(fsimd_t a) {
return _mm256_sqrt_ps(a); }
inline auto rsqrt(fsimd_t a) {
return _mm256_rsqrt_ps(a); }
inline auto rcp(fsimd_t a) {
return _mm256_rcp_ps(a); }
void step_avx() {
auto eps_sqr8 = set1(eps_sqr);
for (std::size_t j{
}; j < N; ++j) {
auto mg_dt = set1(G * stars.mass[j] * dt);
auto xj = set1(stars.px[j]);
auto yj = set1(stars.py[j]);
auto zj = set1(stars.pz[j]);
auto unroll_body = [&](std::size_t i) {
auto xi = load(&stars.px[i]);
auto yi = load(&stars.py[i]);
auto zi = load(&stars.pz[i]);
auto dx = sub(xj, xi);
auto dy = sub(yj, yi);
auto dz = sub(zj, zi);
auto x2 = mul(dx, dx);
auto y2 = mul(dy, dy);
auto z2 = mul(dz, dz);
auto d2 = add(add(x2, y2), add(z2, eps_sqr8));
auto inv_d2 = rcp(d2);
auto inv_d = rsqrt(d2);
auto mg_dt_invd3 = mul(mul(mg_dt, inv_d2), inv_d);
auto vx = load(&stars.vx[i]);
auto vy = load(&stars.vy[i]);
auto vz = load(&stars.vz[i]);
auto new_vx = add(mul(mg_dt_invd3, dx), vx);
auto new_vy = add(mul(mg_dt_invd3, dy), vy);
auto new_vz = add(mul(mg_dt_invd3, dz), vz);
store(&stars.vx[i], new_vx);
store(&stars.vy[i], new_vy);
store(&stars.vz[i], new_vz);
};
unroll_body(0);
unroll_body(8);
unroll_body(16);
unroll_body(24);
unroll_body(32);
unroll_body(40);
// for(std::size_t i{} ; i < N ; i += 8){
// unroll_body(i);
// }
}
//for (auto &star: stars) {
// star.px += star.vx * dt;
// star.py += star.vy * dt;
// star.pz += star.vz * dt;
for (std::size_t i{
}; i < N; i += 8) {
auto dt8 = set1(dt);
auto vx = load(&stars.vx[i]);
auto vy = load(&stars.vy[i]);
auto vz = load(&stars.vz[i]);
auto new_px = add(load(&stars.px[i]), mul(vx, dt8));
auto new_py = add(load(&stars.py[i]), mul(vy, dt8));
auto new_pz = add(load(&stars.pz[i]), mul(vz, dt8));
store(&stars.px[i], new_px);
store(&stars.py[i], new_py);
store(&stars.pz[i], new_pz);
}
}
float calc() {
float energy = 0;
//for (auto &star: stars) {
// float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
// energy += star.mass * v2 / 2;
// for (auto &other: stars) {
// float dx = other.px - star.px;
// float dy = other.py - star.py;
// float dz = other.pz - star.pz;
// float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
// energy -= other.mass * star.mass * G / sqrt(d2) / 2;
for (std::size_t i{
}; i < N; ++i) {
float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i];
energy += stars.mass[i] * v2 / 2;
for (std::size_t j{
}; j < N; ++j) {
float dx = stars.px[j] - stars.px[i];
float dy = stars.py[j] - stars.py[i];
float dz = stars.pz[j] - stars.pz[i];
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
energy -= stars.mass[j] * stars.mass[i] * G / std::sqrt(d2) / 2;
}
}
return energy;
}
template <class Func>
//long benchmark(Func const &func) {
auto benchmark(Func const& func) {
auto t0 = std::chrono::steady_clock::now();
func();
auto t1 = std::chrono::steady_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
return dt.count();
}
int main() {
init();
//printf("Initial energy: %f\n", calc());
std::cout << "Initial energy: " << calc() << std::endl;
auto dt = benchmark([&] {
for (int i = 0; i < 100000; i++)
//step();
step_avx();
});
//printf("Final energy: %f\n", calc());
//printf("Time elapsed: %ld ms\n", dt);
std::cout << "Final energy: " << calc() << std::endl;
std::cout << "Time elapsed: " << dt << " ms" << std::endl;
system("PAUSE");
return 0;
}
cmake_minimum_required(VERSION 3.12)
project(hellocmake LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
add_executable(main main.cpp)
if(MSVC)
target_compile_options(main PUBLIC /arch:AVX2 /fp:fast /O2 /openmp:experimental)
else()
target_compile_options(main PUBLIC -ffast-math -march=native -O3)
endif()
BUILD_TPYE=RelWithDebInfo
cmake . -B build \
-DCMAKE_TOOLCHAIN_FILE=C:/zeno/vcpkg/scripts/buildsystems/vcpkg.cmake \
-DVCPKG_TARGET_TRIPLET=x64-windows-static \
-DCMAKE_MSVC_RUNTIME_LIBRARY="MultiThreaded$<$<CONFIG:Debug>:Debug>"
cmake --build build --config $BUILD_TPYE
echo "================= Baseline =================="
./baseline
echo "================= Optimized ================="
./build/$BUILD_TPYE/main.exe
echo "================== Finish ==================="
四 优秀的运行结果和解答
Fin with AVX2 by JYLeeLYJ · Pull Request #3 · parallel101/hw04 (github.com)https://github.com/parallel101/hw04/pull/3