mickit 发表于 2025-4-11 11:49

浮点数性能和一致性



// Copyright 2022 Tencent

#include "test_fpu.h"

#include <assert.h>

#if defined(_WIN32)
#include <float.h>
#else
#if defined(__aarch64__) || defined(__arm__)
#    include <fenv.h>
#else// defined(__i386__) || defined(__x86_64__)
#    include <fpu_control.h>
#endif
#endif
#include <atomic>
#include <chrono>
#include <cmath>
#include <memory>
#include <random>
#include <sstream>
#include <string>
#include <thread>
#include <type_traits>

bool init_fpu() {
#if defined(_MSC_VER)
unsigned int control_word;
int err;
err = _controlfp_s(&control_word, 0, 0);
if (err) {
    return false;
}

#if !defined(_M_X64)
err = _controlfp_s(&control_word, PC_24, MCW_PC);
if (err) {
    return false;
}
#endif
err = _controlfp_s(&control_word, RC_NEAR, MCW_RC);
if (err) {
    return false;
}
return true;
#else
#if defined(__aarch64__) || defined(__arm__)
fesetround(FE_TONEAREST);
#else
fpu_control_t cw = (_FPU_DEFAULT & ~_FPU_EXTENDED) | _FPU_RC_NEAREST | _FPU_SINGLE;
_FPU_SETCW(cw);
#endif
return true;
#endif
}

std::string dump_current_controlfp() {
std::stringstream ss;
float a = 0.1f;

#if defined(_MSC_VER)
unsigned int control_word;
int err = _controlfp_s(&control_word, 0, 0);
if (err) {
    ss << "Got error code: " << err;
    return ss.str();
}

ss << "Control word: " << std::hex << control_word << std::endl;
float b = a * a;
ss << a << "*" << a << "=" << b << std::endl;
#else
#if defined(__aarch64__) || defined(__arm__)
ss << "Rounding word: " << std::hex << fegetround() << std::endl;
#else
fpu_control_t cw;
_FPU_GETCW(cw);
ss << "Control word: " << std::hex << cw << std::endl;
#endif
float b = a * a;
ss << a << "*" << a << "=" << b << std::endl;
#endif

return ss.str();
}

struct benchmark_thread_data {
std::unique_ptr<std::thread> thread;

benchmark_result result;
};

struct benchmark_handle {
size_t max_round;
std::atomic<size_t> running_thread;
std::atomic<size_t> progress_total;
std::atomic<size_t> progress_done;
std::vector<benchmark_thread_data> datas;
std::unique_ptr<std::thread> controller_thread;

~benchmark_handle() {
    if (controller_thread && controller_thread->joinable()) {
      controller_thread->join();
    }
}
};

namespace {
static constexpr size_t kMaxParameterCount = 1 << 20;
static constexpr size_t kMaxParameterArraySize = kMaxParameterCount * 2;
static uint32_t g_integer_parameters_odd = {0};
static uint32_t g_integer_parameters_even = {0};
static float g_float_parameters_odd = {0};
static float g_float_parameters_even = {0};

static void initialize_parameters(std::atomic<size_t> &progress_total, std::atomic<size_t> &progress_done) {
if (g_integer_parameters_even != 0) {
    return;
}

progress_total += kMaxParameterArraySize >> 9;

std::mt19937 rnd{9999991};
size_t index = 0;
while (index < kMaxParameterArraySize * 2) {
    uint32_t r = rnd();
    if (r < 9999991) {
      continue;
    }
    r = (r << 1) & 0x7ffffffe;
    if (index & 0x1) {
      g_integer_parameters_odd = r | 0x1;
      g_float_parameters_odd = static_cast<float>(r | 0x1);
    } else {
      g_integer_parameters_even = r;
      g_float_parameters_even = static_cast<float>(r);
    }

    ++index;
    if (0 == (index & ((1 << 10) - 1))) {
      ++progress_done;
    }
}
}

template <class TDATA>
static inline void benchmark_add(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
size_t s1 = start_parameter_idx;
size_t s2 = start_parameter_idx;
final_result += odd + odd + odd + odd + odd + odd + odd +
                  odd + odd + odd + odd + odd + odd + odd +
                  odd + odd;
final_result += even + even + even + even + even + even +
                  even + even + even + even + even + even +
                  even + even + even + even;
}

template <class TDATA>
static inline void benchmark_sub(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
size_t s1 = start_parameter_idx;
size_t s2 = start_parameter_idx;
final_result += odd - odd - odd - odd - odd - odd - odd -
                  odd - odd - odd - odd - odd - odd - odd -
                  odd - odd;
final_result += even - even - even - even - even - even -
                  even - even - even - even - even - even -
                  even - even - even - even;
}

template <class>
struct benchmark_mul_helper;

template <>
struct benchmark_mul_helper<uint32_t> {
static inline void do_operator(uint32_t odd[], uint32_t &final_result, size_t start_parameter_idx) {
    final_result *= odd;
    final_result *= odd;
}
};

template <>
struct benchmark_mul_helper<float> {
static inline void do_operator(float odd[], float &final_result, size_t start_parameter_idx) {
    if (std::isinf(final_result * odd)) {
      int exp;
      final_result = std::frexp(final_result, &exp);
      // memset(&final_result, 0, 1);
      // *(reinterpret_cast<uint8_t *>(&final_result) + sizeof(float) - 1) = 0;
    }
    final_result *= odd;
}
};

template <class TDATA>
static inline void benchmark_mul(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
}

template <class>
struct benchmark_div_helper;

template <>
struct benchmark_div_helper<uint32_t> {
static inline void do_operator(uint32_t odd[], uint32_t even[], uint32_t &final_result, size_t start_parameter_idx) {
    final_result *= even;
    uint32_t devided = (odd & 0xff);
    if (final_result > devided) {
      final_result /= devided;
    } else {
      final_result %= devided;
    }
}
};

template <>
struct benchmark_div_helper<float> {
static inline void do_operator(float odd[], float even[], float &final_result, size_t start_parameter_idx) {
    float r = final_result * even;
    if (!std::isinf(r)) {
      final_result = r;
    }
    if (final_result > odd) {
      final_result /= odd;
    } else {
      int exp;
      float devided = std::frexp(odd, &exp);
      final_result /= devided;
    }
}
};

template <class TDATA>
static inline void benchmark_div(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
}

template <class>
struct benchmark_sqrt_helper;

template <>
struct benchmark_sqrt_helper<float> {
static inline void do_operator(float odd[], float &final_result, size_t start_parameter_idx) {
    float v = odd;
    if (start_parameter_idx & 0xc) {
      final_result = std::sqrt(final_result * v + v * v);
    } else if (start_parameter_idx & 0x3) {
      final_result = std::sqrt(final_result * v * v);
    } else {
      final_result = std::sqrt(final_result * final_result * v);
    }
}
};

template <class TDATA>
static inline void benchmark_sqrt(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
}

static void start_benchmark_worker(size_t idx, size_t max_round, benchmark_thread_data &data,
                                 std::atomic<size_t> &progress_total, std::atomic<size_t> &progress_done) {
constexpr const size_t step = 1 << 4;
constexpr const size_t iterator_count = kMaxParameterCount >> 4;
progress_total += max_round * 9;
progress_total += 2;// sin + cos

// integer add
{
    data.result.integer_add_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      uint32_t result = g_integer_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_add(g_integer_parameters_odd, g_integer_parameters_even, result, i);
      }
      ++progress_done;

      data.result.integer_add_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.integer_add_cost = end - begin;
}

// integer sub
{
    data.result.integer_sub_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      uint32_t result = g_integer_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_sub(g_integer_parameters_odd, g_integer_parameters_even, result, i);
      }
      ++progress_done;

      data.result.integer_sub_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.integer_sub_cost = end - begin;
}

// integer mul
{
    data.result.integer_mul_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      uint32_t result = g_integer_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_mul(g_integer_parameters_odd, g_integer_parameters_even, result, i);
      }
      ++progress_done;

      data.result.integer_mul_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.integer_mul_cost = end - begin;
}

// integer div
{
    data.result.integer_div_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      uint32_t result = g_integer_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_div(g_integer_parameters_odd, g_integer_parameters_even, result, i);
      }
      ++progress_done;

      data.result.integer_div_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.integer_div_cost = end - begin;
}

// float add
{
    data.result.float_add_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      float result = g_float_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_add(g_float_parameters_odd, g_float_parameters_even, result, i);
      }
      ++progress_done;

      data.result.float_add_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.float_add_cost = end - begin;
}

// float sub
{
    data.result.float_sub_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      float result = g_float_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_sub(g_float_parameters_odd, g_float_parameters_even, result, i);
      }
      ++progress_done;

      data.result.float_sub_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.float_sub_cost = end - begin;
}

// float mul
{
    data.result.float_mul_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      float result = g_float_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_mul(g_float_parameters_odd, g_float_parameters_even, result, i);
      }
      ++progress_done;

      data.result.float_mul_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.float_mul_cost = end - begin;
}

// float div
{
    data.result.float_div_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      float result = g_float_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_div(g_float_parameters_odd, g_float_parameters_even, result, i);
      }
      ++progress_done;
      data.result.float_div_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.float_div_cost = end - begin;
}

// float sqrt
{
    data.result.float_sqrt_final_result.resize(max_round);
    auto begin = std::chrono::system_clock::now();
    for (size_t round = 0; round < max_round; ++round) {
      size_t start_index = kMaxParameterCount / max_round * round;
      size_t iterator_end = start_index + iterator_count;
      float result = g_float_parameters_odd;
      for (size_t i = start_index; i < iterator_end; i += step) {
      benchmark_sqrt(g_float_parameters_odd, g_float_parameters_even, result, i);
      }
      ++progress_done;
      data.result.float_sqrt_final_result = result;
    }
    auto end = std::chrono::system_clock::now();
    data.result.float_sqrt_cost = end - begin;
}

// float sin
{
    for (int i = 0; i < 16; ++i) {
      data.result.float_sin_final_result.push_back(std::sin(3.14159f / 34 * i));
    }
    ++progress_done;
}

// float cos
{
    for (int i = 0; i < 16; ++i) {
      data.result.float_cos_final_result.push_back(std::cos(3.14159f / 34 * i));
    }
    ++progress_done;
}
}

static void start_benchmark_controller(std::shared_ptr<benchmark_handle> handle) {
initialize_parameters(handle->progress_total, handle->progress_done);

size_t idx = 0;
for (auto &data : handle->datas) {
    data.result.float_add_cost = std::chrono::system_clock::duration::zero();
    data.result.float_sub_cost = std::chrono::system_clock::duration::zero();
    data.result.float_mul_cost = std::chrono::system_clock::duration::zero();
    data.result.float_div_cost = std::chrono::system_clock::duration::zero();
    data.result.float_sqrt_cost = std::chrono::system_clock::duration::zero();
    data.result.integer_add_cost = std::chrono::system_clock::duration::zero();
    data.result.integer_sub_cost = std::chrono::system_clock::duration::zero();
    data.result.integer_mul_cost = std::chrono::system_clock::duration::zero();
    data.result.integer_div_cost = std::chrono::system_clock::duration::zero();
    data.thread = std::unique_ptr<std::thread>(new std::thread(() {
      ++handle->running_thread;
      start_benchmark_worker(idx, handle->max_round, data, handle->progress_total, handle->progress_done);
      --handle->running_thread;
    }));
    ++idx;
}

for (auto &data : handle->datas) {
    if (data.thread && data.thread->joinable()) {
      data.thread->join();
    }
}
}
}// namespace

std::shared_ptr<benchmark_handle> start_benchmark(size_t thread_count, size_t round) {
if (thread_count > 32) {
    thread_count = 32;
}

std::shared_ptr<benchmark_handle> ret = std::make_shared<benchmark_handle>();
if (!ret) {
    return ret;
}

ret->max_round = round;
ret->running_thread.store(0);
ret->progress_total.store(1);
ret->progress_done.store(0);
ret->datas.resize(thread_count);
ret->controller_thread = std::unique_ptr<std::thread>(new std::thread(() {
    start_benchmark_controller(ret);
    ++ret->progress_done;
}));
return ret;
}

bool is_benchmark_running(const std::shared_ptr<benchmark_handle> &handle) {
if (!handle) {
    return false;
}

if (!handle->controller_thread) {
    return false;
}

return handle->progress_done.load() < handle->progress_total.load();
}

std::pair<size_t, size_t> get_benchmark_progress(const std::shared_ptr<benchmark_handle> &handle) {
if (!handle) {
    return std::pair<size_t, size_t>{0, 0};
}

return std::pair<size_t, size_t>{handle->progress_done, handle->progress_total};
}

size_t get_benchmark_running_thread(const std::shared_ptr<benchmark_handle> &handle) {
if (!handle) {
    return 0;
}

return handle->running_thread.load();
}

size_t get_benchmark_thread_count(const std::shared_ptr<benchmark_handle> &handle) {
if (!handle) {
    return 0;
}

return handle->datas.size();
}

void pick_benchmark_result(const std::shared_ptr<benchmark_handle> &handle, std::vector<benchmark_result> &result) {
if (!handle) {
    return;
}

result.reserve(handle->datas.size());
for (auto &data : handle->datas) {
    result.push_back(data.result);
}
}

burgessmaggie 发表于 2025-4-17 21:32

C 语言中的浮点数运算遵循一定的舍入模式,常见的舍入模式有向偶数舍入(默认模式)、向零舍入、向上舍入、向下舍入等。

hudi008 发表于 2025-4-17 21:44

使用定点数或查表法替代复杂浮点运算。

averyleigh 发表于 2025-4-17 23:15

在需要严格一致性的场合,可以禁用编译器的浮点优化选项。

mickit 发表于 2025-4-18 00:00

可以通过设置浮点运算单元的控制寄存器来控制舍入模式和精度,以确保一致性。

robertesth 发表于 2025-4-18 00:12

使用更高精度类型(如double)或调整循环结构。

gygp 发表于 2025-4-18 00:25

避免不必要的类型转换和复杂的浮点数运算。

mollylawrence 发表于 2025-4-18 00:38

编译器可以对浮点数运算进行多种优化。

vivilyly 发表于 2025-4-18 00:51

使用适合当前处理器架构的编译器和优化选项。

zerorobert 发表于 2025-4-18 01:04

尽量使用整数运算替代浮点运算,例如将浮点数乘以一个常数后转换为整数。

abotomson 发表于 2025-4-18 01:17

在C语言中,浮点数的性能和一致性是需要特别关注的两个方面,尤其是在高性能计算、跨平台开发或对数值精度敏感的场景中。

51xlf 发表于 2025-4-18 01:30

可以自动向量化循环,但需要手动编写优化代码以获得最佳性能。

1988020566 发表于 2025-4-18 01:43

比较两个浮点数是否相等时,不能直接使用==运算符,而是要判断它们的差值是否在一个很小的误差范围内。

sdlls 发表于 2025-4-18 01:56

浮点数运算(如加减乘除)通常比整数运算更慢,尤其是复杂数学函数(如sin、cos)的计算,可能需要更多CPU周期。

macpherson 发表于 2025-4-18 02:08

避免在浮点密集型循环中使用条件分支。

jackcat 发表于 2025-4-18 02:21

不同的编译器在实现浮点数运算时也可能会有一些细微的差别。

sesefadou 发表于 2025-4-18 02:33

复杂的浮点数运算,如乘法和除法,通常比加法和减法慢。

uytyu 发表于 2025-4-18 02:45

不同平台可能使用不同的浮点指令集。例如,x86平台上的x87和SSE指令集在精度和性能上有所不同。

10299823 发表于 2025-4-18 02:57

由于浮点数的精度问题,直接比较两个浮点数是否相等可能会得到不准确的结果。

pl202 发表于 2025-4-18 03:09

编译器可以选择不同的指令来执行浮点运算。例如,GCC和Clang提供了-Ofast选项,允许编译器为了性能而放弃某些一致性保证。
页: [1] 2 3 4
查看完整版本: 浮点数性能和一致性