用sse4.1指令集加速cvCeil( )函數

本文轉載自查看原文 2020-03-25 13:40 638 優化

C/C++標准庫，math.h/cmath中，給出了ceil()函數的聲明。

在OpenCV中，看到了cvCeil()函數，它是用sse2加速的。cvCeil()比ceil()快嗎？評測下來，g++-5.4（ubuntu16.04）和VS2017下，確實都是cvCeil()更快。

其實現在我用的PC，avx2都有支持了，sse、avx系列是遞增式支持的，用sse4.1來優化一下cvCeil()，可以更快，普適性應該也還不錯的。

#include <iostream>
#include <cmath>
#include <sstream>
#include <chrono>

//sse2
#if defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)
#include <emmintrin.h>
#endif

//sse 4.1
#include <smmintrin.h>


//sse2 optimized
inline int cvCeil(double value)
{
#if defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)
    __m128d t = _mm_set_sd( value );
    int i = _mm_cvtsd_si32(t);
    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
#elif defined __GNUC__
    int i = (int)value;
    return i + (i < value);
#else
    int i = cvRound(value);
    float diff = (float)(i - value);
    return i + (diff < 0);
#endif
}


//sse4 optimized
inline int myCeil(double value)
{
#if defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)

    /*
    //這段實現，ubuntu上clang-8.0，運行結果正確，但是VS2017運行結果不對。。速度是25ms
    __m128d val = _mm_set_sd(value);
    __m128d dst;
    _mm_round_sd(dst, val, _MM_FROUND_CEIL);
    return _mm_cvtsd_si32(dst);
    */

    //這段實現，ubuntu clang-8.0和VS2017結果都正確，不過慢了一些，70ms左右
    __m128d val = _mm_set_sd(value);
    __m128d res = _mm_round_sd(val, val, _MM_FROUND_CEIL);
    return _mm_cvtsd_si32(res);

#elif defined __GNUC__
    int i = (int)value;
    return i + (i < value);
#else
    int i = cvRound(value);
    float diff = (float)(i - value);
    return i + (diff < 0);
#endif
}



template<typename T, typename P>
std::string toString(std::chrono::duration<T,P> dt)
{
    std::ostringstream str;
    using namespace std::chrono;
    str << duration_cast<microseconds>(dt).count()*1e-3 << " ms";
    return str.str();
}

int main () {
    volatile double x = 34.234;
    volatile double y1, y2, y3;
    const int MAX_ITER=100000000;
    const auto t0 = std::chrono::steady_clock::now();

    for(int i=0; i<MAX_ITER; i++) {
        y1 = std::ceil(x);
    }
    const auto t1 = std::chrono::steady_clock::now();

    for(int i=0; i<MAX_ITER; i++) {
        y2 = cvCeil(x);
    }
    const auto t2 = std::chrono::steady_clock::now();

    for(int i=0; i<MAX_ITER; i++) {
        y3 = myCeil(x);
    }
    const auto t3 = std::chrono::steady_clock::now();



    std::cout << "std::ceil: " << toString(t1-t0) << "\n";
    std::cout << "cvCeil   : " << toString(t2-t1) << "\n";
    std::cout << "myCeil   : " << toString(t3-t2) << "\n";
    std::cout << "y1=" << y1 << ", y2=" << y2 << ", y3=" << y3 << std::endl;

    return 0;
}

編譯指令：

clang++-8 main8.cpp -O3 -o a8 -std=c++11 -msse4

運行輸出：

std::ceil: 30.347 ms
cvCeil   : 106.99 ms
myCeil   : 72.361 ms
y1=35, y2=35, y3=35

換g++-5.4試試看？

g++ main8.cpp -O3 -o a8 -std=c++11 -msse4

std::ceil: 153.051 ms
cvCeil   : 122.439 ms
myCeil   : 85.935 ms
y1=35, y2=35, y3=35

看來g++-5.4的libstdc++里ceil的性能，也不如cvCeil

在VS2017 Release模式運行得到：

std::ceil: 344.014 ms
cvCeil   : 119.165 ms
myCeil   : 94.527 ms
y1=35, y2=35, y3=35

MSVCRT加油鴨

附帶：CMakeLists.txt中，給VS添加SSE4.1支持：

if(MSVC)
   target_compile_definitions(ocv_test INTERFACE /arch:SSE4.1)
endif()
target_link_libraries(ocv_test ${OpenCV_LIBS})

refs:
is cvCeil() faster than standard library?
MSDN - _mm_round_sd

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 SSE4.1指令集系列之一 SSSE3指令集 [C] 跨平台使用Intrinsic函數范例3——使用MMX、SSE2指令集處理 32位整數數組求和 ARM Cortex M3指令集第18章-x86指令集之常用指令 c/c++ 代碼中使用sse指令集加速 SSE指令集學習：Compiler Intrinsic sse4.2 指令集 LCD1602指令集解讀 Intel的AVX2指令集解讀