我在前面的兩篇隨筆中,寫到了YUV與RGB的互轉公式,也寫了一些SSE指令學習的常用指令。因為接下來我決定暫停對SSE指令的簡單翻譯,而要把他用到實踐中去。因此會貼上大篇幅的看起來讓人頭暈目眩的代碼,不過我會盡力寫好注釋,以免大家看起來比較費勁。
既然說SSE能夠對重復大量相同運算的數據在運算效率上有很大的提升,那就需要與高級語言(因為我常用的是C++,所以就采用C++語言)做一個對比。
在此之前,我要提前做一下說明,YUV我采用的4:2:2的壓縮方式,也就是兩個Y分量公用一組UV分量,並且每個YUV分量分別占10bit,但是考慮的內存對齊,YUV分量其實分別占用2個字節,並且我在這里使用的公式BT709的數字RGB與數字YUV的轉換公式。
如果不清楚轉換公式,可以轉到我的一篇關於YUV與RGB互轉的公式總結的隨筆。
http://www.cnblogs.com/zhengjianhong/p/7872459.html
C++代碼如下:
void RGB2YUV(OUT ushort &Y, OUT ushort &U, OUT ushort &V, IN const BYTE r, IN const BYTE g, IN const BYTE b)
{ Y = ushort(16 + 0.183 * r + 0.614 * g + 0.062 * b + 0.5); U = ushort(128 - 0.101 * r - 0.339 * g + 0.439 * b + 0.5); V = ushort(128 + 0.439 * r - 0.399 * g - 0.040* b + 0.5); }
bool RGB2YUV422_10BIT_CPLUSPLUS(OUT void* pYUV, IN void *pRGB, int nPixelCount, bool bAlpha) { if(pYUV == NULL || pRGB == NULL || nPixelCount <= 0) return false; // RGB 到 YUV422 10bit的轉換 BYTE *pRGBByte = (BYTE *)pRGB; ushort *pYUVShort = (ushort *)pYUV; int nBitCount = 3; if(bAlpha) nBitCount = 4; for(int i = 0; i < nPixelCount; i += 2) { // 第一個像素 ushort y, u, v;
pRGBByte += nBitCount;
RGB2YUV(y, u, v, *(pRGBByte), *(pRGBByte + 1), *(pRGBByte + 2)); *(pYUVShort + 2 * i) = y; *(pYUVShort + 2 * i + 1) = u; *(pYUVShort + 2 * i + 3) = v; // 第二個像素
pRGBByte += nBitCount;
RGB2YUV(y, u, v, *(pRGBByte), *(pRGBByte + nBitCount * (i + 1) + 1), *(pRGBByte + nBitCount * (i + 1) + 2));
*(pYUVShort + 2 * i + 2) = y;
}
return true;
}
SSE代碼如下:
// 在這里我考慮在精度允許的情況下,綜合考慮在不產生進位的情況下對RGB轉YUV的因子做了放大處理(放大256倍),以消除浮點運算,提高指令的執行效率。
實現思路:一條跳轉指令完成8個像素的處理,先將8個像素的RGB分量分別用一個128位寄存器來存儲,RGB的每個分量占2個字節, 並且用128位來存儲每個轉換因子,每個因子一樣占2個字節。簡單點說就是
(R0R1 R2R3 R4R5 R6R7) * (YrYr YrYr YrYr YrYr) 這樣就計算出了構成Y的R部分簡稱YR,以此類推,分別計算出YG, YB,然后在將YR,YG,YB做加法運算,最終計算出Y的結果。
// 因子的順序為YrYg YbUr UgUb VgVb // Vr = Ub
__declspec(align(16)) short dwRGB2YCbCrCoefFR256[8] = {77, 150, 29, -43, -85, 128, -107, -21}; // Full Range __declspec(align(16)) short dwRGB2YCbCrCoefHD256[8] = {47, 157, 16, -26, -87, 112, -102, -10}; __declspec(align(16)) short dwRGB2YCbCrCoefSD256[8] = {66, 129, 25, -38, -74, 112, -94, -18};
bool RGBA2YUV422_10BIT_SSE_Nofloat(OUT void* pYUV, IN void *pRGB, int nPixelCount, short* dwRGBA2YCbCrCoef) { if(pYUV == NULL || pRGB == NULL || nPixelCount <= 0) return false; int n32 = nPixelCount / 8; int m32 = nPixelCount % 8; BYTE *pByte = (BYTE *)pRGB; ushort *pYuvShort = (ushort *)pYUV; __m128i _m128i; __asm { mov esi, pByte; mov edi, pYuvShort; mov ecx, n32; mov edx, m32; mov eax, dwRGBA2YCbCrCoef; movaps xmm7, [eax]; movaps _m128i, xmm7; prefetchnta [esi]; test ecx, ecx; jz loop_m32; loop_32: prefetchnta [esi + 32]; // prefetchnta指令,將內存數據加載到緩存中,提高指令的數據命中率 movups xmm0, [esi]; // A3B3G3R3 A2B2G2R2 A1B1G1R1 A0B0G0R0 movups xmm1, [esi + 16]; // A7B7G7R7 A6B6G6R6 A5B5G5R5 A4B4G4R4 pand xmm0, dwMaskA; // 0B3G3R3 0B2G2R2 0B1G1R1 0B0G0R0 pand xmm1, dwMaskA; // 0B7G7R7 0B6G6R6 0B5G5R5 0B4G4R4 movaps xmm2, xmm0; movaps xmm3, xmm1; pand xmm2, dwMaskR; // 000R3 000R2 000R1 000R0 pand xmm3, dwMaskR; // 000R7 000R6 000R5 000R4 packssdw xmm2, xmm3; // 0R70R6 0R50R4 0R30R2 0R10R0 movaps xmm3, xmm0; movaps xmm4, xmm1; psrld xmm3, 8; psrld xmm4, 8; pand xmm3, dwMaskR; // 000G3 000G2 000G1 000G0 pand xmm4, dwMaskR; // 000G7 000G6 000G5 000G4 packssdw xmm3, xmm4; // 0G70G6 0G50G4 0G30G2 0G10G0 movaps xmm4, xmm0; movaps xmm5, xmm1; psrld xmm4, 16; psrld xmm5, 16; pand xmm4, dwMaskR; // 000B3 000B2 000B1 000B0 pand xmm5, dwMaskR; // 000B7 000B6 000B5 000B4 packssdw xmm4, xmm5; // 0B70B6 0B50B4 0B30B2 0B10B0 movaps xmm0, xmm7; // VbVg UbUg UrYb YgYr // 系數 pshuflw xmm0, xmm0, 0x00; // VbVg UbUg YrYr YrYr shufps xmm0, xmm0, 0x00; // YrYr YrYr YrYr YrYr movups _m128i, xmm0; movaps xmm1, xmm2; pmullw xmm1, xmm0; // YR7YR6 YR5YR4 YR3Yr2 YR1YR0 movups _m128i, xmm1; movaps xmm0, xmm7; pshuflw xmm0, xmm0, 0x05; // VbVg UbUg YrYr YgYg shufps xmm0, xmm0, 0x00; // YgYg YgYg YgYg YgYg movups _m128i, xmm0; movaps xmm5, xmm3; pmullw xmm5, xmm0; // YG7YG6 YG5YG4 YG3YG2 YG1YG0 paddw xmm1, xmm5; // YR+YG movups _m128i, xmm1; movaps xmm0, xmm7; pshuflw xmm0, xmm0, 0x0a; // VbVg UbUg YrYr YbYb shufps xmm0, xmm0, 0x00; // YbYb YbYb YbYb YbYb movups _m128i, xmm0; movaps xmm5, xmm4; pmullw xmm5, xmm0; // YB7YB6 YB5YB4 YB3YB2 YB1YB0 paddw xmm1, xmm5; // YR+YG+YB movups _m128i, xmm1; paddw xmm1, dwAdjust128; psrlw xmm1, 8; // Y7Y6 Y5Y4 Y3Y2 Y1Y0 movups _m128i, xmm1; movaps xmm0, xmm7; pshuflw xmm0, xmm0, 0x0f; // VbVg UbUg YrYr UrUr; shufps xmm0, xmm0, 0x00; // UrUr UrUr UrUr UrUr movups _m128i, xmm0; movaps xmm5, xmm2; pmullw xmm5, xmm0; // UR7UR6 UR5UR4 UR3UR2 UR1UR0 movups _m128i, xmm5; movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0x00; // UgUg UgUg UrYb YgYr shufps xmm0, xmm0, 0xaa; // UgUg UgUg UgUg UgUg movups _m128i, xmm0; movaps xmm6, xmm3; pmullw xmm6, xmm0; // UG7UG6 UG5UG4 UG3UG2 UG1UG0 movups _m128i, xmm6; paddw xmm5, xmm6; // UR+UG movups _m128i, xmm5; movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0x05; // UgUg UbUb UrYb YgYr shufps xmm0, xmm0, 0xaa; // UbUb UbUb UbUb UbUb movups _m128i, xmm0; movaps xmm6, xmm4; pmullw xmm6, xmm0; // UB7UB6 UB5UB4 UB3UB2 UB1UB0 movups _m128i, xmm6; paddw xmm5, xmm6; // UR + UG + UB movups _m128i, xmm5; paddw xmm5, dwAdjust; paddw xmm5, dwAdjust128; movups _m128i, xmm5; psrlw xmm5, 8; // U7U6 U5U4 U3U2 U1U0 movups _m128i, xmm5; pmullw xmm2, xmm0; // VR7VR6 VR5VR4 VR3VR2 VR1VR0 movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0xaa; // VgVg VgVg UrYb YgYr shufps xmm0, xmm0, 0xaa; // VgVg VgVg VgVg VgVg pmullw xmm3, xmm0; // VG7VG6 VG5VG4 VG3VG2 VG1VG0 paddw xmm2, xmm3; // VR + VG movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0xff; // VbVb VbVb UrYb YgYr shufps xmm0, xmm0, 0xaa; // VbVb VbVb VbVb VbVb pmullw xmm4, xmm0; // VB7VB6 VB5VB4 VB3VB2 VB1VB0 paddw xmm2, xmm4; // VR + VG + VB paddw xmm2, dwAdjust; paddw xmm2, dwAdjust128; psrlw xmm2, 8; // V7V6 V5V4 V3V2 V1V0 movaps xmm4, xmm5; punpcklwd xmm4, xmm2; // V3U3 V2U2 V1U1 V0U0 punpckhwd xmm5, xmm2; // V7U7 V6U6 V5U5 V4U4 shufps xmm4, xmm5, 0x88; // V6U6 V4U4 V2U2 V0U0 movaps xmm3, xmm1; punpcklwd xmm3, xmm4; // V2Y3 U2Y2 V0Y1 U0Y0 punpckhwd xmm1, xmm4; // V6Y7 U6Y6 V4Y5 U4Y4 movups [edi], xmm3; movups [edi + 16], xmm1; add edi, 32; add esi, 32; dec ecx; jnz loop_32; loop_m32: test edx, edx; jz loop_exit; cmp edx, 4; jl loop_2pixel; movups xmm0, [esi]; //A3B3G3R3 A2B2G2R2 A1B1G1R1 A0B0G0R0 pand xmm0, dwMaskA; // 0B3G3R3 0B2G2R2 0B1G1R1 0B0G0R0 movaps xmm1, xmm0; pand xmm1, dwMaskR; // 000R3 000R2 000R1 000R0 pshuflw xmm1, xmm1, 0xd8; // 000R3 000R2 0000 0R10R0 pshufhw xmm1, xmm1, 0xd8; // 0000 0R30R2 0000 0R10R0 shufps xmm1, xmm1, 0xd8; // 0000 0000 0R30R2 0R10R0 movaps xmm2, xmm0; psrld xmm2, 8; pand xmm2, dwMaskR; pshuflw xmm2, xmm2, 0xd8; pshufhw xmm2, xmm2, 0xd8; shufps xmm2, xmm2, 0xd8; // 0000 0000 0G30G2 0G10G0 movaps xmm3, xmm0; psrld xmm3, 16; pand xmm3, dwMaskR; pshuflw xmm3, xmm3, 0xd8; pshufhw xmm3, xmm3, 0xd8; shufps xmm3, xmm3, 0xd8; // 0000 0000 0B30B2 0B10B0 movaps xmm0, xmm7; // VbVg UbUg UrYb YgYr pshuflw xmm0, xmm0, 0x00; // VbVg UbUg YrYr YrYr movups xmm4, xmm1; pmullw xmm4, xmm0; // 0000 0000 YR3YR2 YR1YR0 movaps xmm0, xmm7; pshuflw xmm0, xmm0, 0x55; // VbVg UbUg YgYg YgYg movaps xmm5, xmm2; pmullw xmm5, xmm0; // 0000 0000 YG3YG2 YG1YG0 paddw xmm4, xmm5; // 00 00 (YR+YG)(YR+YG) (YR+YG)(YR+YG) movaps xmm0, xmm7; pshuflw xmm0, xmm0, 0xaa; // VbVg UbUg YbYb YbYb movups xmm5, xmm3; pmullw xmm5, xmm0; // 00 00 YB3YB2 YB1YB0 paddw xmm4, xmm5; // 00 00 Y3Y2 Y1Y0 psrlw xmm4, 8; movaps xmm0, xmm7; pshuflw xmm0 ,xmm0, 0xff; // VbVb UbUg UrUr UrUr; movups xmm5, xmm1; pmullw xmm5, xmm0; // 00 00 UR3UR2 UR1UR0 movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0x00; // UgUg UgUg UrYb YgYr shufps xmm0, xmm0, 0xee; // UgUg UgUg UgUg UgUg movups xmm6, xmm2; pmullw xmm6, xmm0; // 00 00 UG3UG2 UG1UG0 paddw xmm5, xmm6; // 00 00 U3'U2' U1'U0' movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0x55; shufps xmm0, xmm0, 0xee; // UbUb UbUb UbUb UbUb movups xmm6, xmm3; pmullw xmm6, xmm0; // 00 00 UB3UB2 UB1UB0 paddw xmm5, xmm6; // 00 00 U3U2 U1U0 psrlw xmm5, 8; paddw xmm5, dwAdjust; pmullw xmm1, xmm0; // 00 00 VR3VR2 VR1VR0 movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0xaa; // VgVg VgVg UrYb YgYr shufps xmm0, xmm0, 0xee; // VgVg VgVg VgVg VgVg; pmullw xmm2, xmm0; // 00 00 VG3VG2 VG1VG0 paddw xmm1, xmm2; // 00 00 U3'U2' U1'U0' movaps xmm0, xmm7; pshufhw xmm0, xmm0, 0xff; // VbVb VbVb UrYb YgYr shufps xmm0, xmm0, 0xee; // VbVb VbVb VbVb VbVb pmullw xmm3, xmm0; // 00 00 VB3VB2 VB1VB0 paddw xmm1, xmm3; // 00 00 V3V2 V1V0 psrlw xmm1, 8; paddw xmm1, dwAdjust; punpcklwd xmm5, xmm1; // V3U3 V2U2 V1U1 V0U0 shufps xmm5, xmm5, 0xd8; // V3U3 V1U1 V2U2 V0U0 punpcklwd xmm4, xmm5; // V2Y3 U2Y2 V0Y1 U0Y0 movups [edi], xmm4; add esi, 16; add edi, 16; sub edx, 4; jnz loop_m32; loop_2pixel: test edx, edx; jz loop_exit; cmp edx, 2; jl loop_spixel; movups xmm0, [esi]; pand xmm0, dwMaskA; // 0000 0000 0B1G1R1 0B0G0R0 movups xmm1, xmm0; pand xmm1, dwMaskR; // 0000 0000 000R1 000R0 pshuflw xmm1, xmm1, 0xd8; // 0000 0000 0000 0R10R0 movups xmm2, xmm0; psrld xmm2, 8; pand xmm2, dwMaskR; pshuflw xmm2, xmm2, 0xd8; // 0000 0000 0000 0G10G0 movups xmm3, xmm0; psrld xmm3, 16; pand xmm3, dwMaskR; pshuflw xmm3, xmm3, 0xd8; // 0000 0000 0000 0B10B0 movups xmm0, xmm7; // VbVg UbUg UrYb YgYr pshuflw xmm0, xmm0, 0x00; // VbVg UbUg YrYr YrYr; movups xmm4, xmm1; pmullw xmm4, xmm0; // 00 00 00 YR1YR0 movups xmm0, xmm7; pshuflw xmm0, xmm0, 0x55; movups xmm5, xmm2; pmullw xmm5, xmm0; // 00 00 00 YG1YG0 paddw xmm4, xmm5; // 00 00 00 Y1'Y0' movups xmm0, xmm7; pshuflw xmm0, xmm0, 0xaa; movups xmm5, xmm3; pmullw xmm5, xmm0; // 00 00 00 YB1YB0 paddw xmm4, xmm5; // 00 00 00 Y1Y0 psrlw xmm4, 8; movups xmm0, xmm7; pshuflw xmm0, xmm0, 0xff; movups xmm5, xmm1; pmullw xmm5, xmm0; // 00 00 00 UR1UR0 movups xmm0, xmm7; pshufhw xmm0, xmm0, 0x00; // UgUg UgUg .. .. shufps xmm0, xmm0, 0xee; // .. .. UgUg UgUg movups xmm6, xmm2; pmullw xmm6, xmm0; // 00 00 00 UG1 UG0; paddw xmm5, xmm6; movups xmm0, xmm7; pshufhw xmm0, xmm0, 0x55; shufps xmm0, xmm0, 0xee; movups xmm6, xmm3; pmullw xmm6, xmm0; // 00 00 00 UB1UB0 paddw xmm5, xmm6; // 00 00 00 U1U0 psrlw xmm5, 8; psrlw xmm5, dwAdjust; pmullw xmm1, xmm0; // 00 00 00 VR1VR0 movups xmm0, xmm7; pshufhw xmm0, xmm0, 0xaa; shufps xmm0, xmm0, 0xee; pmullw xmm2, xmm0; // 00 00 00 VG1VG0 paddw xmm1, xmm2; movups xmm0, xmm7; pshufhw xmm0, xmm0, 0xff; shufps xmm0, xmm0, 0xee; pmullw xmm3, xmm0; // 00 00 00 VB1VB0 paddw xmm1, xmm3; // 00 00 00 V1V0; punpcklwd xmm5, xmm1; // 00 00 V1U1 V0U0 punpcklwd xmm4, xmm5; // V10 U10 V0Y1 U0Y0 movlps [edi], xmm4; add edi, 8; add esi, 8; sub edx, 2; jnz loop_2pixel; loop_spixel: loop_exit: } return true; }
下面附上兩種轉換對比的貼圖:
前面代表debug下的執行截圖,后面表示Releas下的執行截圖。總體而言SSE指令優化之后的代碼執行效率比較穩定,而Release開啟優化之后的C++代碼提升比較明顯,但是仍然沒有SSE指令的效率高。
至於YUV轉RGB的指令優化,有興趣的同學,可以自己嘗試寫一下。
在這里,我也是剛剛入門SSE指令優化,如果有同學發現問題,還希望可以指出來。