YUV422p To RGB888 C語言版本接口說明:
- /****************************************YUV422P_To_RGB24.c**************************/
- //模塊功能:將YUV422_PLANAR圖像數據轉換成RGB24格式
- typedef unsigned char BYTE; // [0..255]
- /*
- * 接口說明:
- * 功能:構建查詢表,轉換模塊運行前必須進行的初始化操作
- */
- void YUV422P_To_RGB24_init();
- /*
- *接口說明:
- *功能:將YUV422P圖像數據轉換成RGB24格式
- *參數:
- * pY: YUV422P圖像數據Y的起始指針
- * pU: YUV422P圖像數據U的起始指針
- * pV: YUV422P圖像數據V的起始指針
- * DstPic: 轉換成的RGB24圖像數據的起始指針
- * width: 圖像寬度
- * height: 圖像高度
- *返回值:成功返回0,失敗返回-1
- *注意:DstPic所指向的緩沖區必須事先分配好,其大小應該為 width*height*3
- */
- int YUV422P_To_RGB24(BYTE* pY, BYTE* pU, BYTE* pV, BYTE* DstPic, int width, int height);
實現:
- //使用整數運算(定點數運算)來代替浮點運算
- const int csY_coeff_16 = 1.164383 * (1 << 16);
- const int csU_blue_16 = 2.017232 * (1 << 16);
- const int csU_green_16 = (-0.391762) * (1 << 16);
- const int csV_green_16 = (-0.812968) * (1 << 16);
- const int csV_red_16 = 1.596027 * (1 << 16);
- //顏色查表
- static BYTE _color_table[256 * 3];
- static const BYTE* color_table = &_color_table[256];
- //查表
- static int Ym_tableEx[256];
- static int Um_blue_tableEx[256];
- static int Um_green_tableEx[256];
- static int Vm_green_tableEx[256];
- static int Vm_red_tableEx[256];
- //顏色飽和函數
- inline long border_color(long color) {
- if (color > 255)
- return 255;
- else if (color < 0)
- return 0;
- else
- return color;
- }
- //采用查找表進行計算時,必須運行的初始化函數
- void YUV422P_To_RGB24_init() {
- int i;
- for (i = 0; i < 256 * 3; ++i)
- _color_table[i] = border_color(i - 256);
- for (i = 0; i < 256; ++i) {
- Ym_tableEx[i] = (csY_coeff_16 * (i - 16)) >> 16;
- Um_blue_tableEx[i] = (csU_blue_16 * (i - 128)) >> 16;
- Um_green_tableEx[i] = (csU_green_16 * (i - 128)) >> 16;
- Vm_green_tableEx[i] = (csV_green_16 * (i - 128)) >> 16;
- Vm_red_tableEx[i] = (csV_red_16 * (i - 128)) >> 16;
- }
- }
- inline void YUVToRGB24_Table(BYTE *p, const BYTE Y0, const BYTE Y1,
- const BYTE U, const BYTE V) {
- int Ye0 = Ym_tableEx[Y0];
- int Ye1 = Ym_tableEx[Y1];
- int Ue_blue = Um_blue_tableEx[U];
- int Ue_green = Um_green_tableEx[U];
- int Ve_green = Vm_green_tableEx[V];
- int Ve_red = Vm_red_tableEx[V];
- int UeVe_green = Ue_green + Ve_green;
- *p = color_table[(Ye0 + Ve_red)];
- *(p + 1) = color_table[(Ye0 + UeVe_green)];
- *(p + 2) = color_table[(Ye0 + Ue_blue)];
- *(p + 3) = color_table[(Ye1 + Ve_red)];
- *(p + 4) = color_table[(Ye1 + UeVe_green)];
- *(p + 5) = color_table[(Ye1 + Ue_blue)];
- }
- int YUV420P_To_RGB24(BYTE* pY, BYTE* pU, BYTE* pV, BYTE* DstPic, int width,
- int height) {
- int y, x, x_uv;
- BYTE* pDstLine = DstPic;
- if ((width % 2) != 0 || (height % 2) != 0)
- return (-1);
- for (y = 0; y < height; ++y) {
- //DECODE_PlanarYUV211_Common_line(pDstLine, pY, pU, pV,width);
- for (x = 0; x < width; x += 2) {
- x_uv = x >> 1;
- YUVToRGB24_Table(&pDstLine[x * 3], pY[x], pY[x + 1], pU[x_uv],
- pV[x_uv]);
- }
- pDstLine += width * 3; //RGB888
- pY += width; //YUV422
- if (y % 2 == 1) {
- pU += width / 2;
- pV += width / 2;
- }
- }
- return 0;
- }
經測試發現,在hi3512(arm 926ej-s,267MHz)平台上運行時,該yuv轉rgb模塊的速度不是很快,大概20幀/秒。為了提高效率,核心解碼模塊我采用了arm匯編,重寫了YUVToRGB24_Table模塊。
YUV420P_To_RGB24_asm.c代碼:
- extern int YUVToRGB24_Assemble(unsigned char *pDstLine, unsigned char **yuv, int width);
- //使用整數運算(定點數運算)來代替浮點運算
- const int csY_coeff_16 = 1.164383 * (1 << 16);
- const int csU_blue_16 = 2.017232 * (1 << 16);
- const int csU_green_16 = (-0.391762) * (1 << 16);
- const int csV_green_16 = (-0.812968) * (1 << 16);
- const int csV_red_16 = 1.596027 * (1 << 16);
- //查表
- int Ym_tableEx[256];
- int Um_blue_tableEx[256];
- int Um_green_tableEx[256];
- int Vm_green_tableEx[256];
- int Vm_red_tableEx[256];
- //采用查找表進行計算時,必須運行的初始化函數
- void YUV422P_To_RGB24_init()
- {
- int i;
- for (i = 0; i < 256; ++i)
- {
- Ym_tableEx[i]=(csY_coeff_16 * (i - 16) )>>16;
- Um_blue_tableEx[i]=(csU_blue_16 * (i - 128) )>>16;
- Um_green_tableEx[i]=(csU_green_16 * (i - 128) )>>16;
- Vm_green_tableEx[i]=(csV_green_16 * (i - 128) )>>16;
- Vm_red_tableEx[i]=(csV_red_16 * (i - 128) )>>16;
- }
- }
- int YUV420P_To_RGB24(BYTE* pY, BYTE* pU, BYTE* pV, BYTE* DstPic, int width, int height)
- {
- int y;
- BYTE* pDstLine = DstPic;
- BYTE* yuv[3];
- if ((width % 8)!=0)
- return(-1);
- yuv[0] = pY;
- yuv[1] = pU;
- yuv[2] = pV;
- for (y = height; y > 0; --y)
- {
- YUVToRGB24_Assemble(pDstLine, yuv, width); //decoder a line with asm function in YUVToRGB24_Assemble.s
- pDstLine += width * 3; //RGB888
- yuv[0] += width; //YUV422
- if(y % 2 == 1) {
- yuv[1] += width >> 1;
- yuv[2] += width >> 1;
- }
- }
- return 0;
- }
arm匯編核心解碼模塊:
- .text
- .macro loadu a
- adr r1, UM_BLUE
- ldr r1, [r1]
- ldr r9, [r1, /a, lsl #2]
- adr r1, UM_GREEN
- ldr r1, [r1]
- ldr r10, [r1, /a, lsl #2]
- .endm
- .macro loadv a
- adr r1, VM_RED
- ldr r1, [r1]
- ldr r11, [r1, /a, lsl #2]
- adr r1, VM_GREEN
- ldr r1, [r1]
- ldr r12, [r1, /a, lsl #2]
- .endm
- .macro bound_r0
- cmp r0, #0x00
- movlt r0, #0x00
- cmp r0, #255
- movgt r0, #255
- .endm
- .globl YUVToRGB24_Assemble
- @ r0 = pDstLine; r1 = yuv; r2 = width
- YUVToRGB24_Assemble:
- stmdb sp!, { r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
- ldmia r1, {r1, r3, r4} @r1 = y; r3 = u; r4 = v;
- mov r5, #0 @r5 = row counter
- hloop:
- ldr r6, [r1], #0x04 @load y; 4 bytes
- ldrh r7, [r3], #0x02 @load u; 2 bytes
- ldrh r8, [r4], #0x02 @load v; 2 bytes
- stmdb sp!, {r0, r1, r2, r3, r4, r5}
- @ temp register: r0,r1 rgbdata: r2,r3,r4
- @ ye:r5 ue_blue:r9 ue_green:r10 ve_red:r11 ve_green:r12
- mov r2, #0
- mov r3, #0
- mov r4, #0
- @ load ue_bule0, ue_green0
- mov r0, r7
- and r0, r0, #0xFF
- loadu r0
- @load ve_red0, ve_green0
- mov r0, r8
- and r0, r0, #0xFF
- loadv r0
- @load ye0
- mov r0, r6
- and r0, r0, #0xFF
- adr r1, YM
- ldr r1, [r1]
- ldr r5, [r1, r0, lsl #2]
- @r0 = ye0+ve_red0
- add r0, r5, r11
- bound_r0
- orr r2, r2, r0
- @g0 = ye0+ue_green0+ve_green0
- adds r0, r10, r12
- adc r0, r0, r5
- bound_r0
- orr r2, r2, r0, lsl #8
- @b0 = ye0+ue_blue0
- add r0, r5, r9
- bound_r0
- orr r2, r2, r0, lsl #16
- @load ye1
- mov r0, r6, lsr #8
- and r0, r0, #0xFF
- ldr r5, [r1, r0, lsl #2]
- @r1 = ye1+ve_red0
- add r0, r5, r11
- bound_r0
- orr r2, r2, r0, lsl #24
- @g1 = ye1+ue_green0+ve_green0
- adds r0, r10, r12
- adc r0, r0, r5
- bound_r0
- orr r3, r3, r0
- @b1 = ye1+ue_blue0
- add r0, r5, r9
- bound_r0
- orr r3, r3, r0, lsl #8
- @ load ue_bule1, ue_green1
- mov r0, r7, lsr #8
- and r0, r0, #0xFF
- loadu r0
- @load ve_red1, ve_green1
- mov r0, r8, lsr #8
- and r0, r0, #0xFF
- loadv r0
- @load ye2
- mov r0, r6, lsr #16
- and r0, r0, #0xFF
- adr r1, YM
- ldr r1, [r1]
- ldr r5, [r1, r0, lsl #2]
- @r2 = ye2+ve_red1
- add r0, r5, r11
- bound_r0
- orr r3, r3, r0, lsl #16
- @g2 = ye2+ue_green1+ve_green1
- adds r0, r10, r12
- adc r0, r0, r5
- bound_r0
- add r3, r3, r0, lsl #24
- @b2 = ye2+ue_blue1
- add r0, r5, r9
- bound_r0
- orr r4, r4, r0
- @load ye3
- mov r0, r6, lsr #24
- and r0, r0, #0xFF
- ldr r5, [r1, r0, lsl #2]
- @r3 = ye3+ve_red1
- add r0, r5, r11
- bound_r0
- orr r4, r4, r0, lsl #8
- @g3 = ye3+ue_green1+ve_green1
- adds r0, r10, r12
- adc r0, r0, r5
- bound_r0
- orr r4, r4, r0, lsl #16
- @b3 = ye3+ue_blue1
- add r0, r5, r9
- bound_r0
- orr r4, r4, r0, lsl #24
- mov r10, r2
- mov r11, r3
- mov r12, r4
- ldmia sp!, {r0, r1, r2, r3, r4, r5}
- stmia r0!, {r10, r11, r12}
- add r5, r5, #4
- cmp r5, r2
- blo hloop
- ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} @exit
- @tables
- YM : .long Ym_tableEx
- UM_BLUE : .long Um_blue_tableEx
- UM_GREEN: .long Um_green_tableEx
- VM_GREEN: .long Vm_green_tableEx
- VM_RED : .long Vm_red_tableEx
將核心模塊改成匯編后,解碼達到了50幀/秒,效率提高了60%,匯編果然強大,哈哈。