利用neon技術對矩陣旋轉進行加速（2）

本文轉載自查看原文 2014-06-04 15:34 2031 NEON/ android/ 矩陣旋轉

上次介紹的是順時針旋轉90度，最近用到了180度和270度，在這里記錄一下。

1.利用neon技術將矩陣順時針旋轉180度：

順時針旋轉180度比順時針旋轉90度容易很多，如下圖

A1 A2 A3 A4　　　　　　　　　　　　　　D4 D3 D2 D1

B1 B2 B3 B4 順時針旋轉180度　　　 C4 C3 C2 C1

C1 C2 C3 C4　　　　　　　　　　　　　　B4 B3 B2 B1

D1 D2 D3 D4　　　　　　　　　　　　　 A4 A3 A2 A1

其實就是把矩陣每一行的元素逆序排列，再把矩陣的每一行逆序排列，代碼如下：

void rotate180(unsigned char* dstImg,unsigned char* srcImg,int width,int height)
{
    uint8x8x4_t y_mat1;  //use 2 register array to load a 8x8 patch
    uint8x8x4_t y_mat2;
    for(int i=0;i<height;i+=8)
    {
        for(int j=0;j<width;j+=8)
        {
            //step0 load 8x8 bytes in 8 registers
            y_mat1.val[0]=vld1_u8(srcImg+i*width+j);
            y_mat1.val[1]=vld1_u8(srcImg+(i+1)*width+j);
            y_mat1.val[2]=vld1_u8(srcImg+(i+2)*width+j);
            y_mat1.val[3]=vld1_u8(srcImg+(i+3)*width+j);
            y_mat2.val[0]=vld1_u8(srcImg+(i+4)*width+j);
            y_mat2.val[1]=vld1_u8(srcImg+(i+5)*width+j);
            y_mat2.val[2]=vld1_u8(srcImg+(i+6)*width+j);
            y_mat2.val[3]=vld1_u8(srcImg+(i+7)*width+j);
            //step1 reverse every element in a row
            y_mat1.val[0]=vrev64_u8(y_mat1.val[0]);
            y_mat1.val[1]=vrev64_u8(y_mat1.val[1]);
            y_mat1.val[2]=vrev64_u8(y_mat1.val[2]);
            y_mat1.val[3]=vrev64_u8(y_mat1.val[3]);
            y_mat2.val[0]=vrev64_u8(y_mat2.val[0]);
            y_mat2.val[1]=vrev64_u8(y_mat2.val[1]);
            y_mat2.val[2]=vrev64_u8(y_mat2.val[2]);
            y_mat2.val[3]=vrev64_u8(y_mat2.val[3]);
            //step2 store every row in reverse order
            vst1_u8(dstImg+(height-i-8)*width+(width-j-8),y_mat2.val[3]);
            vst1_u8(dstImg+(height-i-7)*width+(width-j-8),y_mat2.val[2]);
            vst1_u8(dstImg+(height-i-6)*width+(width-j-8),y_mat2.val[1]);
            vst1_u8(dstImg+(height-i-5)*width+(width-j-8),y_mat2.val[0]);
            vst1_u8(dstImg+(height-i-4)*width+(width-j-8),y_mat1.val[3]);
            vst1_u8(dstImg+(height-i-3)*width+(width-j-8),y_mat1.val[2]);
            vst1_u8(dstImg+(height-i-2)*width+(width-j-8),y_mat1.val[1]);
            vst1_u8(dstImg+(height-i-1)*width+(width-j-8),y_mat1.val[0]);
        }
    }
}

2.利用neon技術將矩陣順時針旋轉270度：

這個和順時針旋轉90度非常像，只是在對neon寄存器中的向量進行轉置時不太一樣，這點需要注意

void rotate270(unsigned char* dstImg,unsigned char* srcImg,int width,int height)
{
    uint8x8x4_t y_mat1;  //use 2 register array to load a 8x8 patch
    uint8x8x4_t y_mat2;

    uint8x8x2_t temp1;
    uint8x8x2_t temp2;
    uint8x8x2_t temp3;
    uint8x8x2_t temp4;

    uint16x4x2_t temp5;
    uint16x4x2_t temp6;
    uint16x4x2_t temp7;
    uint16x4x2_t temp8;
    uint16x4x2_t temp9;
    uint16x4x2_t temp10;
    uint16x4x2_t temp11;
    uint16x4x2_t temp12;

    uint32x2x2_t temp13;
    uint32x2x2_t temp14;
    uint32x2x2_t temp15;
    uint32x2x2_t temp16;
    uint32x2x2_t temp17;
    uint32x2x2_t temp18;
    uint32x2x2_t temp19;
    uint32x2x2_t temp20;
    for(int i=0;i<height;i+=8)
    {
        for(int j=0;j<width;j+=8)
        {
            //step0 load 8x8 bytes in 8 registers
            y_mat1.val[0]=vld1_u8(srcImg+i*width+j);
            y_mat1.val[1]=vld1_u8(srcImg+(i+1)*width+j);
            y_mat1.val[2]=vld1_u8(srcImg+(i+2)*width+j);
            y_mat1.val[3]=vld1_u8(srcImg+(i+3)*width+j);
            y_mat2.val[0]=vld1_u8(srcImg+(i+4)*width+j);
            y_mat2.val[1]=vld1_u8(srcImg+(i+5)*width+j);
            y_mat2.val[2]=vld1_u8(srcImg+(i+6)*width+j);
            y_mat2.val[3]=vld1_u8(srcImg+(i+7)*width+j);
            //step1 trn nearby registers
            temp1=vtrn_u8(y_mat1.val[0],y_mat1.val[1]);
            temp2=vtrn_u8(y_mat1.val[2],y_mat1.val[3]);
            temp3=vtrn_u8(y_mat2.val[0],y_mat2.val[1]);
            temp4=vtrn_u8(y_mat2.val[2],y_mat2.val[3]);
            //step2 trn 1,3 2,4 5,7 6,8
            temp5.val[0]= vreinterpret_u16_u8(temp1.val[0]);
            temp5.val[1]= vreinterpret_u16_u8(temp1.val[1]);
            temp6.val[0]= vreinterpret_u16_u8(temp2.val[0]);
            temp6.val[1]= vreinterpret_u16_u8(temp2.val[1]);
            temp7.val[0]= vreinterpret_u16_u8(temp3.val[0]);
            temp7.val[1]= vreinterpret_u16_u8(temp3.val[1]);
            temp8.val[0]= vreinterpret_u16_u8(temp4.val[0]);
            temp8.val[1]= vreinterpret_u16_u8(temp4.val[1]);
            temp9=vtrn_u16(temp5.val[0],temp6.val[0]);
            temp10=vtrn_u16(temp5.val[1],temp6.val[1]);
            temp11=vtrn_u16(temp7.val[0],temp8.val[0]);
            temp12=vtrn_u16(temp7.val[1],temp8.val[1]);
            //step3 trn 1,5 2,6 3,7 4,8
            temp13.val[0]= vreinterpret_u32_u16(temp9.val[0]);
            temp13.val[1]= vreinterpret_u32_u16(temp9.val[1]);
            temp14.val[0]= vreinterpret_u32_u16(temp10.val[0]);
            temp14.val[1]= vreinterpret_u32_u16(temp10.val[1]);
            temp15.val[0]= vreinterpret_u32_u16(temp11.val[0]);
            temp15.val[1]= vreinterpret_u32_u16(temp11.val[1]);
            temp16.val[0]= vreinterpret_u32_u16(temp12.val[0]);
            temp16.val[1]= vreinterpret_u32_u16(temp12.val[1]);
            temp17=vtrn_u32(temp13.val[0],temp15.val[0]);
            temp18=vtrn_u32(temp13.val[1],temp15.val[1]);
            temp19=vtrn_u32(temp14.val[0],temp16.val[0]);
            temp20=vtrn_u32(temp14.val[1],temp16.val[1]);
            //step4 store bytes in correct position,the order now is 1,2,3,4,5,6,7,8
            temp1.val[0]= vreinterpret_u8_u32(temp20.val[1]);
            temp1.val[1]= vreinterpret_u8_u32(temp18.val[1]);
            temp2.val[0]= vreinterpret_u8_u32(temp19.val[1]);
            temp2.val[1]= vreinterpret_u8_u32(temp17.val[1]);
            temp3.val[0]= vreinterpret_u8_u32(temp20.val[0]);
            temp3.val[1]= vreinterpret_u8_u32(temp18.val[0]);
            temp4.val[0]= vreinterpret_u8_u32(temp19.val[0]);
            temp4.val[1]= vreinterpret_u8_u32(temp17.val[0]);
            vst1_u8(dstImg+(width-j-8)*height+i,temp1.val[0]);
            vst1_u8(dstImg+(width-j-7)*height+i,temp1.val[1]);
            vst1_u8(dstImg+(width-j-6)*height+i,temp2.val[0]);
            vst1_u8(dstImg+(width-j-5)*height+i,temp2.val[1]);
            vst1_u8(dstImg+(width-j-4)*height+i,temp3.val[0]);
            vst1_u8(dstImg+(width-j-3)*height+i,temp3.val[1]);
            vst1_u8(dstImg+(width-j-2)*height+i,temp4.val[0]);
            vst1_u8(dstImg+(width-j-1)*height+i,temp4.val[1]);
        }
    }
}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 利用neon技術對矩陣旋轉進行加速 linux kernel態下使用NEON對算法進行加速利用 OpenVINO 進行推理加速（一）利用矩陣進行平移，旋轉，縮放等圖像變換、創建第二個一模一樣的圖像並使之進行縮放等操作【linux】ARM板子開啟浮點和neon加速矩陣的旋轉矩陣乘法與矩陣加速 Python中利用svd進行矩陣分解和重構原始矩陣?? Cublas矩陣加速運算旋轉矩陣