本文要介紹的是SSE4.1指令集中的幾條整數指令及其在視頻編碼中的應用。
1. 單指令32字節差分絕對值求和指令 MPSADBW
這條指令類似於SSE的PSADBW,但它實現的功能更強大。包括微軟官方網站上對這條指令的說明都不是能夠讓人一目了然。下面這張圖也許可以幫助我們理解:
這條指令的靈活之處在於源操作數和目的操作數的位置都是可選的。如何選擇關鍵在於后面那個mask常量。這個常量是一個立即數,但只用到了其中的低三位。
其中,最低2位,用於選擇源操作數的連續4個字節的起始位置。由於兩位二進制有4中狀態,所以源操作數的可選起始位置共有4種,具體見上圖。
mask的第三位用於選擇目的操作時連續11個字節的起始位置。很顯然,共有兩個起始位置可供選擇。
下面的c代碼更清楚的描述了這條指令的功能。
1 static __m128i compute_mpsadbw (unsigned char *v1, unsigned char *v2, int mask) 2 { 3 union 4 { 5 __m128i x; 6 unsigned short s[8]; 7 } ret; 8 unsigned char s[4]; 9 int i, j; 10 int offs1, offs2; 11 12 offs2 = 4 * (mask & 3); 13 for (i = 0; i < 4; i++) 14 s[i] = v2[offs2 + i]; 15 16 offs1 = 4 * ((mask & 4) >> 2); 17 for (j = 0; j < 8; j++) 18 { 19 ret.s[j] = 0; 20 for (i = 0; i < 4; i++) 21 ret.s[j] += abs (v1[offs1 + j + i] - s[i]); 22 } 23 24 return ret.x; 25 }
2. 水平方向上最小值查找指令 PHMINPOSUW
這條指令相對而言比較好理解,下面這張圖很好的描述了這條指令的執行過程:
3. 整數格式轉換指令
整數格式轉換,例如,把一個8位的字節型變量轉換為16位字變量,或者32位的雙字變量等。這種運算在圖像,語音信號處理中的經常碰到。例如,圖像數據是8位的字節 型變量,如果運算過程中的浮點變量定點化采用的Q15格式,則需要將8位無符號擴展為16位以適應SIMD的並行運算,如果為了更高的精度,Q15格式顯然太低,例如采用Q24是一個不錯的選擇,這時候需要將8位無符號擴展為32位雙字變量以適應SIMD的並行運算。
SSE4.1提供了12條不同的指令來完成各種不同整數格式之間的轉換。詳見下圖:
從上圖可以看出,把8位的字節型變量擴展為32位的雙字變量,SSE4.1用了一條指令
PMOVSXBD xmm0, m32
而SSE2指令用了4條指令,即:
movd xmm0, m32
punpcklbw xmm0,xmm0
punpcklwd xmm0,xmm0
psrad xmm0, 24
4. 視頻編碼中運動估計
運動估計占視頻編碼30%以上的時間,采用SSE的SIMD指令可有效加速運動估計的計算過程。
4.1 4x4塊匹配運動估計代碼
1 int blockMatch4x4(const unsigned char* refFrame, int stepBytesRF, 2 const unsigned char* curBlock, int stepBytesCB, int* matchBlock, 3 int frameWidth, int frameHeight) 4 { 5 6 int lowSum = INT_MAX; 7 int i,j,k,l; 8 9 int temSum = 0; 10 11 int blockHeight = 4; 12 13 int blockWidth = 4; 14 15 const unsigned char *pRef, *pCur; 16 17 for (i = 0; i <= frameHeight - blockHeight; i++) 18 { 19 for (j = 0; j <= frameWidth - blockWidth; j++) 20 { 21 temSum = 0; 22 pCur = curBlock; 23 pRef = refFrame + i * stepBytesRF + j; 24 25 for (k=0; k < 4; k++) 26 { 27 for (l=0; l < 4; l++) 28 { 29 temSum += labs(*pRef-*pCur); 30 pCur++; 31 pRef++; 32 } 33 pCur += stepBytesCB - 4; 34 pRef += stepBytesRF - 4; 35 } 36 37 if (temSum < lowSum) 38 { 39 lowSum = temSum; 40 *matchBlock = j; 41 *(matchBlock+1) = i; 42 } 43 } 44 } 45 return 0; 46 }
4.2 4x4塊匹配運動估計SSE2指令優化
1 int blockMatch4x4SSE2_opted(const unsigned char* refFrame, int stepBytesRF, const unsigned 2 char* curBlock, int stepBytesCB, int* matchBlock, int frameWidth, int frameHeight) 3 { 4 unsigned int lowSum[4] = {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}; 5 unsigned int temSum = 0; 6 int blockHeight = 4; 7 int blockWidth = 4; 8 int i,j,k,l; 9 const unsigned char *pRef, *pCur; 10 __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; 11 12 pCur = curBlock; 13 s0 = _mm_loadu_si128((__m128i*)pCur); 14 s1 = _mm_loadu_si128((__m128i*)(pCur + stepBytesCB)); 15 s2 = _mm_loadu_si128((__m128i*)(pCur + 2 * stepBytesCB)); 16 s3 = _mm_loadu_si128((__m128i*)(pCur + 3 * stepBytesCB)); 17 18 19 s8 = _mm_unpacklo_epi32(s0, s1); 20 s9 = _mm_unpacklo_epi32(s2, s3); 21 s10 = _mm_unpackhi_epi32(s0, s1); 22 s11 = _mm_unpackhi_epi32(s2, s3); 23 24 for (i = 0; i <= frameHeight - blockHeight; i++) 25 { 26 for (j = 0; j <= frameWidth - blockWidth; j++) 27 { 28 pRef = refFrame + i * stepBytesRF + j; 29 s6 = _mm_unpacklo_epi32( 30 _mm_cvtsi32_si128(*(unsigned int*)pRef), 31 _mm_cvtsi32_si128(*(unsigned int*)(pRef + stepBytesRF))); 32 33 s6 = _mm_shuffle_epi32(s6, 0x44); 34 35 s7 = _mm_unpacklo_epi32( 36 _mm_cvtsi32_si128(*(unsigned int*)(pRef + 2 * stepBytesRF)), 37 _mm_cvtsi32_si128(*(unsigned int*)(pRef + 3 * stepBytesRF))); 38 39 s7 = _mm_shuffle_epi32(s7, 0x44); 40 41 42 s0 = _mm_adds_epu16(_mm_sad_epu8(s6, s8), _mm_sad_epu8(s7, s9)); 43 44 s1 = _mm_adds_epu16( 45 _mm_sad_epu8(s6, s10), 46 _mm_sad_epu8(s7, s11)); 47 48 49 temSum = _mm_extract_epi16(s0,0); 50 51 if (temSum < lowSum[0]) 52 { 53 lowSum[0] = temSum; 54 *matchBlock = j; 55 *(matchBlock+1) = i; 56 } 57 58 temSum = _mm_extract_epi16(s0,4); 59 if (temSum < lowSum[1]) 60 { 61 lowSum[1] = temSum; 62 *(matchBlock+2) = j; 63 *(matchBlock+3) = i; 64 } 65 66 temSum = _mm_extract_epi16(s1,0); 67 if (temSum < lowSum[2]) 68 { 69 lowSum[2] = temSum; 70 *(matchBlock+4) = j; 71 *(matchBlock+5) = i; 72 } 73 74 temSum = _mm_extract_epi16(s1,4); 75 if (temSum < lowSum[3]) 76 { 77 lowSum[3] = temSum; 78 *(matchBlock+6) = j; 79 *(matchBlock+7) = i; 80 } 81 } 82 } 83 return 0; 84 85 }
4.3 4x4塊匹配運動估計SSE4.1指令優化
1 int blockMatch4x4SSE4_opted(const unsigned char* refFrame, int stepBytesRF, const unsigned 2 char* curBlock, int stepBytesCB, int* matchBlock, int frameWidth, int frameHeight) 3 { 4 unsigned int lowSum[4] = {UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX}; 5 unsigned int temSum = 0; 6 int blockHeight = 4; 7 int blockWidth = 4; 8 int i,j,k; 9 const unsigned char *pRef, *pCur; 10 11 __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; 12 13 14 pCur = curBlock; 15 s0 = _mm_loadu_si128((__m128i*)pCur); 16 s1 = _mm_loadu_si128((__m128i*)(pCur+stepBytesCB)); 17 s2 = _mm_loadu_si128((__m128i*)(pCur+2*stepBytesCB)); 18 s3 = _mm_loadu_si128((__m128i*)(pCur+3*stepBytesCB)); 19 s8 = _mm_unpacklo_epi32(s0, s1); 20 s9 = _mm_unpacklo_epi32(s2, s3); 21 s10 = _mm_unpackhi_epi32(s0, s1); 22 s11 = _mm_unpackhi_epi32(s2, s3); 23 24 for (i = 0; i <= frameHeight-blockHeight; i++) 25 { 26 for (j = 0; j <= frameWidth-16; j += 8) 27 { 28 pCur = curBlock; 29 pRef = refFrame+i*stepBytesRF+j; 30 s2 = _mm_setzero_si128(); 31 s3 = _mm_setzero_si128(); 32 s4 = _mm_setzero_si128(); 33 s5 = _mm_setzero_si128(); 34 for (k = 0; k < blockHeight; k++) 35 { 36 s0 = _mm_loadu_si128((__m128i*)pRef); 37 s1 = _mm_loadu_si128((__m128i*)pCur); 38 s2 = _mm_adds_epu16(s2, _mm_mpsadbw_epu8(s0, s1, 0)); 39 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s1, 1)); 40 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s1, 2)); 41 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s0, s1, 3)); 42 pCur+=stepBytesCB; 43 pRef+=stepBytesRF; 44 } 45 s6 = _mm_minpos_epu16(s2); 46 temSum = _mm_extract_epi16(s6,0); 47 if (temSum < lowSum[0]) 48 { 49 lowSum[0] = temSum; 50 k = _mm_extract_epi16(s6,1); 51 *matchBlock = j+k; 52 *(matchBlock+1) = i; 53 } 54 55 s6 = _mm_minpos_epu16(s3); 56 temSum = _mm_extract_epi16(s6,0); 57 if (temSum < lowSum[1]) 58 { 59 lowSum[1] = temSum; 60 k = _mm_extract_epi16(s6,1); 61 *(matchBlock+2) = j+k; 62 *(matchBlock+3) = i; 63 } 64 65 s6 = _mm_minpos_epu16(s4); 66 temSum = _mm_extract_epi16(s6,0); 67 if (temSum < lowSum[2]) 68 { 69 lowSum[2] = temSum; 70 k = _mm_extract_epi16(s6,1); 71 *(matchBlock+4) = j+k; 72 *(matchBlock+5) = i; 73 } 74 75 s6 = _mm_minpos_epu16(s5); 76 temSum = _mm_extract_epi16(s6,0); 77 if (temSum < lowSum[3]) 78 { 79 lowSum[3] = temSum; 80 k = _mm_extract_epi16(s6,1); 81 *(matchBlock+6) = j+k; 82 *(matchBlock+7) = i; 83 } 84 } 85 86 for (; j <= frameWidth - blockWidth; j++) 87 { 88 pRef = refFrame+i*stepBytesRF+j; 89 s6 = _mm_unpacklo_epi32( 90 _mm_cvtsi32_si128(*(unsigned int*)pRef), 91 _mm_cvtsi32_si128(*(unsigned int*)(pRef+stepBytesRF)) 92 ); 93 94 s6 = _mm_shuffle_epi32(s6, 0x44); 95 s7 = _mm_unpacklo_epi32( 96 _mm_cvtsi32_si128(*(unsigned int*)(pRef+2*stepBytesRF)), 97 _mm_cvtsi32_si128(*(unsigned int*)(pRef+3*stepBytesRF)) 98 ); 99 100 s7 = _mm_shuffle_epi32(s7, 0x44); 101 s0 = _mm_adds_epu16(_mm_sad_epu8(s6, s8), _mm_sad_epu8(s7, s9)); 102 s1 = _mm_adds_epu16( 103 _mm_sad_epu8(s6, s10), 104 _mm_sad_epu8(s7, s11) 105 ); 106 107 temSum = _mm_extract_epi16(s0,0); 108 if (temSum < lowSum[0]) 109 { 110 lowSum[0] = temSum; 111 *matchBlock = j; 112 *(matchBlock+1) = i; 113 } 114 115 temSum = _mm_extract_epi16(s0,4); 116 if (temSum < lowSum[1]) 117 { 118 lowSum[1] = temSum; 119 *(matchBlock+2) = j; 120 *(matchBlock+3) = i; 121 } 122 123 temSum = _mm_extract_epi16(s1,0); 124 if (temSum < lowSum[2]) 125 { 126 lowSum[2] = temSum; 127 *(matchBlock+4) = j; 128 *(matchBlock+5) = i; 129 } 130 131 temSum = _mm_extract_epi16(s1,4); 132 if (temSum < lowSum[3]) 133 { 134 lowSum[3] = temSum; 135 *(matchBlock+6) = j; 136 *(matchBlock+7) = i; 137 } 138 } 139 } 140 return 0; 141 }