mnn中8bit的量化方法基於兩種KL_divergenc 和ADMM 兩種方法,本文主要介紹基於kl_divergence的8bit量化方法;
mnn 編譯和執行命令
編譯:
cd MNN mkdir build cd build cmake -DMNN_BUILD_QUANTOOLS=ON .. make -j4
執行命令:
./quantized.out origin.mnn quan.mnn preprocessConfig.json
配置參數:
{ "format":"RGB", "mean":[ 127.5, 127.5, 127.5 ], "normal":[ 0.00784314, 0.00784314, 0.00784314 ], "width":224, "height":224, "path":"path/to/images/", "used_image_num":500, "feature_quantize_method":"KL", "weight_quantize_method":"MAX_ABS" }
默認的量化方法是采用kl_divergenc。
mnn 量化的原理:
借用幾張tensor rt的圖說明問題,將fp32裝換為int8,就是原來使用32bit來表示一個tensor,現在使用8bit來表示一個tensor。最簡單的方式就是線性量化:
FP32 Tensor (T) = scale_factor(sf) * 8-bit Tensor(t) + FP32_bias (b)
實驗證明,偏置實際上是不需要的,去掉偏置,
T=sf∗t
sf是每一層上每一個tensor 的比例因子(scaling factor)。實際mnn在處理過程中,卷積運算的每一個通道都計算了不同的比例因子。
簡單的將一個tensor 中的-|MAX|和|MAX| value 映射為-127和127.中間按照線性關系映射。這種映射關系是不飽和的。這種簡單的映射關系,會造成很大的精度損失。mnn和tensorrt的做法是如下:
這種做法不是講|max|映射為127,而是存在一個閾值|T|,將±|T|映射為±127.,大於±|T|的值直接映射為閾值±127. 目前使用的普遍方法是,使用kl散度取獲取kl散度最小的閾值T。
1、什么是kl_divergence
KL(Kullback-Leigler divergence)散度,是用來描述兩個概率分布P和Q的差異的一種方法。多應用於概率論和信息論中。在信息論中D(P||Q)表示用概率分布Q來擬合真實分布P時,產生的信息損耗,其中P表示真實分布,Q表示P的擬合分布。
KL散度的定義
python sample 代碼:
import numpy as np def get_distribution(P): pmax = np.max(P) distribution=np.zeros(2048) interval =2048/pmax for i in P: index = int(np.fabs(i*interval)) if index >= 2048: index = 2047 distribution[index]= distribution[index] +1 return distribution def kl_divergence(P,Q,len): KL =0.0 for i in range(len): try: if Q[i] == 0.0: KL = KL + 1 else: KL = KL+ P[i]*np.log(P[i]/Q[i]) except: print 'Q:{},p:{}'.format(Q[i],P[i]) return KL def test(): #P = np.random.rand(96*3*11*11) P = np.random.standard_normal(96*3*11*11) #Q = np.random.rand(96*3*11*11) Pdistribution = get_distribution(P) kl = np.inf for i in Pdistribution: if i ==0.0: print 'zeor' for k in range(128,2048): reference_distribution = Pdistribution[:k].copy() reference_distribution[k-1] = sum(Pdistribution[k::]) interval = k/128.0 #print interval quantized_distribution = np.zeros(k) for i in range(128): start = i*interval end = (i+1)*interval leftupper = int(np.ceil(start)) if leftupper > start: scale = leftupper-start quantized_distribution[i] += scale * Pdistribution[leftupper-1] rightlower = int(np.floor(end)) if rightlower < end: scale = end - rightlower quantized_distribution[i] += scale * Pdistribution[rightlower] rightlower = int(np.floor(end)) quantized_distribution[i] = sum(Pdistribution[leftupper:rightlower]) expand_distribution = np.zeros(k) for i in range(128): start = i*interval end = (i+1)*interval leftupper = int(np.ceil(start)) count = 0 if leftupper > start: count +=leftupper-start; rightlower =int(np.floor(end)) if rightlower < end: count +=end -rightlower count = count+ rightlower - leftupper if count ==0: continue expandvalue = quantized_distribution[i]/count if leftupper > start and expand_distribution[leftupper-1] !=0: expand_distribution[leftupper-1] = expandvalue*(leftupper-start) if rightlower < end and expand_distribution[rightlower] !=0: expand_distribution[rightlower] = expandvalue*(rightlower - end) expand_distribution[leftupper:rightlower] = expandvalue tempkl = kl_divergence(reference_distribution,expand_distribution,k) if tempkl < kl: kl = tempkl print 'kl :{},index:{}'.format(kl,k) #print 'kl :{},index:{}'.format(tempkl,k) #break return if __name__=="__main__": test()
深度學習量化的過程中,真實的分布P,即每一個tensor 都會分為2048個bin。Q用int8 即[0-127]來擬合真實的分布P
MNN中是怎么計算kl_divergence
1、獲取Q的真實分布:
從量化模型的命令中可以看到,需要500張圖片來模擬真實數據的分布,500張圖片前向計算 ,來獲取每一層的分布 ,代碼入口在Calibration.cpp 文件中,代碼如下:
void Calibration::_computeFeatureScaleKL() { _computeFeatureMapsRange(); _collectFeatureMapsDistribution(); _scales.clear(); for (auto& iter : _featureInfo) { AUTOTIME; _scales[iter.first] = iter.second->finishAndCompute(); } //_featureInfo.clear();//No need now }
函數_computeFeatureMapsRange 是統計每一個卷積層下每個channel下前向計算的最大值和最小值。_collectFeatureMapsDistribution,是根據獲取到每個channel下的最大值來統計2048個bin,每個bin下的權重分布。
finishAndCompute中的_computeThreshold 計算kl散度的最小值,找到最合適的閾值T
int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) { const int targetBinNums = 128; int threshold = targetBinNums; if (mThresholdMethod == THRESHOLD_KL) { float minKLDivergence = 10000.0f; float afterThresholdSum = 0.0f; std::for_each(distribution.begin() + targetBinNums, distribution.end(), [&](float n) { afterThresholdSum += n; }); for (int i = targetBinNums; i < mBinNumber; ++i) { std::vector<float> quantizedDistribution(targetBinNums); std::vector<float> candidateDistribution(i); std::vector<float> expandedDistribution(i); std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin()); candidateDistribution[i - 1] += afterThresholdSum; afterThresholdSum -= distribution[i]; const float binInterval = (float)i / (float)targetBinNums; // merge i bins to target bins for (int j = 0; j < targetBinNums; ++j) { const float start = j * binInterval; const float end = start + binInterval; const int leftUpper = static_cast<int>(std::ceil(start)); if (leftUpper > start) { const float leftScale = leftUpper - start; quantizedDistribution[j] += leftScale * distribution[leftUpper - 1]; } const int rightLower = static_cast<int>(std::floor(end)); if (rightLower < end) { const float rightScale = end - rightLower; quantizedDistribution[j] += rightScale * distribution[rightLower]; } std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) { quantizedDistribution[j] += n; }); } // expand target bins to i bins for (int j = 0; j < targetBinNums; ++j) { const float start = j * binInterval; const float end = start + binInterval; float count = 0; const int leftUpper = static_cast<int>(std::ceil(start)); float leftScale = 0.0f; if (leftUpper > start) { leftScale = leftUpper - start; if (distribution[leftUpper - 1] != 0) { count += leftScale; } } const int rightLower = static_cast<int>(std::floor(end)); float rightScale = 0.0f; if (rightLower < end) { rightScale = end - rightLower; if (distribution[rightLower] != 0) { count += rightScale; } } std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) { if (n != 0) { count += 1; } }); if (count == 0) { continue; } const float toExpandValue = quantizedDistribution[j] / count; if (leftUpper > start && distribution[leftUpper - 1] != 0) { expandedDistribution[leftUpper - 1] += toExpandValue * leftScale; } if (rightLower < end && distribution[rightLower] != 0) { expandedDistribution[rightLower] += toExpandValue * rightScale; } for (int k = leftUpper; k < rightLower; ++k) { if (distribution[k] != 0) { expandedDistribution[k] += toExpandValue; } } } const float curKL = _klDivergence(candidateDistribution, expandedDistribution); // std::cout << "=====> KL: " << i << " ==> " << curKL << std::endl; if (curKL < minKLDivergence) { minKLDivergence = curKL; threshold = i; } } } else if (mThresholdMethod == THRESHOLD_MAX) { threshold = mBinNumber - 1; } else { // TODO, support other method MNN_ASSERT(false); } return threshold; }