Java實現LSH（Locality Sensitive Hash ）

本文轉載自查看原文 2018-01-07 14:30 1592 Java雜項

　　在對大批量數據進行圖像處理的時候，比如說我提取SIFT特征，數據集為10W張圖片，一個SIFT特征點是128維，一張圖片提取出500個特征點，這樣我們在處理的時候就是對5000萬個128維的數據進行處理，這樣處理所需要的耗時太長了，不符合實際生產的需要。我們需要用一種方法降低運算量，比如說降維。

　　看了一些論文，提到的較多的方法是LSH（Locality Sensitive Hash），就是局部敏感哈希。我們利用LSH方法在5000萬個特征點中篩選出極少量的我們需要的特征點，在對這些極少量的數據進行計算，就可以得到我們想要的結果啦。

  1 package com.demo.lsh;
  2 
  3 import com.demo.config.Constant;
  4 import com.demo.dao.FeatureDao;
  5 import com.demo.dao.FeatureTableDao;
  6 import com.demo.dao.HashTableDao;
  7 import com.demo.entity.HashTable;
  8 import com.demo.utils.MD5Util;
  9 import com.demo.utils.MathUtil;
 10 import org.opencv.core.Mat;
 11 import org.springframework.util.StringUtils;
 12 
 13 import java.io.*;
 14 import java.security.MessageDigest;
 15 import java.security.NoSuchAlgorithmException;
 16 import java.util.*;
 17 
 18 public class LSH {
 19     //維度大小，例如對於sift特征來說就是128
 20     private int dimention = Constant.DIMENTION;
 21     //所需向量中元素可能的上限，譬如對於RGB來說，就是255
 22     private int max = Constant.MAX;
 23     //哈希表的數量，用於更大程度地削減false positive
 24     private int hashCount = Constant.HASHCOUNT;
 25     //LSH隨機選取的采樣位數，該值越小，則近似查找能力越大，但相應的false positive也越大；若該值等於size，則為由近似查找退化為精確匹配
 26     private int bitCount = Constant.BITCOUNT;
 27     //轉化為01字符串之后的位數，等於max乘以dimensions
 28     private int size = dimention * max;
 29     //LSH哈希族，保存了隨機采樣點的INDEX
 30     private int[][] hashFamily;
 31     private HashTableDao hashTableDao;
 32     /**
 33      * 構造函數
 34      */
 35     public LSH(HashTableDao hashTableDao) {
 36         this.hashTableDao = hashTableDao;
 37         dimention = Constant.DIMENTION;
 38         max = Constant.MAX;
 39         hashCount = Constant.HASHCOUNT;
 40         bitCount = Constant.BITCOUNT;
 41         size = dimention * max;
 42         hashFamily = new int[hashCount][bitCount];
 43         generataHashFamily();
 44     }
 45 
 46     /**
 47      * 生成隨機的投影點 ，在程序第一次執行時生成。投影點可以理解為后面去數組的索引值
 48      */
 49     private void generataHashFamily() {
 50         if (new File("/home/fanxuan/data/1.txt").exists()) {
 51             try {
 52                 InputStream in = new FileInputStream("/home/fanxuan/data/1.txt");
 53                 ObjectInputStream oin = new ObjectInputStream(in);
 54                 hashFamily = (int[][]) (oin.readObject());
 55             } catch (IOException e) {
 56                 e.printStackTrace();
 57             } catch (ClassNotFoundException e) {
 58                 e.printStackTrace();
 59             }
 60         }else {
 61             Random rd = new Random();
 62             for (int i = 0; i < hashCount; i++) {
 63                 for (int j = 0; j < bitCount; j++) {
 64                     hashFamily[i][j] = rd.nextInt(size);
 65                 }
 66             }
 67             try {
 68                 OutputStream out = new FileOutputStream("/home/fanxuan/data/1.txt");
 69                 ObjectOutputStream oout = new ObjectOutputStream(out);
 70                 oout.writeObject(hashFamily);
 71             } catch (FileNotFoundException e) {
 72                 e.printStackTrace();
 73             } catch (IOException e) {
 74                 e.printStackTrace();
 75             }
 76         }
 77     }
 78 
 79     //將向量轉化為二進制字符串，比如元素的最大范圍255，則元素65就被轉化為65個1以及190個0
 80     private int[] unAray(int[] data) {
 81         int unArayData[] = new int[size];
 82         for (int i = 0; i < data.length; i++) {
 83             for (int j = 0; j < data[i]; j++) {
 84                 unArayData[i * max + j] = 1;
 85             }
 86         }
 87         return unArayData;
 88     }
 89 
 90     /**
 91      * 將向量映射為LSH中的key
 92      */
 93     private String generateHashKey(int[] list, int hashNum) {
 94         StringBuilder sb = new StringBuilder();
 95         int[] tempData = unAray(list);
 96         int[] hashedData = new int[bitCount];
 97         //首先將向量轉為二進制字符串
 98         for (int i = 0; i < bitCount; i++) {
 99             hashedData[i] = tempData[hashFamily[hashNum][i]];
100             sb.append(hashedData[i]);
101         }
102         //再用常規hash函數比如MD5對key進行壓縮
103         MessageDigest messageDigest = null;
104         try{
105             messageDigest = MessageDigest.getInstance("MD5");
106         }catch (NoSuchAlgorithmException e) {
107 
108         }
109         byte[] binary = sb.toString().getBytes();
110         byte[] hash = messageDigest.digest(binary);
111         String hashV = MD5Util.bufferToHex(hash);
112         return hashV;
113     }
114 
115     /**
116      * 將Sift特征點轉換為Hash存表
117      */
118     public void generateHashMap(String id, int[] vercotr, int featureId) {
119         for (int j = 0; j < hashCount; j++) {
120             String key = generateHashKey(vercotr, j);
121             HashTable hashTableUpdateOrAdd = new HashTable();
122             HashTable hashTable = hashTableDao.findHashTableByBucketId(key);
123             if (hashTable != null) {
124                 String featureIdValue = hashTable.getFeatureId() + "," + featureId;
125                 hashTableUpdateOrAdd.setFeatureId(featureIdValue);
126                 hashTableUpdateOrAdd.setBucketId(key);
127                 hashTableDao.updateHashTableFeatureId(hashTableUpdateOrAdd);
128             } else {
129                 hashTableUpdateOrAdd.setBucketId(key);
130                 hashTableUpdateOrAdd.setFeatureId(String.valueOf(featureId));
131                 hashTableDao.insertHashTable(hashTableUpdateOrAdd);
132             }
133         }
134     }
135 
136     // 查詢與輸入向量最接近（海明空間）的向量
137     public List<String> queryList(int[] data) {
138         List<String> result = new ArrayList<>();
139         for (int j = 0; j < hashCount; j++) {
140             String key = generateHashKey(data, j);
141             result.add(key);
142             HashTable hashTable = hashTableDao.findHashTableByBucketId(key);
143             if (!StringUtils.isEmpty(hashTable.getFeatureId())) {
144                 String[] str = hashTable.getFeatureId().split(",");
145                 for (String string : str) {
146                     result.add(string);
147                 }
148             }
149         }
150         return result;
151     }
152 
153 }

 1 package com.demo.config;
 2 
 3 public class Constant {
 4     //維度大小，例如對於sift特征來說就是128
 5     public static final int DIMENTION = 128;
 6     //所需向量中元素可能的上限，譬如對於RGB來說，就是255
 7     public static final int MAX = 255;
 8     //哈希表的數量，用於更大程度地削減false positive
 9     public static final int HASHCOUNT = 12;
10     //LSH隨機選取的采樣位數，該值越小，則近似查找能力越大，但相應的false positive也越大；若該值等於size，則為由近似查找退化為精確匹配
11     public static final int BITCOUNT = 32;
12 }

　　簡單的介紹下代碼，構造函數LSH（）用來建立LSH對象，hashTableDao為數據表操作對象，不多說;因為局部敏感哈希依賴與一套隨機數，每次產生的結果都不一致，所以我們需要在程序第一次運行的時候將隨機數生成並固定下來，我采用的方法是存放在本地磁盤中，也可以存放在數據庫中。generateHashMap（）方法為數據訓練函數，int[] vercotr為特征向量，其他兩個參數為我需要的標志位。queryList（）方法是篩選方法。

　　感謝http://grunt1223.iteye.com/blog/944894的文章。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Locality Sensitive Hashing，LSH 從NLP任務中文本向量的降維問題，引出LSH（Locality Sensitive Hash 局部敏感哈希）算法及其思想的討論 Locality Sensitive Hashing(局部敏感哈希)之cross-polytope LSH Spark Locality Sensitive Hashing (LSH)局部哈希敏感局部敏感哈希Locality Sensitive Hashing(LSH)之隨機投影法局部敏感哈希(Locality-Sensitive Hashing, LSH) 局部敏感哈希LSH（Locality-Sensitive Hashing）——海量數據相似性查找技術局部敏感哈希算法(Locality Sensitive Hashing) [Algorithm] 局部敏感哈希算法(Locality Sensitive Hashing) Java中實現hash算法