lucene 統計單詞次數(詞頻tf)並進行排序

本文轉載自查看原文 2018-09-24 19:39 804 框架類庫/ 搜索

 1 public class WordCount {  2     static Directory directory;  3     // 創建分詞器
 4     static Analyzer analyzer = new IKAnalyzer();  5     static IndexWriterConfig config = new IndexWriterConfig(analyzer);  6     static IndexWriter writer;  7     static IndexReader reader;  8     static {  9         // 指定索引存放目錄以及配置參數
 10         try {  11             directory = FSDirectory.open(Paths.get("F:/luceneIndex"));  12             writer = new IndexWriter(directory, config);  13         } catch (IOException e) {  14  e.printStackTrace();  15  }  16  }  17 
 18     public static void main(String[] args) {  19  indexCreate();  20         Map<String, Long> map = getTotalFreqMap();  21         Map<String, Long> sortMap = sortMapByValue(map);  22         Set<Entry<String, Long>> entrySet = sortMap.entrySet();  23         Iterator<Entry<String, Long>> iterator = entrySet.iterator();  24         while (iterator.hasNext()) {  25             Entry<String, Long> entry = iterator.next();  26             System.out.println(entry.getKey() + "----" + entry.getValue());  27  }  28 
 29  }  30 
 31     /**  32  * 創建索引  33      */
 34     public static void indexCreate() {  35         // 文件夾檢測(創建索引前要保證目錄是空的)
 36         File file = new File("f:/luceneIndex");  37         if (!file.exists()) {  38  file.mkdirs();  39         } else {  40             try {  41  file.delete();  42             } catch (Exception e) {  43  e.printStackTrace();  44  }  45  }  46 
 47         // 將采集的數據封裝到Document中
 48         Document doc = new Document();  49         FieldType ft = new FieldType();  50  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);  51         ft.setStored(true);  52         ft.setStoreTermVectors(true);  53         ft.setTokenized(true);  54         // ft.setStoreTermVectorOffsets(true);  55         // ft.setStoreTermVectorPositions(true);  56 
 57         // 讀取文件內容(小文件,readFully)
 58         File content = new File("f:/qz/twitter.txt");  59         try {  60             byte[] buffer = new byte[(int) content.length()];  61             IOUtils.readFully(new FileInputStream(content), buffer);  62             doc.add(new Field("twitter", new String(buffer), ft));  63         } catch (Exception e) {  64  e.printStackTrace();  65  }  66 
 67         // 生成索引
 68         try {  69  writer.addDocument(doc);  70             // 關閉
 71  writer.close();  72 
 73         } catch (IOException e) {  74  e.printStackTrace();  75  }  76  }  77 
 78     /**  79  * 獲得詞頻map  80  *  81  * @throws ParseException  82      */
 83     public static Map<String, Long> getTotalFreqMap() {  84         Map<String, Long> map = new HashMap<String, Long>();  85         try {  86             reader = DirectoryReader.open(directory);  87             List<LeafReaderContext> leaves = reader.leaves();  88             for (LeafReaderContext leafReaderContext : leaves) {  89                 LeafReader leafReader = leafReaderContext.reader();  90 
 91                 Terms terms = leafReader.terms("twitter");  92 
 93                 TermsEnum iterator = terms.iterator();  94 
 95                 BytesRef term = null;  96 
 97                 while ((term = iterator.next()) != null) {  98                     String text = term.utf8ToString();  99  map.put(text, iterator.totalTermFreq()); 100  } 101 
102  } 103  reader.close(); 104             return map; 105         } catch (IOException e) { 106  e.printStackTrace(); 107  } 108         return null; 109  } 110 
111     /** 112  * 使用 Map按value進行排序 113  * 114  * @param map 115  * @return 116      */
117     public static Map<String, Long> sortMapByValue(Map<String, Long> oriMap) { 118         if (oriMap == null || oriMap.isEmpty()) { 119             return null; 120  } 121         Map<String, Long> sortedMap = new LinkedHashMap<String, Long>(); 122 
123         List<Map.Entry<String, Long>> entryList = new ArrayList<Map.Entry<String, Long>>(oriMap.entrySet()); 124         Collections.sort(entryList, new MapValueComparator()); 125 
126         Iterator<Map.Entry<String, Long>> iter = entryList.iterator(); 127         Map.Entry<String, Long> tmpEntry = null; 128         while (iter.hasNext()) { 129             tmpEntry = iter.next(); 130  sortedMap.put(tmpEntry.getKey(), tmpEntry.getValue()); 131  } 132         return sortedMap; 133  } 134 } 135 
136 class MapValueComparator implements Comparator<Map.Entry<String, Long>> { 137 
138  @Override 139     public int compare(Entry<String, Long> me1, Entry<String, Long> me2) { 140         if (me1.getValue() == me2.getValue()) { 141             return 0; 142  } 143         return me1.getValue() > me2.getValue() ? -1 : 1; 144         // return me1.getValue().compareTo(me2.getValue());
145  } 146 }

map排序代碼https://www.cnblogs.com/zhujiabin/p/6164826.html

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 使用ES對中文文章進行分詞，並進行詞頻統計排序 [Python]統計數列中元素出現的次數並進行排序【shell腳本實例】shell腳本統計單詞頻率、出現次數最多的n個單詞 python簡單詞頻統計 hive進行詞頻統計利用多種方式來統計詞頻（單詞個數） java實現文件單詞頻率統計【學習筆記】C#中HashTable和快速排序的用法，從單詞頻率統計小程序寫起利用python實現簡單詞頻統計、構建詞雲使用storm分別進行計數和詞頻統計