下載IK源碼
https://github.com/medcl/elasticsearch-analysis-ik/tree/v5.2.0
選擇你對應ik的版本(ps:版本最好一致)
http://localhost:9200/?pretty查看es版本 我的是6.5.1
修改源碼
1.創建一個ext包同時增加3個類文件
DBHelper
package org.wltea.analyzer.ext; import org.apache.logging.log4j.Logger; import org.elasticsearch.common.logging.Loggers; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; public class DBHelper { Logger logger= Loggers.getLogger(DBHelper.class); public static String url = null; public static String dbUser = null; public static String dbPwd = null; public static String dbTable = null; private Connection conn; public static Map<String, Date> lastImportTimeMap = new HashMap<String, Date>(); static{ try { Class.forName("com.mysql.jdbc.Driver");// 加載Mysql數據驅動 } catch (Exception e) { e.printStackTrace(); } } private Connection getConn() throws Exception { try { conn = DriverManager.getConnection(url, dbUser, dbPwd);// 創建數據連接 } catch (Exception e) { logger.warn("異常了"); e.printStackTrace(); } return conn; } /** * 從數據庫獲得分詞信息 * @param key 字段名 * @param type 分詞類型 0擴展分詞 1停分詞 * @param delete 是否有效 0有效 1無效 * @param flag 是否每次加載最新的 * @param synonyStr * @return * @throws Exception */ public String getKey(String key, Integer type,boolean delete,boolean flag,String synonyStr) throws Exception { conn = getConn(); StringBuilder data = new StringBuilder(); PreparedStatement ps = null; ResultSet rs = null; try { StringBuilder sql = new StringBuilder("select * from " + dbTable + " where 1=1"); //lastImportTime 最新更新時間 Date lastImportTime = DBHelper.lastImportTimeMap.get(key); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); if (lastImportTime != null && flag) { sql.append(" and update_time > '" + sdf.format(lastImportTime) + "'"); } sql.append(" and " + key + " !=''"); if(type!=null){ sql.append("and word_type="+type); } if(delete){ sql.append(" and delete_type="+1); }else{ sql.append(" and delete_type="+0); } lastImportTime = new Date(); lastImportTimeMap.put(key,lastImportTime); //如果打印出來的時間 和本地時間不一樣,則要注意JVM時區是否和服務器系統時區一致 logger.warn("sql==={}",sql.toString()); System.out.print(conn); ps = conn.prepareStatement(sql.toString()); rs = ps.executeQuery(); while (rs.next()) { String value = rs.getString(key); if (StringUtils.isNotBlank(value)) { if (StringUtils.isNotBlank(synonyStr)) { data.append(value + synonyStr); } else { data.append(value + ","); } } } } catch (Exception e) { e.printStackTrace(); } finally { try { if (ps != null) { ps.close(); } if (rs != null) { rs.close(); } conn.close(); } catch (Exception e) { e.printStackTrace(); } } return data.toString(); } //測試 // public static void main(String[] args) throws Exception { // DBHelper dbHelper=new DBHelper(); // String extWords=dbHelper.getKey("ext_word",true); // List<String> extList = Arrays.asList(extWords.split(",")); // System.out.println(extList); // // System.out.println(getKey("stopword")); // // System.out.println(getKey("synonym")); // LocalDate now=LocalDate.now(); // // } }
DBRunnable
package org.wltea.analyzer.ext; import org.apache.logging.log4j.Logger; import org.elasticsearch.common.logging.Loggers; import org.wltea.analyzer.dic.Dictionary; import java.util.Arrays; import java.util.List; public class DBRunnable implements Runnable { Logger logger = Loggers.getLogger(DBRunnable.class); private String wordField; public DBRunnable(String wordField) { super(); this.wordField = wordField; } @Override public void run() { logger.warn("開始加載詞庫========"); //獲取詞庫 Dictionary dic = Dictionary.getSingleton(); DBHelper dbHelper = new DBHelper(); try { String extWords = dbHelper.getKey(wordField, 0,false,true,","); String stopWords = dbHelper.getKey(wordField, 1,false,true,","); String extDelWords = dbHelper.getKey(wordField, 0,true,true,","); String extStopWords = dbHelper.getKey(wordField, 1,true,true,","); if(StringUtils.isNotBlank(extWords)){ List<String> extList = Arrays.asList(extWords.split(",")); //把擴展詞加載到主詞庫中 dic.addWords(extList); logger.warn("加載擴展詞成功========"); logger.warn("extWords為==={}",extWords); } if(StringUtils.isNotBlank(stopWords)){ List<String> stopList = Arrays.asList(stopWords.split(",")); //把擴展詞加載到主詞庫中 dic.addStopWords(stopList); logger.warn("加載停用詞成功========"); logger.warn("stopWords為==={}",stopWords); } //移除詞庫 if(StringUtils.isNotBlank(extDelWords)){ List<String> stopList = Arrays.asList(extDelWords.split(",")); //把擴展詞加載到主詞庫中 dic.disableWords(stopList); logger.warn("移除擴展詞成功========"); logger.warn("extDelWords==={}",extDelWords); } if(StringUtils.isNotBlank(extStopWords)){ List<String> stopList = Arrays.asList(extStopWords.split(",")); //把擴展詞加載到主詞庫中 dic.disableStopWords(stopList); logger.warn("移除停用詞成功========"); logger.warn("extStopWords==={}",extStopWords); } } catch (Exception e) { logger.warn("加載擴展詞失敗========{}",e); } } }
StringUtils
package org.wltea.analyzer.ext; public class StringUtils { /** * 判斷字符串是否為空 為空返回true 否則返回false * @param str * @return */ public static boolean isBlank(String str) { int strLen; if (str == null || (strLen = str.length()) == 0) { return true; } for (int i = 0; i < strLen; i++) { if ((Character.isWhitespace(str.charAt(i)) == false)) { return false; } } return true; } /** * 判斷字符串是否不為空 為空返回false 否則返回true * @param str * @return */ public static boolean isNotBlank(String str) { return !StringUtils.isBlank(str); } }
2.Dictionary增加幾個方法
/** * 批量加載新停用詞條 * * @param words * Collection<String>詞條列表 */ public void addStopWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量加載詞條到主內存詞典中 _StopWords.fillSegment(word.trim().toCharArray()); } } } } /** * 批量移除停用詞條 * * @param words * Collection<String>詞條列表 */ public void disableStopWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量加載詞條到主內存詞典中 _StopWords.disableSegment(word.trim().toCharArray()); } } } } /** * 讀取jdbc配置初始化 定時更新數據庫詞組定時任務 * * @throws IOException */ public void initReloadMysqlWordJob() throws IOException { logger.warn("============IKAnalyzer=============="); Path file = PathUtils.get(getDictRoot(), "jdbc.properties"); Properties prop = new Properties(); prop.load(new FileInputStream(file.toFile())); logger.info("===========load jdbc.properties========"); for(Object key : prop.keySet()) { logger.info("==========>>" + key + "=" + prop.getProperty(String.valueOf(key))); } boolean autoReloadDic=Boolean.valueOf(prop.getProperty("autoReloadDic")); if(autoReloadDic){ String dbUser = prop.getProperty("dbUser"); String dbPwd = prop.getProperty("dbPwd"); //獲取每隔多久從數據庫更新信息 默認60S Integer flushTime = Integer.valueOf(prop.getProperty("flushTime")); String dbTable = prop.getProperty("dbTable","t_es_ik_dic"); DBHelper.dbTable=dbTable; DBHelper.dbUser=dbUser; DBHelper.dbPwd=dbPwd; DBHelper.url=prop.getProperty("dbUrl"); String wordFieldName = prop.getProperty("wordFieldName"); ScheduledExecutorService scheduledExecutorService = Executors.newSingleThreadScheduledExecutor(); scheduledExecutorService.scheduleAtFixedRate(new DBRunnable(wordFieldName), 0, flushTime, TimeUnit.SECONDS); } }
4.在init方法啟用job
public static synchronized Dictionary initial(Configuration cfg) { if (singleton == null) { synchronized (Dictionary.class) { if (singleton == null) { singleton = new Dictionary(cfg); singleton.loadMainDict(); singleton.loadSurnameDict(); singleton.loadQuantifierDict(); singleton.loadSuffixDict(); singleton.loadPrepDict(); singleton.loadStopWordDict(); try { singleton.initReloadMysqlWordJob(); } catch (IOException e) { logger.error("動態加載mysql詞組失敗...."); e.printStackTrace(); } if(cfg.isEnableRemoteDict()){ // 建立監控線程 for (String location : singleton.getRemoteExtDictionarys()) { // 10 秒是初始延遲可以修改的 60是間隔時間 單位秒 pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); } for (String location : singleton.getRemoteExtStopWordDictionarys()) { pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); } } return singleton; } } } return singleton; }
將ik安裝導入es
1.打包
2.將zip文件移動到es的plugins文件夾
解壓並重命名為ik
3.ik目錄的config創建一個jdbc.properties文件
dbUrl=jdbc:mysql://ip/port #數據庫連接 dbUser=user #數據庫用戶名 dbPwd=password #數據庫密碼 dbTable=md_es_ik_dic #詞庫表 wordFieldName=word #詞組字段 flushTime=5 #刷新時間 (秒) autoReloadDic=true #是否啟用
4.創建數據庫表
DROP TABLE IF EXISTS `md_es_ik_dic`; CREATE TABLE `md_es_ik_dic` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id', `word` varchar(100) DEFAULT '' COMMENT '擴展分詞', `word_type` varchar(100) DEFAULT '' COMMENT '0:擴展分詞 1:停用分詞 ', `delete_type` tinyint(4) DEFAULT '0' COMMENT '0表示未刪除,1表示刪除', `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '創建時間', `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新時間', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=16 DEFAULT CHARSET=utf8 COMMENT='詞庫維護表';
5.es lib增加一個mysql數據庫驅動文件
6.啟動es測試
get請求es:http://127.0.0.1:9200/_analyze
{ "analyzer":"ik_max_word", "text":"我是一名小正太" }
分詞結果
{ "tokens": [ { "token": "我", "start_offset": 0, "end_offset": 1, "type": "CN_CHAR", "position": 0 }, { "token": "是", "start_offset": 1, "end_offset": 2, "type": "CN_CHAR", "position": 1 }, { "token": "一名", "start_offset": 2, "end_offset": 4, "type": "CN_WORD", "position": 2 }, { "token": "一", "start_offset": 2, "end_offset": 3, "type": "TYPE_CNUM", "position": 3 }, { "token": "名", "start_offset": 3, "end_offset": 4, "type": "COUNT", "position": 4 }, { "token": "小", "start_offset": 4, "end_offset": 5, "type": "CN_CHAR", "position": 5 }, { "token": "正", "start_offset": 5, "end_offset": 6, "type": "CN_CHAR", "position": 6 }, { "token": "太", "start_offset": 6, "end_offset": 7, "type": "CN_CHAR", "position": 7 } ] }
如果我們需要小正太分詞也分一個詞在數據庫新增
es日期打印
再次測試分詞結果
{ "tokens": [ { "token": "我", "start_offset": 0, "end_offset": 1, "type": "CN_CHAR", "position": 0 }, { "token": "是", "start_offset": 1, "end_offset": 2, "type": "CN_CHAR", "position": 1 }, { "token": "一名", "start_offset": 2, "end_offset": 4, "type": "CN_WORD", "position": 2 }, { "token": "一", "start_offset": 2, "end_offset": 3, "type": "TYPE_CNUM", "position": 3 }, { "token": "名", "start_offset": 3, "end_offset": 4, "type": "COUNT", "position": 4 }, { "token": "小正太", "start_offset": 4, "end_offset": 7, "type": "CN_WORD", "position": 5 } ] }
可以看到小正太分成了一個詞
可能遇到的問題
啟動報錯:Plugin [analysis-ik] was built for Elasticsearch version 6.5.0 but version 6.5.1 is running
因為要求es版本和ik版本要完全一致,可以嘗試一下修改ik目錄下的plugin-descriptor.properties
改成es版本
找不到數據庫驅動
ikpom增加數據庫驅動依賴 es lib放入數據庫驅動jar
The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.
permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";
修改jre下的lib/security java.policy
我的是在:/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home/jre/lib/security
增加:
permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";
可能會出現當前文件只讀 切換為root權限修改即可