elasticsearch實戰 修改IK源碼實現詞組動態更新


下載IK源碼

https://github.com/medcl/elasticsearch-analysis-ik/tree/v5.2.0

選擇你對應ik的版本(ps:版本最好一致)

http://localhost:9200/?pretty查看es版本 我的是6.5.1

修改源碼

1.創建一個ext包同時增加3個類文件

DBHelper

package org.wltea.analyzer.ext;

import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.logging.Loggers;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

public class DBHelper {
    Logger logger= Loggers.getLogger(DBHelper.class);


    public static String url = null;
    public static String dbUser = null;
    public static String dbPwd = null;
    public static String dbTable = null;
    private Connection conn;
    public static Map<String, Date> lastImportTimeMap = new HashMap<String, Date>();

    static{
        try {
            Class.forName("com.mysql.jdbc.Driver");// 加載Mysql數據驅動
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    private Connection getConn() throws Exception {
        try {
            conn = DriverManager.getConnection(url, dbUser, dbPwd);// 創建數據連接
        } catch (Exception e) {
            logger.warn("異常了");
            e.printStackTrace();
        }
        return conn;
    }


    /**
     * 從數據庫獲得分詞信息
     * @param key 字段名
     * @param type 分詞類型 0擴展分詞 1停分詞
     * @param delete 是否有效 0有效 1無效
     * @param flag 是否每次加載最新的
     * @param synonyStr
     * @return
     * @throws Exception
     */
    public String getKey(String key, Integer type,boolean delete,boolean flag,String synonyStr) throws Exception {

        conn = getConn();
        StringBuilder data = new StringBuilder();
        PreparedStatement ps = null;
        ResultSet rs = null;
        try {
            StringBuilder sql = new StringBuilder("select  *  from " + dbTable + " where 1=1");
            //lastImportTime 最新更新時間
            Date lastImportTime = DBHelper.lastImportTimeMap.get(key);
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            if (lastImportTime != null && flag) {
                sql.append(" and update_time > '" + sdf.format(lastImportTime) + "'");
            }
            sql.append(" and " + key + " !=''");
            if(type!=null){
                sql.append("and word_type="+type);
            }
            if(delete){
                sql.append(" and delete_type="+1);
            }else{
                sql.append(" and delete_type="+0);
            }
            lastImportTime = new Date();
            lastImportTimeMap.put(key,lastImportTime);
            //如果打印出來的時間 和本地時間不一樣,則要注意JVM時區是否和服務器系統時區一致
            logger.warn("sql==={}",sql.toString());
            System.out.print(conn);
            ps = conn.prepareStatement(sql.toString());
            rs = ps.executeQuery();
            while (rs.next()) {
                String value = rs.getString(key);
                if (StringUtils.isNotBlank(value)) {
                    if (StringUtils.isNotBlank(synonyStr)) {
                        data.append(value + synonyStr);
                    } else {
                        data.append(value + ",");
                    }
                }

            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (ps != null) {
                    ps.close();

                }
                if (rs != null) {
                    rs.close();
                }

                conn.close();

            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return data.toString();
    }
//測試
//    public static void main(String[] args) throws Exception {
//        DBHelper dbHelper=new DBHelper();
//        String extWords=dbHelper.getKey("ext_word",true);
//        List<String> extList = Arrays.asList(extWords.split(","));
//        System.out.println(extList);
//        // System.out.println(getKey("stopword"));
//        // System.out.println(getKey("synonym"));
//        LocalDate now=LocalDate.now();
//
//    }

}

DBRunnable

package org.wltea.analyzer.ext;


import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.logging.Loggers;
import org.wltea.analyzer.dic.Dictionary;

import java.util.Arrays;
import java.util.List;

public class DBRunnable implements Runnable {
    Logger logger = Loggers.getLogger(DBRunnable.class);
    private String wordField;


    public DBRunnable(String wordField) {
        super();
        this.wordField = wordField;
    }


    @Override
    public void run() {
        logger.warn("開始加載詞庫========");
        //獲取詞庫
        Dictionary dic = Dictionary.getSingleton();
        DBHelper dbHelper = new DBHelper();
        try {
            String extWords = dbHelper.getKey(wordField, 0,false,true,",");
            String stopWords = dbHelper.getKey(wordField, 1,false,true,",");
            String extDelWords = dbHelper.getKey(wordField, 0,true,true,",");
            String extStopWords = dbHelper.getKey(wordField, 1,true,true,",");
            if(StringUtils.isNotBlank(extWords)){
                List<String> extList = Arrays.asList(extWords.split(","));
                //把擴展詞加載到主詞庫中
                dic.addWords(extList);
                logger.warn("加載擴展詞成功========");
                logger.warn("extWords為==={}",extWords);
            }
            if(StringUtils.isNotBlank(stopWords)){
                List<String> stopList = Arrays.asList(stopWords.split(","));
                //把擴展詞加載到主詞庫中
                dic.addStopWords(stopList);
                logger.warn("加載停用詞成功========");
                logger.warn("stopWords為==={}",stopWords);
            }
            //移除詞庫
            if(StringUtils.isNotBlank(extDelWords)){
                List<String> stopList = Arrays.asList(extDelWords.split(","));
                //把擴展詞加載到主詞庫中
                dic.disableWords(stopList);
                logger.warn("移除擴展詞成功========");
                logger.warn("extDelWords==={}",extDelWords);
            }
            if(StringUtils.isNotBlank(extStopWords)){
                List<String> stopList = Arrays.asList(extStopWords.split(","));
                //把擴展詞加載到主詞庫中
                dic.disableStopWords(stopList);
                logger.warn("移除停用詞成功========");
                logger.warn("extStopWords==={}",extStopWords);
            }

        } catch (Exception e) {

            logger.warn("加載擴展詞失敗========{}",e);
        }

    }

}

StringUtils

package org.wltea.analyzer.ext;


public class StringUtils {
    /**
     * 判斷字符串是否為空 為空返回true 否則返回false
     * @param str
     * @return
     */
    public static boolean isBlank(String str) {
        int strLen;
        if (str == null || (strLen = str.length()) == 0) {
            return true;
        }
        for (int i = 0; i < strLen; i++) {
            if ((Character.isWhitespace(str.charAt(i)) == false)) {
                return false;
            }
        }
        return true;
    }
    /**
     * 判斷字符串是否不為空 為空返回false 否則返回true
     * @param str
     * @return
     */
    public static boolean isNotBlank(String str) {
        return !StringUtils.isBlank(str);
    }
}

2.Dictionary增加幾個方法

/**
     * 批量加載新停用詞條
     *
     * @param words
     *            Collection<String>詞條列表
     */
    public void addStopWords(Collection<String> words) {
        if (words != null) {
            for (String word : words) {
                if (word != null) {
                    // 批量加載詞條到主內存詞典中
                    _StopWords.fillSegment(word.trim().toCharArray());
                }
            }
        }

    }
    /**
     * 批量移除停用詞條
     *
     * @param words
     *            Collection<String>詞條列表
     */
    public void disableStopWords(Collection<String> words) {
        if (words != null) {
            for (String word : words) {
                if (word != null) {
                    // 批量加載詞條到主內存詞典中
                    _StopWords.disableSegment(word.trim().toCharArray());
                }
            }
        }

    }
    /**
     * 讀取jdbc配置初始化 定時更新數據庫詞組定時任務
     *
     * @throws IOException
     */
    public  void initReloadMysqlWordJob() throws IOException {

        logger.warn("============IKAnalyzer==============");
        Path file = PathUtils.get(getDictRoot(), "jdbc.properties");
        Properties prop = new Properties();
        prop.load(new FileInputStream(file.toFile()));
        logger.info("===========load jdbc.properties========");
        for(Object key : prop.keySet()) {
            logger.info("==========>>" + key + "=" + prop.getProperty(String.valueOf(key)));
        }
        boolean autoReloadDic=Boolean.valueOf(prop.getProperty("autoReloadDic"));
        if(autoReloadDic){
            String dbUser = prop.getProperty("dbUser");
            String dbPwd = prop.getProperty("dbPwd");
            //獲取每隔多久從數據庫更新信息 默認60S
            Integer flushTime = Integer.valueOf(prop.getProperty("flushTime"));
            String dbTable = prop.getProperty("dbTable","t_es_ik_dic");
            DBHelper.dbTable=dbTable;
            DBHelper.dbUser=dbUser;
            DBHelper.dbPwd=dbPwd;
            DBHelper.url=prop.getProperty("dbUrl");
            String wordFieldName = prop.getProperty("wordFieldName");
            ScheduledExecutorService scheduledExecutorService  =  Executors.newSingleThreadScheduledExecutor();
            scheduledExecutorService.scheduleAtFixedRate(new DBRunnable(wordFieldName), 0, flushTime, TimeUnit.SECONDS);
        }
    }

4.在init方法啟用job

public static synchronized Dictionary initial(Configuration cfg) {
        if (singleton == null) {
            synchronized (Dictionary.class) {
                if (singleton == null) {

                    singleton = new Dictionary(cfg);
                    singleton.loadMainDict();
                    singleton.loadSurnameDict();
                    singleton.loadQuantifierDict();
                    singleton.loadSuffixDict();
                    singleton.loadPrepDict();
                    singleton.loadStopWordDict();
                    try {
                        singleton.initReloadMysqlWordJob();
                    } catch (IOException e) {
                        logger.error("動態加載mysql詞組失敗....");
                        e.printStackTrace();
                    }
                    if(cfg.isEnableRemoteDict()){
                        // 建立監控線程
                        for (String location : singleton.getRemoteExtDictionarys()) {
                            // 10 秒是初始延遲可以修改的 60是間隔時間 單位秒
                            pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
                        }
                        for (String location : singleton.getRemoteExtStopWordDictionarys()) {
                            pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
                        }
                    }

                    return singleton;
                }
            }
        }
        return singleton;
    }

將ik安裝導入es

1.打包

2.將zip文件移動到es的plugins文件夾

解壓並重命名為ik

3.ik目錄的config創建一個jdbc.properties文件

dbUrl=jdbc:mysql://ip/port #數據庫連接
dbUser=user #數據庫用戶名
dbPwd=password #數據庫密碼
dbTable=md_es_ik_dic #詞庫表
wordFieldName=word #詞組字段
flushTime=5 #刷新時間 (秒)
autoReloadDic=true #是否啟用

 

4.創建數據庫表

DROP TABLE IF EXISTS `md_es_ik_dic`;
CREATE TABLE `md_es_ik_dic` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id',
  `word` varchar(100) DEFAULT '' COMMENT '擴展分詞',
  `word_type` varchar(100) DEFAULT '' COMMENT '0:擴展分詞  1:停用分詞 ',
  `delete_type` tinyint(4) DEFAULT '0' COMMENT '0表示未刪除,1表示刪除',
  `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '創建時間',
  `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新時間',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=16 DEFAULT CHARSET=utf8 COMMENT='詞庫維護表';

 5.es lib增加一個mysql數據庫驅動文件

6.啟動es測試

get請求es:http://127.0.0.1:9200/_analyze

{
    "analyzer":"ik_max_word",
    "text":"我是一名小正太"
}

分詞結果

{
    "tokens": [
        {
            "token": "我",
            "start_offset": 0,
            "end_offset": 1,
            "type": "CN_CHAR",
            "position": 0
        },
        {
            "token": "是",
            "start_offset": 1,
            "end_offset": 2,
            "type": "CN_CHAR",
            "position": 1
        },
        {
            "token": "一名",
            "start_offset": 2,
            "end_offset": 4,
            "type": "CN_WORD",
            "position": 2
        },
        {
            "token": "一",
            "start_offset": 2,
            "end_offset": 3,
            "type": "TYPE_CNUM",
            "position": 3
        },
        {
            "token": "名",
            "start_offset": 3,
            "end_offset": 4,
            "type": "COUNT",
            "position": 4
        },
        {
            "token": "小",
            "start_offset": 4,
            "end_offset": 5,
            "type": "CN_CHAR",
            "position": 5
        },
        {
            "token": "正",
            "start_offset": 5,
            "end_offset": 6,
            "type": "CN_CHAR",
            "position": 6
        },
        {
            "token": "太",
            "start_offset": 6,
            "end_offset": 7,
            "type": "CN_CHAR",
            "position": 7
        }
    ]
}

如果我們需要小正太分詞也分一個詞在數據庫新增

 

es日期打印

再次測試分詞結果

{
    "tokens": [
        {
            "token": "我",
            "start_offset": 0,
            "end_offset": 1,
            "type": "CN_CHAR",
            "position": 0
        },
        {
            "token": "是",
            "start_offset": 1,
            "end_offset": 2,
            "type": "CN_CHAR",
            "position": 1
        },
        {
            "token": "一名",
            "start_offset": 2,
            "end_offset": 4,
            "type": "CN_WORD",
            "position": 2
        },
        {
            "token": "一",
            "start_offset": 2,
            "end_offset": 3,
            "type": "TYPE_CNUM",
            "position": 3
        },
        {
            "token": "名",
            "start_offset": 3,
            "end_offset": 4,
            "type": "COUNT",
            "position": 4
        },
        {
            "token": "小正太",
            "start_offset": 4,
            "end_offset": 7,
            "type": "CN_WORD",
            "position": 5
        }
    ]
}

可以看到小正太分成了一個詞

 

可能遇到的問題

啟動報錯:Plugin [analysis-ik] was built for Elasticsearch version 6.5.0 but version 6.5.1 is running

因為要求es版本和ik版本要完全一致,可以嘗試一下修改ik目錄下的plugin-descriptor.properties

改成es版本

 

 

找不到數據庫驅動

ikpom增加數據庫驅動依賴   es lib放入數據庫驅動jar

 

The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.

  permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";

修改jre下的lib/security  java.policy

 

我的是在:/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home/jre/lib/security

增加:

permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";

可能會出現當前文件只讀  切換為root權限修改即可

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM