Elasticsearch教程(二)java集成Elasticsearch


1、添加maven

<!--tika抽取文件內容 -->
<dependency>
    <groupId>org.apache.tika</groupId>
    <artifactId>tika-core</artifactId>
    <version>1.12</version>
</dependency>
<dependency>
    <groupId>org.apache.tika</groupId>
    <artifactId>tika-parsers</artifactId>
    <version>1.12</version>
</dependency>
<!--tika end-->
<!--bboss操作elasticsearch-->
<dependency>
    <groupId>com.bbossgroups.plugins</groupId>
    <artifactId>bboss-elasticsearch-rest-jdbc</artifactId>
    <version>5.5.7</version>
</dependency>

<!--Hanlp自然語言分詞-->
<dependency>
    <groupId>com.hankcs</groupId>
    <artifactId>hanlp</artifactId>
    <version>portable-1.7.1</version>
</dependency>

<!-- httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.5</version>
</dependency>

注意:與spring集成時需要注意版本號,版本太高會造成jar包沖突,tika-parsers 依賴poi.jar包,所以項目中不需要單獨添加poi.jar,會造成沖突。

完整的項目elasticsearch-common

pom.xml內容

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hd</groupId>
    <artifactId>elasticsearch-common</artifactId>
    <version>1.0-SNAPSHOT</version>
    <packaging>war</packaging>

    <name>elasticsearch-common Maven Webapp</name>
    <url>http://www.example.com</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
        <mysql.version>5.1.40</mysql.version>
        <druid.version>1.0.29</druid.version>
        <spring.version>4.2.3.RELEASE</spring.version>
        <servlet.version>3.0.1</servlet.version>
        <jackson.version>2.8.8</jackson.version>
        <commons-io.version>2.5</commons-io.version>
        <log4j2.version>2.8.2</log4j2.version>
        <hibernate-validator.version>5.3.5.Final</hibernate-validator.version>
        <hibernate.version>4.3.11.Final</hibernate.version>
        <shiro.version>1.3.2</shiro.version>
        <ehcache.version>2.6.11</ehcache.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>javax.el</groupId>
            <artifactId>javax.el-api</artifactId>
            <version>3.0.0</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.glassfish</groupId>
            <artifactId>javax.el</artifactId>
            <version>3.0.0</version>
            <scope>test</scope>
        </dependency>
      <!--test end-->
        <!--web begin -->
        <dependency>
            <groupId>javax.servlet</groupId>
            <artifactId>javax.servlet-api</artifactId>
            <version>${servlet.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>javax.servlet</groupId>
            <artifactId>jsp-api</artifactId>
            <version>2.0</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>javax.servlet</groupId>
            <artifactId>jstl</artifactId>
            <version>1.2</version>
        </dependency> 
        <!-- web end -->
        <!-- log4j2 begin -->
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>${log4j2.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-jcl</artifactId>
            <version>${log4j2.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-slf4j-impl</artifactId>
            <version>${log4j2.version}</version>
        </dependency>
        <!-- log4j2 end -->
        <!-- spring核心包 -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-core</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-context</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-beans</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-expression</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-orm</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-tx</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-aop</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-webmvc</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-test</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-aspects</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-context-support</artifactId>
        </dependency>

        <!--上傳組件-->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>${commons-io.version}</version>
        </dependency>
        <dependency>
            <groupId>commons-fileupload</groupId>
            <artifactId>commons-fileupload</artifactId>
            <version>1.3.1</version>
        </dependency>

        <dependency>
            <groupId>org.hibernate</groupId>
            <artifactId>hibernate-core</artifactId>
            <version>${hibernate.version}</version>
        </dependency>
        <!--數據庫-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>${mysql.version}</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>druid</artifactId>
            <version>${druid.version}</version>
        </dependency>

        <!-- jackson begin -->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <!--fastjson-->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.54</version>
        </dependency>
        <!-- httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.5</version>
        </dependency>

        <!--tika抽取文件內容 -->
        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-core</artifactId>
            <version>1.12</version>
        </dependency>
        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-parsers</artifactId>
            <version>1.12</version>
        </dependency>
        <!--tika end-->

        <!--bboss操作elasticsearch-->
        <dependency>
            <groupId>com.bbossgroups.plugins</groupId>
            <artifactId>bboss-elasticsearch-rest-jdbc</artifactId>
            <version>5.5.7</version>
        </dependency>

        <!--Hanlp自然語言分詞-->
        <dependency>
            <groupId>com.hankcs</groupId>
            <artifactId>hanlp</artifactId>
            <version>portable-1.7.1</version>
        </dependency>

        <!-- shiro begin -->
        <dependency>
            <groupId>org.apache.shiro</groupId>
            <artifactId>shiro-spring</artifactId>
            <version>${shiro.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>slf4j-api</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- hibernate-validator -->
        <dependency>
            <groupId>org.hibernate</groupId>
            <artifactId>hibernate-validator</artifactId>
            <version>${hibernate-validator.version}</version>
        </dependency>

        <dependency>
            <groupId>net.sf.ehcache</groupId>
            <artifactId>ehcache-core</artifactId>
            <version>${ehcache.version}</version>
        </dependency>
        <dependency>
            <groupId>com.googlecode.ehcache-spring-annotations</groupId>
            <artifactId>ehcache-spring-annotations</artifactId>
            <version>1.2.0</version>
        </dependency>

    </dependencies>

    <build>
        <finalName>elasticsearch-common</finalName>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
                <configuration>
                    <source>${maven.compiler.source}</source>
                    <target>${maven.compiler.target}</target>
                    <encoding>${project.build.sourceEncoding}</encoding>
                </configuration>
            </plugin>
            <!--跳過test begin-->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
                <version>2.4.2</version>
                <configuration>
                    <skip>true</skip>
                </configuration>
            </plugin>
            <!-- jetty:run 添加jetty插件以便啟動 -->
            <plugin>
                <groupId>org.eclipse.jetty</groupId>
                <artifactId>jetty-maven-plugin</artifactId>
                <!-- <version>9.2.12.M0</version> -->
                <version>9.3.10.v20160621</version>
                <configuration>
                    <stopPort>9967</stopPort>
                    <stopKey>stop</stopKey>
                    <scanIntervalSeconds>0</scanIntervalSeconds>
                    <httpConnector>
                        <port>8878</port>
                    </httpConnector>
                    <webApp>
                        <contextPath>/</contextPath>
                    </webApp>
                </configuration>
            </plugin>
            <!-- tomcat7:run -->
            <plugin>
                <groupId>org.apache.tomcat.maven</groupId>
                <artifactId>tomcat7-maven-plugin</artifactId>
                <version>2.2</version>
                <configuration>
                    <port>8878</port>
                    <path>/</path>
                    <uriEncoding>UTF-8</uriEncoding>
                    <server>tomcat7</server>
                </configuration>
                <!-- 配置tomcat熱部署 -->
                <!--<configuration>-->
                <!--<uriEncoding>UTF-8</uriEncoding>-->
                <!--<url>http://localhost:8080/manager/text</url>-->
                <!--<path>/${project.build.finalName}</path>-->
                <!--&lt;!&ndash;<server>tomcat7</server>&ndash;&gt;-->
                <!--<username>tomcat</username>-->
                <!--<password>123456</password>-->
                <!--</configuration>-->
            </plugin>

            <!-- <plugin>
                <groupId>org.zeroturnaround</groupId>
                <artifactId>javarebel-maven-plugin</artifactId>
                <version>1.0.5</version>
                <executions>
                    <execution>
                        <id>generate-rebel-xml</id>
                        <phase>process-resources</phase>
                        <goals>
                            <goal>generate</goal>
                        </goals>
                    </execution>
                </executions>
             </plugin> -->
        </plugins>
    </build>


    <!-- 使用aliyun鏡像 -->
    <repositories>
        <repository>
            <id>aliyun</id>
            <name>aliyun</name>
            <url>http://maven.aliyun.com/nexus/content/groups/public</url>
        </repository>
    </repositories>

    <!-- spring-framework-bom -->
    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>org.springframework</groupId>
                <artifactId>spring-framework-bom</artifactId>
                <version>${spring.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>
</project>

2、配置文件

elasticsearch.properties文件內容

#elasticUser=elastic
#elasticPassword=hzhh123

elasticsearch.rest.hostNames=127.0.0.1:9200
#elasticsearch.rest.hostNames=192.168.200.82:9200,192.168.200.83:9200,192.168.200.85:9200
elasticsearch.dateFormat=yyyy.MM.dd
elasticsearch.timeZone=Asia/Shanghai
elasticsearch.ttl=2d
#在控制台輸出腳本調試開關showTemplate,false關閉,true打開,同時log4j至少是info級別
elasticsearch.showTemplate=true
#elasticsearch.discoverHost=true

http.timeoutConnection = 400000
http.timeoutSocket = 400000
http.connectionRequestTimeout=400000
http.retryTime = 1
http.maxLineLength = -1
http.maxHeaderCount = 200
http.maxTotal = 400
http.defaultMaxPerRoute = 200

elasticsearch.xml


<properties>
    <config file="conf/elasticsearch.properties"/>
    <property name="elasticsearchPropes">
        <propes>

            <property name="elasticsearch.client" value="${elasticsearch.client:restful}">
                <description> <![CDATA[ 客戶端類型:transport,restful ]]></description>
            </property>

            <!--<property name="elasticUser" value="${elasticUser:}">-->
                <!--<description> <![CDATA[ 認證用戶 ]]></description>-->
            <!--</property>-->

            <!--<property name="elasticPassword" value="${elasticPassword:}">-->
                <!--<description> <![CDATA[ 認證口令 ]]></description>-->
            <!--</property>-->
            <!--<property name="elasticsearch.hostNames" value="${elasticsearch.hostNames}">
                <description> <![CDATA[ 指定序列化處理類,默認為kafka.serializer.DefaultEncoder,即byte[] ]]></description>
            </property>-->

            <property name="elasticsearch.rest.hostNames" value="${elasticsearch.rest.hostNames}">
                <description> <![CDATA[ rest協議地址 ]]></description>
            </property>


            <property name="elasticsearch.dateFormat" value="${elasticsearch.dateFormat}">
                <description> <![CDATA[ 索引日期格式]]></description>
            </property>
            <property name="elasticsearch.timeZone" value="${elasticsearch.timeZone}">
                <description> <![CDATA[ 時區信息]]></description>
            </property>

            <property name="elasticsearch.ttl" value="${elasticsearch.ttl}">
                <description> <![CDATA[ ms(毫秒) s(秒) m(分鍾) h(小時) d(天) w(星期)]]></description>
            </property>

            <property name="elasticsearch.showTemplate" value="${elasticsearch.showTemplate:false}">
                <description> <![CDATA[ query dsl腳本日志調試開關,與log info級別日志結合使用]]></description>
            </property>

            <property name="elasticsearch.httpPool" value="${elasticsearch.httpPool:default}">
                <description> <![CDATA[ http連接池邏輯名稱,在conf/httpclient.xml中配置]]></description>
            </property>
            <property name="elasticsearch.discoverHost" value="${elasticsearch.discoverHost:false}">
                <description> <![CDATA[ 是否啟動節點自動發現功能,默認關閉,開啟后每隔10秒探測新加或者移除的es節點,實時更新本地地址清單]]></description>
            </property>


        </propes>
    </property>
    <!--默認的elasticsearch-->
    <property name="elasticSearch"
              class="org.frameworkset.elasticsearch.ElasticSearch"
              init-method="configure"
              destroy-method="stop"
              f:elasticsearchPropes="attr:elasticsearchPropes"/>


</properties>

httpclient.xml

<properties>
    <config file="conf/elasticsearch.properties"/>
    <property name="default"
              f:timeoutConnection = "${http.timeoutConnection}"
              f:timeoutSocket = "${http.timeoutSocket}"
              f:connectionRequestTimeout="${http.connectionRequestTimeout}"
              f:retryTime = "${http.retryTime}"
              f:maxLineLength = "${http.maxLineLength}"
              f:maxHeaderCount = "${http.maxHeaderCount}"
              f:maxTotal = "${http.maxTotal}"
              f:defaultMaxPerRoute = "${http.defaultMaxPerRoute}"
              class="org.frameworkset.spi.remote.http.ClientConfiguration">
    </property>
</properties>

search.xml

<properties>
    <!--
        創建document需要的索引表結構
    -->
    <property name="document">
        <![CDATA[{
        "settings": {
            "number_of_shards": 6,
            "index.refresh_interval": "5s"
        },
        "mappings": {
            "document": {
                "properties": {
                    "title": {
                        "type": "text",
                        "analyzer": "ik_max_word"
                    },
                    "contentbody": {
                        "type": "text",
                        "analyzer": "ik_max_word"
                    },
                    "fileId": {
                        "type": "text"
                    },
                    "description": {
                        "type": "text",
                        "analyzer": "ik_max_word"
                    },
                    "tags": {
                        "type": "text"
                    },
                    "typeId": {
                        "type": "text"
                    },
                    "classicId": {
                        "type": "text"
                    },
                    "url": {
                        "type": "text"
                    },
                    "agentStarttime": {
                        "type": "date"
                        ## ,"format":"yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd'T'HH:mm:ss.SSS||yyyy-MM-dd HH:mm:ss||epoch_millis"
                    },
                    "name": {
                        "type": "keyword"
                    }
            }
        }
    }
    }]]>
    </property>

    <!--
        一個簡單的檢索dsl,中有四個變量
        applicationName1
        applicationName2
        startTime
        endTime
        通過map傳遞變量參數值

        變量語法參考文檔:
    -->
    <property name="searchDatas">
        <![CDATA[{
        "query": {
            "bool": {
                "filter": [
                    {  ## 多值檢索,查找多個應用名稱對應的文檔記錄
                    "terms": {
                        "applicationName.keyword": [#[applicationName1],#[applicationName2]]
            }
        },
    {   ## 時間范圍檢索,返回對應時間范圍內的記錄,接受long型的值
    "range": {
    "agentStarttime": {
    "gte": #[startTime],##統計開始時間
    "lt": #[endTime]  ##統計截止時間
    }
    }
    }
    ]
    }
    },
    ## 最多返回1000條記錄
    "size":1000
    }]]>
    </property>


    <!--
       一個簡單的檢索dsl,中有四個變量
       applicationName1
       applicationName2
       startTime
       endTime
       通過map傳遞變量參數值

       變量語法參考文檔:
   -->
    <property name="searchPagineDatas">
        <![CDATA[{
       "query": {
            "bool": {
                "filter": [
                    {
                    "term": {
                        "classicId": #[classicId]
                   }
                }],
                "must": [
                 {
                   "multi_match": {
                        "query": #[keywords],
                        "fields": ["contentbody","title","description"]
                    }
                 }
                ]
           }
          },
        ## 分頁起點
        "from":#[from] ,
        ## 最多返回size條記錄
        "size":#[size],
        "highlight": {
            "pre_tags": [
            "<mark>"
            ],
            "post_tags": [
            "</mark>"
            ],
            "fields": {
            "*": {}
            },
            "fragment_size": 2147483647
        }
    }]]>
    </property>
    <property name="searchPagineDatas2">
        <![CDATA[{
       "query": {
            "bool": {
                "filter": [
                    {
                    "term": {
                        "classicId": #[classicId]
                   }
                }]
           }
          },
        ## 分頁起點
        "from":#[from] ,
        ## 最多返回size條記錄
        "size":#[size],
        "highlight": {
            "pre_tags": [
            "<mark>"
            ],
            "post_tags": [
            "</mark>"
            ],
            "fields": {
            "*": {}
            },
            "fragment_size": 2147483647
        }
    }]]>
    </property>

    <property name="searchPagineDatas3">
        <![CDATA[{
       "query": {
            "bool": {
                "filter": [
                    {
                    "term": {
                        "typeId": #[typeId]
                   }
                }],
                "must": [
                 {
                   "multi_match": {
                        "query": #[keywords],
                        "fields": ["contentbody","title","description"]
                    }
                 }
                ]
           }
          },
        ## 分頁起點
        "from":#[from] ,
        ## 最多返回size條記錄
        "size":#[size],
        "highlight": {
            "pre_tags": [
            "<mark>"
            ],
            "post_tags": [
            "</mark>"
            ],
            "fields": {
            "*": {}
            },
            "fragment_size": 2147483647
        }
    }]]>
    </property>
    <property name="searchPagineDatas4">
        <![CDATA[{
       "query": {
            "bool": {
                "filter": [
                    {
                    "term": {
                        "typeId": #[typeId]
                   }
                }]
           }
          },
        ## 分頁起點
        "from":#[from] ,
        ## 最多返回size條記錄
        "size":#[size],
        "highlight": {
            "pre_tags": [
            "<mark>"
            ],
            "post_tags": [
            "</mark>"
            ],
            "fields": {
            "*": {}
            },
            "fragment_size": 2147483647
        }
    }]]>
    </property>

    <!--
        一個簡單的檢索dsl,中有四個變量
        applicationName1
        applicationName2
        startTime
        endTime
        通過map傳遞變量參數值

        變量語法參考文檔:
    -->
    <property name="searchDatasArray">
        <![CDATA[{
        "query": {
            "bool": {
                "filter": [
                    {  ## 多值檢索,查找多個應用名稱對應的文檔記錄
                    "terms": {
                        "applicationName.keyword":[
                            #if($applicationNames && $applicationNames.size() > 0)
                        #foreach($applicationName in $applicationNames)
                        #if($velocityCount > 0),#end "$applicationName"
                        #end
                        #end
                    ]
                    }
                },
                    {   ## 時間范圍檢索,返回對應時間范圍內的記錄,接受long型的值
                    "range": {
                        "agentStarttime": {
                            "gte": #[startTime],##統計開始時間
                    "lt": #[endTime]  ##統計截止時間
                    }
                }
                }
                ]
            }
        },
        ## 最多返回1000條記錄
        "size":1000
    }]]>
    </property>
    <!--部分更新,注意:dsl不能換行-->
    <property name="updatePartDocument">
        <![CDATA[{"applicationName" : #[applicationName],"agentStarttime" : #[agentStarttime],"contentbody" : #[contentbody]}]]>
    </property>
</properties>

hanlp.properties

#本配置文件中的路徑的根目錄,根目錄+其他路徑=完整路徑(支持相對路徑,請參考:https://github.com/hankcs/HanLP/pull/254)
#Windows用戶請注意,路徑分隔符統一使用/
root=H:/doc/java/hzhh123
#root=/home/data/software/devsoft/java/hanlp


#好了,以上為唯一需要修改的部分,以下配置項按需反注釋編輯。

#核心詞典路徑
CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
#2元語法詞典路徑
BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
#自定義詞典路徑,用;隔開多個自定義詞典,空格開頭表示在同一個目錄,使用“文件名 詞性”形式則表示這個詞典的詞性默認是該詞性。優先級遞減。
#所有詞典統一使用UTF-8編碼,每一行代表一個單詞,格式遵從[單詞] [詞性A] [A的頻次] [詞性B] [B的頻次] ... 如果不填詞性則表示采用詞典的默認詞性。
CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 現代漢語補充詞庫.txt; 全國地名大全.txt ns; 人名詞典.txt; 機構名詞典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf;
#停用詞詞典路徑
CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
#同義詞詞典路徑
CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
#人名詞典路徑
PersonDictionaryPath=data/dictionary/person/nr.txt
#人名詞典轉移矩陣路徑
PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
#繁簡詞典根目錄
tcDictionaryRoot=data/dictionary/tc
#HMM分詞模型
HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
#分詞結果是否展示詞性
ShowTermNature=true
#IO適配器,實現com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台(Hadoop、Redis等)上運行HanLP
#默認的IO適配器如下,該適配器是基於普通文件系統的。
#IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter
#感知機詞法分析器
PerceptronCWSModelPath=data/model/perceptron/pku199801/cws.bin
PerceptronPOSModelPath=data/model/perceptron/pku199801/pos.bin
PerceptronNERModelPath=data/model/perceptron/pku199801/ner.bin
#CRF詞法分析器
CRFCWSModelPath=data/model/crf/pku199801/cws.txt
CRFPOSModelPath=data/model/crf/pku199801/pos.txt
CRFNERModelPath=data/model/crf/pku199801/ner.txt
#更多配置項請參考 https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L59 自行添加

注意:參考https://github.com/hankcs/HanLP,下載data.zip文件,解壓到H:/doc/java/hzhh123下

3、java代碼

Hanlp.java

package com.hd.util;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.document.sentence.Sentence;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * hzhh123
 * 2019/3/25 14:05
 *
 * @desciption 自然語言處理 中文分詞 詞性標注 命名實體識別 依存句法分析
 * 新詞發現 關鍵詞短語提取 自動摘要 文本分類聚類 拼音簡繁
 * @link https://github.com/hankcs/HanLP
 */
public class HanlpUtil {

    /**
     * @param content
     * @return
     * @description 提取摘要
     */
    public static List<String> summary(String content) {
        List<String> summary = HanLP.extractSummary(content, 3);
        return summary;
    }

    /**
     * @param content
     * @return
     * @desciption 提取短語
     */
    public static List<String> phrase(String content) {
        return HanLP.extractPhrase(content, 5);
    }

    /**
     * @param document
     * @return
     * @throws IOException
     * @desciption 找出相關詞性聚合成一個list
     */
    public static List<String> findWordsAndCollectByLabel(List<String> document) throws IOException {
        /* 對詞性進行分析,找出合適的詞性 */
        CRFLexicalAnalyzer analyzer = new CRFLexicalAnalyzer();
        Sentence analyzeWords = analyzer.analyze(String.valueOf(document));

        List<IWord> wordsByLabell = analyzeWords.findWordsByLabel("n");
        List<IWord> wordsByLabel2 = analyzeWords.findWordsByLabel("ns");
        List<IWord> wordsByLabel3 = analyzeWords.findWordsByLabel("t");
        List<IWord> wordsByLabel4 = analyzeWords.findWordsByLabel("j");
        List<IWord> wordsByLabel5 = analyzeWords.findWordsByLabel("vn");
        List<IWord> wordsByLabel6 = analyzeWords.findWordsByLabel("nr");
        List<IWord> wordsByLabel7 = analyzeWords.findWordsByLabel("nt");
        List<IWord> wordsByLabel8 = analyzeWords.findWordsByLabel("nz");

        wordsByLabell.addAll(wordsByLabel2);
        wordsByLabell.addAll(wordsByLabel3);
        wordsByLabell.addAll(wordsByLabel4);
        wordsByLabell.addAll(wordsByLabel5);
        wordsByLabell.addAll(wordsByLabel6);
        wordsByLabell.addAll(wordsByLabel7);
        wordsByLabell.addAll(wordsByLabel8);

        List<String> words = new ArrayList<>();

        for (IWord word : wordsByLabell) {
            words.add(word.getValue());
        }

        return words;
    }

    public static void main(String[] args) {
        String document = "算法可大致分為基本算法、數據結構的算法、數論算法、計算幾何的算法、圖的算法、動態規划以及數值分析、加密算法、排序算法、檢索算法、隨機化算法、並行算法、厄米變形模型、隨機森林算法。\n" +
                "算法可以寬泛的分為三類,\n" +
                "一,有限的確定性算法,這類算法在有限的一段時間內終止。他們可能要花很長時間來執行指定的任務,但仍將在一定的時間內終止。這類算法得出的結果常取決於輸入值。\n" +
                "二,有限的非確定算法,這類算法在有限的時間內終止。然而,對於一個(或一些)給定的數值,算法的結果並不是唯一的或確定的。\n" +
                "三,無限的算法,是那些由於沒有定義終止定義條件,或定義的條件無法由輸入的數據滿足而不終止運行的算法。通常,無限算法的產生是由於未能確定的定義終止條件。";
        List<String> sentenceList = phrase(document);
        //  List<String> sentenceList = summary(document);
        System.out.println(sentenceList);

    }
}

ElasticsearchResponseEntity.java

package com.hd.util;

import java.util.List;

/**
 * hzhh123
 * 2019/3/22 11:51
 * @descript elasticsearch分頁查詢查詢返回結果內容
 */
public class ElasticsearchResponseEntity<T> {
    private int from=0;
    private int size=10;
    private Long total;
    private List<T> records;

    public ElasticsearchResponseEntity(int from, int size) {
        this.from = from;
        this.size = size;
    }

    public int getFrom() {
        return from;
    }

    public void setFrom(int from) {
        this.from = from;
    }

    public int getSize() {
        return size;
    }

    public void setSize(int size) {
        this.size = size;
    }

    public Long getTotal() {
        return total;
    }

    public void setTotal(Long total) {
        this.total = total;
    }

    public List<T> getRecords() {
        return records;
    }

    public void setRecords(List<T> records) {
        this.records = records;
    }
}

ElasticsearchClentUtil.java

package com.hd.util;

import org.frameworkset.elasticsearch.ElasticSearchException;
import org.frameworkset.elasticsearch.ElasticSearchHelper;
import org.frameworkset.elasticsearch.client.ClientInterface;
import org.frameworkset.elasticsearch.entity.ESBaseData;
import org.frameworkset.elasticsearch.entity.ESDatas;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * hzhh123
 * <p>
 * ES 增刪改查實現
 * @link  https://gitee.com/bboss/bboss-elastic
 * </p>
 */
public class ElasticsearchClentUtil<T extends ESBaseData> {
    private String mappath;

    public ElasticsearchClentUtil(String mappath) {
        this.mappath = mappath;
    }

    /**
     * @param indexName    索引名稱
     * @param indexMapping 表結構名稱
     * @return
     * @description 創建索引庫
     */
    public String createIndex(String indexName, String indexMapping) throws Exception {
        //加載配置文件,單實例多線程安全的
        ClientInterface clientUtil = ElasticSearchHelper.getConfigRestClientUtil(mappath);
        //判斷索引表是否存在
        boolean exist = clientUtil.existIndice(indexName);
        if (exist) {
            //創建一個mapping之前先刪除
            clientUtil.dropIndice(indexName);
        }
        //創建mapping
        return clientUtil.createIndiceMapping(indexName, indexMapping);
    }

    /**
     * @desciption 刪除索引
     * @param indexName
     * @return
     */
    public String dropIndex(String indexName){
        //加載配置文件,單實例多線程安全的
        ClientInterface clientUtil = ElasticSearchHelper.getConfigRestClientUtil(mappath);
        return clientUtil.dropIndice(indexName);
    }

    /**
     * @param indexName 索引庫名稱
     * @param indexType 索引類型
     * @param id        索引id
     * @return
     * @description 刪除文檔索引
     */
    public String deleteDocment(String indexName, String indexType, String id) throws ElasticSearchException {
        //加載配置文件,單實例多線程安全的
        ClientInterface clientUtil = ElasticSearchHelper.getConfigRestClientUtil(mappath);
        return clientUtil.deleteDocument(indexName, indexType, id);
    }


    /**
     * @param indexName 索引庫名稱
     * @param indexType 索引類型
     * @param bean
     * @return
     * @description 添加文檔
     */
    public String addDocument(String indexName, String indexType,T bean){
        //創建創建/修改/獲取/刪除文檔的客戶端對象,單實例多線程安全
        ClientInterface clientUtil = ElasticSearchHelper.getConfigRestClientUtil(mappath);
        return clientUtil.addDocument(indexName,indexType,bean);
    }

    /**
     *
     * @param path _search為檢索操作action
     * @param templateName esmapper/search.xml中定義的dsl語句
     * @param queryFiled 查詢參數
     * @param keywords 查詢參數值
     * @param from 分頁查詢的起始記錄,默認為0
     * @param size 分頁大小,默認為10
     * @return
     */
    public ElasticsearchResponseEntity<T> searchDocumentByKeywords(String path, String templateName, String queryFiled, String keywords,
                                                                   String from, String size, Class <T> beanClass) {
        //加載配置文件,單實例多線程安全的
        ClientInterface clientUtil = ElasticSearchHelper.getConfigRestClientUtil(mappath);
        Map<String,Object> params = new HashMap<String,Object>();
        params.put(queryFiled, keywords);
        //設置分頁參數
        params.put("from",from);
        params.put("size",size);
        ElasticsearchResponseEntity<T> responseEntity = new ElasticsearchResponseEntity<T>(Integer.parseInt(from),Integer.parseInt(size));
        //執行查詢,search為索引表,_search為檢索操作action
        ESDatas<T> esDatas =  //ESDatas包含當前檢索的記錄集合,最多1000條記錄,由dsl中的size屬性指定
                clientUtil.searchList(path,//search為索引表,_search為檢索操作action
                        templateName,//esmapper/search.xml中定義的dsl語句
                        params,//變量參數
                        beanClass);//返回的文檔封裝對象類型

        //獲取結果對象列表,最多返回1000條記錄
        List<T> documentList = esDatas.getDatas();
        System.out.println(documentList==null);
        //獲取總記錄數
        long totalSize = esDatas.getTotalSize();
        responseEntity.setTotal(totalSize);
        for(int i = 0; documentList != null && i < documentList.size(); i ++) {//遍歷檢索結果列表
            T doc = documentList.get(i);
            //記錄中匹配上檢索條件的所有字段的高亮內容
            Map<String, List<Object>> highLights = doc.getHighlight();
            Iterator<Map.Entry<String, List<Object>>> entries = highLights.entrySet().iterator();
            while (entries.hasNext()) {
                Map.Entry<String, List<Object>> entry = entries.next();
                String fieldName = entry.getKey();
                System.out.print(fieldName + ":");
                List<Object> fieldHighLightSegments = entry.getValue();
                for (Object highLightSegment : fieldHighLightSegments) {
                    /**
                     * 在dsl中通過<mark></mark>來標識需要高亮顯示的內容,然后傳到web ui前端的時候,通過為mark元素添加css樣式來設置高亮的顏色背景樣式
                     * 例如:
                     * <style type="text/css">
                     *     .mark,mark{background-color:#f39c12;padding:.2em}
                     * </style>
                     */
                    System.out.println(highLightSegment);
                }
            }
        }
        responseEntity.setRecords(documentList);
        return responseEntity;
    }

    /**
     *
     * @param path _search為檢索操作action
     * @param templateName esmapper/search.xml中定義的dsl語句
     * @param  paramsMap 包含from和size,還有其他要查詢的key-value
     * @return
     */
    public ElasticsearchResponseEntity<T> searchDocumentByKeywords(String path, String templateName, Map<String,String> paramsMap,
                                                                    Class <T> beanClass) {
        //加載配置文件,單實例多線程安全的
        ClientInterface clientUtil = ElasticSearchHelper.getConfigRestClientUtil(mappath);
        ElasticsearchResponseEntity<T> responseEntity = new ElasticsearchResponseEntity<T>(Integer.parseInt(paramsMap.get("from")),Integer.parseInt(paramsMap.get("size")));
        //執行查詢,search為索引表,_search為檢索操作action
        ESDatas<T> esDatas =  //ESDatas包含當前檢索的記錄集合,最多1000條記錄,由dsl中的size屬性指定
                clientUtil.searchList(path,//search為索引表,_search為檢索操作action
                        templateName,//esmapper/search.xml中定義的dsl語句
                        paramsMap,//變量參數
                        beanClass);//返回的文檔封裝對象類型

        //獲取結果對象列表,最多返回1000條記錄
        List<T> documentList = esDatas.getDatas();
        System.out.println(documentList==null);
        //獲取總記錄數
        long totalSize = esDatas.getTotalSize();
        responseEntity.setTotal(totalSize);
        for(int i = 0; documentList != null && i < documentList.size(); i ++) {//遍歷檢索結果列表
            T doc = documentList.get(i);
            //記錄中匹配上檢索條件的所有字段的高亮內容
            Map<String, List<Object>> highLights = doc.getHighlight();
            Iterator<Map.Entry<String, List<Object>>> entries = highLights.entrySet().iterator();
            while (entries.hasNext()) {
                Map.Entry<String, List<Object>> entry = entries.next();
                String fieldName = entry.getKey();
                System.out.print(fieldName + ":");
                List<Object> fieldHighLightSegments = entry.getValue();
                for (Object highLightSegment : fieldHighLightSegments) {
                    /**
                     * 在dsl中通過<mark></mark>來標識需要高亮顯示的內容,然后傳到web ui前端的時候,通過為mark元素添加css樣式來設置高亮的顏色背景樣式
                     * 例如:
                     * <style type="text/css">
                     *     .mark,mark{background-color:#f39c12;padding:.2em}
                     * </style>
                     */
                    System.out.println(highLightSegment);
                }
            }
        }
        responseEntity.setRecords(documentList);
        return responseEntity;
    }

}

具體的代碼參考https://gitee.com/hzhh123/elasticsearch-common.git


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM