中文分詞之結巴分詞~~~附使用場景+demo(net)


常用技能(更新ing):http://www.cnblogs.com/dunitian/p/4822808.html#skill

技能總綱(更新ing):http://www.cnblogs.com/dunitian/p/5493793.html

在線演示:http://cppjieba-webdemo.herokuapp.com

完整demo:https://github.com/dunitian/TempCode/tree/master/2016-09-05

逆天修改版:https://github.com/dunitian/TempCode/blob/master/2016-09-05/jieba.NET.0.38.2.zip

先說下注意點,結巴分詞他沒有對分詞進行一次去重,我們得自己干這件事;字典得自行配置或者設置成輸出到bin目錄

 

應用場景舉例(搜索那塊大家都知道,說點其他的)

——————————————————————————————————————————————————

言歸正傳:看一組民間統計數據:(非Net版,指的是官方版)

net版的IKanalyzer盤古分詞好多年沒更新了,所以這次選擇了結巴分詞(這個名字也很符合分詞的意境~~結巴說話,是不是也是一種分詞的方式呢?

下面簡單演示一下:

1.先引入包:

2.字典設置:

3.簡單封裝的幫助類:

using System.Linq;
using JiebaNet.Segmenter;
using System.Collections.Generic;

namespace LoTLib.Word.Split
{
    #region 分詞類型
    public enum JiebaTypeEnum
    {
        /// <summary>
        /// 精確模式---最基礎和自然的模式,試圖將句子最精確地切開,適合文本分析
        /// </summary>
        Default,
        /// <summary>
        /// 全模式---可以成詞的詞語都掃描出來, 速度更快,但是不能解決歧義
        /// </summary>
        CutAll,
        /// <summary>
        /// 搜索引擎模式---在精確模式的基礎上對長詞再次切分,提高召回率,適合用於搜索引擎分詞
        /// </summary>
        CutForSearch,
        /// <summary>
        /// 精確模式-不帶HMM
        /// </summary>
        Other
    } 
    #endregion

    /// <summary>
    /// 結巴分詞
    /// </summary>
    public static partial class WordSplitHelper
    {
        /// <summary>
        /// 獲取分詞之后的字符串集合
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var jieba = new JiebaSegmenter();
            switch (type)
            {
                case JiebaTypeEnum.Default:
                    return jieba.Cut(objStr);                 //精確模式-帶HMM
                case JiebaTypeEnum.CutAll:
                    return jieba.Cut(objStr, cutAll: true);   //全模式
                case JiebaTypeEnum.CutForSearch:
                    return jieba.CutForSearch(objStr);        //搜索引擎模式
                default:
                    return jieba.Cut(objStr, false, false);   //精確模式-不帶HMM
            }
        }

        /// <summary>
        /// 獲取分詞之后的字符串
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var words = GetSplitWords(objStr, type);
            //沒結果則返回空字符串
            if (words == null || words.Count() < 1)
            {
                return string.Empty;
            }
            words = words.Distinct();//有時候詞有重復的,得自己處理一下
            return string.Join(",", words);//根據個人需求返回
        }
    }
}

調用很簡單:

            string str = "bootstrap-datetimepicker 進一步跟進~~~開始時間和結束時間的樣式顯示";
            Console.WriteLine("\n精確模式-帶HMM:\n");
            Console.WriteLine(str.GetSplitWordStr());

            Console.WriteLine("\n全模式:\n");
            Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutAll));

            Console.WriteLine("\n搜索引擎模式:\n");
            Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutForSearch));

            Console.WriteLine("\n精確模式-不帶HMM:\n");
            Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.Other));

            Console.ReadKey();

效果:

--------------------------

有人可能會說,那內容關鍵詞提取呢?==》別急,看下面:

這種方式所對應的字典是它=》idf.txt

簡單說下Constants==》

效果:

完整幫助類(最新看github):https://github.com/dunitian/TempCode/tree/master/2016-09-05

using System.Linq;
using JiebaNet.Segmenter;
using System.Collections.Generic;
using JiebaNet.Analyser;

namespace LoTLib.Word.Split
{
    #region 分詞類型
    public enum JiebaTypeEnum
    {
        /// <summary>
        /// 精確模式---最基礎和自然的模式,試圖將句子最精確地切開,適合文本分析
        /// </summary>
        Default,
        /// <summary>
        /// 全模式---可以成詞的詞語都掃描出來, 速度更快,但是不能解決歧義
        /// </summary>
        CutAll,
        /// <summary>
        /// 搜索引擎模式---在精確模式的基礎上對長詞再次切分,提高召回率,適合用於搜索引擎分詞
        /// </summary>
        CutForSearch,
        /// <summary>
        /// 精確模式-不帶HMM
        /// </summary>
        Other
    }
    #endregion

    /// <summary>
    /// 結巴分詞
    /// </summary>
    public static partial class WordSplitHelper
    {
        #region 公用系列
        /// <summary>
        /// 獲取分詞之后的字符串集合
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var jieba = new JiebaSegmenter();
            switch (type)
            {
                case JiebaTypeEnum.Default:
                    return jieba.Cut(objStr);                 //精確模式-帶HMM
                case JiebaTypeEnum.CutAll:
                    return jieba.Cut(objStr, cutAll: true);   //全模式
                case JiebaTypeEnum.CutForSearch:
                    return jieba.CutForSearch(objStr);        //搜索引擎模式
                default:
                    return jieba.Cut(objStr, false, false);   //精確模式-不帶HMM
            }
        }

        /// <summary>
        /// 提取文章關鍵詞集合
        /// </summary>
        /// <param name="objStr"></param>
        /// <returns></returns>
        public static IEnumerable<string> GetArticleKeywords(string objStr)
        {
            var idf = new TfidfExtractor();
            return idf.ExtractTags(objStr, 10, Constants.NounAndVerbPos);//名詞和動詞
        }

        /// <summary>
        /// 返回拼接后的字符串
        /// </summary>
        /// <param name="words"></param>
        /// <returns></returns>
        public static string JoinKeyWords(IEnumerable<string> words)
        {
            //沒結果則返回空字符串
            if (words == null || words.Count() < 1)
            {
                return string.Empty;
            }
            words = words.Distinct();//有時候詞有重復的,得自己處理一下
            return string.Join(",", words);//根據個人需求返回
        }
        #endregion

        #region 擴展相關
        /// <summary>
        /// 獲取分詞之后的字符串
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var words = GetSplitWords(objStr, type);
            return JoinKeyWords(words);
        }

        /// <summary>
        /// 提取文章關鍵詞字符串
        /// </summary>
        /// <param name="objStr"></param>
        /// <returns></returns>
        public static string GetArticleKeywordStr(this string objStr)
        {
            var words = GetArticleKeywords(objStr);
            return JoinKeyWords(words);
        } 
        #endregion
    }
}

 

還有耐心或者只看末尾的有福了~

web端的字典配置那是個煩啊,逆天把源碼微調了下

 

使用方法和上面一樣

web版演示:

https://github.com/dunitian/LoTCode/blob/master/PawChina/PawChina/PawChina.UI/Areas/PawRoot/assets/js/note.js

https://github.com/dunitian/LoTCode/blob/master/PawChina/PawChina/PawChina.UI/Areas/PawRoot/Controllers/PartialViewController.cs

結巴中文分詞相關:

https://github.com/fxsjy/jieba

https://github.com/anderscui/jieba.NET

http://cppjieba-webdemo.herokuapp.com


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM