http://pangusegment.codeplex.com
PanGu.dll 調用方法
初始化
在進程啟動時,我們需要對盤古分詞進行初始化,初始化的調用代碼如下:
默認方式初始化
PanGu.Segment.Init();
這個調用將使用和 PanGu.dll 同路徑下的 pangu.xml 配置文件
指定配置文件方式初始化
PanGu.Segment.Init(filename);
filename 為pangu.xml 的完整路徑名,如“c:\pangu.xml”
在某些應用中,pangu.xml 不一定在pangu.dll相同的路徑下,或者無法獲取pangu.dll的當前路徑,這時需要通過這種方式調用來讓調用者指定盤古分詞所用的配置文件的絕對路徑。
分詞
Segment segment = new Segment(); ICollection<WordInfo> words = segment.DoSegment(text);
或
ICollection<WordInfo> words = segment.DoSegment(text, options);
或
ICollection<WordInfo> words = segment.DoSegment(text, options, parameters);
其中
· text 為需要分詞的文本
· options 為自定義分詞選項,默認為pangu.xml 中指定的分詞選項
· parameters 為分詞參數,默認為pangu.xml 中指定的分詞參數
分詞選項定義:
public class MatchOptions { /// <summary> /// 中文人名識別 /// </summary> public bool ChineseNameIdentify = false; /// <summary> /// 詞頻優先 /// </summary> public bool FrequencyFirst = false; /// <summary> /// 多元分詞 /// </summary> public bool MultiDimensionality = true; /// <summary> /// 英文多元分詞,這個開關,會將英文中的字母和數字分開。 /// </summary> public bool EnglishMultiDimensionality = false; /// <summary> /// 過濾停用詞 /// </summary> public bool FilterStopWords = true; /// <summary> /// 忽略空格、回車、Tab /// </summary> public bool IgnoreSpace = true; /// <summary> /// 強制一元分詞 /// </summary> public bool ForceSingleWord = false; /// <summary> /// 繁體中文開關 /// </summary> public bool TraditionalChineseEnabled = false; /// <summary> /// 同時輸出簡體和繁體 /// </summary> public bool OutputSimplifiedTraditional = false; /// <summary> /// 未登錄詞識別 /// </summary> public bool UnknownWordIdentify = true; /// <summary> /// 過濾英文,這個選項只有在過濾停用詞選項生效時才有效 /// </summary> public bool FilterEnglish = false; /// <summary> /// 過濾數字,這個選項只有在過濾停用詞選項生效時才有效 /// </summary> public bool FilterNumeric = false; /// <summary> /// 忽略英文大小寫 /// </summary> public bool IgnoreCapital = false; /// <summary> /// 英文分詞 /// </summary> public bool EnglishSegment = false; /// <summary> /// 同義詞輸出 /// </summary> /// <remarks> /// 同義詞輸出功能一般用於對搜索字符串的分詞,不建議在索引時使用 /// </remarks> public bool SynonymOutput = false; /// <summary> /// 通配符匹配輸出 /// </summary> /// <remarks> /// 同義詞輸出功能一般用於對搜索字符串的分詞,不建議在索引時使用 /// </remarks> public bool WildcardOutput = false; /// <summary> /// 對通配符匹配的結果分詞 /// </summary> public bool WildcardSegment = false; /// <summary> /// 是否進行用戶自定義規則匹配 /// </summary> public bool CustomRule = false; }
分詞參數定義
[Serializable] public class MatchParameter { /// <summary> /// 多元分詞冗余度 /// </summary> public int Redundancy = 0; /// <summary> /// 未登錄詞權值 /// </summary> public int UnknowRank = 1; /// <summary> /// 最匹配詞權值 /// </summary> public int BestRank = 5; /// <summary> /// 次匹配詞權值 /// </summary> public int SecRank = 3; /// <summary> /// 再次匹配詞權值 /// </summary> public int ThirdRank = 2; /// <summary> /// 強行輸出的單字的權值 /// </summary> public int SingleRank = 1; /// <summary> /// 數字的權值 /// </summary> public int NumericRank = 1; /// <summary> /// 英文詞匯權值 /// </summary> public int EnglishRank = 5; /// <summary> /// 符號的權值 /// </summary> public int SymbolRank = 1; /// <summary> /// 強制同時輸出簡繁漢字時,非原來文本的漢字輸出權值。 /// 比如原來文本是簡體,這里就是輸出的繁體字的權值,反之亦然。 /// </summary> public int SimplifiedTraditionalRank = 1; /// <summary> /// 同義詞權值 /// </summary> public int SynonymRank = 1; /// <summary> /// 通配符匹配結果的權值 /// </summary> public int WildcardRank = 1; /// <summary> /// 過濾英文選項生效時,過濾大於這個長度的英文。 /// </summary> public int FilterEnglishLength = 0; /// <summary> /// 過濾數字選項生效時,過濾大於這個長度的數字。 /// </summary> public int FilterNumericLength = 0; /// <summary> /// 用戶自定義規則的配件文件名 /// </summary> public string CustomRuleAssemblyFileName = ""; /// <summary> /// 用戶自定義規則的類的完整名,即帶名字空間的名稱 /// </summary> public string CustomRuleFullClassName = ""; }
返回為WordInfo 的集合
public class WordInfo : WordAttribute, IComparable<WordInfo> { /// <summary> /// 當前單詞類型 /// </summary> public WordType WordType; /// <summary> /// 原始的單詞類型 /// </summary> public WordType OriginalWordType; /// <summary> /// 單詞在text 中的起始位置 /// </summary> public int Position; /// <summary> /// Rank for this word /// 單詞權重 /// </summary> public int Rank; /// <summary> /// 單詞 /// </summary> public String Word; /// <summary> /// 詞性 /// </summary> public POS Pos; /// <summary> /// 詞頻 /// </summary> public double Frequency; }
配置文件PanGu.xml
<?xml version="1.0" encoding="utf-8"?> <PanGuSettings xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns="http://www.codeplex.com/pangusegment"> <DictionaryPath>..\Dictionaries</DictionaryPath> <MatchOptions> <ChineseNameIdentify>true</ChineseNameIdentify> <FrequencyFirst>false</FrequencyFirst> <MultiDimensionality>false</MultiDimensionality> <FilterStopWords>true</FilterStopWords> <IgnoreSpace>true</IgnoreSpace> <ForceSingleWord>false</ForceSingleWord> <TraditionalChineseEnabled>false</TraditionalChineseEnabled> <OutputSimplifiedTraditional>false</OutputSimplifiedTraditional> <UnknownWordIdentify>true</UnknownWordIdentify> <FilterEnglish>false</FilterEnglish> <FilterNumeric>false</FilterNumeric> <IgnoreCapital>false</IgnoreCapital> <EnglishSegment>false</EnglishSegment> <SynonymOutput>false</SynonymOutput> <WildcardOutput>false</WildcardOutput> <WildcardSegment>false</WildcardSegment> <CustomRule>false</CustomRule> </MatchOptions> <Parameters> <UnknowRank>1</UnknowRank> <BestRank>5</BestRank> <SecRank>3</SecRank> <ThirdRank>2</ThirdRank> <SingleRank>1</SingleRank> <NumericRank>1</NumericRank> <EnglishRank>5</EnglishRank> <EnglishLowerRank>3</EnglishLowerRank> <EnglishStemRank>2</EnglishStemRank> <SymbolRank>1</SymbolRank> <SimplifiedTraditionalRank>1</SimplifiedTraditionalRank> <SynonymRank>1</SynonymRank> <WildcardRank>1</WildcardRank> <FilterEnglishLength>0</FilterEnglishLength> <FilterNumericLength>0</FilterNumericLength> <CustomRuleAssemblyFileName>CustomRuleExample.dll</CustomRuleAssemblyFileName> <CustomRuleFullClassName>CustomRuleExample.PickupNokia</CustomRuleFullClassName> <Redundancy>0</Redundancy> </Parameters> </PanGuSettings>
其中DictionaryPath 指明字典所在目錄,可以為相對路徑也可以為絕對路徑。
MatchOptions 對應分詞選項
Parameters 對於分詞參數
高亮組件PanGu.HighLight.dll 調用方法
//創建HTMLFormatter,參數為高亮單詞的前后綴 PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>"); //創建Highlighter ,輸入HTMLFormatter 和盤古分詞對象Semgent PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment()); //設置每個摘要段的字符數 highlighter.FragmentSize = 50; //獲取最匹配的摘要段 String abstract = highlighter.GetBestFragment(keywords, news.Content);
