很多時候想通過.NET調用一些C/C++寫的庫,但是一直都不知道怎么弄。去網上找了一些資料,大多數是教如何通過托管C++和非托管C++的混合編程來完成C/C++的的類庫的.NET Wrapper。
有的時候用C#來實現一個功能的時候,可能要調用windows api,往往都是到網上現查代碼,然后粘過來使用,沒有細研究到底是怎么做到的。
最近一個朋友用到分詞,所以就研究了一些中科院提供的中文分詞軟件,詳情請訪問http://ictclas.org/。用了一下還挺好用的,速度沒有測試,感覺應該用於學術研究肯定是沒有問題的,如果要用到項目中,我覺得還是公司自己實現會比較好。
可惜該組件沒有提供.NET的版本的,只有C++的和Java版本的(java版也是通過調用本地c語言的版本)。給的開發包中有一個編譯好的dll庫。
想想之前調用windows api的時候,不正是從一些系統的dll中導入函數,然后再通過C#代碼進行調用的嗎?想到這里,我就覺得我可以通過導入該分詞dll中的函數用C#來完成改程序的wrapper。
說干就干,我試着導入了幾個簡單的函數,發現可以調用,感覺非常好,然后花了一段的時間給這個庫寫了.NET Wrapper,方便自己以后用C#調用該接口來分詞。
核心代碼如下,用單子模式實現,感覺設計得不是很好,不知道各位有沒有什么建議?
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
namespace ICTCLAS.NET
{
//////////////////////////////////////////////////////////////////////// //
// character coding types
//////////////////////////////////////////////////////////////////////// //
public enum ECodeType
{
CODE_TYPE_UNKNOWN, // type unknown
CODE_TYPE_ASCII, // ASCII
CODE_TYPE_GB, // GB2312,GBK,GB10380
CODE_TYPE_UTF8, // UTF-8
CODE_TYPE_BIG5 // BIG5
}
public enum EPosTag
{
/// <summary>
/// 計算所二級標注集
/// </summary>
ICT_POS_MAP_SECOND = 0,
/// <summary>
/// 計算所一級標注集
/// </summary>
ICT_POS_MAP_FIRST = 1,
/// <summary>
/// 北大二級標注集
/// </summary>
PKU_POS_MAP_SECOND = 2,
/// <summary>
/// 北大一級標注集
/// </summary>
PKU_POS_MAP_FIRST = 3,
/// <summary>
/// 標注集 數量
/// </summary>
POS_MAP_NUMBER = 4,
/// <summary>
/// 詞性標記最大字節數
/// </summary>
POS_SIZE = 8
}
[StructLayout(LayoutKind.Explicit)]
struct result_t
{
[FieldOffset( 0)]
public int start;
[FieldOffset( 4)]
public int length;
[FieldOffset( 8)]
public int sPos;
[FieldOffset( 12)]
public int sPosLow;
[FieldOffset( 16)]
public int POS_id;
[FieldOffset( 20)]
public int word_ID;
[FieldOffset( 24)]
public int word_type;
[FieldOffset( 28)]
public int weight;
}
public struct Word
{
public string str;
public int pos_id;
public int word_id;
public int weight;
public int word_type;
}
public class WordSegger
{
class Nested
{
static Nested()
{
}
internal static readonly WordSegger instance = new WordSegger();
}
private static object lockobj = new object();
private static bool inited = false;
/// <summary>
/// 獲取分詞器,第一次調用需要提供path參數
/// 以后調用不傳參數即可
/// </summary>
/// <param name="path"> 指定配置文件和data文件位置 </param>
/// <returns></returns>
public static WordSegger GetInstance( string path = "")
{
if (inited)
{
return Nested.instance;
}
lock (lockobj)
{
if (!inited)
{
inited = ICTCLAS_Init(path);
if (!inited)
{
return null;
}
}
return Nested.instance;
}
}
/// <summary>
/// 主動釋放切詞所占的托管資源
/// </summary>
public static void Release()
{
if (inited)
{
lock (lockobj)
{
if (inited)
{
ICTCLAS_Exit();
inited = false;
}
}
}
}
/// <summary>
/// 構建切詞對象
/// </summary>
private WordSegger()
{
}
/// <summary>
/// 析構函數,主動調用釋放資源
/// </summary>
~WordSegger()
{
Release();
}
/// <summary>
/// 切詞
/// </summary>
/// <param name="str"> 需要切詞的字符串 </param>
/// <param name="ecode"> 編碼 </param>
/// <param name="posTagged"> 是否進行詞性標注 </param>
/// <returns></returns>
public Word[] SegStr( string str, ECodeType ecode, bool posTagged = false)
{
result_t[] result = new result_t[str.Length];
// 切詞
int cnt = ICTCLAS_ParagraphProcessAW(str, result, ecode, posTagged);
Word[] words = new Word[cnt];
byte[] mybyte = Encoding.Default.GetBytes(str);
for ( int i = 0; i < cnt; i++)
{
byte[] byteWord = new byte[result[i].length];
Array.Copy(mybyte, result[i].start, byteWord, 0, result[i].length);
words[i].str = Encoding.Default.GetString(mybyte, result[i].start, result[i].length);
words[i].pos_id = result[i].POS_id;
words[i].word_id = result[i].word_ID;
words[i].weight = result[i].weight;
words[i].word_type = result[i].word_type;
}
return words;
}
/// <summary>
/// 對文本文件切詞
/// </summary>
/// <param name="src"> 源文件路徑 </param>
/// <param name="ct"> 編碼 </param>
/// <param name="des"> 目標文件路徑 </param>
/// <param name="postag"> 是否詞性標注 </param>
/// <returns> 切詞是否成功 </returns>
public bool SegFile( string src, ECodeType ct, string des, bool postag = false)
{
return ICTCLAS_FileProcess(src, des, ct, postag);
}
/// <summary>
/// 設置詞性標注集
/// </summary>
/// <param name="nPOSmap"> 詞性標注集 </param>
/// <returns> 成功/失敗 </returns>
public bool SetPosTagMap(EPosTag nPOSmap)
{
return ICTCLAS_SetPOSmap(nPOSmap);
}
/// <summary>
/// 從文件中導入用戶詞典
///
/// 用戶導入詞匯文件格式如下:
/// 1.詞語與詞性用‘@@’間隔。例如:“中科院@@nr;
/// 2.一行一詞;
/// 3.詞性可省略
/// </summary>
/// <param name="path"> 用戶詞典文件路徑 </param>
/// <param name="ct"> 編碼 </param>
/// <returns> 導入的用戶自定詞的數量 </returns>
public uint ImportUserDictFile( string path, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDictFile(path, ct);
}
/// <summary>
/// 導入用戶詞匯
/// </summary>
/// <param name="userDict">
/// 用戶詞匯
/// 1.詞語與詞性用‘@@’間隔;
/// 2.詞與詞之間用 半角‘;’間隔;
/// 3.詞性可省略
/// 例如:“中科院@@nr;分詞 v;系統@@adj;……;”,
/// 或者:“中科院;分詞;系統;……;”
/// </param>
/// <param name="ct"> 編碼 </param>
/// <returns> 導入的用戶詞匯數量 </returns>
public uint ImportUserDict( string userDict, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDict(userDict, userDict.Length, ct);
}
public bool SaveUserDict()
{
return ICTCLAS_SaveTheUsrDic() == 0 ? false : true;
}
const string DLLPATH = @" ICTCLAS50.dll ";
/// <summary>
/// 初始化,調用其它任何接口前,必須保證本接口調用成功!
/// </summary>
/// <param name="sInitDirPath"> 配置文件及data文件所在路徑 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Init ")]
private static extern bool ICTCLAS_Init( string sInitDirPath);
/// <summary>
/// 釋放資源,所有操作完成后,請調用本接口釋放相關資源!
/// </summary>
/// <returns> 是否成功 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Exit ")]
private static extern bool ICTCLAS_Exit();
/// <summary>
/// 指定詞性標注集
/// </summary>
/// <param name="nPOSmap"> 詞性標注集 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SetPOSmap ")]
private static extern bool ICTCLAS_SetPOSmap(EPosTag nPOSmap);
/// <summary>
/// 導入用戶自定義詞典
/// 用戶導入詞匯文件格式如下:
/// 1.詞語與詞性用‘@@’間隔。例如:“中科院@@nr;
/// 2.一行一詞;
/// 3.詞性可省略
/// </summary>
/// <param name="pszFileName"> 用戶詞典路徑名稱 </param>
/// <param name="codeType"> 詞典編碼類型 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDictFile ")]
private static extern uint ICTCLAS_ImportUserDictFile( string pszFileName, ECodeType codeType = ECodeType.CODE_TYPE_UNKNOWN);
/// <summary>
/// 導入用戶詞典
/// 1.本接口將根據用戶輸入的詞匯,生成相應的用戶詞典。
/// 2.該詞典,將覆蓋內存里原有的用戶詞典。
/// </summary>
/// <param name="pszDictBuffer">
/// 用戶詞典字符串
/// 1.詞語與詞性用‘@@’間隔;
/// 2.詞與詞之間用 半角‘;’間隔;
/// 3.詞性可省略
/// 例如:“中科院@@nr;分詞 v;系統@@adj;……;”,
/// 或者:“中科院;分詞;系統;……;”
/// </param>
/// <param name="nLength"> 字符串長度 </param>
/// <param name="codeType"> 編碼類型 </param>
/// <returns> 成功導入的詞匯數量 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDict ")]
private static extern uint ICTCLAS_ImportUserDict( string pszDictBuffer, int length, ECodeType codeType);
/// <summary>
/// 保存用戶詞典
/// 1.本接口將會覆蓋原有/data/文件夾用戶相關詞典。
/// 2.用戶可在配置文件中,指定下次是否使用該詞典。
/// </summary>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SaveTheUsrDic ")]
private static extern int ICTCLAS_SaveTheUsrDic();
/// <summary>
/// 切詞處理
/// </summary>
/// <param name="sParagraph"> 要處理的文本 </param>
/// <param name="eCT"> 文本編碼 </param>
/// <param name="bPOSTagged"> 是否詞性標注 </param>
/// <param name="result"> 切詞結果 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ParagraphProcessAW ")]
private static extern int ICTCLAS_ParagraphProcessAW( string sParagraph, [Out, MarshalAs(UnmanagedType.LPArray)]result_t[] result, ECodeType eCT, bool bPOSTagged = false);
/// <summary>
/// 文本文件分詞
/// </summary>
/// <param name="sSrcFilename"> 待切詞文件名 </param>
/// <param name="eCt"> 編碼 </param>
/// <param name="sDsnFilename"> 目標文件名 </param>
/// <param name="bPOStagged"> 是否詞性標注 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_FileProcess ")]
private static extern bool ICTCLAS_FileProcess( string sSrcFilename, string sDsnFilename, ECodeType eCt, bool bPOStagged = false);
}
}
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
namespace ICTCLAS.NET
{
//////////////////////////////////////////////////////////////////////// //
// character coding types
//////////////////////////////////////////////////////////////////////// //
public enum ECodeType
{
CODE_TYPE_UNKNOWN, // type unknown
CODE_TYPE_ASCII, // ASCII
CODE_TYPE_GB, // GB2312,GBK,GB10380
CODE_TYPE_UTF8, // UTF-8
CODE_TYPE_BIG5 // BIG5
}
public enum EPosTag
{
/// <summary>
/// 計算所二級標注集
/// </summary>
ICT_POS_MAP_SECOND = 0,
/// <summary>
/// 計算所一級標注集
/// </summary>
ICT_POS_MAP_FIRST = 1,
/// <summary>
/// 北大二級標注集
/// </summary>
PKU_POS_MAP_SECOND = 2,
/// <summary>
/// 北大一級標注集
/// </summary>
PKU_POS_MAP_FIRST = 3,
/// <summary>
/// 標注集 數量
/// </summary>
POS_MAP_NUMBER = 4,
/// <summary>
/// 詞性標記最大字節數
/// </summary>
POS_SIZE = 8
}
[StructLayout(LayoutKind.Explicit)]
struct result_t
{
[FieldOffset( 0)]
public int start;
[FieldOffset( 4)]
public int length;
[FieldOffset( 8)]
public int sPos;
[FieldOffset( 12)]
public int sPosLow;
[FieldOffset( 16)]
public int POS_id;
[FieldOffset( 20)]
public int word_ID;
[FieldOffset( 24)]
public int word_type;
[FieldOffset( 28)]
public int weight;
}
public struct Word
{
public string str;
public int pos_id;
public int word_id;
public int weight;
public int word_type;
}
public class WordSegger
{
class Nested
{
static Nested()
{
}
internal static readonly WordSegger instance = new WordSegger();
}
private static object lockobj = new object();
private static bool inited = false;
/// <summary>
/// 獲取分詞器,第一次調用需要提供path參數
/// 以后調用不傳參數即可
/// </summary>
/// <param name="path"> 指定配置文件和data文件位置 </param>
/// <returns></returns>
public static WordSegger GetInstance( string path = "")
{
if (inited)
{
return Nested.instance;
}
lock (lockobj)
{
if (!inited)
{
inited = ICTCLAS_Init(path);
if (!inited)
{
return null;
}
}
return Nested.instance;
}
}
/// <summary>
/// 主動釋放切詞所占的托管資源
/// </summary>
public static void Release()
{
if (inited)
{
lock (lockobj)
{
if (inited)
{
ICTCLAS_Exit();
inited = false;
}
}
}
}
/// <summary>
/// 構建切詞對象
/// </summary>
private WordSegger()
{
}
/// <summary>
/// 析構函數,主動調用釋放資源
/// </summary>
~WordSegger()
{
Release();
}
/// <summary>
/// 切詞
/// </summary>
/// <param name="str"> 需要切詞的字符串 </param>
/// <param name="ecode"> 編碼 </param>
/// <param name="posTagged"> 是否進行詞性標注 </param>
/// <returns></returns>
public Word[] SegStr( string str, ECodeType ecode, bool posTagged = false)
{
result_t[] result = new result_t[str.Length];
// 切詞
int cnt = ICTCLAS_ParagraphProcessAW(str, result, ecode, posTagged);
Word[] words = new Word[cnt];
byte[] mybyte = Encoding.Default.GetBytes(str);
for ( int i = 0; i < cnt; i++)
{
byte[] byteWord = new byte[result[i].length];
Array.Copy(mybyte, result[i].start, byteWord, 0, result[i].length);
words[i].str = Encoding.Default.GetString(mybyte, result[i].start, result[i].length);
words[i].pos_id = result[i].POS_id;
words[i].word_id = result[i].word_ID;
words[i].weight = result[i].weight;
words[i].word_type = result[i].word_type;
}
return words;
}
/// <summary>
/// 對文本文件切詞
/// </summary>
/// <param name="src"> 源文件路徑 </param>
/// <param name="ct"> 編碼 </param>
/// <param name="des"> 目標文件路徑 </param>
/// <param name="postag"> 是否詞性標注 </param>
/// <returns> 切詞是否成功 </returns>
public bool SegFile( string src, ECodeType ct, string des, bool postag = false)
{
return ICTCLAS_FileProcess(src, des, ct, postag);
}
/// <summary>
/// 設置詞性標注集
/// </summary>
/// <param name="nPOSmap"> 詞性標注集 </param>
/// <returns> 成功/失敗 </returns>
public bool SetPosTagMap(EPosTag nPOSmap)
{
return ICTCLAS_SetPOSmap(nPOSmap);
}
/// <summary>
/// 從文件中導入用戶詞典
///
/// 用戶導入詞匯文件格式如下:
/// 1.詞語與詞性用‘@@’間隔。例如:“中科院@@nr;
/// 2.一行一詞;
/// 3.詞性可省略
/// </summary>
/// <param name="path"> 用戶詞典文件路徑 </param>
/// <param name="ct"> 編碼 </param>
/// <returns> 導入的用戶自定詞的數量 </returns>
public uint ImportUserDictFile( string path, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDictFile(path, ct);
}
/// <summary>
/// 導入用戶詞匯
/// </summary>
/// <param name="userDict">
/// 用戶詞匯
/// 1.詞語與詞性用‘@@’間隔;
/// 2.詞與詞之間用 半角‘;’間隔;
/// 3.詞性可省略
/// 例如:“中科院@@nr;分詞 v;系統@@adj;……;”,
/// 或者:“中科院;分詞;系統;……;”
/// </param>
/// <param name="ct"> 編碼 </param>
/// <returns> 導入的用戶詞匯數量 </returns>
public uint ImportUserDict( string userDict, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDict(userDict, userDict.Length, ct);
}
public bool SaveUserDict()
{
return ICTCLAS_SaveTheUsrDic() == 0 ? false : true;
}
const string DLLPATH = @" ICTCLAS50.dll ";
/// <summary>
/// 初始化,調用其它任何接口前,必須保證本接口調用成功!
/// </summary>
/// <param name="sInitDirPath"> 配置文件及data文件所在路徑 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Init ")]
private static extern bool ICTCLAS_Init( string sInitDirPath);
/// <summary>
/// 釋放資源,所有操作完成后,請調用本接口釋放相關資源!
/// </summary>
/// <returns> 是否成功 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Exit ")]
private static extern bool ICTCLAS_Exit();
/// <summary>
/// 指定詞性標注集
/// </summary>
/// <param name="nPOSmap"> 詞性標注集 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SetPOSmap ")]
private static extern bool ICTCLAS_SetPOSmap(EPosTag nPOSmap);
/// <summary>
/// 導入用戶自定義詞典
/// 用戶導入詞匯文件格式如下:
/// 1.詞語與詞性用‘@@’間隔。例如:“中科院@@nr;
/// 2.一行一詞;
/// 3.詞性可省略
/// </summary>
/// <param name="pszFileName"> 用戶詞典路徑名稱 </param>
/// <param name="codeType"> 詞典編碼類型 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDictFile ")]
private static extern uint ICTCLAS_ImportUserDictFile( string pszFileName, ECodeType codeType = ECodeType.CODE_TYPE_UNKNOWN);
/// <summary>
/// 導入用戶詞典
/// 1.本接口將根據用戶輸入的詞匯,生成相應的用戶詞典。
/// 2.該詞典,將覆蓋內存里原有的用戶詞典。
/// </summary>
/// <param name="pszDictBuffer">
/// 用戶詞典字符串
/// 1.詞語與詞性用‘@@’間隔;
/// 2.詞與詞之間用 半角‘;’間隔;
/// 3.詞性可省略
/// 例如:“中科院@@nr;分詞 v;系統@@adj;……;”,
/// 或者:“中科院;分詞;系統;……;”
/// </param>
/// <param name="nLength"> 字符串長度 </param>
/// <param name="codeType"> 編碼類型 </param>
/// <returns> 成功導入的詞匯數量 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDict ")]
private static extern uint ICTCLAS_ImportUserDict( string pszDictBuffer, int length, ECodeType codeType);
/// <summary>
/// 保存用戶詞典
/// 1.本接口將會覆蓋原有/data/文件夾用戶相關詞典。
/// 2.用戶可在配置文件中,指定下次是否使用該詞典。
/// </summary>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SaveTheUsrDic ")]
private static extern int ICTCLAS_SaveTheUsrDic();
/// <summary>
/// 切詞處理
/// </summary>
/// <param name="sParagraph"> 要處理的文本 </param>
/// <param name="eCT"> 文本編碼 </param>
/// <param name="bPOSTagged"> 是否詞性標注 </param>
/// <param name="result"> 切詞結果 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ParagraphProcessAW ")]
private static extern int ICTCLAS_ParagraphProcessAW( string sParagraph, [Out, MarshalAs(UnmanagedType.LPArray)]result_t[] result, ECodeType eCT, bool bPOSTagged = false);
/// <summary>
/// 文本文件分詞
/// </summary>
/// <param name="sSrcFilename"> 待切詞文件名 </param>
/// <param name="eCt"> 編碼 </param>
/// <param name="sDsnFilename"> 目標文件名 </param>
/// <param name="bPOStagged"> 是否詞性標注 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_FileProcess ")]
private static extern bool ICTCLAS_FileProcess( string sSrcFilename, string sDsnFilename, ECodeType eCt, bool bPOStagged = false);
}
}
整個工程代碼:
ICTCLAS.NET.rar
ICTCLAS分詞接口建議到官方下載,不過為了對應版本,也可以從如下地址下載: