Trie樹實現[ java ]

本文轉載自查看原文 2012-04-27 21:25 5384 71-算法

trie樹的定義這里就不多說了，直接貼代碼（代碼大部分是參考別人的，修改了個別錯誤，並添加了一個最大匹配的方法）。

package com.fox.analyzer;

import java.util.ArrayList;
import java.util.List;

public class Trie {
	private Vertex root = new Vertex();

	protected class Vertex {
		protected int words; // 單詞個數
		protected int prefixes; // 前綴個數
		protected Vertex[] edges; // 子節點

		Vertex() {
			this.words = 0;
			this.prefixes = 0;
			edges = new Vertex[26];
			for (int i = 0; i < edges.length; i++) {
				edges[i] = null;
			}
		}
	}

	/**
	 * 獲取tire樹中所有的詞
	 * 
	 * @return
	 */

	public List<String> listAllWords() {

		List<String> words = new ArrayList<String>();
		Vertex[] edges = root.edges;

		for (int i = 0; i < edges.length; i++) {
			if (edges[i] != null) {
				String word = "" + (char) ('a' + i);
				depthFirstSearchWords(words, edges[i], word);
			}
		}
		return words;
	}

	/**
	 * 
	 * @param words
	 * @param vertex
	 * @param wordSegment
	 */

	private void depthFirstSearchWords(List words, Vertex vertex,
			String wordSegment) {
		if (vertex.words != 0) {
			words.add(wordSegment);
		}
		Vertex[] edges = vertex.edges;
		for (int i = 0; i < edges.length; i++) {
			if (edges[i] != null) {
				String newWord = wordSegment + (char) ('a' + i);
				depthFirstSearchWords(words, edges[i], newWord);
			}
		}
	}

	/**
	 * 計算指定前綴單詞的個數
	 * 
	 * @param prefix
	 * @return
	 */
	public int countPrefixes(String prefix) {
		return countPrefixes(root, prefix);
	}

	private int countPrefixes(Vertex vertex, String prefixSegment) {
		if (prefixSegment.length() == 0) { // reach the last character of the
											// word
			return vertex.prefixes;
		}

		char c = prefixSegment.charAt(0);
		int index = c - 'a';
		if (vertex.edges[index] == null) { // the word does NOT exist
			return 0;
		} else {

			return countPrefixes(vertex.edges[index],
					prefixSegment.substring(1));

		}

	}

	/**
	 * 計算完全匹配單詞的個數
	 * 
	 * @param word
	 * @return
	 */
	public int countWords(String word) {
		return countWords(root, word);
	}

	private int countWords(Vertex vertex, String wordSegment) {
		if (wordSegment.length() == 0) { // reach the last character of the word
			return vertex.words;
		}

		char c = wordSegment.charAt(0);
		int index = c - 'a';
		if (vertex.edges[index] == null) { // the word does NOT exist
			return 0;
		} else {
			return countWords(vertex.edges[index], wordSegment.substring(1));

		}

	}

	/**
	 * 向tire樹添加一個詞
	 * 
	 * @param word
	 * 
	 */

	public void addWord(String word) {
		addWord(root, word);
	}

	/**
	 * Add the word from the specified vertex.
	 * 
	 * @param vertex
	 *            The specified vertex.
	 * @param word
	 *            The word to be added.
	 */

	private void addWord(Vertex vertex, String word) {
		if (word.length() == 0) { // if all characters of the word has been
									// added
			vertex.words++;
		} else {
			vertex.prefixes++;
			char c = word.charAt(0);
			c = Character.toLowerCase(c);
			int index = c - 'a';
			if (vertex.edges[index] == null) { // if the edge does NOT exist
				vertex.edges[index] = new Vertex();
			}

			addWord(vertex.edges[index], word.substring(1)); // go the the next
																// character
		}
	}

	/**
	 * 返回指定字段前綴匹配最長的單詞。
	 * 
	 * @param word
	 * @return
	 */
	public String getMaxMatchWord(String word) {
		String s = "";
		String temp = "";// 記錄最近一次匹配最長的單詞
		char[] w = word.toCharArray();
		Vertex vertex = root;
		for (int i = 0; i < w.length; i++) {
			char c = w[i];
			c = Character.toLowerCase(c);
			int index = c - 'a';
			if (vertex.edges[index] == null) {// 如果沒有子節點
				if (vertex.words != 0)// 如果是一個單詞，則返回
					return s;
				else
					// 如果不是一個單詞則返回null
					return null;
			} else {
				if (vertex.words != 0)
					temp = s;
				s += c;
				vertex = vertex.edges[index];
			}
		}
		// trie中存在比指定單詞更長（包含指定詞）的單詞
		if (vertex.words == 0)//
			return temp;
		return s;
	}

	public static void main(String args[]) // Just used for test
	{
		Trie trie = new Trie();
		trie.addWord("abcedfddddddd");
		trie.addWord("a");
		trie.addWord("ba");
		trie.addWord("abce");
		trie.addWord("abcedfdddd");
		trie.addWord("abcef");

		String maxMatch = trie.getMaxMatchWord("abcedfddd");
		System.out.println(maxMatch);
		// List<String> list = trie.listAllWords();
		// Iterator listiterator = list.listIterator();
		// while (listiterator.hasNext()) {
		// String s = (String) listiterator.next();
		// System.out.println(s);
		// }

		// int count = trie.countPrefixes("abcdef");
		// int count1 = trie.countWords("abcd");
		// System.out.println("prefixes:" + count);
		// System.out.println("countWords:" + count1);

	}
}

最近開始慢慢了解中文分詞技術，剛開始當然就是最大匹配了，之前做了一個非常簡單的算法，字典存儲在hashSet中，每次正向或逆向做最大切分N（字典中最長詞組的長度，取N個字符），不匹配則取（N-1）個字符，繼續匹配，如此循環只到匹配字典中的一個單詞或者被切分成單字（1個字符）為止。例如我找的一個詞典最長詞組的長度為16，然后中文詞組普遍是二字或三字詞語，這里每切分一個詞就白白多匹配了10多次。效率明顯會有問題。

那么有什么好的方式可以解決這個問題呢? 首先想到的是字典的存儲方式導致了這一問題。那么Trie樹是如何解決這個問題的呢？

首先，我們可以將一段文字按照標點切分成句子（文章 -》段落 -》句子 -》詞組 ），然后將句子在字典（trie樹）里做最大匹配（或者找出所有前綴匹配，做最優選擇）進行切分，這么一來，就可以避免多次匹配，提高效率。

不過上面的代碼，是針對英文的（為了簡單起見），然后如果是中文，那么這棵trie會非常扁平（root下將出現成千上萬的子節點），這個地方將會發生比較大的效率問題（查找效率），那么可以進行划分，例如將中文里的詞語按照首字的首字母進行划分（或者其他方式），形成一個森林，然后用一個關聯表進行對應。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Trie 樹及Java實現雙數組Trie樹(DoubleArrayTrie)Java實現 java實現的Trie樹數據結構 Trie樹的C++實現 trie樹（前綴樹）詳解——PHP代碼實現 trie樹-前綴樹從Trie樹到雙數組Trie樹 [原創] Trie樹 php 實現敏感詞過濾字典樹（Trie tree）【trie樹】Xor Sum