幾個問題:
1、停用次應該去到什么程度??
2、比如我選了參數topicNumber=100,結果中,其中有80個topic,每個的前幾個words很好地描述了一個topic。另外的20個topic的前幾個words沒有描述好。這樣是否說明了topicNumber=100已經足夠了?
3、LDA考慮了多少文件之間的關系?
4、參數 alpha,beta怎么取?? alpha=K/50 ?? b=0.1(0.01) ??
========================================
看了幾篇LDA的文檔,實在寫的太好了,我只能貼點代碼,表示我做過lda了
public class LdaModel { int[][] doc;// word index array,每個文本中每個詞在字典indexToTermMap中的序號 int V, K, M;// vocabulary size, topic number, document number int[][] z;// topic label array,每個文本的每個詞對應的topic的編號 float alpha; // doc-topic dirichlet prior parameter float beta; // topic-word dirichlet prior parameter int[][] nmk;// given document m, count times of topic k. M*K int[][] nkt;// given topic k, count times of term t. K*V int[] nmkSum;// Sum for each row in nmk,nukSum[m]=n:也就是文檔m中word的個數為n int[] nktSum;// Sum for each row in nkt,nkt[k]=n:被指定給topic k的term/word的個數為n double[][] phi;// Parameters for topic-word distribution K*V double[][] theta;// Parameters for doc-topic distribution M*K int iterations;// Times of iterations int saveStep;// The number of iterations between two saving int beginSaveIters;// Begin save model at this iteration public LdaModel(LdaGibbsSampling.modelparameters modelparam) { // TODO Auto-generated constructor stub alpha = modelparam.alpha; beta = modelparam.beta; iterations = modelparam.iteration; K = modelparam.topicNum; saveStep = modelparam.saveStep; beginSaveIters = modelparam.beginSaveIters; } public void initializeModel(Documents docSet) { // TODO Auto-generated method stub M = docSet.docs.size(); V = docSet.termToIndexMap.size(); nmk = new int[M][K]; nkt = new int[K][V]; nmkSum = new int[M]; nktSum = new int[K]; phi = new double[K][V]; theta = new double[M][K]; // initialize documents index array doc = new int[M][]; for (int m = 0; m < M; m++) { // Notice the limit of memory int N = docSet.docs.get(m).docWords.length; doc[m] = new int[N]; for (int n = 0; n < N; n++) { doc[m][n] = docSet.docs.get(m).docWords[n]; } } // initialize topic label z for each word z = new int[M][]; for (int m = 0; m < M; m++) { int N = docSet.docs.get(m).docWords.length; z[m] = new int[N]; for (int n = 0; n < N; n++) { int initTopic = (int) (Math.random() * K);// From 0 to K - 1 z[m][n] = initTopic; // number of words in doc m assigned to topic initTopic add 1 nmk[m][initTopic]++; // number of terms doc[m][n] assigned to topic initTopic add 1 nkt[initTopic][doc[m][n]]++; // total number of words assigned to topic initTopic add 1 nktSum[initTopic]++; } // total number of words in document m is N nmkSum[m] = N; } } public void inferenceModel(Documents docSet) throws IOException { // TODO Auto-generated method stub if (iterations < saveStep + beginSaveIters) { System.err .println("Error: the number of iterations should be larger than " + (saveStep + beginSaveIters)); System.exit(0); } for (int i = 0; i < iterations; i++) { System.out.println("Iteration " + i); if ((i >= beginSaveIters) && (((i - beginSaveIters) % saveStep) == 0)) { // Saving the model System.out.println("Saving model at iteration " + i + " ... "); // Firstly update parameters updateEstimatedParameters(); // Secondly print model variables saveIteratedModel(i, docSet); } // Use Gibbs Sampling to update z[][] for (int m = 0; m < M; m++) { int N = docSet.docs.get(m).docWords.length; for (int n = 0; n < N; n++) { // Sample from p(z_i|z_-i, w) int newTopic = sampleTopicZ(m, n); z[m][n] = newTopic; } } } } private void updateEstimatedParameters() { // TODO Auto-generated method stub for (int k = 0; k < K; k++) { for (int t = 0; t < V; t++) { phi[k][t] = (nkt[k][t] + beta) / (nktSum[k] + V * beta); } } for (int m = 0; m < M; m++) { for (int k = 0; k < K; k++) { theta[m][k] = (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha); } } } private int sampleTopicZ(int m, int n) { // TODO Auto-generated method stub // Sample from p(z_i|z_-i, w) using Gibbs upde rule // Remove topic label for w_{m,n} int oldTopic = z[m][n]; nmk[m][oldTopic]--; nkt[oldTopic][doc[m][n]]--; nmkSum[m]--; nktSum[oldTopic]--; // Compute p(z_i = k|z_-i, w) double[] p = new double[K]; for (int k = 0; k < K; k++) { p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha); } // Sample a new topic label for w_{m, n} like roulette // Compute cumulated probability for p for (int k = 1; k < K; k++) { p[k] += p[k - 1]; } double u = Math.random() * p[K - 1]; // p[] is unnormalised int newTopic; for (newTopic = 0; newTopic < K; newTopic++) { if (u < p[newTopic]) { break; } } // Add new topic label for w_{m, n} nmk[m][newTopic]++; nkt[newTopic][doc[m][n]]++; nmkSum[m]++; nktSum[newTopic]++; return newTopic; } public void saveIteratedModel(int iters, Documents docSet) throws IOException { // TODO Auto-generated method stub // lda.params lda.phi lda.theta lda.tassign lda.twords // lda.params String resPath = LdaConfig.OUTPUTFILE_PATH; String modelName = "lda_" + iters; ArrayList<String> lines = new ArrayList<String>(); lines.add("alpha = " + alpha); lines.add("beta = " + beta); lines.add("topicNum = " + K); lines.add("docNum = " + M); lines.add("termNum = " + V); lines.add("iterations = " + iterations); lines.add("saveStep = " + saveStep); lines.add("beginSaveIters = " + beginSaveIters); FileUtil.writeLines(resPath + modelName + ".params", lines); // lda.phi K*V BufferedWriter writer = new BufferedWriter(new FileWriter(resPath + modelName + ".phi")); for (int i = 0; i < K; i++) { for (int j = 0; j < V; j++) { writer.write(phi[i][j] + "\t"); } writer.write("\n"); } writer.close(); // lda.theta M*K writer = new BufferedWriter(new FileWriter(resPath + modelName + ".theta")); for (int i = 0; i < M; i++) { for (int j = 0; j < K; j++) { writer.write(theta[i][j] + "\t"); } writer.write("\n"); } writer.close(); // lda.tassign writer = new BufferedWriter(new FileWriter(resPath + modelName + ".tassign")); for (int m = 0; m < M; m++) { for (int n = 0; n < doc[m].length; n++) { writer.write(doc[m][n] + ":" + z[m][n] + "\t"); } writer.write("\n"); } writer.close(); // lda.twords phi[][] K*V writer = new BufferedWriter(new FileWriter(resPath + modelName + ".twords")); int topNum = 15; // Find the top 20 topic words in each topic for (int i = 0; i < K; i++) { List<Integer> tWordsIndexArray = new ArrayList<Integer>(); for (int j = 0; j < V; j++) { tWordsIndexArray.add(new Integer(j)); } Collections.sort(tWordsIndexArray, new LdaModel.ArrayDoubleComparator(phi[i])); writer.write("topic " + i + "\t:\t"); for (int t = 0; t < topNum; t++) { writer.write(docSet.indexToTermMap.get(tWordsIndexArray.get(t)) + " " + phi[i][tWordsIndexArray.get(t)] + "\t"); } writer.write("\n"); } writer.close(); } // save topic "word1:f1;word2:f2" public void saveTopic(Documents docSet) { int topNum = 15; for (int i = 0; i < K; i++) { List<Integer> tWordsIndexArray = new ArrayList<Integer>(); for (int j = 0; j < V; j++) { tWordsIndexArray.add(new Integer(j)); } Collections.sort(tWordsIndexArray, new LdaModel.ArrayDoubleComparator(phi[i])); TbTopic tbTopic = new TbTopic(); tbTopic.setId(i); StringBuffer bf = new StringBuffer(); for (int t = 0; t < topNum; t++) { bf.append(docSet.indexToTermMap.get(tWordsIndexArray.get(t))); bf.append(":"); bf.append(phi[i][tWordsIndexArray.get(t)]); bf.append(";"); } tbTopic.setWords(bf.toString()); DocDBUtil.saveTbTopic(tbTopic); } } // save TbDistopic "topicId1:f1;TopicId2:f1" public void saveDisTopic(Documents docSet) { int topicNum = 3; for (int i = 0; i < M; i++) { int disId = Integer.parseInt(docSet.docs.get(i).docName); TbDisTopic tbDisTopic = new TbDisTopic(); tbDisTopic.setId(disId); List<Integer> topicIndexArray = new ArrayList<Integer>(); for (int j = 0; j < K; j++) { topicIndexArray.add(new Integer(j)); } Collections.sort(topicIndexArray, new LdaModel.ArrayDoubleComparator(theta[i])); String topicIds = ""; for (int t = 0; t < topicNum; t++) { topicIds += topicIndexArray.get(t); topicIds += ":"; topicIds += theta[i][topicIndexArray.get(t)]; topicIds += ";"; } tbDisTopic.setTopic(topicIds); DocDBUtil.saveTbDisTopic(tbDisTopic); } } public class ArrayDoubleComparator implements Comparator<Integer> { private double[] sortProb; // Store probability of each word in topic k public ArrayDoubleComparator(double[] sortProb) { this.sortProb = sortProb; } @Override public int compare(Integer o1, Integer o2) {// Sort topic word index according to the probability of each word // in topic k if (sortProb[o1] > sortProb[o2]) return -1; else if (sortProb[o1] < sortProb[o2]) return 1; else return 0; } } }
核心代碼還是寫的不錯的
public class Documents { ArrayList<Document> docs;//.size()=M Map<String, Integer> termToIndexMap;//.size()=V,所有的詞,沒有重復,詞--序號 ArrayList<String> indexToTermMap;//序號--詞,序號就是數組號。相當於詞典,給定序號,就能找到詞 Map<String, Integer> termCountMap;//詞頻 public Documents() { docs = new ArrayList<Document>(); termToIndexMap = new HashMap<String, Integer>(); indexToTermMap = new ArrayList<String>(); termCountMap = new HashMap<String, Integer>(); } public void getDocsFromDB() { List<TbDiseases> diseases = DocDBUtil.getTbDiseases(); for (TbDiseases disea : diseases) { String content = disea.getSymptomDetail(); String docName = disea.getId() + ""; Document doc = new Document(content, termToIndexMap, indexToTermMap, termCountMap, docName); docs.add(doc); } } public static class Document { String docName; //文檔的詞,去除了停用詞和干擾詞,保存的int是indexToTermMap中對應詞的序號 int[] docWords; public Document(String content, Map<String, Integer> termToIndexMap, ArrayList<String> indexToTermMap, Map<String, Integer> termCountMap, String docName) { this.setDocName(docName); ArrayList<String> words = DocDBUtil.getWordsFromSentence(content); // Transfer word to index this.docWords = new int[words.size()]; for (int i = 0; i < words.size(); i++) { String word = words.get(i); if (!termToIndexMap.containsKey(word)) { int newIndex = termToIndexMap.size(); termToIndexMap.put(word, newIndex); indexToTermMap.add(word); termCountMap.put(word, new Integer(1)); docWords[i] = newIndex; } else { docWords[i] = termToIndexMap.get(word); termCountMap.put(word, termCountMap.get(word) + 1); } } words.clear(); } public void setDocName(String docName) { this.docName = docName; } public String getDocName() { return docName; } }//Document }
文檔來自數據庫的