#include<stdio.h>
- #include<string.h>
- #include<stdlib.h>
- //樹結點定義
- typedef struct
- {
- int weight;
- int parent;
- int lchild;
- int rchild;
- }HTNode,*HuffmanTree;
- static char N[100];//用於保存正文
- //哈弗曼編碼,char型二級指針
- typedef char **HuffmanCode;
- //封裝最小權結點和次小權結點
- typedef struct
- {
- int s1;
- int s2;
- }MinCode;
- //函數聲明
- void Error(char *message);
- HuffmanCode HuffmanCoding(HuffmanTree &HT,HuffmanCode HC,int *w,int n);
- MinCode Select(HuffmanTree HT,int n);
- //當輸入1個結點時的錯誤提示
- void Error(char *message)
- {
- fprintf(stderr,"Error:%s\n",message);
- exit(1);
- }
- //構造哈夫曼樹HT,編碼存放在HC中,w為權值,n為結點個數
- HuffmanCode HuffmanCoding(HuffmanTree &HT,HuffmanCode HC,int *w,int n)
- {
- int i,s1=0,s2=0;
- HuffmanTree p;
- char *cd;
- int f,c,start,m;
- MinCode min;
- if(n<=1)
- {
- Error("Code too small!");//只有一個結點不進行編碼,直接exit(1)退出。非return,如果return 會造成main函數HT[i]無值
- }
- m=2*n-1;//哈弗曼編碼需要開辟的結點大小為2n-1
- HT=(HuffmanTree)malloc((m+1)*sizeof(HTNode));//開辟哈夫曼樹結點空間 m+1 。為了對應關系,我們第0個空間不用。
- //初始化n個葉子結點,w[0] = 0,main函數已賦值
- for(p=HT,i=0;i<=n;i++,p++,w++)
- {
- p->weight=*w;
- p->parent=0;
- p->lchild=0;
- p->rchild=0;
- }
- //將n-1個非葉子結點的初始化
- for(;i<=m;i++,p++)
- {
- p->weight=0;
- p->parent=0;
- p->lchild=0;
- p->rchild=0;
- }
- //構造哈夫曼樹
- for(i=n+1;i<=m;i++)
- {
- min=Select(HT,i-1);//找出最小和次小的兩個結點
- s1=min.s1 ; //最小結點下標
- s2=min.s2;//次小結點下標
- HT[s1].parent=i;
- HT[s2].parent=i;
- HT[i].lchild=s1;
- HT[i].rchild=s2;
- HT[i].weight=HT[s1].weight+HT[s2].weight;//賦權和
- }
- //打印哈弗曼樹
- printf("HT List:\n");
- printf("Number\t\tweight\t\tparent\t\tlchild\t\trchild\n");
- for(i=1;i<=m;i++)
- {
- printf("%d\t\t%d\t\t%d\t\t%d\t\t%d\t\n",i,HT[i].weight,HT[i].parent,HT[i].lchild,HT[i].rchild);
- }
- //從葉子結點到根節點求每個字符的哈弗曼編碼
- HC=(HuffmanCode)malloc((n+1)*sizeof(char *));
- cd=(char *)malloc(n*sizeof(char *));//為哈弗曼編碼動態分配空間
- cd[n-1]='\0';//如:3個結點編碼最長為2。cd[3-1] = '\0';
- //求葉子結點的哈弗曼編碼
- for(i=1;i<=n;i++)
- {
- start=n-1;
- //定義左子樹為0,右子樹為1
- /*
- 從最下面的1號節點開始往頂部編碼(逆序存放),然后編碼2號節點,3號......
- */
- for(c=i,f=HT[i].parent; f!=0; c=f,f=HT[f].parent)
- {
- if(HT[f].lchild==c)
- cd[--start]='0';
- else
- cd[--start]='1';
- }
- //為第i個字符分配編碼空間
- HC[i]=(char *)malloc((n-start)*sizeof(char *));
- //將當前求出結點的哈弗曼編碼復制到HC
- strcpy(HC[i],&cd[start]);
- }
- free(cd);
- return HC;
- }
- MinCode Select(HuffmanTree HT,int n)
- {
- int min,secmin;
- int temp = 0;
- int i,s1,s2,tempi = 0;
- MinCode code ;
- s1=1;
- s2=1;
- min = 66666;//足夠大
- //找出權值weight最小的結點,下標保存在s1中
- for(i=1;i<=n;i++)
- {
- if(HT[i].weight<min && HT[i].parent==0)
- {
- min=HT[i].weight;
- s1=i;
- }
- }
- secmin = 66666;//足夠大
- //找出權值weight次小的結點,下標保存在s2中
- for(i=1;i<=n;i++)
- {
- if((HT[i].weight<secmin) && (i!=s1) && HT[i].parent==0)
- {
- secmin=HT[i].weight;
- s2=i;
- }
- }
- //放進封裝中
- code.s1=s1;
- code.s2=s2;
- return code;
- }
- void HuffmanTranslateCoding(HuffmanTree HT, int n,char* ch)
- {//譯碼過程
- int m=2*n-1;
- int i,j=0;
- printf("After Translation:");
- while(ch[j]!='\0')//ch[]:你輸入的要譯碼的0101010串
- {
- i=m;
- while(0 != HT[i].lchild && 0 != HT[i].rchild)//從頂部找到最下面
- {
- if('0' == ch[j])//0 往左子樹走
- {
- i=HT[i].lchild;
- }
- else//1 往右子樹走
- {
- i=HT[i].rchild;
- }
- ++j;//下一個路徑
- }
- printf("%c",N[i-1]);//打印出來
- }
- printf("\n");
- }
- void main()
- {
- HuffmanTree HT=NULL;
- HuffmanCode HC=NULL;
- int *w=NULL;
- int i,n;
- char tran[100];
- printf("Input N(char):");
- gets(N);
- fflush(stdin);
- n = strlen(N);
- w=(int *)malloc((n+1)*sizeof(int *));//開辟n+1個長度的int指針空間
- w[0]=0;
- printf("Enter weight:\n");
- //輸入結點權值
- for(i=1;i<=n;i++)
- {
- printf("w[%d]=",i);
- scanf("%d",&w[i]);
- }
- fflush(stdin);
- //構造哈夫曼樹HT,編碼存放在HC中,w為權值,n為結點個數
- HC=HuffmanCoding(HT,HC,w,n);
- //輸出哈弗曼編碼
- printf("HuffmanCode:\n");
- printf("Number\t\tWeight\t\tCode\n");
- for(i=1;i<=n;i++)
- {
- printf("%c\t\t%d\t\t%s\n",N[i-1],w[i],HC[i]);
- }
- fflush(stdin);
- //譯碼過程
- printf("Input HuffmanTranslateCoding:");
- gets(tran);
- HuffmanTranslateCoding(HT, n, tran);
- return;
- }
#include <stdio.h>#include <stdlib.h>#include <memory.h> #define MAX_CODE_LENGTH 40 long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; /** * word與Huffman樹編碼 */struct vocab_word { long long cn; // 詞在訓練集中出現的頻率 int *point; // 編碼的節點路徑 char *word, // 詞 *code, // Huffman編碼,每一位上,0或1 codelen; // Huffman編碼長度}; struct vocab_word *vocab; /* * 打印構造過程的中間狀態. */ void printState(long long* count, long long* binary, long long* parent_node) { printf("count[]:\t"); for(int x=0; x<vocab_size * 2; x++) { printf("%lld", count[x]); printf(" "); } printf("\n"); printf("binary[]:\t"); for(int x=0; x<vocab_size * 2; x++) { printf("%lld", binary[x]); printf(" "); } printf("\n"); printf("parent[]:\t"); for(int x=0; x<vocab_size * 2; x++) { printf("%lld", parent_node[x]); printf(" "); } printf("\n");}/** * 使用詞頻創建一棵的Huffman樹. 頻率高的字將具有更短的 * Huffman二進制碼(binary code). * */// Create binary Huffman tree using the word counts// Frequent words will have short uniqe binary codesvoid CreateBinaryTree() { long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; char code[MAX_CODE_LENGTH]; // count: 詞頻. // binary: // parent_node: long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); // 初始化count數組的前一半 for (a = 0; a < vocab_size; a++) { count[a] = vocab[a].cn; //printf("count=%lld\n", count[a]); } // 初始化count數組的后一半,用於交換. 賦很大值. for (a = vocab_size; a < vocab_size * 2; a++) { count[a] = 1e15; //printf("count=%lld\n", count[a]); } // pos1 = vocab_size - 1; pos2 = vocab_size; //printf("pos1=%lld, pos2=%lld\n", pos1, pos2); printState(count, binary, parent_node); // 根據算法構建Huffman樹,一次增加一個節點. // Following algorithm constructs the Huffman tree by adding one node at a time for (a = 0; a < vocab_size - 1; a++) { printf("----------------\n"); printf("pos1=%lld, pos2=%lld\n", pos1, pos2); // 每輪找到最小的兩個值. // First, find two smallest nodes 'min1, min2' if (pos1 >= 0) { // 遍歷所有詞匯的count,比較count;取較小值. if (count[pos1] < count[pos2]) { min1i = pos1; pos1--; } else { min1i = pos2; pos2++; } } else { min1i = pos2; pos2++; } printf("min1i=%d, min2i=%d\n", min1i, min2i); printf("pos1=%lld, pos2=%lld\n", pos1, pos2); // 再比一次. if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min2i = pos1; pos1--; } else { min2i = pos2; pos2++; } } else { min2i = pos2; pos2++; } // 最小值cnt的兩個索引 printf("min1i=%d, min2i=%d\n", min1i, min2i); printf("count[min1i]=%d, count[min2i]=%d\n", count[min1i], count[min2i]); count[vocab_size + a] = count[min1i] + count[min2i]; parent_node[min1i] = vocab_size + a; parent_node[min2i] = vocab_size + a; binary[min2i] = 1; printf("count[vocab_size + a] = %d\n", count[vocab_size + a]); printf("parent_node[%d] = %d\n", min1i, parent_node[min1i]); printf("parent_node[%d] = %d\n", min2i, parent_node[min2i]); printf("binary[%d] = %d\n", min2i, binary[min2i]); printState(count, binary, parent_node); } // 將二進制編碼分配給詞匯表中每個詞匯. // Now assign binary code to each vocabulary word for (a = 0; a < vocab_size; a++) { b = a; i = 0; while (1) { code[i] = binary[b]; point[i] = b; i++; b = parent_node[b]; if (b == vocab_size * 2 - 2) break; } // 得到huffman編碼長度. vocab[a].codelen = i; // 得到huffman編碼code及路徑point. vocab[a].point[0] = vocab_size - 2; for (b = 0; b < i; b++) { vocab[a].code[i - b - 1] = code[b]; vocab[a].point[i - b] = point[b] - vocab_size; } } // 釋放內存. free(count); free(binary); free(parent_node);}/** * 代碼運行: * gcc ./huffman_tree.cpp; ./a.out */int main(){ vocab_size = 6; vocab = (vocab_word*) calloc(vocab_size, sizeof(vocab_word)); memset(vocab, 0, sizeof(vocab_word) * (vocab_size)); // 初始化code/point. for (int a = 0; a < vocab_size; a++) { vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); } // 事先對vocab按詞頻排好序(word2vec事先已經用qsort處理),從大到小排序. // 可以用qsort。 本代碼直接已經人工排好序了. vocab[0].cn = 7; char* str = "T"; vocab[0].word = str; vocab[1].cn = 5; str = "E"; vocab[1].word = str; vocab[2].cn = 4; str = "G"; vocab[2].word = str; vocab[3].cn = 4; str = "R"; vocab[3].word = str; vocab[4].cn = 3; str = "O"; vocab[4].word = str; vocab[5].cn = 2; str = "F"; vocab[5].word = str; CreateBinaryTree(); for (int a = 0; a < vocab_size; a++) { printf("word=%s\t", vocab[a].word); printf("cn=%d\t", vocab[a].cn); printf("codelen=%d\t", vocab[a].codelen); printf("code="); for(int i = 0; i < vocab[a].codelen; i++) { printf("%d", vocab[a].code[i]); } printf("\t"); printf("point="); for(int i = 0; i < vocab[a].codelen; i++) { printf("%d-", vocab[a].point[i]); } printf("\n"); //printf("point=%s\n", vocab[a].point); } return 0;
————————————————
版權聲明:本文為CSDN博主「桃根仙」的原創文章,遵循CC 4.0 BY-SA版權協議,轉載請附上原文出處鏈接及本聲明。
原文鏈接:https://blog.csdn.net/taotaobaobei/java/article/details/78513979