FP-growth
算法優缺點:
- 優點:一般快於Apriori
- 缺點:實現比較困難,在某些數據上性能下降
- 適用數據類型:標稱型數據
算法思想:
FP-growth算法是用來解決頻繁項集發現問題的,這個問題再前面我們可以通過Apriori算法來解決,但是雖然利用Apriori原理加快了速度,仍舊是效率比較低的。FP-growth算法則可以解決這個問題。
FP-growth算法使用了頻繁模式樹(Frequent Pattern Tree)的數據結構。FP-tree是一種特殊的前綴樹,由頻繁項頭表和項前綴樹構成。所謂前綴樹,是一種存儲候選項集的數據結構,樹的分支用項名標識,樹的節點存儲后綴項,路徑表示項集。
FP-growth算法生成頻繁項集相對Apriori生成頻繁項集的主要好處就是速度快,能快到幾個數量級;另一個好處就是用FP樹存儲數據可以減少存儲空間,因為關聯挖掘的數據集往往是重復性很高的,這就能帶來很高的壓縮比。
算法可以分成一下幾個部分:
-
構建FP樹
- 首先我們要統計出所有的元素的頻度,刪除不滿足最小支持度的(Apriori原理)
- 然后我們要根據頻度對所有的項集排序(保證我們的樹是最小的)
- 最后根據排序的項集構建FP樹
-
從FP樹挖掘頻繁項集:
- 生成條件模式基
- 生成條件FP樹
算法的執行過程這篇文章有個很好的示例程序
函數:
loadSimpDat()
創建數據集createInitSet(dataSet)
將數據集處理成字典的形式createTree(dataSet, minSup=1)
創建FP樹的主函數。首先生成單元素的頻繁項,然后對每個項集進行以頻繁項的頻度為基准的排序。updateTree(items, inTree, headerTable, count)
根據每一個項集和對應的頻數,更新FP樹。並同時建立表頭updateHeader(nodeToTest, targetNode)
當指針已經初始化的時候,調用這個函數把新的點加到鏈表的最后面ascendTree(leafNode, prefixPath)
向上遍歷移植到根節點,將經過的節點都加到前綴路徑中,得到整條每個頻繁項的前綴路徑findPrefixPath(basePat, treeNode)
生成條件模式基mineTree(inTree, headerTable, minSup, preFix, freqItemList)
遞歸調用生成條件FP樹和頻繁項集。創建條件FP樹的過程可以重用前面createTree的代碼
-
1 #coding=utf-8 2 import time 3 class treeNode(object): 4 """docstring for treeNode""" 5 def __init__(self, nameValue, numOccur, parentNode): 6 super(treeNode, self).__init__() 7 self.name = nameValue 8 self.count = numOccur 9 self.nodeLink = None 10 self.parent = parentNode 11 self.children = {} 12 def inc(self, numOccur): 13 self.count += numOccur 14 def disp(self, ind=1): 15 print ' '*ind,self.name,' ',self.count 16 for child in self.children.values(): 17 child.disp(ind+1) 18 def loadSimpDat(): 19 simpDat = [['r', 'z', 'h', 'j', 'p'], 20 ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], 21 ['z'], 22 ['r', 'x', 'n', 'o', 's'], 23 ['y', 'r', 'x', 'z', 'q', 't', 'p'], 24 ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] 25 return simpDat 26 def createInitSet(dataSet): 27 retDict = {} 28 for trans in dataSet: 29 retDict[frozenset(trans)] = 1 30 return retDict 31 def createTree(dataSet, minSup=1): 32 headerTable = {} 33 #frequency of each item 34 for trans in dataSet: 35 for item in trans: 36 headerTable[item] = headerTable.get(item, 0) + dataSet[trans]#some trans may same 37 #remove items not meeting minSup 38 for k in headerTable.keys(): 39 if headerTable[k] < minSup: 40 del(headerTable[k]) 41 freqItemSet = set(headerTable.keys()) 42 if len(freqItemSet) == 0:#no frequent item 43 return None, None 44 for k in headerTable:#add a point field 45 headerTable[k] = [headerTable[k], None] 46 47 retTree = treeNode('Null set', 1, None) 48 for tranSet, count in dataSet.items(): 49 localD = {} 50 for item in tranSet:#把每一個項集的元素提取出來,並加上統計出來的頻率 51 if item in freqItemSet: 52 localD[item] = headerTable[item][0] 53 if len(localD) > 0:#排序,並更新樹 54 orderdItem = [v[0] for v in sorted(localD.items(),key=lambda p:p[1],reverse=True)] 55 updateTree(orderdItem, retTree, headerTable, count) 56 return retTree, headerTable 57 def updateTree(items, inTree, headerTable, count): 58 #將新的節點加上來 59 if items[0] in inTree.children: 60 inTree.children[items[0]].inc(count) 61 else: 62 inTree.children[items[0]] = treeNode(items[0], count, inTree) 63 #更新指針 64 if headerTable[items[0]][1] == None: 65 headerTable[items[0]][1] = inTree.children[items[0]] 66 else: 67 updateHeader(headerTable[items[0]][1],inTree.children[items[0]]) 68 if len(items) > 1: 69 updateTree(items[1::],inTree.children[items[0]],headerTable,count) 70 def updateHeader(nodeToTest, targetNode): 71 while nodeToTest.nodeLink != None: 72 nodeToTest = nodeToTest.nodeLink 73 nodeToTest.nodeLink = targetNode 74 75 def ascendTree(leafNode, prefixPath): #ascends from leaf node to root 76 if leafNode.parent != None: 77 prefixPath.append(leafNode.name) 78 ascendTree(leafNode.parent, prefixPath) 79 80 def findPrefixPath(basePat, treeNode): #treeNode comes from header table 81 condPats = {} 82 while treeNode != None: 83 prefixPath = [] 84 ascendTree(treeNode, prefixPath) 85 if len(prefixPath) > 1: 86 condPats[frozenset(prefixPath[1:])] = treeNode.count 87 treeNode = treeNode.nodeLink 88 return condPats 89 def mineTree(inTree, headerTable, minSup, preFix, freqItemList): 90 bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table) 91 #print bigL 92 for basePat in bigL: #start from bottom of header table 93 newFreqSet = preFix.copy() 94 newFreqSet.add(basePat) 95 print 'finalFrequent Item: ',newFreqSet #append to set 96 freqItemList.append(newFreqSet) 97 condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) 98 print 'condPattBases :',basePat, condPattBases 99 #2. construct cond FP-tree from cond. pattern base 100 myCondTree, myHead = createTree(condPattBases, minSup) 101 print 'head from conditional tree: ', myHead 102 if myHead != None: #3. mine cond. FP-tree 103 print 'conditional tree for: ',newFreqSet 104 myCondTree.disp(1) 105 mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList) 106 def main(): 107 if True: 108 simpDat = loadSimpDat() 109 initSet = createInitSet(simpDat) 110 myFP, myHeadTable = createTree(initSet,3) 111 myFP.disp() 112 freqItems = [] 113 mineTree(myFP,myHeadTable,3,set([]),freqItems) 114 print freqItems 115 if False: 116 t1 = time.clock() 117 parsedDat = [line.split() for line in open('kosarak.dat').readlines()] 118 initSet = createInitSet(parsedDat) 119 myFP,myHeadTable = createTree(initSet,100000) 120 myfreq = [] 121 mineTree(myFP,myHeadTable,100000,set([]),myfreq) 122 t2 = time.clock() 123 print 'time=', t2-t1 124 print myfreq 125 if __name__ == '__main__': 126 main() 127

