fp-growth代碼問題(Python)


網上的 python3 fp-growth代碼每次在執行時可能會出現找出的頻繁項集不一致的情況,這是因為每次執行代碼時建的FP樹可能不一致。

 

加了一行代碼可以解決這個問題(第59行):先對 frequentItemsInRecord 按 key 的ASSIC碼排序,然后再按照 key 的支持度(即value值)降序排列。

 

之所以這么做是因為 frequentItemsInRecord 中可能會出現支持度一樣的項,如果不按ASSIC碼先排一次的話,

有可能出現每次執行代碼時 orderedFrequentItems (第60行)中相同支持度的項出現的順序不一致,從而造成每次建的FP樹不一致,導致找出的頻繁項集不一致。

 

import pprint
  
  
def loadDataSet():
    dataSet = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
               ['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
               ['socks', 'gloves'],
               ['bread', 'milk', 'shoes', 'socks', 'eggs'],
               ['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
               ['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
  
    return dataSet
  
  
def transfer2FrozenDataSet(dataSet):
    frozenDataSet = {}
    for elem in dataSet:
        frozenDataSet[frozenset(elem)] = 1
  
    return frozenDataSet
  
  
class TreeNode:
    def __init__(self, nodeName, count, nodeParent):
        self.nodeName = nodeName
        self.count = count
        self.nodeParent = nodeParent
        self.nextSimilarItem = None
        self.children = {}
  
    def increaseC(self, count):
        self.count += count
  
  
def createFPTree(frozenDataSet, minSupport):
    # scan dataset at the first time, filter out items which are less than minSupport
    headPointTable = {}
    for items in frozenDataSet:
        for item in items:
            headPointTable[item] = headPointTable.get(item, 0) + frozenDataSet[items]
    headPointTable = {
        k: v
        for k, v in headPointTable.items() if v >= minSupport
    }
    frequentItems = set(headPointTable.keys())
    if len(frequentItems) == 0: return None, None
  
    for k in headPointTable:
        headPointTable[k] = [headPointTable[k], None]
  
    fptree = TreeNode("null", 1, None)
    # scan dataset at the second time, filter out items for each record
    for items, count in frozenDataSet.items():
        frequentItemsInRecord = {}
        for item in items:
            if item in frequentItems:
                frequentItemsInRecord[item] = headPointTable[item][0]
        if len(frequentItemsInRecord) > 0:
            frequentItemsInRecord = sorted(frequentItemsInRecord.items(), key=lambda v: v[0])
            orderedFrequentItems = [v[0] for v in sorted(frequentItemsInRecord, key=lambda v: v[1], reverse=True)]
            updateFPTree(fptree, orderedFrequentItems, headPointTable, count)
  
    return fptree, headPointTable
  
  
def updateFPTree(fptree, orderedFrequentItems, headPointTable, count):
    # handle the first item
    if orderedFrequentItems[0] in fptree.children:
        fptree.children[orderedFrequentItems[0]].increaseC(count)
    else:
        fptree.children[orderedFrequentItems[0]] = TreeNode(orderedFrequentItems[0], count, fptree)
  
        # update headPointTable
        if headPointTable[orderedFrequentItems[0]][1] == None:
            headPointTable[orderedFrequentItems[0]][1] = fptree.children[orderedFrequentItems[0]]
        else:
            updateHeadPointTable(headPointTable[orderedFrequentItems[0]][1], fptree.children[orderedFrequentItems[0]])
    # handle other items except the first item
    if (len(orderedFrequentItems) > 1):
        updateFPTree(fptree.children[orderedFrequentItems[0]], orderedFrequentItems[1::], headPointTable, count)
  
  
def updateHeadPointTable(headPointBeginNode, targetNode):
    while (headPointBeginNode.nextSimilarItem != None):
        headPointBeginNode = headPointBeginNode.nextSimilarItem
    headPointBeginNode.nextSimilarItem = targetNode
  
  
def mineFPTree(headPointTable, prefix, frequentPatterns, minSupport):
    # for each item in headPointTable, find conditional prefix path, create conditional fptree,
    # then iterate until there is only one element in conditional fptree
    headPointItems = [v[0] for v in sorted(headPointTable.items(), key=lambda v: v[1][0])]
    if (len(headPointItems) == 0): return
  
    for headPointItem in headPointItems:
        newPrefix = prefix.copy()
        newPrefix.add(headPointItem)
        support = headPointTable[headPointItem][0]
        frequentPatterns[frozenset(newPrefix)] = support
  
        prefixPath = getPrefixPath(headPointTable, headPointItem)
        if (prefixPath != {}):
            conditionalFPtree, conditionalHeadPointTable = createFPTree(prefixPath, minSupport)
            if conditionalHeadPointTable != None:
                mineFPTree(conditionalHeadPointTable, newPrefix, frequentPatterns, minSupport)
  
  
def getPrefixPath(headPointTable, headPointItem):
    prefixPath = {}
    beginNode = headPointTable[headPointItem][1]
    prefixs = ascendTree(beginNode)
    if ((prefixs != [])):
        prefixPath[frozenset(prefixs)] = beginNode.count
  
    while (beginNode.nextSimilarItem != None):
        beginNode = beginNode.nextSimilarItem
        prefixs = ascendTree(beginNode)
        if (prefixs != []):
            prefixPath[frozenset(prefixs)] = beginNode.count
  
    return prefixPath
  
  
def ascendTree(treeNode):
    prefixs = []
    while ((treeNode.nodeParent != None) and (treeNode.nodeParent.nodeName != 'null')):
        treeNode = treeNode.nodeParent
        prefixs.append(treeNode.nodeName)
  
    return prefixs
  
  
def rulesGenerator(frequentPatterns, minConf, rules):
    for frequentset in frequentPatterns:
        if (len(frequentset) > 1):
            getRules(frequentset, frequentset, rules, frequentPatterns, minConf)
  
  
def removeStr(set, str):
    tempSet = []
    for elem in set:
        if (elem != str):
            tempSet.append(elem)
    tempFrozenSet = frozenset(tempSet)
  
    return tempFrozenSet
  
 
def getRules(frequentset, currentset, rules, frequentPatterns, minConf):
    for frequentElem in currentset:
        subSet = removeStr(currentset, frequentElem)
        confidence = frequentPatterns[frequentset] / frequentPatterns[subSet]
        if (confidence >= minConf):
            flag = False
            for rule in rules:
                if (rule[0] == subSet and rule[1] == frequentset - subSet):
                    flag = True
  
            if (flag == False):
                rules.append((subSet, frequentset - subSet, confidence))
  
            if (len(subSet) >= 2):
                getRules(frequentset, subSet, rules, frequentPatterns, minConf)
  
  
if __name__ == '__main__':
    dataSet = loadDataSet()
    frozenDataSet = transfer2FrozenDataSet(dataSet)
    minSupport = 3
    fptree, headPointTable = createFPTree(frozenDataSet, minSupport)
    frequentPatterns = {}
    prefix = set([])
    mineFPTree(headPointTable, prefix, frequentPatterns, minSupport)
    print("frequent patterns:")
    pprint.pprint(frequentPatterns)
  
    minConf = 0.6
    rules = []
    rulesGenerator(frequentPatterns, minConf, rules)
    print("association rules:")
    pprint.pprint(rules)
    print('rules num:', len(rules))

  

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM