FP-Growth 算法 Python 实现:高效频繁项集挖掘
FP-Growth 算法是一种用于频繁项集挖掘的算法,它的主要思想是利用 FP 树来发现频繁项集。下面是 FP-Growth 算法的 Python 实现:
- 首先定义一个节点类,用于表示 FP 树中的节点。
class Node:
def __init__(self, item, count, parent):
self.item = item # 节点代表的项
self.count = count # 项的计数
self.parent = parent # 父节点
self.children = {} # 子节点列表
self.next = None # 指向相同项的下一个节点
def increment(self, count):
self.count += count
- 定义 FP-Growth 算法的主函数。
def fp_growth(data, min_support):
# 构建频繁项集列表
freq_items = {}
for trans in data:
for item in trans:
freq_items[item] = freq_items.get(item, 0) + 1
freq_items = {k:v for k,v in freq_items.items() if v >= min_support}
freq_itemset = set(freq_items.keys())
# 如果没有频繁项集,则直接返回
if len(freq_itemset) == 0:
return None, None
# 构建 FP 树
root = Node(None, 0, None)
for trans, count in zip(data, count_list):
ordered_items = [i for i in trans if i in freq_itemset]
ordered_items.sort(key=lambda x: freq_items[x], reverse=True)
if len(ordered_items) > 0:
update_tree(ordered_items, count, root)
# 构建条件模式基
cond_pat_bases = {}
for item in freq_itemset:
cond_pat_bases[item] = get_cond_pat_base(item, root)
# 递归地挖掘 FP 树
freq_itemsets = []
prefix = []
mine_tree(root, freq_items, prefix, freq_itemsets, cond_pat_bases)
return freq_itemsets, freq_items
- 定义用于更新 FP 树的函数。
def update_tree(items, count, root):
node = root
for item in items:
if item in node.children:
child = node.children[item]
child.increment(count)
else:
child = Node(item, count, node)
node.children[item] = child
update_node_link(child, root)
node = child
def update_node_link(node, header_table):
if header_table[node.item][1] is None:
header_table[node.item][1] = node
else:
curr = header_table[node.item][1]
while curr.next is not None:
curr = curr.next
curr.next = node
- 定义用于获取条件模式基的函数。
def get_cond_pat_base(item, node):
cond_pat_base = []
while node is not None:
prefix = []
collect_prefix(node, prefix)
if len(prefix) > 1:
cond_pat_base.append((prefix[1:], node.count))
node = node.next
return cond_pat_base
def collect_prefix(node, prefix):
if node.parent is not None:
prefix.append(node.item)
collect_prefix(node.parent, prefix)
- 定义用于递归挖掘 FP 树的函数。
def mine_tree(node, freq_items, prefix, freq_itemsets, cond_pat_bases):
# 按照出现频率排序
items = [item for item in freq_items.keys() if item not in prefix]
items.sort(key=lambda x: freq_items[x], reverse=True)
for item in items:
new_prefix = prefix + [item]
new_freq_itemset = prefix + [item]
freq_itemsets.append(new_freq_itemset)
cond_pat_base = cond_pat_bases[item]
cond_tree, cond_freq_items = fp_growth([trans for trans, count in cond_pat_base], min_support)
if cond_tree is not None:
mine_tree(cond_tree, cond_freq_items, new_prefix, freq_itemsets, cond_pat_bases)
以上就是 FP-Growth 算法的 Python 实现。需要注意的是,FP-Growth 算法的实现中需要用到一个数据结构——FP 树,它是一种用于高效存储和查找频繁项集的树形结构,如果不熟悉 FP 树的实现,可能需要先了解一下。
原文地址: https://www.cveoy.top/t/topic/or6i 著作权归作者所有。请勿转载和采集!