-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathclassic_arm.py
More file actions
112 lines (88 loc) · 4.53 KB
/
Copy pathclassic_arm.py
File metadata and controls
112 lines (88 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np
import time
from src.util.rule_quality import *
from src.util.ucimlrepo import *
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth, hmine
from joblib import Parallel, delayed
class ClassicARM:
"""
This class implements classical ARM approaches (FP-Growth and HMine) using the Mlxtend Python package
"""
def __init__(self, min_support=0.5, min_confidence=0.8, algorithm="fpgrowth"):
self.min_support = min_support
self.min_confidence = min_confidence
self.algorithm = algorithm
def mine_rules(self, dataset, antecedents=2, frequent_items=False, rule_stats=True):
one_hot_encoded_input = one_hot_encoding(dataset)
start = time.time()
if self.algorithm == "fpgrowth":
frq_items = fpgrowth(one_hot_encoded_input, self.min_support, use_colnames=True, max_len=antecedents + 1)
else:
frq_items = hmine(one_hot_encoded_input, self.min_support, use_colnames=True, max_len=antecedents + 1)
if len(frq_items) == 0:
return None, None
if frequent_items:
exec_time = (time.time() - start)
return frq_items, exec_time
rules = association_rules(frq_items, metric="confidence", min_threshold=self.min_confidence)
exec_time = time.time() - start
if len(rules) == 0:
return None, None
if not rule_stats:
reformatted_rules = self.reformat_rules(rules)
return reformatted_rules, exec_time
coverage = self.calculate_dataset_coverage(rules, dataset)
reformatted_rules = self.reformat_rules(rules)
stats = calculate_average_rule_quality(reformatted_rules)
return [len(rules), exec_time, stats['support'], stats["confidence"], coverage], reformatted_rules
def reformat_rules(self, rules):
reformatted_rules = []
for rule_index, rule in rules.iterrows():
new_rule = {'antecedents': list(rule['antecedents']), 'consequent': list(rule['consequents']),
'support': rule['support'], 'confidence': rule['confidence']}
reformatted_rules.append(new_rule)
return reformatted_rules
@staticmethod
def calculate_dataset_coverage(rules, dataset):
dataset_sets = [set(transaction) for transaction in dataset]
coverage_array = np.zeros(len(dataset), dtype=bool)
for rule in rules.itertuples(index=False):
antecedents = set(rule.antecedents)
coverage_array |= np.array([antecedents.issubset(transaction) for transaction in dataset_sets])
coverage = np.sum(coverage_array) / len(dataset)
return coverage
def calculate_stats(self, rules, exec_time, dataset):
"""
Optimized function for calculating rule quality metrics including coverage,
interestingness, and Yule's Q.
"""
# Precompute dataset as a list of sets for efficient membership checking
dataset_sets = [set(transaction) for transaction in dataset]
num_transactions = len(dataset)
# Function to process each rule and return its stats and local coverage
def process_rule(index, row):
local_coverage = np.zeros(num_transactions)
antecedents = set(row['antecedents'])
for transaction_index, transaction in enumerate(dataset_sets):
if antecedents.issubset(transaction):
local_coverage[transaction_index] = 1 # Mark transaction as covered
# Calculate metrics
# row["interestingness"] = calculate_interestingness(
# row['confidence'], row['support'], row['consequent support'], num_transactions
# )
# row["yulesq"] = calculate_yulesq(
# ant_and_cons_count, no_ant_no_cons_count, cons_no_ant_count, ant_no_cons_count
# )
return row, local_coverage
# Parallel processing of rules
results = Parallel(n_jobs=-1)(
delayed(process_rule)(index, row) for index, row in rules.iterrows()
)
# Extract rule stats and combine coverage
rule_stats, coverages = zip(*results)
rule_coverage = np.maximum.reduce(coverages) # Combine local coverages using element-wise maximum
# Calculate overall statistics
stats = calculate_average_rule_quality(rule_stats)
stats["coverage"] = sum(rule_coverage) / num_transactions
return [len(rules), exec_time, stats['support'], stats["confidence"], stats["coverage"]]