Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion amlb/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score # just aliasing
from sklearn.metrics import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, \
log_loss, mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, \
r2_score, roc_auc_score # just aliasing
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder

from .utils import profile, path_from_split, repr_def, split_path, touch
Expand Down
92 changes: 74 additions & 18 deletions amlb/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import pandas as pd

from .data import Dataset, DatasetType, Feature
from .datautils import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score, read_csv, write_csv, is_data_frame, to_data_frame
from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, r2_score, roc_auc_score, \
read_csv, write_csv, is_data_frame, to_data_frame
from .resources import get as rget, config as rconfig, output_dirs
from .utils import Namespace, backup_file, cached, datetime_iso, json_load, memoize, profile

Expand Down Expand Up @@ -394,6 +396,10 @@ def do_score(m):
for metric in metadata.metrics or []:
scores[metric] = do_score(metric)
scores.result = scores[scores.metric] if scores.metric in scores else do_score(scores.metric)
if not higher_is_better(scores.metric):
scores.metric = f"neg_{scores.metric}"
scores.result = - scores.result

scores.info = result.info
if scoring_errors:
scores.info = "; ".join(filter(lambda it: it, [scores.info, *scoring_errors]))
Expand Down Expand Up @@ -453,6 +459,8 @@ def __init__(self, error):

class ClassificationResult(Result):

multi_class_average = 'weighted' # used by metrics like fbeta or auc

def __init__(self, predictions_df, info=None):
super().__init__(predictions_df, info)
self.classes = self.df.columns[:-2].values.astype(str, copy=False)
Expand All @@ -464,42 +472,80 @@ def __init__(self, predictions_df, info=None):
self.labels = self._autoencode(self.classes)

def acc(self):
"""Accuracy"""
return float(accuracy_score(self.truth, self.predictions))

def balacc(self):
return float(balanced_accuracy_score(self.truth, self.predictions))

def auc(self):
"""Array Under (ROC) Curve, computed on probabilities, not on predictions"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: area instead of array

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oups! will fix

if self.type != DatasetType.binary:
# raise ValueError("AUC metric is only supported for binary classification: {}.".format(self.classes))
log.warning("AUC metric is only supported for binary classification: %s.", self.labels)
log.warning("For multiclass problems, please use `auc_ovr` or `auc_ovo` metrics instead of `auc`.")
return nan
return float(roc_auc_score(self.truth, self.probabilities[:, 1], labels=self.labels))
return float(roc_auc_score(self.truth, self.probabilities[:, 1]))

def cm(self):
return confusion_matrix(self.truth, self.predictions, labels=self.labels)
def auc_ovo(self):
"""AUC One-vs-One"""
return self._auc_multi(mc='ovo')

def _per_class_errors(self):
return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self.cm()))]
def auc_ovr(self):
"""AUC One-vs-Rest"""
return self._auc_multi(mc='ovr')

def mean_pce(self):
"""mean per class error"""
return statistics.mean(self._per_class_errors())
def balacc(self):
"""Balanced accuracy"""
return float(balanced_accuracy_score(self.truth, self.predictions))

def max_pce(self):
"""max per class error"""
return max(self._per_class_errors())
def f05(self):
"""F-beta 0.5"""
return self._fbeta(0.5)

def f1(self):
return float(f1_score(self.truth, self.predictions, labels=self.labels))
"""F-beta 1"""
return self._fbeta(1)

def f2(self):
"""F-beta 2"""
return self._fbeta(2)

def logloss(self):
"""Log Loss"""
return float(log_loss(self.truth, self.probabilities, labels=self.labels))

def max_pce(self):
"""Max per Class Error"""
return max(self._per_class_errors())

def mean_pce(self):
"""Mean per Class Error"""
return statistics.mean(self._per_class_errors())

def pr_auc(self):
"""Precision Recall AUC"""
if self.type != DatasetType.binary:
log.warning("PR AUC metric is only available for binary problems.")
return nan
# precision, recall, thresholds = precision_recall_curve(self.truth, self.probabilities[:, 1])
# return float(auc(recall, precision))
return float(average_precision_score(self.truth, self.probabilities[:, 1]))

def _autoencode(self, vec):
needs_encoding = not _encode_predictions_and_truth_ or (isinstance(vec[0], str) and not vec[0].isdigit())
return self.target.label_encoder.transform(vec) if needs_encoding else vec

def _auc_multi(self, mc='raise'):
average = ClassificationResult.multi_class_average
return float(roc_auc_score(self.truth, self.probabilities, average=average, labels=self.labels, multi_class=mc))

def _cm(self):
return confusion_matrix(self.truth, self.predictions, labels=self.labels)

def _fbeta(self, beta):
average = ClassificationResult.multi_class_average if self.truth == DatasetType.multiclass else 'binary'
return float(fbeta_score(self.truth, self.predictions, beta=beta, average=average, labels=self.labels))

def _per_class_errors(self):
return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self._cm()))]



class RegressionResult(Result):

Expand All @@ -510,24 +556,34 @@ def __init__(self, predictions_df, info=None):
self.type = DatasetType.regression

def mae(self):
"""Mean Absolute Error"""
return float(mean_absolute_error(self.truth, self.predictions))

def mse(self):
"""Mean Squared Error"""
return float(mean_squared_error(self.truth, self.predictions))

def msle(self):
"""Mean Squared Logarithmic Error"""
return float(mean_squared_log_error(self.truth, self.predictions))

def rmse(self):
"""Root Mean Square Error"""
return math.sqrt(self.mse())

def rmsle(self):
"""Root Mean Square Logarithmic Error"""
return math.sqrt(self.msle())

def r2(self):
"""R^2"""
return float(r2_score(self.truth, self.predictions))


def higher_is_better(metric):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems a bit hacky. Better to have either a dictionary mapping or metrics as classes (example in AutoGluon).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't disagree with you: it IS a bit hacky.
Ideally, there should be a class for each metric. It's probably something I'll do at some point to support custom metrics or other customizations in a more satisfying way that what was done in #141.
If there's a demand for it, I'll do it.

return re.fullmatch(r"((pr_)?auc(_\w*)?)|(\w*acc)|(f\d+)|(r2)", metric)


_encode_predictions_and_truth_ = False

save_predictions = TaskResult.save_predictions
65 changes: 24 additions & 41 deletions amlb_report/results.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""
Loading results, formatting and adding columns
result is the raw result metric computed from predictions at the end the benchmark. For classification problems, it is usually auc for binomial classification and logloss for multinomial classification.
score ensures a standard comparison between tasks: higher is always better.
norm_score is a normalization of score on a [0, 1] scale, with {{zero_one_refs[0]}} score as 0 and {{zero_one_refs[1]}} score as 1.
imp_result and imp_score for imputed results/scores. Given a task and a framework:
if all folds results/scores are missing, then no imputation occurs, and the result is nan for each fold.
if only some folds results/scores are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
Loading results, formatting and adding columns.
result is the raw result metric computed from predictions at the end the benchmark: higher is always better!
- For classification problems, it is usually auc for binary problems and negative log loss for multiclass problems.
- For regression problems, it is usually negative rmse.
norm_result is a normalization of result on a [0, 1] scale, with {{zero_one_refs[0]}} scoring as 0 and {{zero_one_refs[1]}} scoring as 1.
imp_result for imputed results. Given a task and a framework:
- if all folds results are missing, then no imputation occurs, and the result is nan for each fold.
- if only some folds results are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
"""

import numpy as np
Expand Down Expand Up @@ -52,35 +53,21 @@ def imputed(row):
return pd.isna(row.result) and pd.notna(row.imp_result)


fit_metrics = ['auc', 'acc', 'r2']


def metric_type(row, res_col='result'):
return 'fit' if any([row[res_col] == getattr(row, m, None) for m in fit_metrics]) else 'loss'


def score(row, res_col='result'):
return (row[res_col] if row['metric_type'] == 'fit'
else - row[res_col])


def norm_score(row, score_col='score',
zero_one_refs=None, ref_results=None,
aggregation=None):
def norm_result(row, res_col='result', zero_one_refs=None, ref_results=None, aggregation=None):
if zero_one_refs is None:
return row[score_col]
return row[res_col]

def get_val(ref, default):
try:
if isinstance(ref, str):
return (ref_results.loc[(ref_results.framework == ref)
& (ref_results.task == row.task)]
[score_col]
[res_col]
.agg(aggregation) if aggregation
else ref_results.loc[(ref_results.framework == ref)
& (ref_results.task == row.task)
& (ref_results.fold == row.fold)]
[score_col]
[res_col]
.item())
else:
return ref
Expand All @@ -89,9 +76,9 @@ def get_val(ref, default):
# return default

zero, one = (get_val(ref, i) for i, ref in enumerate(zero_one_refs))
rel_score = (row[score_col] - zero) / (one - zero)
return (- rel_score if row['metric_type'] == 'loss' and one < 0 <= zero
else rel_score)
norm_res = (row[res_col] - zero) / (one - zero)
return (- norm_res if row['metric'].startswith("neg_") and one < 0 <= zero
else norm_res)


def sorted_ints(arr):
Expand All @@ -117,7 +104,8 @@ def prepare_results(results,
imputation=None,
normalization=None,
ref_results=None,
duplicates_handling='fail' # other options are 'keep_first', 'keep_last', 'keep_none'
duplicates_handling='fail', # other options are 'keep_first', 'keep_last', 'keep_none'
include_metadata=False
):
if results is None or len(results) == 0:
return None
Expand All @@ -139,7 +127,7 @@ def prepare_results(results,

folds = results.fold.unique()

metadata = load_dataset_metadata(results)
metadata = load_dataset_metadata(results) if include_metadata else {}

done = results.set_index(['task', 'fold', 'framework'])
done = remove_duplicates(done, handling=duplicates_handling)
Expand All @@ -158,9 +146,8 @@ def prepare_results(results,

# extending the data frame
results = results.append(missing.reset_index())
results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]
results['metric_type'] = [metric_type(row) for _, row in results.iterrows()]
results['score'] = [score(row) for _, row in results.iterrows()]
if 'type' not in results:
results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]

if ref_results is None:
ref_results = results
Expand All @@ -177,18 +164,14 @@ def prepare_results(results,
imp_framework=imp_fr, imp_results=ref_results,
imp_value=imp_val, aggregation=aggr)
for _, row in results.iterrows()]
results['imp_score'] = [impute_result(row, results, 'score',
imp_framework=imp_fr, imp_results=ref_results,
imp_value=imp_val, aggregation=aggr)
for _, row in results.iterrows()]

if normalization is not None:
score_col = 'imp_score' if imputation is not None else 'score'
res_col = 'imp_result' if imputation is not None else 'result'
zero_one = normalization[0:2]
aggr = normalization[2] if len(normalization) > 2 else None
results['norm_score'] = [norm_score(row, score_col,
zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
for _, row in results.iterrows()]
results['norm_result'] = [norm_result(row, res_col,
zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
for _, row in results.iterrows()]

return Namespace(
results=results,
Expand Down
Loading