openml · sebhrusen · Apr 6, 2021 · Apr 1, 2021 · Apr 1, 2021 · Apr 1, 2021
diff --git a/amlb/datautils.py b/amlb/datautils.py
@@ -14,7 +14,9 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import TransformerMixin
-from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score  # just aliasing
+from sklearn.metrics import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, \
+    log_loss, mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, \
+    r2_score, roc_auc_score  # just aliasing
 from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
 
 from .utils import profile, path_from_split, repr_def, split_path, touch

diff --git a/amlb/results.py b/amlb/results.py
@@ -16,7 +16,9 @@
 import pandas as pd
 
 from .data import Dataset, DatasetType, Feature
-from .datautils import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score, read_csv, write_csv, is_data_frame, to_data_frame
+from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
+    mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, r2_score, roc_auc_score, \
+    read_csv, write_csv, is_data_frame, to_data_frame
 from .resources import get as rget, config as rconfig, output_dirs
 from .utils import Namespace, backup_file, cached, datetime_iso, json_load, memoize, profile
 
@@ -394,6 +396,10 @@ def do_score(m):
         for metric in metadata.metrics or []:
             scores[metric] = do_score(metric)
         scores.result = scores[scores.metric] if scores.metric in scores else do_score(scores.metric)
+        if not higher_is_better(scores.metric):
+            scores.metric = f"neg_{scores.metric}"
+            scores.result = - scores.result
+
         scores.info = result.info
         if scoring_errors:
             scores.info = "; ".join(filter(lambda it: it, [scores.info, *scoring_errors]))
@@ -453,6 +459,8 @@ def __init__(self, error):
 
 class ClassificationResult(Result):
 
+    multi_class_average = 'weighted'  # used by metrics like fbeta or auc
+
     def __init__(self, predictions_df, info=None):
         super().__init__(predictions_df, info)
         self.classes = self.df.columns[:-2].values.astype(str, copy=False)
@@ -464,42 +472,80 @@ def __init__(self, predictions_df, info=None):
         self.labels = self._autoencode(self.classes)
 
     def acc(self):
+        """Accuracy"""
         return float(accuracy_score(self.truth, self.predictions))
 
-    def balacc(self):
-        return float(balanced_accuracy_score(self.truth, self.predictions))
-
     def auc(self):
+        """Array Under (ROC) Curve, computed on probabilities, not on predictions"""
         if self.type != DatasetType.binary:
-            # raise ValueError("AUC metric is only supported for binary classification: {}.".format(self.classes))
-            log.warning("AUC metric is only supported for binary classification: %s.", self.labels)
+            log.warning("For multiclass problems, please use `auc_ovr` or `auc_ovo` metrics instead of `auc`.")
             return nan
-        return float(roc_auc_score(self.truth, self.probabilities[:, 1], labels=self.labels))
+        return float(roc_auc_score(self.truth, self.probabilities[:, 1]))
 
-    def cm(self):
-        return confusion_matrix(self.truth, self.predictions, labels=self.labels)
+    def auc_ovo(self):
+        """AUC One-vs-One"""
+        return self._auc_multi(mc='ovo')
 
-    def _per_class_errors(self):
-        return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self.cm()))]
+    def auc_ovr(self):
+        """AUC One-vs-Rest"""
+        return self._auc_multi(mc='ovr')
 
-    def mean_pce(self):
-        """mean per class error"""
-        return statistics.mean(self._per_class_errors())
+    def balacc(self):
+        """Balanced accuracy"""
+        return float(balanced_accuracy_score(self.truth, self.predictions))
 
-    def max_pce(self):
-        """max per class error"""
-        return max(self._per_class_errors())
+    def f05(self):
+        """F-beta 0.5"""
+        return self._fbeta(0.5)
 
     def f1(self):
-        return float(f1_score(self.truth, self.predictions, labels=self.labels))
+        """F-beta 1"""
+        return self._fbeta(1)
+
+    def f2(self):
+        """F-beta 2"""
+        return self._fbeta(2)
 
     def logloss(self):
+        """Log Loss"""
         return float(log_loss(self.truth, self.probabilities, labels=self.labels))
 
+    def max_pce(self):
+        """Max per Class Error"""
+        return max(self._per_class_errors())
+
+    def mean_pce(self):
+        """Mean per Class Error"""
+        return statistics.mean(self._per_class_errors())
+
+    def pr_auc(self):
+        """Precision Recall AUC"""
+        if self.type != DatasetType.binary:
+            log.warning("PR AUC metric is only available for binary problems.")
+            return nan
+        # precision, recall, thresholds = precision_recall_curve(self.truth, self.probabilities[:, 1])
+        # return float(auc(recall, precision))
+        return float(average_precision_score(self.truth, self.probabilities[:, 1]))
+
     def _autoencode(self, vec):
         needs_encoding = not _encode_predictions_and_truth_ or (isinstance(vec[0], str) and not vec[0].isdigit())
         return self.target.label_encoder.transform(vec) if needs_encoding else vec
 
+    def _auc_multi(self, mc='raise'):
+        average = ClassificationResult.multi_class_average
+        return float(roc_auc_score(self.truth, self.probabilities, average=average, labels=self.labels, multi_class=mc))
+
+    def _cm(self):
+        return confusion_matrix(self.truth, self.predictions, labels=self.labels)
+
+    def _fbeta(self, beta):
+        average = ClassificationResult.multi_class_average if self.truth == DatasetType.multiclass else 'binary'
+        return float(fbeta_score(self.truth, self.predictions, beta=beta, average=average, labels=self.labels))
+
+    def _per_class_errors(self):
+        return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self._cm()))]
+
+
 
 class RegressionResult(Result):
 
@@ -510,24 +556,34 @@ def __init__(self, predictions_df, info=None):
         self.type = DatasetType.regression
 
     def mae(self):
+        """Mean Absolute Error"""
         return float(mean_absolute_error(self.truth, self.predictions))
 
     def mse(self):
+        """Mean Squared Error"""
         return float(mean_squared_error(self.truth, self.predictions))
 
     def msle(self):
+        """Mean Squared Logarithmic Error"""
         return float(mean_squared_log_error(self.truth, self.predictions))
 
     def rmse(self):
+        """Root Mean Square Error"""
         return math.sqrt(self.mse())
 
     def rmsle(self):
+        """Root Mean Square Logarithmic Error"""
         return math.sqrt(self.msle())
 
     def r2(self):
+        """R^2"""
         return float(r2_score(self.truth, self.predictions))
 
 
+def higher_is_better(metric):
+    return re.fullmatch(r"((pr_)?auc(_\w*)?)|(\w*acc)|(f\d+)|(r2)", metric)
+
+
 _encode_predictions_and_truth_ = False
 
 save_predictions = TaskResult.save_predictions
diff --git a/amlb_report/results.py b/amlb_report/results.py
@@ -1,11 +1,12 @@
 """
-Loading results, formatting and adding columns
-result is the raw result metric computed from predictions at the end the benchmark. For classification problems, it is usually auc for binomial classification and logloss for multinomial classification.
-score ensures a standard comparison between tasks: higher is always better.
-norm_score is a normalization of score on a [0, 1] scale, with {{zero_one_refs[0]}} score as 0 and {{zero_one_refs[1]}} score as 1.
-imp_result and imp_score for imputed results/scores. Given a task and a framework:
-if all folds results/scores are missing, then no imputation occurs, and the result is nan for each fold.
-if only some folds results/scores are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
+Loading results, formatting and adding columns.
+result is the raw result metric computed from predictions at the end the benchmark: higher is always better!
+ - For classification problems, it is usually auc for binary problems and negative log loss for multiclass problems.
+ - For regression problems, it is usually negative rmse.
+norm_result is a normalization of result on a [0, 1] scale, with {{zero_one_refs[0]}} scoring as 0 and {{zero_one_refs[1]}} scoring as 1.
+imp_result for imputed results. Given a task and a framework:
+ - if all folds results are missing, then no imputation occurs, and the result is nan for each fold.
+ - if only some folds results are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
 """
 
 import numpy as np
@@ -52,35 +53,21 @@ def imputed(row):
     return pd.isna(row.result) and pd.notna(row.imp_result)
 
 
-fit_metrics = ['auc', 'acc', 'r2']
-
-
-def metric_type(row, res_col='result'):
-    return 'fit' if any([row[res_col] == getattr(row, m, None) for m in fit_metrics]) else 'loss'
-
-
-def score(row, res_col='result'):
-    return (row[res_col] if row['metric_type'] == 'fit'
-            else - row[res_col])
-
-
-def norm_score(row, score_col='score',
-               zero_one_refs=None, ref_results=None,
-               aggregation=None):
+def norm_result(row, res_col='result', zero_one_refs=None, ref_results=None, aggregation=None):
     if zero_one_refs is None:
-        return row[score_col]
+        return row[res_col]
 
     def get_val(ref, default):
         try:
             if isinstance(ref, str):
                 return (ref_results.loc[(ref_results.framework == ref)
                                         & (ref_results.task == row.task)]
-                                       [score_col]
+                                       [res_col]
                                    .agg(aggregation) if aggregation
                         else ref_results.loc[(ref_results.framework == ref)
                                              & (ref_results.task == row.task)
                                              & (ref_results.fold == row.fold)]
-                                            [score_col]
+                                            [res_col]
                                         .item())
             else:
                 return ref
@@ -89,9 +76,9 @@ def get_val(ref, default):
             # return default
 
     zero, one = (get_val(ref, i) for i, ref in enumerate(zero_one_refs))
-    rel_score = (row[score_col] - zero) / (one - zero)
-    return (- rel_score if row['metric_type'] == 'loss' and one < 0 <= zero
-            else rel_score)
+    norm_res = (row[res_col] - zero) / (one - zero)
+    return (- norm_res if row['metric'].startswith("neg_") and one < 0 <= zero
+            else norm_res)
 
 
 def sorted_ints(arr):
@@ -117,7 +104,8 @@ def prepare_results(results,
                     imputation=None,
                     normalization=None,
                     ref_results=None,
-                    duplicates_handling='fail'  # other options are 'keep_first', 'keep_last', 'keep_none'
+                    duplicates_handling='fail',  # other options are 'keep_first', 'keep_last', 'keep_none'
+                    include_metadata=False
                     ):
     if results is None or len(results) == 0:
         return None
@@ -139,7 +127,7 @@ def prepare_results(results,
 
     folds = results.fold.unique()
 
-    metadata = load_dataset_metadata(results)
+    metadata = load_dataset_metadata(results) if include_metadata else {}
 
     done = results.set_index(['task', 'fold', 'framework'])
     done = remove_duplicates(done, handling=duplicates_handling)
@@ -158,9 +146,8 @@ def prepare_results(results,
 
     # extending the data frame
     results = results.append(missing.reset_index())
-    results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]
-    results['metric_type'] = [metric_type(row) for _, row in results.iterrows()]
-    results['score'] = [score(row) for _, row in results.iterrows()]
+    if 'type' not in results:
+        results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]
 
     if ref_results is None:
         ref_results = results
@@ -177,18 +164,14 @@ def prepare_results(results,
                                                imp_framework=imp_fr, imp_results=ref_results,
                                                imp_value=imp_val, aggregation=aggr)
                                  for _, row in results.iterrows()]
-        results['imp_score'] = [impute_result(row, results, 'score',
-                                              imp_framework=imp_fr, imp_results=ref_results,
-                                              imp_value=imp_val, aggregation=aggr)
-                                for _, row in results.iterrows()]
 
     if normalization is not None:
-        score_col = 'imp_score' if imputation is not None else 'score'
+        res_col = 'imp_result' if imputation is not None else 'result'
         zero_one = normalization[0:2]
         aggr = normalization[2] if len(normalization) > 2 else None
-        results['norm_score'] = [norm_score(row, score_col,
-                                            zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
-                                 for _, row in results.iterrows()]
+        results['norm_result'] = [norm_result(row, res_col,
+                                              zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
+                                  for _, row in results.iterrows()]
 
     return Namespace(
         results=results,