-
Notifications
You must be signed in to change notification settings - Fork 229
/
Copy pathbase_metric.py
907 lines (751 loc) · 34.1 KB
/
base_metric.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
"""
Base module.
"""
from sklearn.base import BaseEstimator
from sklearn.utils.extmath import stable_cumsum
from sklearn.utils.validation import _is_arraylike, check_is_fitted
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
import numpy as np
from abc import ABCMeta, abstractmethod
from ._util import ArrayIndexer, check_input, validate_vector
import warnings
class BaseMetricLearner(BaseEstimator, metaclass=ABCMeta):
"""
Base class for all metric-learners.
Parameters
----------
preprocessor : array-like, shape=(n_samples, n_features) or callable
The preprocessor to call to get tuples from indices. If array-like,
tuples will be gotten like this: X[indices].
"""
def __init__(self, preprocessor=None):
self.preprocessor = preprocessor
@abstractmethod
def score_pairs(self, pairs):
"""
.. deprecated:: 0.7.0 Refer to `pair_distance` and `pair_similarity`.
.. warning::
This method will be removed in 0.8.0. Please refer to `pair_distance`
or `pair_similarity`. This change will occur in order to add learners
that don't necessarily learn a Mahalanobis distance.
Returns the score between pairs
(can be a similarity, or a distance/metric depending on the algorithm)
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs to score, with each row corresponding to two points,
for 2D array of indices of pairs if the metric learner uses a
preprocessor.
Returns
-------
scores : `numpy.ndarray` of shape=(n_pairs,)
The score of every pair.
See Also
--------
get_metric : a method that returns a function to compute the metric between
two points. The difference between `score_pairs` is that it works on two
1D arrays and cannot use a preprocessor. Besides, the returned function
is independent of the metric learner and hence is not modified if the
metric learner is.
"""
@abstractmethod
def pair_score(self, pairs):
"""
.. versionadded:: 0.7.0 Compute the similarity score between pairs
Returns the similarity score between pairs of points (the larger the score,
the more similar the pair). For metric learners that learn a distance,
the score is simply the opposite of the distance between pairs. All learners
have access to this method.
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs to score, with each row corresponding to two points,
for 2D array of indices of pairs if the metric learner uses a
preprocessor.
Returns
-------
scores : `numpy.ndarray` of shape=(n_pairs,)
The score of every pair.
See Also
--------
get_metric : a method that returns a function to compute the metric between
two points. The difference with `pair_score` is that it works on two
1D arrays and cannot use a preprocessor. Besides, the returned function
is independent of the metric learner and hence is not modified if the
metric learner is.
"""
@abstractmethod
def pair_distance(self, pairs):
"""
.. versionadded:: 0.7.0 Compute the distance between pairs
Returns the (pseudo) distance between pairs, when available. For metric
learners that do not learn a (pseudo) distance, an error is thrown
instead.
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs for which to compute the distance, with each
row corresponding to two points, for 2D array of indices of pairs
if the metric learner uses a preprocessor.
Returns
-------
scores : `numpy.ndarray` of shape=(n_pairs,)
The distance between every pair.
See Also
--------
get_metric : a method that returns a function to compute the metric between
two points. The difference with `pair_distance` is that it works on two
1D arrays and cannot use a preprocessor. Besides, the returned function
is independent of the metric learner and hence is not modified if the
metric learner is.
"""
def _check_preprocessor(self):
"""Initializes the preprocessor"""
if _is_arraylike(self.preprocessor):
self.preprocessor_ = ArrayIndexer(self.preprocessor)
elif callable(self.preprocessor) or self.preprocessor is None:
self.preprocessor_ = self.preprocessor
else:
raise ValueError("Invalid type for the preprocessor: {}. You should "
"provide either None, an array-like object, "
"or a callable.".format(type(self.preprocessor)))
def _prepare_inputs(self, X, y=None, type_of_inputs='classic',
**kwargs):
"""Initializes the preprocessor and processes inputs. See `check_input`
for more details.
Parameters
----------
X : array-like
The input data array to check.
y : array-like
The input labels array to check.
type_of_inputs : `str` {'classic', 'tuples'}
The type of inputs to check. If 'classic', the input should be
a 2D array-like of points or a 1D array like of indicators of points. If
'tuples', the input should be a 3D array-like of tuples or a 2D
array-like of indicators of tuples.
**kwargs : dict
Arguments to pass to check_input.
Returns
-------
X : `numpy.ndarray`
The checked input data array.
y : `numpy.ndarray` (optional)
The checked input labels array.
"""
self._check_preprocessor()
check_is_fitted(self, ['preprocessor_'])
return check_input(X, y,
type_of_inputs=type_of_inputs,
preprocessor=self.preprocessor_,
estimator=self,
tuple_size=getattr(self, '_tuple_size', None),
**kwargs)
@abstractmethod
def get_metric(self):
"""Returns a function that takes as input two 1D arrays and outputs
the value of the learned metric on these two points. Depending on the
algorithm, it can return a distance or a similarity function between
pairs.
This function will be independent from the metric learner that learned it
(it will not be modified if the initial metric learner is modified),
and it can be directly plugged into the `metric` argument of
scikit-learn's estimators.
Returns
-------
metric_fun : function
The function described above.
Examples
--------
.. doctest::
>>> from metric_learn import NCA
>>> from sklearn.datasets import make_classification
>>> from sklearn.neighbors import KNeighborsClassifier
>>> nca = NCA()
>>> X, y = make_classification()
>>> nca.fit(X, y)
>>> knn = KNeighborsClassifier(metric=nca.get_metric())
>>> knn.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric=<function MahalanobisMixin.get_metric.<locals>.metric_fun
at 0x...>,
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
See Also
--------
pair_distance : a method that returns the distance between several
pairs of points. Unlike `get_metric`, this is a method of the metric
learner and therefore can change if the metric learner changes. Besides,
it can use the metric learner's preprocessor, and works on concatenated
arrays.
pair_score : a method that returns the similarity score between
several pairs of points. Unlike `get_metric`, this is a method of the
metric learner and therefore can change if the metric learner changes.
Besides, it can use the metric learner's preprocessor, and works on
concatenated arrays.
"""
class MetricTransformer(metaclass=ABCMeta):
@abstractmethod
def transform(self, X):
"""Applies the metric transformation.
Parameters
----------
X : (n x d) matrix
Data to transform.
Returns
-------
transformed : (n x d) matrix
Input data transformed to the metric space by :math:`XL^{\\top}`
"""
class MahalanobisMixin(BaseMetricLearner, MetricTransformer,
metaclass=ABCMeta):
r"""Mahalanobis metric learning algorithms.
Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`,
defined between two column vectors :math:`x` and :math:`x'` by: :math:`d_M(x,
x') = \sqrt{(x-x')^T M (x-x')}`, where :math:`M` is a learned symmetric
positive semi-definite (PSD) matrix. The metric between points can then be
expressed as the euclidean distance between points embedded in a new space
through a linear transformation. Indeed, the above matrix can be decomposed
into the product of two transpose matrices (through SVD or Cholesky
decomposition): :math:`d_M(x, x')^2 = (x-x')^T M (x-x') = (x-x')^T L^T L
(x-x') = (L x - L x')^T (L x- L x')`
Attributes
----------
components_ : `numpy.ndarray`, shape=(n_components, n_features)
The learned linear transformation ``L``.
"""
def score_pairs(self, pairs):
r"""
.. deprecated:: 0.7.0
This method is deprecated. Please use `pair_distance` instead.
.. warning::
This method will be removed in 0.8.0. Please refer to `pair_distance`
or `pair_score`. This change will occur in order to add learners
that don't necessarily learn a Mahalanobis distance.
Returns the learned Mahalanobis distance between pairs.
This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}`
where ``M`` is the learned Mahalanobis matrix, for every pair of points
``x`` and ``x'``. This corresponds to the euclidean distance between
embeddings of the points in a new space, obtained through a linear
transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e -
x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See
:class:`MahalanobisMixin`).
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs to score, with each row corresponding to two points,
for 2D array of indices of pairs if the metric learner uses a
preprocessor.
Returns
-------
scores : `numpy.ndarray` of shape=(n_pairs,)
The learned Mahalanobis distance for every pair.
See Also
--------
get_metric : a method that returns a function to compute the metric between
two points. The difference with `score_pairs` is that it works on two
1D arrays and cannot use a preprocessor. Besides, the returned function
is independent of the metric learner and hence is not modified if the
metric learner is.
:ref:`mahalanobis_distances` : The section of the project documentation
that describes Mahalanobis Distances.
"""
dpr_msg = ("score_pairs will be deprecated in release 0.7.0. "
"Use pair_score to compute similarity scores, or "
"pair_distances to compute distances.")
warnings.warn(dpr_msg, category=FutureWarning)
return self.pair_distance(pairs)
def pair_score(self, pairs):
"""
Returns the opposite of the learned Mahalanobis distance between pairs.
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs to score, with each row corresponding to two points,
for 2D array of indices of pairs if the metric learner uses a
preprocessor.
Returns
-------
scores : `numpy.ndarray` of shape=(n_pairs,)
The opposite of the learned Mahalanobis distance for every pair.
See Also
--------
get_metric : a method that returns a function to compute the metric between
two points. The difference with `pair_score` is that it works on two
1D arrays and cannot use a preprocessor. Besides, the returned function
is independent of the metric learner and hence is not modified if the
metric learner is.
:ref:`mahalanobis_distances` : The section of the project documentation
that describes Mahalanobis Distances.
"""
return -1 * self.pair_distance(pairs)
def pair_distance(self, pairs):
"""
Returns the learned Mahalanobis distance between pairs.
This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}`
where ``M`` is the learned Mahalanobis matrix, for every pair of points
``x`` and ``x'``. This corresponds to the euclidean distance between
embeddings of the points in a new space, obtained through a linear
transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e -
x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See
:class:`MahalanobisMixin`).
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs to score, with each row corresponding to two points,
for 2D array of indices of pairs if the metric learner uses a
preprocessor.
Returns
-------
scores : `numpy.ndarray` of shape=(n_pairs,)
The learned Mahalanobis distance for every pair.
See Also
--------
get_metric : a method that returns a function to compute the metric between
two points. The difference with `pair_distance` is that it works on two
1D arrays and cannot use a preprocessor. Besides, the returned function
is independent of the metric learner and hence is not modified if the
metric learner is.
:ref:`mahalanobis_distances` : The section of the project documentation
that describes Mahalanobis Distances.
"""
check_is_fitted(self, ['preprocessor_'])
pairs = check_input(pairs, type_of_inputs='tuples',
preprocessor=self.preprocessor_,
estimator=self, tuple_size=2)
pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :])
# (for MahalanobisMixin, the embedding is linear so we can just embed the
# difference)
return np.sqrt(np.sum(pairwise_diffs**2, axis=-1))
def transform(self, X):
"""Embeds data points in the learned linear embedding space.
Transforms samples in ``X`` into ``X_embedded``, samples inside a new
embedding space such that: ``X_embedded = X.dot(L.T)``, where ``L`` is
the learned linear transformation (See :class:`MahalanobisMixin`).
Parameters
----------
X : `numpy.ndarray`, shape=(n_samples, n_features)
The data points to embed.
Returns
-------
X_embedded : `numpy.ndarray`, shape=(n_samples, n_components)
The embedded data points.
"""
check_is_fitted(self, ['preprocessor_', 'components_'])
X_checked = check_input(X, type_of_inputs='classic', estimator=self,
preprocessor=self.preprocessor_,
accept_sparse=True)
return X_checked.dot(self.components_.T)
def get_metric(self):
check_is_fitted(self, 'components_')
components_T = self.components_.T.copy()
def metric_fun(u, v, squared=False):
"""This function computes the metric between u and v, according to the
previously learned metric.
Parameters
----------
u : array-like, shape=(n_features,)
The first point involved in the distance computation.
v : array-like, shape=(n_features,)
The second point involved in the distance computation.
squared : `bool`
If True, the function will return the squared metric between u and
v, which is faster to compute.
Returns
-------
distance : float
The distance between u and v according to the new metric.
"""
u = validate_vector(u)
v = validate_vector(v)
transformed_diff = (u - v).dot(components_T)
dist = np.dot(transformed_diff, transformed_diff.T)
if not squared:
dist = np.sqrt(dist)
return dist
return metric_fun
get_metric.__doc__ = BaseMetricLearner.get_metric.__doc__
def get_mahalanobis_matrix(self):
"""Returns a copy of the Mahalanobis matrix learned by the metric learner.
Returns
-------
M : `numpy.ndarray`, shape=(n_features, n_features)
The copy of the learned Mahalanobis matrix.
"""
check_is_fitted(self, 'components_')
return self.components_.T.dot(self.components_)
class _PairsClassifierMixin(BaseMetricLearner):
"""Base class for pairs learners.
Attributes
----------
threshold_ : `float`
If the distance metric between two points is lower than this threshold,
points will be classified as similar, otherwise they will be
classified as dissimilar.
"""
_tuple_size = 2 # number of points in a tuple, 2 for pairs
def predict(self, pairs):
"""Predicts the learned metric between input pairs. (For now it just
calls decision function).
Returns the learned metric value between samples in every pair. It should
ideally be low for similar samples and high for dissimilar samples.
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs to predict, with each row corresponding to two
points, or 2D array of indices of pairs if the metric learner uses a
preprocessor.
Returns
-------
y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
The predicted learned metric value between samples in every pair.
"""
check_is_fitted(self, 'preprocessor_')
if "threshold_" not in vars(self):
msg = ("A threshold for this estimator has not been set, "
"call its set_threshold or calibrate_threshold method.")
raise AttributeError(msg)
return 2 * (- self.decision_function(pairs) <= self.threshold_) - 1
def decision_function(self, pairs):
"""Returns the decision function used to classify the pairs.
Returns the opposite of the learned metric value between samples in every
pair, to be consistent with scikit-learn conventions. Hence it should
ideally be low for dissimilar samples and high for similar samples.
This is the decision function that is used to classify pairs as similar
(+1), or dissimilar (-1).
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs to predict, with each row corresponding to two
points, or 2D array of indices of pairs if the metric learner uses a
preprocessor.
Returns
-------
y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
The predicted decision function value for each pair.
"""
check_is_fitted(self, 'preprocessor_')
pairs = check_input(pairs, type_of_inputs='tuples',
preprocessor=self.preprocessor_,
estimator=self, tuple_size=self._tuple_size)
return self.pair_score(pairs)
def score(self, pairs, y):
"""Computes score of pairs similarity prediction.
Returns the ``roc_auc`` score of the fitted metric learner. It is
computed in the following way: for every value of a threshold
``t`` we classify all pairs of samples where the predicted distance is
inferior to ``t`` as belonging to the "similar" class, and the other as
belonging to the "dissimilar" class, and we count false positive and
true positives as in a classical ``roc_auc`` curve.
Parameters
----------
pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
3D Array of pairs, with each row corresponding to two points,
or 2D array of indices of pairs if the metric learner uses a
preprocessor.
y : array-like, shape=(n_constraints,)
The corresponding labels.
Returns
-------
score : float
The ``roc_auc`` score.
"""
return roc_auc_score(y, self.decision_function(pairs))
def set_threshold(self, threshold):
"""Sets the threshold of the metric learner to the given value `threshold`.
See more in the :ref:`User Guide <calibration>`.
Parameters
----------
threshold : float
The threshold value we want to set. It is the value to which the
predicted distance for test pairs will be compared. If they are superior
to the threshold they will be classified as similar (+1),
and dissimilar (-1) if not.
Returns
-------
self : `_PairsClassifier`
The pairs classifier with the new threshold set.
"""
check_is_fitted(self, 'preprocessor_')
self.threshold_ = threshold
return self
def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
min_rate=None, beta=1.):
"""Decision threshold calibration for pairwise binary classification
Method that calibrates the decision threshold (cutoff point) of the metric
learner. This threshold will then be used when calling the method
`predict`. The methods for picking cutoff points make use of traditional
binary classification evaluation statistics such as the true positive and
true negative rates and F-scores. The threshold will be found to maximize
the chosen score on the validation set ``(pairs_valid, y_valid)``.
See more in the :ref:`User Guide <calibration>`.
Parameters
----------
strategy : str, optional (default='accuracy')
The strategy to use for choosing the cutoff threshold.
'accuracy'
Selects a decision threshold that maximizes the accuracy.
'f_beta'
Selects a decision threshold that maximizes the f_beta score,
with beta given by the parameter `beta`.
'max_tpr'
Selects a decision threshold that yields the highest true positive
rate with true negative rate at least equal to the value of the
parameter `min_rate`.
'max_tnr'
Selects a decision threshold that yields the highest true negative
rate with true positive rate at least equal to the value of the
parameter `min_rate`.
beta : float in [0, 1], optional (default=None)
Beta value to be used in case strategy == 'f_beta'.
min_rate : float in [0, 1] or None, (default=None)
In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
to specify the minimal value for the true negative rate or true positive
rate respectively that needs to be achieved.
pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features)
The validation set of pairs to use to set the threshold.
y_valid : array-like, shape=(n_pairs_valid,)
The labels of the pairs of the validation set to use to set the
threshold. They must be +1 for positive pairs and -1 for negative pairs.
References
----------
.. [1] Receiver-operating characteristic (ROC) plots: a fundamental
evaluation tool in clinical medicine, MH Zweig, G Campbell -
Clinical chemistry, 1993
.. [2] most of the code of this function is from scikit-learn's PR #10117
See Also
--------
sklearn.calibration : scikit-learn's module for calibrating classifiers
"""
check_is_fitted(self, 'preprocessor_')
self._validate_calibration_params(strategy, min_rate, beta)
pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid,
type_of_inputs='tuples')
n_samples = pairs_valid.shape[0]
if strategy == 'accuracy':
scores = self.decision_function(pairs_valid)
scores_sorted_idces = np.argsort(scores)[::-1]
scores_sorted = scores[scores_sorted_idces]
# true labels ordered by decision_function value: (higher first)
y_ordered = y_valid[scores_sorted_idces]
# we need to add a threshold that will reject all points
scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted])
# finds the threshold that maximizes the accuracy:
cum_tp = stable_cumsum(y_ordered == 1) # cumulative number of true
# positives
# we need to add the point where all samples are rejected:
cum_tp = np.concatenate([[0.], cum_tp])
cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1)
cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1]
cum_accuracy = (cum_tp + cum_tn) / n_samples
imax = np.argmax(cum_accuracy)
# we set the threshold to the lowest accepted score
# note: we are working with negative distances but we want the threshold
# to be with respect to the actual distances so we take minus sign
self.threshold_ = - scores_sorted[imax]
# note: if the best is to reject all points it's already one of the
# thresholds (scores_sorted[0])
return self
if strategy == 'f_beta':
precision, recall, thresholds = precision_recall_curve(
y_valid, self.decision_function(pairs_valid), pos_label=1)
# here the thresholds are decreasing
# We ignore the warnings here, in the same taste as
# https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065
# f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284
with np.errstate(divide='ignore', invalid='ignore'):
f_beta = ((1 + beta**2) * (precision * recall) /
(beta**2 * precision + recall))
# We need to set nans to zero otherwise they will be considered higher
# than the others (also discussed in https://github.com/scikit-learn/
# scikit-learn/pull/10117/files#r262115773)
f_beta[np.isnan(f_beta)] = 0.
imax = np.argmax(f_beta)
# we set the threshold to the lowest accepted score
# note: we are working with negative distances but we want the threshold
# to be with respect to the actual distances so we take minus sign
self.threshold_ = - thresholds[imax]
# Note: we don't need to deal with rejecting all points (i.e. threshold =
# max_scores + 1), since this can never happen to be optimal
# (see a more detailed discussion in test_calibrate_threshold_extreme)
return self
fpr, tpr, thresholds = roc_curve(y_valid,
self.decision_function(pairs_valid),
pos_label=1)
# here the thresholds are decreasing
fpr, tpr, thresholds = fpr, tpr, thresholds
if strategy in ['max_tpr', 'max_tnr']:
if strategy == 'max_tpr':
indices = np.where(1 - fpr >= min_rate)[0]
imax = np.argmax(tpr[indices])
if strategy == 'max_tnr':
indices = np.where(tpr >= min_rate)[0]
imax = np.argmax(1 - fpr[indices])
imax_valid = indices[imax]
# note: we are working with negative distances but we want the threshold
# to be with respect to the actual distances so we take minus sign
if indices[imax] == len(thresholds): # we want to accept everything
self.threshold_ = - (thresholds[imax_valid] - 1)
else:
# thanks to roc_curve, the first point will always be max_scores
# + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523
self.threshold_ = - thresholds[imax_valid]
return self
@staticmethod
def _validate_calibration_params(strategy='accuracy', min_rate=None,
beta=1.):
"""Ensure that calibration parameters have allowed values"""
if strategy not in ('accuracy', 'f_beta', 'max_tpr',
'max_tnr'):
raise ValueError('Strategy can either be "accuracy", "f_beta" or '
'"max_tpr" or "max_tnr". Got "{}" instead.'
.format(strategy))
if strategy == 'max_tpr' or strategy == 'max_tnr':
if (min_rate is None or not isinstance(min_rate, (int, float)) or
not min_rate >= 0 or not min_rate <= 1):
raise ValueError('Parameter min_rate must be a number in'
'[0, 1]. '
'Got {} instead.'.format(min_rate))
if strategy == 'f_beta':
if beta is None or not isinstance(beta, (int, float)):
raise ValueError('Parameter beta must be a real number. '
'Got {} instead.'.format(type(beta)))
class _TripletsClassifierMixin(BaseMetricLearner):
"""Base class for triplets learners.
"""
_tuple_size = 3 # number of points in a tuple, 3 for triplets
def predict(self, triplets):
"""Predicts the ordering between sample distances in input triplets.
For each triplets, returns 1 if the first element is closer to the second
than to the last and -1 if not.
Parameters
----------
triplets : array-like, shape=(n_triplets, 3, n_features) or (n_triplets, 3)
3D array of triplets to predict, with each row corresponding to three
points, or 2D array of indices of triplets if the metric learner
uses a preprocessor.
Returns
-------
prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
Predictions of the ordering of pairs, for each triplet.
"""
return 2 * (self.decision_function(triplets) > 0) - 1
def decision_function(self, triplets):
"""Predicts differences between sample distances in input triplets.
For each triplet (X_a, X_b, X_c) in the samples, computes the difference
between the learned distance of the second pair (X_a, X_c) minus the
learned distance of the first pair (X_a, X_b). The higher it is, the more
probable it is that the pairs in the triplets are presented in the right
order, i.e. that the label of the triplet is 1. The lower it is, the more
probable it is that the label of the triplet is -1.
Parameters
----------
triplet : array-like, shape=(n_triplets, 3, n_features) or \
(n_triplets, 3)
3D array of triplets to predict, with each row corresponding to three
points, or 2D array of indices of triplets if the metric learner
uses a preprocessor.
Returns
-------
decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
Metric differences.
"""
check_is_fitted(self, 'preprocessor_')
triplets = check_input(triplets, type_of_inputs='tuples',
preprocessor=self.preprocessor_,
estimator=self, tuple_size=self._tuple_size)
return (self.pair_score(triplets[:, :2]) -
self.pair_score(triplets[:, [0, 2]]))
def score(self, triplets):
"""Computes score on input triplets.
Returns the accuracy score of the following classification task: a triplet
(X_a, X_b, X_c) is correctly classified if the predicted similarity between
the first pair (X_a, X_b) is higher than that of the second pair (X_a, X_c)
Parameters
----------
triplets : array-like, shape=(n_triplets, 3, n_features) or \
(n_triplets, 3)
3D array of triplets to score, with each row corresponding to three
points, or 2D array of indices of triplets if the metric learner
uses a preprocessor.
Returns
-------
score : float
The triplets score.
"""
# Since the prediction is a vector of values in {-1, +1}, we need to
# rescale them to {0, 1} to compute the accuracy using the mean (because
# then 1 means a correctly classified result (pairs are in the right
# order), and a 0 an incorrectly classified result (pairs are in the
# wrong order).
return self.predict(triplets).mean() / 2 + 0.5
class _QuadrupletsClassifierMixin(BaseMetricLearner):
"""Base class for quadruplets learners.
"""
_tuple_size = 4 # number of points in a tuple, 4 for quadruplets
def predict(self, quadruplets):
"""Predicts the ordering between sample distances in input quadruplets.
For each quadruplet, returns 1 if the quadruplet is in the right order (
first pair is more similar than second pair), and -1 if not.
Parameters
----------
quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
(n_quadruplets, 4)
3D Array of quadruplets to predict, with each row corresponding to four
points, or 2D array of indices of quadruplets if the metric learner
uses a preprocessor.
Returns
-------
prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
Predictions of the ordering of pairs, for each quadruplet.
"""
return np.sign(self.decision_function(quadruplets))
def decision_function(self, quadruplets):
"""Predicts differences between sample distances in input quadruplets.
For each quadruplet in the samples, computes the difference between the
learned metric of the second pair minus the learned metric of the first
pair. The higher it is, the more probable it is that the pairs in the
quadruplet are presented in the right order, i.e. that the label of the
quadruplet is 1. The lower it is, the more probable it is that the label of
the quadruplet is -1.
Parameters
----------
quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
(n_quadruplets, 4)
3D Array of quadruplets to predict, with each row corresponding to four
points, or 2D array of indices of quadruplets if the metric learner
uses a preprocessor.
Returns
-------
decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
Metric differences.
"""
check_is_fitted(self, 'preprocessor_')
quadruplets = check_input(quadruplets, type_of_inputs='tuples',
preprocessor=self.preprocessor_,
estimator=self, tuple_size=self._tuple_size)
return (self.pair_score(quadruplets[:, :2]) -
self.pair_score(quadruplets[:, 2:]))
def score(self, quadruplets):
"""Computes score on input quadruplets
Returns the accuracy score of the following classification task: a record
is correctly classified if the predicted similarity between the first two
samples is higher than that of the last two.
Parameters
----------
quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
(n_quadruplets, 4)
3D Array of quadruplets to score, with each row corresponding to four
points, or 2D array of indices of quadruplets if the metric learner
uses a preprocessor.
Returns
-------
score : float
The quadruplets score.
"""
# Since the prediction is a vector of values in {-1, +1}, we need to
# rescale them to {0, 1} to compute the accuracy using the mean (because
# then 1 means a correctly classified result (pairs are in the right
# order), and a 0 an incorrectly classified result (pairs are in the
# wrong order).
return self.predict(quadruplets).mean() / 2 + 0.5