Source code for abx_numpy.abx_numpy

# -*- coding: utf-8 -*-
"""
abx_numpy: Main module

Copyright 2015, Roland Thiolliere
Licensed under GPLv3.
"""
import numpy as np
import itertools
import lib


[docs]def score(classes, distances, is_sorted=False): """Compute the ABX score for a set of sorted classes and a distance matrix. Parameters: ----------- classes : array (n_items) 1-D array containing the sorted class labels of the items. distances : array (n_items, n_items) 2-D array containing the pairwise distance of the items. Returns ------- average : float average abx score labels : array (n_classes) 1D array containing the unique classes scores : array (n_classes, n_classes) 2D array containing the abx scores for each pair of classes. The diagonal contains nan values """ if not is_sorted: order = np.argsort(classes) _classes, _distances = classes[order], distances[order, order] else: _classes, _distances = classes, distances labels, indexes = lib.unique_sorted(_classes) class Index(object): def __init__(self, indexes): self.indexes_first = indexes[:-1] self.indexes_last = indexes[1:] def get_slice(self, idx): return slice(self.indexes_first[idx], self.indexes_last[idx], 1) index = Index(indexes) n_labels = len(labels) # scores = np.empty((n_labels, n_labels)) scores = np.zeros((n_labels, n_labels)) scores[np.diag_indices(n_labels)] = np.nan for idx_label1, idx_label2 in itertools.product(range(n_labels), range(n_labels)): if idx_label1 == idx_label2: continue else: items_a = index.get_slice(idx_label1) items_b = index.get_slice(idx_label2) for a in range(items_a.start, items_a.stop): items_x = range(items_a.start, a) + range(a+1, items_a.stop) d_ax = np.tile(_distances[a, items_x], (items_b.stop - items_b.start, 1)) d_bx = _distances[items_b, :][:, items_x] scores[idx_label1, idx_label2] += (np.mean((d_ax < d_bx) - (d_ax > d_bx))) * 0.5 scores[idx_label1, idx_label2] /= (items_a.stop - items_a.start) return np.nanmean(scores), labels, scores
[docs]def compute_distances(features, distance_function): """Compute the distance matrix for an array of features and a distance function. Parameters ---------- features : array (n_items, dim_features) 2-D array containing the features of the items. distance_function : callable Distance function to use. Returns ------- distances : array (n_items, n_items) 2-D array containing the pairwise distance of the items. """ n_items = features.shape[0] distances = np.empty((n_items, n_items)) #TODO: cython, only lower triangle in symmetric distance for i, j in itertools.product(range(n_items), range(n_items)): distances[i, j] = distance_function(features[i], features[j]) return distances
[docs]def sort(classes, features): """Sort classes according to labels and features according to the new order""" order = np.argsort(classes) return classes[order], features[order]
[docs]def sample(classes, features, cutoff, is_sorted=False): """'Fair' sampling (non-uniform, inverse to the class weight) Parameters ---------- classes : array (n_items) 1-D array containing the class labels of the items. features : array (n_items, dim_features) 2-D array containing the features of the items. cutoff : int Cutoff to use for sample (number of items kept). Returns ------- sampled classes, sampled features """ #TODO: improve fairness by enforcing the number of element in each class # to be equal if not is_sorted: _classes, _features = sort(classes, features) else: _classes, _features = classes, features n_items = _classes.shape[0] labels, indexes = lib.unique_sorted(_classes) size_classes = indexes[1:] - indexes[:-1] proba_sampling = np.repeat(1. / (size_classes * len(labels)), size_classes) indexes = np.random.choice(n_items, size=cutoff, replace=False, p=proba_sampling) indexes = np.sort(indexes) return _classes[indexes], _features[indexes]
[docs]def abx(classes, features, distance_function, cutoff=1000): """Calculate the ABX score for a set of classes and a features matrix. The order of the 'classes' and the 'features' arrays must be the same. Parameters ---------- classes : array (n_items) 1-D array containing the class labels of the items. features : array (n_items, dim_features) 2-D array containing the features of the items. distance_function : callable Distance function to use. cutoff : int, optionnal Cutoff to use for sample (number of items kept). None for no sample. Default to 1000. Returns ------- average : float average abx score labels : array (n_classes) 1D array containing the unique classes scores : array (n_classes, n_classes) 2D array containing the abx scores for each pair of classes. The diagonal contains nan values """ assert classes.shape[0] == features.shape[0] _classes, _features = sort(classes, features) print _classes if cutoff and cutoff < _classes.shape[0]: _classes, _features = sample(_classes, _features, cutoff, is_sorted=True) print _classes distances = compute_distances(_features, distance_function) average, labels, scores = score(_classes, distances, is_sorted=True) return average, labels, scores # def score2(classes, distances): # raise NotImplementedError # a = index.get_slice(idx_label1) # b = index.get_slice(idx_label2) # n_a = a.stop - a.start # x = np.eye(n_a, dtype=bool) # d_ax = _distances[a, a][~x].reshape((n_a, n_a-1)).T[None, :, :] # d_bx = _distances[b, a][:, None, :] # scores[idx_label1, idx_label2] = (np.mean((d_ax < d_bx) - (d_ax > d_bx))) * 0.5