Source code for abx_numpy.abx_numpy

# -*- coding: utf-8 -*-
"""
abx_numpy: Main module

Copyright 2015, Roland Thiolliere
Licensed under GPLv3.
"""
import numpy as np
import itertools
import lib


[docs]def score(classes, distances, is_sorted=False):
    """Compute the ABX score for a set of sorted classes and a distance matrix.

    Parameters:
    -----------
    classes : array (n_items)
        1-D array containing the sorted class labels of the items.
    distances : array (n_items, n_items)
        2-D array containing the pairwise distance of the items.

    Returns
    -------
    average : float
        average abx score
    labels : array (n_classes)
        1D array containing the unique classes
    scores : array (n_classes, n_classes)
        2D array containing the abx scores for each pair of classes.
        The diagonal contains nan values
    """
    if not is_sorted:
        order = np.argsort(classes)
        _classes, _distances = classes[order], distances[order, order]
    else:
        _classes, _distances = classes, distances
    labels, indexes = lib.unique_sorted(_classes)
    
    class Index(object):
        def __init__(self, indexes):
            self.indexes_first = indexes[:-1]
            self.indexes_last = indexes[1:]
        def get_slice(self, idx):
            return slice(self.indexes_first[idx], self.indexes_last[idx], 1)

    index = Index(indexes)        
    n_labels = len(labels)
    # scores = np.empty((n_labels, n_labels))
    scores = np.zeros((n_labels, n_labels))
    scores[np.diag_indices(n_labels)] = np.nan
    for idx_label1, idx_label2 in itertools.product(range(n_labels), range(n_labels)):
        if idx_label1 == idx_label2:
            continue
        else:
            items_a = index.get_slice(idx_label1)
            items_b = index.get_slice(idx_label2)
            for a in range(items_a.start, items_a.stop):
                items_x = range(items_a.start, a) + range(a+1, items_a.stop)
                d_ax = np.tile(_distances[a, items_x], (items_b.stop - items_b.start, 1))
                d_bx = _distances[items_b, :][:, items_x]
                scores[idx_label1, idx_label2] += (np.mean((d_ax < d_bx) - (d_ax > d_bx))) * 0.5
            scores[idx_label1, idx_label2] /= (items_a.stop - items_a.start)
    return np.nanmean(scores), labels, scores


[docs]def compute_distances(features, distance_function):
    """Compute the distance matrix for an array of features and a distance
    function.

    Parameters
    ----------
    features : array (n_items, dim_features)
        2-D array containing the features of the items.
    distance_function : callable
        Distance function to use.

    Returns
    -------
    distances : array (n_items, n_items)
        2-D array containing the pairwise distance of the items.
    """
    n_items = features.shape[0]
    distances = np.empty((n_items, n_items))
    #TODO: cython, only lower triangle in symmetric distance
    for i, j in itertools.product(range(n_items), range(n_items)):
        distances[i, j] = distance_function(features[i], features[j])
    return distances


[docs]def sort(classes, features):
    """Sort classes according to labels and features according to the new order"""
    order = np.argsort(classes)
    return classes[order], features[order]


[docs]def sample(classes, features, cutoff, is_sorted=False):
    """'Fair' sampling (non-uniform, inverse to the class weight)

    Parameters
    ----------
    classes : array (n_items)
        1-D array containing the class labels of the items.
    features : array (n_items, dim_features)
        2-D array containing the features of the items.
    cutoff : int
        Cutoff to use for sample (number of items kept).

    Returns
    -------
    sampled classes, sampled features
    """
    #TODO: improve fairness by enforcing the number of element in each class
    # to be equal
    if not is_sorted:
        _classes, _features = sort(classes, features)
    else:
        _classes, _features = classes, features
    n_items = _classes.shape[0]
    labels, indexes = lib.unique_sorted(_classes)
    size_classes = indexes[1:] - indexes[:-1]
    proba_sampling = np.repeat(1. / (size_classes * len(labels)), size_classes)
    indexes = np.random.choice(n_items, size=cutoff, replace=False, p=proba_sampling)
    indexes = np.sort(indexes)
    return _classes[indexes], _features[indexes]


[docs]def abx(classes, features, distance_function, cutoff=1000):
    """Calculate the ABX score for a set of classes and a features matrix.

    The order of the 'classes' and the 'features' arrays must be the same.

    Parameters
    ----------
    classes : array (n_items)
        1-D array containing the class labels of the items.
    features : array (n_items, dim_features)
        2-D array containing the features of the items.
    distance_function : callable
        Distance function to use.
    cutoff : int, optionnal
        Cutoff to use for sample (number of items kept). None for no sample.
        Default to 1000.

    Returns
    -------
    average : float
        average abx score
    labels : array (n_classes)
        1D array containing the unique classes
    scores : array (n_classes, n_classes)
        2D array containing the abx scores for each pair of classes.
        The diagonal contains nan values
    """
    assert classes.shape[0] == features.shape[0]
    _classes, _features = sort(classes, features)
    print _classes
    if cutoff and cutoff < _classes.shape[0]:
        _classes, _features = sample(_classes, _features, cutoff, is_sorted=True)
        print _classes
    distances = compute_distances(_features, distance_function)
    average, labels, scores = score(_classes, distances, is_sorted=True)
    return average, labels, scores



# def score2(classes, distances):
#     raise NotImplementedError
#             a = index.get_slice(idx_label1)
#             b = index.get_slice(idx_label2)
#             n_a = a.stop - a.start
#             x = np.eye(n_a, dtype=bool)
#             d_ax = _distances[a, a][~x].reshape((n_a, n_a-1)).T[None, :, :]
#             d_bx = _distances[b, a][:, None, :]
#             scores[idx_label1, idx_label2] = (np.mean((d_ax < d_bx) - (d_ax > d_bx))) * 0.5