faiss/Heap.h

496 lines
13 KiB
C++

/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD+Patents license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
/*
* C++ support for heaps. The set of functions is tailored for
* efficient similarity search.
*
* There is no specific object for a heap, and the functions that
* operate on a signle heap are inlined, because heaps are often
* small. More complex functions are implemented in Heaps.cpp
*
*/
#ifndef FAISS_Heap_h
#define FAISS_Heap_h
#include <climits>
#include <cstring>
#include <cmath>
#include <cassert>
#include <cstdio>
#include <limits>
namespace faiss {
/*******************************************************************
* C object: uniform handling of min and max heap
*******************************************************************/
/** The C object gives the type T of the values in the heap, the type
* of the keys, TI and the comparison that is done: > for the minheap
* and < for the maxheap. The neutral value will always be dropped in
* favor of any other value in the heap.
*/
template <typename T_, typename TI_>
struct CMax;
// traits of minheaps = heaps where the minimum value is stored on top
// useful to find the *max* values of an array
template <typename T_, typename TI_>
struct CMin {
typedef T_ T;
typedef TI_ TI;
typedef CMax<T_, TI_> Crev;
inline static bool cmp (T a, T b) {
return a < b;
}
// value that will be popped first -> must be smaller than all others
// for int types this is not strictly the smallest val (-max - 1)
inline static T neutral () {
return -std::numeric_limits<T>::max();
}
};
template <typename T_, typename TI_>
struct CMax {
typedef T_ T;
typedef TI_ TI;
typedef CMin<T_, TI_> Crev;
inline static bool cmp (T a, T b) {
return a > b;
}
inline static T neutral () {
return std::numeric_limits<T>::max();
}
};
/*******************************************************************
* Basic heap ops: push and pop
*******************************************************************/
/** Pops the top element from the heap defined by bh_val[0..k-1] and
* bh_ids[0..k-1]. on output the element at k-1 is undefined.
*/
template <class C> inline
void heap_pop (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
{
bh_val--; /* Use 1-based indexing for easier node->child translation */
bh_ids--;
typename C::T val = bh_val[k];
size_t i = 1, i1, i2;
while (1) {
i1 = i << 1;
i2 = i1 + 1;
if (i1 > k)
break;
if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
if (C::cmp(val, bh_val[i1]))
break;
bh_val[i] = bh_val[i1];
bh_ids[i] = bh_ids[i1];
i = i1;
}
else {
if (C::cmp(val, bh_val[i2]))
break;
bh_val[i] = bh_val[i2];
bh_ids[i] = bh_ids[i2];
i = i2;
}
}
bh_val[i] = bh_val[k];
bh_ids[i] = bh_ids[k];
}
/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
* bh_ids[0..k-2]. on output the element at k-1 is defined.
*/
template <class C> inline
void heap_push (size_t k,
typename C::T * bh_val, typename C::TI * bh_ids,
typename C::T val, typename C::TI ids)
{
bh_val--; /* Use 1-based indexing for easier node->child translation */
bh_ids--;
size_t i = k, i_father;
while (i > 1) {
i_father = i >> 1;
if (!C::cmp (val, bh_val[i_father])) /* the heap structure is ok */
break;
bh_val[i] = bh_val[i_father];
bh_ids[i] = bh_ids[i_father];
i = i_father;
}
bh_val[i] = val;
bh_ids[i] = ids;
}
/* Partial instanciation for heaps with TI = long */
template <typename T> inline
void minheap_pop (size_t k, T * bh_val, long * bh_ids)
{
heap_pop<CMin<T, long> > (k, bh_val, bh_ids);
}
template <typename T> inline
void minheap_push (size_t k, T * bh_val, long * bh_ids, T val, long ids)
{
heap_push<CMin<T, long> > (k, bh_val, bh_ids, val, ids);
}
template <typename T> inline
void maxheap_pop (size_t k, T * bh_val, long * bh_ids)
{
heap_pop<CMax<T, long> > (k, bh_val, bh_ids);
}
template <typename T> inline
void maxheap_push (size_t k, T * bh_val, long * bh_ids, T val, long ids)
{
heap_push<CMax<T, long> > (k, bh_val, bh_ids, val, ids);
}
/*******************************************************************
* Heap initialization
*******************************************************************/
/* Initialization phase for the heap (with unconditionnal pushes).
* Store k0 elements in a heap containing up to k values. Note that
* (bh_val, bh_ids) can be the same as (x, ids) */
template <class C> inline
void heap_heapify (
size_t k,
typename C::T * bh_val,
typename C::TI * bh_ids,
const typename C::T * x = nullptr,
const typename C::TI * ids = nullptr,
size_t k0 = 0)
{
if (k0 > 0) assert (x);
if (ids) {
for (size_t i = 0; i < k0; i++)
heap_push<C> (i+1, bh_val, bh_ids, x[i], ids[i]);
} else {
for (size_t i = 0; i < k0; i++)
heap_push<C> (i+1, bh_val, bh_ids, x[i], i);
}
for (size_t i = k0; i < k; i++) {
bh_val[i] = C::neutral();
bh_ids[i] = -1;
}
}
template <typename T> inline
void minheap_heapify (
size_t k, T * bh_val,
long * bh_ids,
const T * x = nullptr,
const long * ids = nullptr,
size_t k0 = 0)
{
heap_heapify< CMin<T, long> > (k, bh_val, bh_ids, x, ids, k0);
}
template <typename T> inline
void maxheap_heapify (
size_t k,
T * bh_val,
long * bh_ids,
const T * x = nullptr,
const long * ids = nullptr,
size_t k0 = 0)
{
heap_heapify< CMax<T, long> > (k, bh_val, bh_ids, x, ids, k0);
}
/*******************************************************************
* Add n elements to the heap
*******************************************************************/
/* Add some elements to the heap */
template <class C> inline
void heap_addn (size_t k,
typename C::T * bh_val, typename C::TI * bh_ids,
const typename C::T * x,
const typename C::TI * ids,
size_t n)
{
size_t i;
if (ids)
for (i = 0; i < n; i++) {
if (C::cmp (bh_val[0], x[i])) {
heap_pop<C> (k, bh_val, bh_ids);
heap_push<C> (k, bh_val, bh_ids, x[i], ids[i]);
}
}
else
for (i = 0; i < n; i++) {
if (C::cmp (bh_val[0], x[i])) {
heap_pop<C> (k, bh_val, bh_ids);
heap_push<C> (k, bh_val, bh_ids, x[i], i);
}
}
}
/* Partial instanciation for heaps with TI = long */
template <typename T> inline
void minheap_addn (size_t k, T * bh_val, long * bh_ids,
const T * x, const long * ids, size_t n)
{
heap_addn<CMin<T, long> > (k, bh_val, bh_ids, x, ids, n);
}
template <typename T> inline
void maxheap_addn (size_t k, T * bh_val, long * bh_ids,
const T * x, const long * ids, size_t n)
{
heap_addn<CMax<T, long> > (k, bh_val, bh_ids, x, ids, n);
}
/*******************************************************************
* Heap finalization (reorder elements)
*******************************************************************/
/* This function maps a binary heap into an sorted structure.
It returns the number */
template <typename C> inline
size_t heap_reorder (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
{
size_t i, ii;
for (i = 0, ii = 0; i < k; i++) {
/* top element should be put at the end of the list */
typename C::T val = bh_val[0];
typename C::TI id = bh_ids[0];
/* boundary case: we will over-ride this value if not a true element */
heap_pop<C> (k-i, bh_val, bh_ids);
bh_val[k-ii-1] = val;
bh_ids[k-ii-1] = id;
if (id != -1) ii++;
}
/* Count the number of elements which are effectively returned */
size_t nel = ii;
memmove (bh_val, bh_val+k-ii, ii * sizeof(*bh_val));
memmove (bh_ids, bh_ids+k-ii, ii * sizeof(*bh_ids));
for (; ii < k; ii++) {
bh_val[ii] = C::neutral();
bh_ids[ii] = -1;
}
return nel;
}
template <typename T> inline
size_t minheap_reorder (size_t k, T * bh_val, long * bh_ids)
{
return heap_reorder< CMin<T, long> > (k, bh_val, bh_ids);
}
template <typename T> inline
size_t maxheap_reorder (size_t k, T * bh_val, long * bh_ids)
{
return heap_reorder< CMax<T, long> > (k, bh_val, bh_ids);
}
/*******************************************************************
* Operations on heap arrays
*******************************************************************/
/** a template structure for a set of [min|max]-heaps it is tailored
* so that the actual data of the heaps can just live in compact
* arrays.
*/
template <typename C>
struct HeapArray {
typedef typename C::TI TI;
typedef typename C::T T;
size_t nh; ///< number of heaps
size_t k; ///< allocated size per heap
TI * ids; ///< identifiers (size nh * k)
T * val; ///< values (distances or similarities), size nh * k
/// Return the list of values for a heap
T * get_val (size_t key) { return val + key * k; }
/// Correspponding identifiers
TI * get_ids (size_t key) { return ids + key * k; }
/// prepare all the heaps before adding
void heapify ();
/** add nj elements to heaps i0:i0+ni, with sequential ids
*
* @param nj nb of elements to add to each heap
* @param vin elements to add, size ni * nj
* @param j0 add this to the ids that are added
* @param i0 first heap to update
* @param ni nb of elements to update (-1 = use nh)
*/
void addn (size_t nj, const T *vin, TI j0 = 0,
size_t i0 = 0, long ni = -1);
/** same as addn
*
* @param id_in ids of the elements to add, size ni * nj
* @param id_stride stride for id_in
*/
void addn_with_ids (
size_t nj, const T *vin, const TI *id_in = nullptr,
long id_stride = 0, size_t i0 = 0, long ni = -1);
/// reorder all the heaps
void reorder ();
/** this is not really a heap function. It just finds the per-line
* extrema of each line of array D
* @param vals_out extreme value of each line (size nh, or NULL)
* @param idx_out index of extreme value (size nh or NULL)
*/
void per_line_extrema (T *vals_out, TI *idx_out) const;
};
/* Define useful heaps */
typedef HeapArray<CMin<float, long> > float_minheap_array_t;
typedef HeapArray<CMin<int, long> > int_minheap_array_t;
typedef HeapArray<CMax<float, long> > float_maxheap_array_t;
typedef HeapArray<CMax<int, long> > int_maxheap_array_t;
// The heap templates are instanciated explicitly in Heap.cpp
/*********************************************************************
* Indirect heaps: instead of having
*
* node i = (bh_ids[i], bh_val[i]),
*
* in indirect heaps,
*
* node i = (bh_ids[i], bh_val[bh_ids[i]]),
*
*********************************************************************/
template <class C>
inline
void indirect_heap_pop (
size_t k,
const typename C::T * bh_val,
typename C::TI * bh_ids)
{
bh_ids--; /* Use 1-based indexing for easier node->child translation */
typename C::T val = bh_val[bh_ids[k]];
size_t i = 1;
while (1) {
size_t i1 = i << 1;
size_t i2 = i1 + 1;
if (i1 > k)
break;
typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
if (C::cmp(val, bh_val[id1]))
break;
bh_ids[i] = id1;
i = i1;
} else {
if (C::cmp(val, bh_val[id2]))
break;
bh_ids[i] = id2;
i = i2;
}
}
bh_ids[i] = bh_ids[k];
}
template <class C>
inline
void indirect_heap_push (size_t k,
const typename C::T * bh_val, typename C::TI * bh_ids,
typename C::TI id)
{
bh_ids--; /* Use 1-based indexing for easier node->child translation */
typename C::T val = bh_val[id];
size_t i = k;
while (i > 1) {
size_t i_father = i >> 1;
if (!C::cmp (val, bh_val[bh_ids[i_father]]))
break;
bh_ids[i] = bh_ids[i_father];
i = i_father;
}
bh_ids[i] = id;
}
} // namespace faiss
#endif /* FAISS_Heap_h */