221 lines
5.7 KiB
C++
221 lines
5.7 KiB
C++
/**
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
*
|
|
* This source code is licensed under the MIT license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
// -*- c++ -*-
|
|
|
|
/*
|
|
* Hamming distances. The binary vector dimensionality should be a
|
|
* multiple of 8, as the elementary operations operate on bytes. If
|
|
* you need other sizes, just pad with 0s (this is done by function
|
|
* fvecs2bitvecs).
|
|
*
|
|
* User-defined type hamdis_t is used for distances because at this time
|
|
* it is still uncler clear how we will need to balance
|
|
* - flexibility in vector size (may need 16- or even 8-bit vectors)
|
|
* - memory usage
|
|
* - cache-misses when dealing with large volumes of data (fewer bits is better)
|
|
*
|
|
*/
|
|
|
|
#ifndef FAISS_hamming_h
|
|
#define FAISS_hamming_h
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <faiss/utils/Heap.h>
|
|
|
|
|
|
/* The Hamming distance type */
|
|
typedef int32_t hamdis_t;
|
|
|
|
namespace faiss {
|
|
|
|
/**************************************************
|
|
* General bit vector functions
|
|
**************************************************/
|
|
|
|
|
|
void bitvec_print (const uint8_t * b, size_t d);
|
|
|
|
|
|
/* Functions for casting vectors of regular types to compact bits.
|
|
They assume proper allocation done beforehand, meaning that b
|
|
should be be able to receive as many bits as x may produce. */
|
|
|
|
/* Makes an array of bits from the signs of a float array. The length
|
|
of the output array b is rounded up to byte size (allocate
|
|
accordingly) */
|
|
void fvecs2bitvecs (
|
|
const float * x,
|
|
uint8_t * b,
|
|
size_t d,
|
|
size_t n);
|
|
|
|
void bitvecs2fvecs (
|
|
const uint8_t * b,
|
|
float * x,
|
|
size_t d,
|
|
size_t n);
|
|
|
|
|
|
void fvec2bitvec (const float * x, uint8_t * b, size_t d);
|
|
|
|
/***********************************************
|
|
* Generic reader/writer for bit strings
|
|
***********************************************/
|
|
|
|
|
|
struct BitstringWriter {
|
|
uint8_t *code;
|
|
size_t code_size;
|
|
size_t i; // current bit offset
|
|
|
|
// code_size in bytes
|
|
BitstringWriter(uint8_t *code, int code_size);
|
|
|
|
// write the nbit low bits of x
|
|
void write(uint64_t x, int nbit);
|
|
};
|
|
|
|
struct BitstringReader {
|
|
const uint8_t *code;
|
|
size_t code_size;
|
|
size_t i;
|
|
|
|
// code_size in bytes
|
|
BitstringReader(const uint8_t *code, int code_size);
|
|
|
|
// read nbit bits from the code
|
|
uint64_t read(int nbit);
|
|
};
|
|
|
|
/**************************************************
|
|
* Hamming distance computation functions
|
|
**************************************************/
|
|
|
|
|
|
|
|
extern size_t hamming_batch_size;
|
|
|
|
inline int popcount64(uint64_t x) {
|
|
return __builtin_popcountl(x);
|
|
}
|
|
|
|
|
|
/** Compute a set of Hamming distances between na and nb binary vectors
|
|
*
|
|
* @param a size na * nbytespercode
|
|
* @param b size nb * nbytespercode
|
|
* @param nbytespercode should be multiple of 8
|
|
* @param dis output distances, size na * nb
|
|
*/
|
|
void hammings (
|
|
const uint8_t * a,
|
|
const uint8_t * b,
|
|
size_t na, size_t nb,
|
|
size_t nbytespercode,
|
|
hamdis_t * dis);
|
|
|
|
|
|
|
|
|
|
/** Return the k smallest Hamming distances for a set of binary query vectors,
|
|
* using a max heap.
|
|
* @param a queries, size ha->nh * ncodes
|
|
* @param b database, size nb * ncodes
|
|
* @param nb number of database vectors
|
|
* @param ncodes size of the binary codes (bytes)
|
|
* @param ordered if != 0: order the results by decreasing distance
|
|
* (may be bottleneck for k/n > 0.01) */
|
|
void hammings_knn_hc (
|
|
int_maxheap_array_t * ha,
|
|
const uint8_t * a,
|
|
const uint8_t * b,
|
|
size_t nb,
|
|
size_t ncodes,
|
|
int ordered);
|
|
|
|
/* Legacy alias to hammings_knn_hc. */
|
|
void hammings_knn (
|
|
int_maxheap_array_t * ha,
|
|
const uint8_t * a,
|
|
const uint8_t * b,
|
|
size_t nb,
|
|
size_t ncodes,
|
|
int ordered);
|
|
|
|
/** Return the k smallest Hamming distances for a set of binary query vectors,
|
|
* using counting max.
|
|
* @param a queries, size na * ncodes
|
|
* @param b database, size nb * ncodes
|
|
* @param na number of query vectors
|
|
* @param nb number of database vectors
|
|
* @param k number of vectors/distances to return
|
|
* @param ncodes size of the binary codes (bytes)
|
|
* @param distances output distances from each query vector to its k nearest
|
|
* neighbors
|
|
* @param labels output ids of the k nearest neighbors to each query vector
|
|
*/
|
|
void hammings_knn_mc (
|
|
const uint8_t * a,
|
|
const uint8_t * b,
|
|
size_t na,
|
|
size_t nb,
|
|
size_t k,
|
|
size_t ncodes,
|
|
int32_t *distances,
|
|
int64_t *labels);
|
|
|
|
/* Counting the number of matches or of cross-matches (without returning them)
|
|
For use with function that assume pre-allocated memory */
|
|
void hamming_count_thres (
|
|
const uint8_t * bs1,
|
|
const uint8_t * bs2,
|
|
size_t n1,
|
|
size_t n2,
|
|
hamdis_t ht,
|
|
size_t ncodes,
|
|
size_t * nptr);
|
|
|
|
/* Return all Hamming distances/index passing a thres. Pre-allocation of output
|
|
is required. Use hamming_count_thres to determine the proper size. */
|
|
size_t match_hamming_thres (
|
|
const uint8_t * bs1,
|
|
const uint8_t * bs2,
|
|
size_t n1,
|
|
size_t n2,
|
|
hamdis_t ht,
|
|
size_t ncodes,
|
|
int64_t * idx,
|
|
hamdis_t * dis);
|
|
|
|
/* Cross-matching in a set of vectors */
|
|
void crosshamming_count_thres (
|
|
const uint8_t * dbs,
|
|
size_t n,
|
|
hamdis_t ht,
|
|
size_t ncodes,
|
|
size_t * nptr);
|
|
|
|
|
|
/* compute the Hamming distances between two codewords of nwords*64 bits */
|
|
hamdis_t hamming (
|
|
const uint64_t * bs1,
|
|
const uint64_t * bs2,
|
|
size_t nwords);
|
|
|
|
|
|
|
|
} // namespace faiss
|
|
|
|
// inlined definitions of HammingComputerXX and GenHammingComputerXX
|
|
|
|
#include <faiss/utils/hamming-inl.h>
|
|
|
|
#endif /* FAISS_hamming_h */
|