mirror of
https://github.com/facebookresearch/faiss.git
synced 2025-06-03 21:54:02 +08:00
introduce options for reducing the overhead for a clustering procedure (#3731)
Summary: Several changes: 1. Introduce `ClusteringParameters::check_input_data_for_NaNs`, which may suppress checks for NaN values in the input data 2. Introduce `ClusteringParameters::use_faster_subsampling`, which uses a newly added SplitMix64-based rng (`SplitMix64RandomGenerator`) and also may pick duplicate points from the original input dataset. Surprisingly, `rand_perm()` may involve noticeable non-zero costs for certain scenarios. 3. Negative values for `ClusteringParameters::seed` initialize internal clustering rng with high-resolution clock each time, making clustering procedure to pick different subsamples each time. I've decided not to use `std::random_device` in order to avoid possible negative effects. Useful for future `ProductResidualQuantizer` improvements. Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3731 Reviewed By: asadoughi Differential Revision: D61106105 Pulled By: mnorris11 fbshipit-source-id: 072ab2f5ce4f82f9cf49d678122f65d1c08ce596
This commit is contained in:
parent
b10f001185
commit
afe9c40f36
@ -11,6 +11,7 @@
|
|||||||
#include <faiss/VectorTransform.h>
|
#include <faiss/VectorTransform.h>
|
||||||
#include <faiss/impl/AuxIndexStructures.h>
|
#include <faiss/impl/AuxIndexStructures.h>
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -74,6 +75,14 @@ void Clustering::train(
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
uint64_t get_actual_rng_seed(const int seed) {
|
||||||
|
return (seed >= 0)
|
||||||
|
? seed
|
||||||
|
: static_cast<uint64_t>(std::chrono::high_resolution_clock::now()
|
||||||
|
.time_since_epoch()
|
||||||
|
.count());
|
||||||
|
}
|
||||||
|
|
||||||
idx_t subsample_training_set(
|
idx_t subsample_training_set(
|
||||||
const Clustering& clus,
|
const Clustering& clus,
|
||||||
idx_t nx,
|
idx_t nx,
|
||||||
@ -87,11 +96,30 @@ idx_t subsample_training_set(
|
|||||||
clus.k * clus.max_points_per_centroid,
|
clus.k * clus.max_points_per_centroid,
|
||||||
nx);
|
nx);
|
||||||
}
|
}
|
||||||
std::vector<int> perm(nx);
|
|
||||||
rand_perm(perm.data(), nx, clus.seed);
|
const uint64_t actual_seed = get_actual_rng_seed(clus.seed);
|
||||||
|
|
||||||
|
std::vector<int> perm;
|
||||||
|
if (clus.use_faster_subsampling) {
|
||||||
|
// use subsampling with splitmix64 rng
|
||||||
|
SplitMix64RandomGenerator rng(actual_seed);
|
||||||
|
|
||||||
|
const idx_t new_nx = clus.k * clus.max_points_per_centroid;
|
||||||
|
perm.resize(new_nx);
|
||||||
|
for (idx_t i = 0; i < new_nx; i++) {
|
||||||
|
perm[i] = rng.rand_int(nx);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// use subsampling with a default std rng
|
||||||
|
perm.resize(nx);
|
||||||
|
rand_perm(perm.data(), nx, actual_seed);
|
||||||
|
}
|
||||||
|
|
||||||
nx = clus.k * clus.max_points_per_centroid;
|
nx = clus.k * clus.max_points_per_centroid;
|
||||||
uint8_t* x_new = new uint8_t[nx * line_size];
|
uint8_t* x_new = new uint8_t[nx * line_size];
|
||||||
*x_out = x_new;
|
*x_out = x_new;
|
||||||
|
|
||||||
|
// might be worth omp-ing as well
|
||||||
for (idx_t i = 0; i < nx; i++) {
|
for (idx_t i = 0; i < nx; i++) {
|
||||||
memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size);
|
memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size);
|
||||||
}
|
}
|
||||||
@ -280,7 +308,7 @@ void Clustering::train_encoded(
|
|||||||
|
|
||||||
double t0 = getmillisecs();
|
double t0 = getmillisecs();
|
||||||
|
|
||||||
if (!codec) {
|
if (!codec && check_input_data_for_NaNs) {
|
||||||
// Check for NaNs in input data. Normally it is the user's
|
// Check for NaNs in input data. Normally it is the user's
|
||||||
// responsibility, but it may spare us some hard-to-debug
|
// responsibility, but it may spare us some hard-to-debug
|
||||||
// reports.
|
// reports.
|
||||||
@ -383,6 +411,9 @@ void Clustering::train_encoded(
|
|||||||
}
|
}
|
||||||
t0 = getmillisecs();
|
t0 = getmillisecs();
|
||||||
|
|
||||||
|
// initialize seed
|
||||||
|
const uint64_t actual_seed = get_actual_rng_seed(seed);
|
||||||
|
|
||||||
// temporary buffer to decode vectors during the optimization
|
// temporary buffer to decode vectors during the optimization
|
||||||
std::vector<float> decode_buffer(codec ? d * decode_block_size : 0);
|
std::vector<float> decode_buffer(codec ? d * decode_block_size : 0);
|
||||||
|
|
||||||
@ -395,7 +426,7 @@ void Clustering::train_encoded(
|
|||||||
centroids.resize(d * k);
|
centroids.resize(d * k);
|
||||||
std::vector<int> perm(nx);
|
std::vector<int> perm(nx);
|
||||||
|
|
||||||
rand_perm(perm.data(), nx, seed + 1 + redo * 15486557L);
|
rand_perm(perm.data(), nx, actual_seed + 1 + redo * 15486557L);
|
||||||
|
|
||||||
if (!codec) {
|
if (!codec) {
|
||||||
for (int i = n_input_centroids; i < k; i++) {
|
for (int i = n_input_centroids; i < k; i++) {
|
||||||
|
@ -43,11 +43,20 @@ struct ClusteringParameters {
|
|||||||
int min_points_per_centroid = 39;
|
int min_points_per_centroid = 39;
|
||||||
/// to limit size of dataset, otherwise the training set is subsampled
|
/// to limit size of dataset, otherwise the training set is subsampled
|
||||||
int max_points_per_centroid = 256;
|
int max_points_per_centroid = 256;
|
||||||
/// seed for the random number generator
|
/// seed for the random number generator.
|
||||||
|
/// negative values lead to seeding an internal rng with
|
||||||
|
/// std::high_resolution_clock.
|
||||||
int seed = 1234;
|
int seed = 1234;
|
||||||
|
|
||||||
/// when the training set is encoded, batch size of the codec decoder
|
/// when the training set is encoded, batch size of the codec decoder
|
||||||
size_t decode_block_size = 32768;
|
size_t decode_block_size = 32768;
|
||||||
|
|
||||||
|
/// whether to check for NaNs in an input data
|
||||||
|
bool check_input_data_for_NaNs = true;
|
||||||
|
|
||||||
|
/// Whether to use splitmix64-based random number generator for subsampling,
|
||||||
|
/// which is faster, but may pick duplicate points.
|
||||||
|
bool use_faster_subsampling = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ClusteringIterationStats {
|
struct ClusteringIterationStats {
|
||||||
|
@ -54,6 +54,37 @@ double RandomGenerator::rand_double() {
|
|||||||
return mt() / double(mt.max());
|
return mt() / double(mt.max());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SplitMix64RandomGenerator::SplitMix64RandomGenerator(int64_t seed)
|
||||||
|
: state{static_cast<uint64_t>(seed)} {}
|
||||||
|
|
||||||
|
int SplitMix64RandomGenerator::rand_int() {
|
||||||
|
return next() & 0x7fffffff;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t SplitMix64RandomGenerator::rand_int64() {
|
||||||
|
uint64_t value = next();
|
||||||
|
return static_cast<int64_t>(value & 0x7fffffffffffffffULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
int SplitMix64RandomGenerator::rand_int(int max) {
|
||||||
|
return next() % max;
|
||||||
|
}
|
||||||
|
|
||||||
|
float SplitMix64RandomGenerator::rand_float() {
|
||||||
|
return next() / float(std::numeric_limits<uint64_t>::max());
|
||||||
|
}
|
||||||
|
|
||||||
|
double SplitMix64RandomGenerator::rand_double() {
|
||||||
|
return next() / double(std::numeric_limits<uint64_t>::max());
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t SplitMix64RandomGenerator::next() {
|
||||||
|
uint64_t z = (state += 0x9e3779b97f4a7c15ULL);
|
||||||
|
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
|
||||||
|
z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
|
||||||
|
return z ^ (z >> 31);
|
||||||
|
}
|
||||||
|
|
||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
* Random functions in this C file only exist because Torch
|
* Random functions in this C file only exist because Torch
|
||||||
* counterparts are slow and not multi-threaded. Typical use is for
|
* counterparts are slow and not multi-threaded. Typical use is for
|
||||||
@ -162,6 +193,18 @@ void rand_perm(int* perm, size_t n, int64_t seed) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void rand_perm_splitmix64(int* perm, size_t n, int64_t seed) {
|
||||||
|
for (size_t i = 0; i < n; i++)
|
||||||
|
perm[i] = i;
|
||||||
|
|
||||||
|
SplitMix64RandomGenerator rng(seed);
|
||||||
|
|
||||||
|
for (size_t i = 0; i + 1 < n; i++) {
|
||||||
|
int i2 = i + rng.rand_int(n - i);
|
||||||
|
std::swap(perm[i], perm[i2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void byte_rand(uint8_t* x, size_t n, int64_t seed) {
|
void byte_rand(uint8_t* x, size_t n, int64_t seed) {
|
||||||
// only try to parallelize on large enough arrays
|
// only try to parallelize on large enough arrays
|
||||||
const size_t nblock = n < 1024 ? 1 : 1024;
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
||||||
|
@ -43,6 +43,30 @@ struct RandomGenerator {
|
|||||||
explicit RandomGenerator(int64_t seed = 1234);
|
explicit RandomGenerator(int64_t seed = 1234);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// fast random generator that cannot be used in multithreaded contexts.
|
||||||
|
/// based on https://prng.di.unimi.it/
|
||||||
|
struct SplitMix64RandomGenerator {
|
||||||
|
uint64_t state;
|
||||||
|
|
||||||
|
/// random positive integer
|
||||||
|
int rand_int();
|
||||||
|
|
||||||
|
/// random int64_t
|
||||||
|
int64_t rand_int64();
|
||||||
|
|
||||||
|
/// generate random integer between 0 and max-1
|
||||||
|
int rand_int(int max);
|
||||||
|
|
||||||
|
/// between 0 and 1
|
||||||
|
float rand_float();
|
||||||
|
|
||||||
|
double rand_double();
|
||||||
|
|
||||||
|
explicit SplitMix64RandomGenerator(int64_t seed = 1234);
|
||||||
|
|
||||||
|
uint64_t next();
|
||||||
|
};
|
||||||
|
|
||||||
/* Generate an array of uniform random floats / multi-threaded implementation */
|
/* Generate an array of uniform random floats / multi-threaded implementation */
|
||||||
void float_rand(float* x, size_t n, int64_t seed);
|
void float_rand(float* x, size_t n, int64_t seed);
|
||||||
void float_randn(float* x, size_t n, int64_t seed);
|
void float_randn(float* x, size_t n, int64_t seed);
|
||||||
@ -53,6 +77,7 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);
|
|||||||
|
|
||||||
/* random permutation */
|
/* random permutation */
|
||||||
void rand_perm(int* perm, size_t n, int64_t seed);
|
void rand_perm(int* perm, size_t n, int64_t seed);
|
||||||
|
void rand_perm_splitmix64(int* perm, size_t n, int64_t seed);
|
||||||
|
|
||||||
/* Random set of vectors with intrinsic dimensionality 10 that is harder to
|
/* Random set of vectors with intrinsic dimensionality 10 that is harder to
|
||||||
* index than a subspace of dim 10 but easier than uniform data in dimension d
|
* index than a subspace of dim 10 but easier than uniform data in dimension d
|
||||||
|
Loading…
x
Reference in New Issue
Block a user