faiss/MatrixStats.cpp

253 lines
6.6 KiB
C++

/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include <faiss/MatrixStats.h>
#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
#include <cmath>
#include <cstdio>
#include <faiss/utils/utils.h>
namespace faiss {
/*********************************************************************
* MatrixStats
*********************************************************************/
MatrixStats::PerDimStats::PerDimStats():
n(0), n_nan(0), n_inf(0), n0(0),
min(HUGE_VALF), max(-HUGE_VALF),
sum(0), sum2(0),
mean(NAN), stddev(NAN)
{}
void MatrixStats::PerDimStats::add (float x)
{
n++;
if (std::isnan(x)) {
n_nan++;
return;
}
if (!std::isfinite(x)) {
n_inf++;
return;
}
if (x == 0) n0++;
if (x < min) min = x;
if (x > max) max = x;
sum += x;
sum2 += (double)x * (double)x;
}
void MatrixStats::PerDimStats::compute_mean_std ()
{
n_valid = n - n_nan - n_inf;
mean = sum / n_valid;
double var = sum2 / n_valid - mean * mean;
if (var < 0) var = 0;
stddev = sqrt(var);
}
void MatrixStats::do_comment (const char *fmt, ...)
{
va_list ap;
/* Determine required size */
va_start(ap, fmt);
size_t size = vsnprintf(buf, nbuf, fmt, ap);
va_end(ap);
nbuf -= size;
buf += size;
}
MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
n(n), d(d),
n_collision(0), n_valid(0), n0(0),
min_norm2(HUGE_VAL), max_norm2(0)
{
std::vector<char> comment_buf (10000);
buf = comment_buf.data ();
nbuf = comment_buf.size();
do_comment ("analyzing %ld vectors of size %ld\n", n, d);
if (d > 1024) {
do_comment (
"indexing this many dimensions is hard, "
"please consider dimensionality reducution (with PCAMatrix)\n");
}
size_t nbytes = sizeof (x[0]) * d;
per_dim_stats.resize (d);
for (size_t i = 0; i < n; i++) {
const float *xi = x + d * i;
double sum2 = 0;
for (size_t j = 0; j < d; j++) {
per_dim_stats[j].add (xi[j]);
sum2 += xi[j] * (double)xi[j];
}
if (std::isfinite (sum2)) {
n_valid++;
if (sum2 == 0) {
n0 ++;
} else {
if (sum2 < min_norm2) min_norm2 = sum2;
if (sum2 > max_norm2) max_norm2 = sum2;
}
}
{ // check hash
uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
auto elt = occurrences.find (hash);
if (elt == occurrences.end()) {
Occurrence occ = {i, 1};
occurrences[hash] = occ;
} else {
if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
elt->second.count ++;
} else {
n_collision ++;
// we should use a list of collisions but overkill
}
}
}
}
// invalid vecor stats
if (n_valid == n) {
do_comment ("no NaN or Infs in data\n");
} else {
do_comment ("%ld vectors contain NaN or Inf "
"(or have too large components), "
"expect bad results with indexing!\n", n - n_valid);
}
// copies in dataset
if (occurrences.size() == n) {
do_comment ("all vectors are distinct\n");
} else {
do_comment ("%ld vectors are distinct (%.2f%%)\n",
occurrences.size(),
occurrences.size() * 100.0 / n);
if (n_collision > 0) {
do_comment ("%ld collisions in hash table, "
"counts may be invalid\n", n_collision);
}
Occurrence max = {0, 0};
for (auto it = occurrences.begin();
it != occurrences.end(); ++it) {
if (it->second.count > max.count) {
max = it->second;
}
}
do_comment ("vector %ld has %ld copies\n", max.first, max.count);
}
{ // norm stats
min_norm2 = sqrt (min_norm2);
max_norm2 = sqrt (max_norm2);
do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
min_norm2, max_norm2, n0);
if (max_norm2 < min_norm2 * 1.0001) {
do_comment ("vectors are normalized, inner product and "
"L2 search are equivalent\n");
}
if (max_norm2 > min_norm2 * 100) {
do_comment ("vectors have very large differences in norms, "
"is this normal?\n");
}
}
{ // per dimension stats
double max_std = 0, min_std = HUGE_VAL;
size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
for (size_t j = 0; j < d; j++) {
PerDimStats &st = per_dim_stats[j];
st.compute_mean_std ();
n0 += st.n0;
if (st.max == st.min) {
n_0_range ++;
} else if (st.max < 1.001 * st.min) {
n_dangerous_range ++;
}
if (st.stddev > max_std) max_std = st.stddev;
if (st.stddev < min_std) min_std = st.stddev;
}
if (n0 == 0) {
do_comment ("matrix contains no 0s\n");
} else {
do_comment ("matrix contains %.2f %% 0 entries\n",
n0 * 100.0 / (n * d));
}
if (n_0_range == 0) {
do_comment ("no constant dimensions\n");
} else {
do_comment ("%ld dimensions are constant: they can be removed\n",
n_0_range);
}
if (n_dangerous_range == 0) {
do_comment ("no dimension has a too large mean\n");
} else {
do_comment ("%ld dimensions are too large "
"wrt. their variance, may loose precision "
"in IndexFlatL2 (use CenteringTransform)\n",
n_dangerous_range);
}
do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
size_t n_small_var = 0;
for (size_t j = 0; j < d; j++) {
const PerDimStats &st = per_dim_stats[j];
if (st.stddev < max_std * 1e-4) {
n_small_var++;
}
}
if (n_small_var > 0) {
do_comment ("%ld dimensions have negligible stddev wrt. "
"the largest dimension, they could be ignored",
n_small_var);
}
}
comments = comment_buf.data ();
buf = nullptr;
nbuf = 0;
}
} // namespace faiss