faiss/tests/test_contrib_with_scipy.py

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import faiss
import unittest
import numpy as np

from faiss.contrib import datasets
from faiss.contrib import clustering

import scipy.sparse

# this test is not in test_contrib because it depends on scipy


class TestClustering(unittest.TestCase):

    def test_python_kmeans(self):
        """ Test the python implementation of kmeans """
        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
        x = ds.get_train()

        # bad distribution to stress-test split code
        xt = x[:10000].copy()
        xt[:5000] = x[0]

        km_ref = faiss.Kmeans(ds.d, 100, niter=10)
        km_ref.train(xt)
        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()

        data = clustering.DatasetAssign(xt)
        centroids = clustering.kmeans(100, data, 10)
        err2 = faiss.knn(xt, centroids, 1)[0].sum()

        # 33517.645 and 33031.098
        self.assertLess(err2, err * 1.1)

    def test_sparse_routines(self):
        """ the sparse assignment routine """
        ds = datasets.SyntheticDataset(1000, 2000, 0, 200)
        xt = ds.get_train().copy()
        faiss.normalize_L2(xt)

        mask = np.abs(xt) > 0.045
        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
        xt[np.logical_not(mask)] = 0

        centroids = ds.get_queries()
        assert len(centroids) == 200

        xsparse = scipy.sparse.csr_matrix(xt)

        Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1)
        D, I = clustering.sparse_assign_to_dense(xsparse, centroids)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)

        D, I = clustering.sparse_assign_to_dense_blocks(
            xsparse, centroids, qbs=123, bbs=33, nt=4)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)

    def test_sparse_kmeans(self):
        """ demo on how to cluster sparse data into dense clusters """

        ds = datasets.SyntheticDataset(1000, 1500, 0, 0)
        xt = ds.get_train().copy()
        faiss.normalize_L2(xt)

        mask = np.abs(xt) > 0.045
        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
        xt[np.logical_not(mask)] = 0

        km = faiss.Kmeans(ds.d, 50)
        km.train(xt)
        ref_err = km.iteration_stats[-1]["obj"]

        xsparse = scipy.sparse.csr_matrix(xt)

        centroids, iteration_stats = clustering.kmeans(
            50, clustering.DatasetAssignSparse(xsparse), return_stats=True)

        new_err = iteration_stats[-1]["obj"]

        self.assertLess(new_err, ref_err * 1.1)