docs/html/Distance_8cu_source.html

/**

 * Copyright (c) 2015-present, Facebook, Inc.

 * All rights reserved.

 *

 * This source code is licensed under the BSD+Patents license found in the

 * LICENSE file in the root directory of this source tree.

 */


// Copyright 2004-present Facebook. All Rights Reserved.


#include "Distance.cuh"

#include "BroadcastSum.cuh"

#include "L2Norm.cuh"

#include "L2Select.cuh"

#include "../../FaissAssert.h"

#include "../GpuResources.h"

#include "../utils/DeviceUtils.h"

#include "../utils/Limits.cuh"

#include "../utils/MatrixMult.cuh"

#include "../utils/BlockSelectKernel.cuh"


#include <memory>

#include <thrust/fill.h>

#include <thrust/for_each.h>

#include <thrust/device_ptr.h>

#include <thrust/execution_policy.h>


namespace faiss { namespace gpu {


namespace {


template <typename T>

Tensor<T, 2, true> sliceCentroids(Tensor<T, 2, true>& centroids,

                                  Tensor<T, 2, true>* centroidsTransposed,

                                  int startCentroid,

                                  int num) {

  if (startCentroid == 0 && num == centroids.getSize(0)) {

    if (centroidsTransposed) {

      return *centroidsTransposed;

    } else {

      return centroids;

    }

  }


  if (centroidsTransposed) {

    // (dim, num)

    return centroidsTransposed->narrow(1, startCentroid, num);

  } else {

    return centroids.narrow(0, startCentroid, num);

  }

}


// For each chunk of k indices, increment the index by chunk * increment

template <typename T>

__global__ void incrementIndex(Tensor<T, 2, true> indices,

                               int k,

                               int increment) {

  for (int i = threadIdx.x; i < k; i += blockDim.x) {

    indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;

  }

}


// Used to update result indices in distance computation where the number of

// centroids is high, and is tiled

template <typename T>

void runIncrementIndex(Tensor<T, 2, true>& indices,

                       int k,

                       int increment,

                       cudaStream_t stream) {

  dim3 grid(indices.getSize(1) / k, indices.getSize(0));

  int block = std::min(k, 512);


  // should be exact

  FAISS_ASSERT(grid.x * k == indices.getSize(1));


  incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);


  cudaDeviceSynchronize();

}


// If the inner size (dim) of the vectors is small, we want a larger query tile

// size, like 1024


void chooseTileSize(int numQueries,

                    int numCentroids,

                    int dim,

                    int elementSize,

                    size_t tempMemAvailable,

                    int& tileRows,

                    int& tileCols) {

  // The matrix multiplication should be large enough to be efficient, but if it

  // is too large, we seem to lose efficiency as opposed to double-streaming.

  // Each tile size here defines 1/2 of the memory use due to double streaming.

  // We ignore available temporary memory, as that is adjusted independently by

  // the user and can thus meet these requirements (or not).

  // For <= 4 GB GPUs, prefer 512 MB of usage.

  // For <= 8 GB GPUs, prefer 768 MB of usage.

  // Otherwise, prefer 1 GB of usage.

  auto totalMem = getCurrentDeviceProperties().totalGlobalMem;


  int targetUsage = 0;


  if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {

    targetUsage = 512 * 1024 * 1024;

  } else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {

    targetUsage = 768 * 1024 * 1024;

  } else {

    targetUsage = 1024 * 1024 * 1024;

  }


  targetUsage /= 2 * elementSize;


  // 512 seems to be a batch size sweetspot for float32.

  // If we are on float16, increase to 512.

  // If the k size (vec dim) of the matrix multiplication is small (<= 32),

  // increase to 1024.

  int preferredTileRows = 512;

  if (dim <= 32) {

    preferredTileRows = 1024;

  }


  tileRows = std::min(preferredTileRows, numQueries);


  // tileCols is the remainder size

  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);

}


}


template <typename T>

void runDistance(bool computeL2,

                 GpuResources* resources,

                 Tensor<T, 2, true>& centroids,

                 Tensor<T, 2, true>* centroidsTransposed,

                 Tensor<T, 1, true>* centroidNorms,

                 Tensor<T, 2, true>& queries,

                 int k,

                 Tensor<T, 2, true>& outDistances,

                 Tensor<int, 2, true>& outIndices,

                 bool useHgemm,

                 bool ignoreOutDistances) {

  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));

  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));

  FAISS_ASSERT(outDistances.getSize(1) == k);

  FAISS_ASSERT(outIndices.getSize(1) == k);


  auto& mem = resources->getMemoryManagerCurrentDevice();

  auto defaultStream = resources->getDefaultStreamCurrentDevice();


  // If we're quering against a 0 sized set, just return empty results

  if (centroids.numElements() == 0) {

    thrust::fill(thrust::cuda::par.on(defaultStream),

                 outDistances.data(), outDistances.end(),

                 Limits<T>::getMax());


    thrust::fill(thrust::cuda::par.on(defaultStream),

                 outIndices.data(), outIndices.end(),

                 -1);


    return;

  }


  // L2: If ||c||^2 is not pre-computed, calculate it

  DeviceTensor<T, 1, true> cNorms;

  if (computeL2 && !centroidNorms) {

    cNorms = std::move(DeviceTensor<T, 1, true>(

                       mem,

                       {centroids.getSize(0)}, defaultStream));

    runL2Norm(centroids, cNorms, true, defaultStream);

    centroidNorms = &cNorms;

  }


  //

  // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed

  //

  int qNormSize[1] = {queries.getSize(0)};

  DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);


  // ||q||^2

  if (computeL2) {

    runL2Norm(queries, queryNorms, true, defaultStream);

  }


  // By default, aim to use up to 512 MB of memory for the processing, with both

  // number of queries and number of centroids being at least 512.

  int tileRows = 0;

  int tileCols = 0;

  chooseTileSize(queries.getSize(0),

                 centroids.getSize(0),

                 queries.getSize(1),

                 sizeof(T),

                 mem.getSizeAvailable(),

                 tileRows,

                 tileCols);


  int numColTiles = utils::divUp(centroids.getSize(0), tileCols);


  FAISS_ASSERT(k <= centroids.getSize(0));

  FAISS_ASSERT(k <= 1024); // select limitation


  // Temporary output memory space we'll use

  DeviceTensor<T, 2, true> distanceBuf1(

    mem, {tileRows, tileCols}, defaultStream);

  DeviceTensor<T, 2, true> distanceBuf2(

    mem, {tileRows, tileCols}, defaultStream);

  DeviceTensor<T, 2, true>* distanceBufs[2] =

    {&distanceBuf1, &distanceBuf2};


  DeviceTensor<T, 2, true> outDistanceBuf1(

    mem, {tileRows, numColTiles * k}, defaultStream);

  DeviceTensor<T, 2, true> outDistanceBuf2(

    mem, {tileRows, numColTiles * k}, defaultStream);

  DeviceTensor<T, 2, true>* outDistanceBufs[2] =

    {&outDistanceBuf1, &outDistanceBuf2};


  DeviceTensor<int, 2, true> outIndexBuf1(

    mem, {tileRows, numColTiles * k}, defaultStream);

  DeviceTensor<int, 2, true> outIndexBuf2(

    mem, {tileRows, numColTiles * k}, defaultStream);

  DeviceTensor<int, 2, true>* outIndexBufs[2] =

    {&outIndexBuf1, &outIndexBuf2};


  auto streams = resources->getAlternateStreamsCurrentDevice();

  streamWait(streams, {defaultStream});


  int curStream = 0;


  // Tile over the input queries

  for (int i = 0; i < queries.getSize(0); i += tileRows) {

    int curQuerySize = std::min(tileRows, queries.getSize(0) - i);


    auto outDistanceView =

      outDistances.narrow(0, i, curQuerySize);

    auto outIndexView =

      outIndices.narrow(0, i, curQuerySize);


    auto queryView =

      queries.narrow(0, i, curQuerySize);

    auto queryNormNiew =

      queryNorms.narrow(0, i, curQuerySize);


    auto outDistanceBufRowView =

      outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);

    auto outIndexBufRowView =

      outIndexBufs[curStream]->narrow(0, 0, curQuerySize);


    // Tile over the centroids

    for (int j = 0; j < centroids.getSize(0); j += tileCols) {

      int curCentroidSize = std::min(tileCols, centroids.getSize(0) - j);


      int curColTile = j / tileCols;


      auto centroidsView =

        sliceCentroids(centroids, centroidsTransposed, j, curCentroidSize);


      auto distanceBufView = distanceBufs[curStream]->

        narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);


      auto outDistanceBufColView =

        outDistanceBufRowView.narrow(1, k * curColTile, k);

      auto outIndexBufColView =

        outIndexBufRowView.narrow(1, k * curColTile, k);


      // L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc

      // IP: just compute qc

      // (query id x dim) x (centroid id, dim)' = (query id, centroid id)

      runMatrixMult(distanceBufView, false,

                    queryView, false,

                    centroidsView,

                    centroidsTransposed ? false : true,

                    computeL2 ? -2.0f : 1.0f, 0.0f, useHgemm,

                    resources->getBlasHandleCurrentDevice(),

                    streams[curStream]);


      if (computeL2) {

        // For L2 distance, we use this fused kernel that performs both

        // adding ||c||^2 to -2qc and k-selection, so we only need two

        // passes (one write by the gemm, one read here) over the huge

        // region of output memory

        //

        // If we aren't tiling along the number of centroids, we can perform the

        // output work directly

        if (tileCols == centroids.getSize(0)) {

          // Write into the final output

          runL2SelectMin(distanceBufView,

                         *centroidNorms,

                         outDistanceView,

                         outIndexView,

                         k,

                         streams[curStream]);


          if (!ignoreOutDistances) {

            // expand (query id) to (query id, k) by duplicating along rows

            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)

            runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]);

          }

        } else {

          auto centroidNormsView =

            centroidNorms->narrow(0, j, curCentroidSize);


          // Write into our intermediate output

          runL2SelectMin(distanceBufView,

                         centroidNormsView,

                         outDistanceBufColView,

                         outIndexBufColView,

                         k,

                         streams[curStream]);


          if (!ignoreOutDistances) {

            // expand (query id) to (query id, k) by duplicating along rows

            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)

            runSumAlongRows(queryNormNiew,

                            outDistanceBufColView,

                            streams[curStream]);

          }

        }

      } else {

        // For IP, just k-select the output for this tile

        if (tileCols == centroids.getSize(0)) {

          // Write into the final output

          runBlockSelect(distanceBufView,

                         outDistanceView,

                         outIndexView,

                         true, k, streams[curStream]);

        } else {

          // Write into the intermediate output

          runBlockSelect(distanceBufView,

                         outDistanceBufColView,

                         outIndexBufColView,

                         true, k, streams[curStream]);

        }

      }

    }


    // As we're finished with processing a full set of centroids, perform the

    // final k-selection

    if (tileCols != centroids.getSize(0)) {

      // The indices are tile-relative; for each tile of k, we need to add

      // tileCols to the index

      runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);


      runBlockSelectPair(outDistanceBufRowView,

                         outIndexBufRowView,

                         outDistanceView,

                         outIndexView,

                         computeL2 ? false : true, k, streams[curStream]);

    }


    curStream = (curStream + 1) % 2;

  }


  // Have the desired ordering stream wait on the multi-stream

  streamWait({defaultStream}, streams);

}


template <typename T>

void runL2Distance(GpuResources* resources,

                   Tensor<T, 2, true>& centroids,

                   Tensor<T, 2, true>* centroidsTransposed,

                   Tensor<T, 1, true>* centroidNorms,

                   Tensor<T, 2, true>& queries,

                   int k,

                   Tensor<T, 2, true>& outDistances,

                   Tensor<int, 2, true>& outIndices,

                   bool useHgemm,

                   bool ignoreOutDistances = false) {

  runDistance<T>(true, // L2

                 resources,

                 centroids,

                 centroidsTransposed,

                 centroidNorms,

                 queries,

                 k,

                 outDistances,

                 outIndices,

                 useHgemm,

                 ignoreOutDistances);

}


template <typename T>

void runIPDistance(GpuResources* resources,

                   Tensor<T, 2, true>& centroids,

                   Tensor<T, 2, true>* centroidsTransposed,

                   Tensor<T, 2, true>& queries,

                   int k,

                   Tensor<T, 2, true>& outDistances,

                   Tensor<int, 2, true>& outIndices,

                   bool useHgemm) {

  runDistance<T>(false, // IP

                 resources,

                 centroids,

                 centroidsTransposed,

                 nullptr,

                 queries,

                 k,

                 outDistances,

                 outIndices,

                 useHgemm,

                 false);

}


//

// Instantiations of the distance templates

//


void

runIPDistance(GpuResources* resources,

              Tensor<float, 2, true>& vectors,

              Tensor<float, 2, true>* vectorsTransposed,

              Tensor<float, 2, true>& queries,

              int k,

              Tensor<float, 2, true>& outDistances,

              Tensor<int, 2, true>& outIndices) {

  runIPDistance<float>(resources,

                       vectors,

                       vectorsTransposed,

                       queries,

                       k,

                       outDistances,

                       outIndices,

                       false);

}


#ifdef FAISS_USE_FLOAT16

void

runIPDistance(GpuResources* resources,

              Tensor<half, 2, true>& vectors,

              Tensor<half, 2, true>* vectorsTransposed,

              Tensor<half, 2, true>& queries,

              int k,

              Tensor<half, 2, true>& outDistances,

              Tensor<int, 2, true>& outIndices,

              bool useHgemm) {

  runIPDistance<half>(resources,

                      vectors,

                      vectorsTransposed,

                      queries,

                      k,

                      outDistances,

                      outIndices,

                      useHgemm);

}

#endif


void

runL2Distance(GpuResources* resources,

              Tensor<float, 2, true>& vectors,

              Tensor<float, 2, true>* vectorsTransposed,

              Tensor<float, 1, true>* vectorNorms,

              Tensor<float, 2, true>& queries,

              int k,

              Tensor<float, 2, true>& outDistances,

              Tensor<int, 2, true>& outIndices,

              bool ignoreOutDistances) {

  runL2Distance<float>(resources,

                       vectors,

                       vectorsTransposed,

                       vectorNorms,

                       queries,

                       k,

                       outDistances,

                       outIndices,

                       false,

                       ignoreOutDistances);

}


#ifdef FAISS_USE_FLOAT16

void

runL2Distance(GpuResources* resources,

              Tensor<half, 2, true>& vectors,

              Tensor<half, 2, true>* vectorsTransposed,

              Tensor<half, 1, true>* vectorNorms,

              Tensor<half, 2, true>& queries,

              int k,

              Tensor<half, 2, true>& outDistances,

              Tensor<int, 2, true>& outIndices,

              bool useHgemm,

              bool ignoreOutDistances) {

  runL2Distance<half>(resources,

                      vectors,

                      vectorsTransposed,

                      vectorNorms,

                      queries,

                      k,

                      outDistances,

                      outIndices,

                      useHgemm,

                      ignoreOutDistances);

}

#endif


} } // namespace