docs/html/BinaryDistance_8cu_source.html

/**

 * Copyright (c) Facebook, Inc. and its affiliates.

 *

 * This source code is licensed under the MIT license found in the

 * LICENSE file in the root directory of this source tree.

 */


#include "../utils/DeviceTensor.cuh"

#include "../utils/DeviceDefs.cuh"

#include "../utils/DeviceUtils.h"

#include "../utils/Select.cuh"


namespace faiss { namespace gpu {


// Number of warps that the kernel is instantiated with

constexpr int kWarps = 8;

constexpr int kLanes = kWarpSize;


constexpr int kMaxDistance = std::numeric_limits<int>::max();


// Performs a binary matrix multiplication, returning the lowest k results in

// `vecs` for each `query` in terms of Hamming distance (a fused kernel)

// Each warp calculates distance for a single query

template <int NumWarpQ,

          int NumThreadQ,

          typename BinaryType>

__launch_bounds__(kWarps * kLanes)

__global__ void binaryDistanceAnySize(const Tensor<BinaryType, 2, true> vecs,

                                      const Tensor<BinaryType, 2, true> query,

                                      Tensor<int, 2, true> outK,

                                      Tensor<int, 2, true> outV,

                                      int k) {

  // A matrix tile (query, k)

  __shared__ BinaryType queryTile[kWarps][kLanes + 1]; // avoid bank conflict


  // B matrix tile (vec, k)

  __shared__ BinaryType vecTile[kLanes][kLanes + 1]; // avoid bank conflict


  WarpSelect<int, int, false, Comparator<int>,

             NumWarpQ, NumThreadQ, kWarps * kLanes>

    heap(kMaxDistance, -1, k);


  int warpId = threadIdx.y;

  int laneId = threadIdx.x;


  // Each warp handles a single query

  int warpQuery = blockIdx.x * kWarps + warpId;

  bool queryInBounds = warpQuery < query.getSize(0);


  // Each warp loops through the entire chunk of vectors

  for (int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {

    int threadDistance = 0;


    // Reduction dimension

    for (int blockK = 0; blockK < vecs.getSize(1); blockK += kLanes) {

      int laneK = blockK + laneId;

      bool kInBounds = laneK < vecs.getSize(1);


      queryTile[warpId][laneId] = queryInBounds && kInBounds ?

                               query[warpQuery][laneK] : 0;


      // kWarps warps are responsible for loading 32 vecs

#pragma unroll

      for (int i = 0; i < kLanes / kWarps; ++i) {

        int warpVec = i * kWarps + warpId;

        int vec = blockVec + warpVec;

        bool vecInBounds = vec < vecs.getSize(0);


        vecTile[warpVec][laneId] = vecInBounds && kInBounds ?

                                 vecs[vec][laneK] : 0;

      }


      __syncthreads();


      // Compare distances

#pragma unroll

      for (int i = 0; i < kLanes; ++i) {

        threadDistance += __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);

      }


      __syncthreads();

    }


    // Lanes within a warp are different vec results against the same query

    // Only submit distances which represent real (query, vec) pairs

    bool valInBounds = queryInBounds && (blockVec + laneId < vecs.getSize(0));

    threadDistance = valInBounds ? threadDistance : kMaxDistance;

    int id = valInBounds ? blockVec + laneId : -1;


    heap.add(threadDistance, id);

  }


  heap.reduce();


  if (warpQuery < query.getSize(0)) {

    heap.writeOut(outK[warpQuery].data(),

                  outV[warpQuery].data(),

                  k);

  }

}


// Version of the kernel that avoids a loop over the reduction dimension, and

// thus avoids reloading the query vectors

template <int NumWarpQ,

          int NumThreadQ,

          typename BinaryType,

          int ReductionLimit = kLanes>

__global__ void

__launch_bounds__(kWarps * kLanes)

binaryDistanceLimitSize(const Tensor<BinaryType, 2, true> vecs,

                        const Tensor<BinaryType, 2, true> query,

                        Tensor<int, 2, true> outK,

                        Tensor<int, 2, true> outV,

                        int k) {

  // A matrix tile (query, k)

  __shared__ BinaryType queryTile[kWarps][kLanes + 1]; // avoid bank conflict


  // B matrix tile (vec, k)

  __shared__ BinaryType vecTile[kLanes][kLanes + 1]; // avoid bank conflict


  WarpSelect<int, int, false, Comparator<int>,

             NumWarpQ, NumThreadQ, kWarps * kLanes>

    heap(kMaxDistance, -1, k);


  int warpId = threadIdx.y;

  int laneId = threadIdx.x;


  // Each warp handles a single query

  int laneK = laneId;

  int warpQuery = blockIdx.x * kWarps + warpId;

  bool kInBounds = laneK < vecs.getSize(1);

  bool queryInBounds = warpQuery < query.getSize(0);


  queryTile[warpId][laneId] = queryInBounds && kInBounds ?

                                   query[warpQuery][laneK] : 0;


  // Each warp loops through the entire chunk of vectors

  for (int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {

    int threadDistance = 0;


    // kWarps warps are responsible for loading 32 vecs

#pragma unroll

    for (int i = 0; i < kLanes / kWarps; ++i) {

      int warpVec = i * kWarps + warpId;

      int vec = blockVec + warpVec;

      bool vecInBounds = vec < vecs.getSize(0);


      vecTile[warpVec][laneId] = vecInBounds && kInBounds ?

                               vecs[vec][laneK] : 0;

    }


    __syncthreads();


    // Compare distances

#pragma unroll

    for (int i = 0; i < ReductionLimit; ++i) {

      threadDistance += __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);

    }


    __syncthreads();


    // Lanes within a warp are different vec results against the same query

    // Only submit distances which represent real (query, vec) pairs

    bool valInBounds = queryInBounds && (blockVec + laneId < vecs.getSize(0));

    threadDistance = valInBounds ? threadDistance : kMaxDistance;

    int id = valInBounds ? blockVec + laneId : -1;


    heap.add(threadDistance, id);

  }


  heap.reduce();


  if (warpQuery < query.getSize(0)) {

    heap.writeOut(outK[warpQuery].data(),

                  outV[warpQuery].data(),

                  k);

  }

}


template <typename BinaryType>

void runBinaryDistanceAnySize(Tensor<BinaryType, 2, true>& vecs,

                              Tensor<BinaryType, 2, true>& query,

                              Tensor<int, 2, true>& outK,

                              Tensor<int, 2, true>& outV,

                              int k, cudaStream_t stream) {

  dim3 grid(utils::divUp(query.getSize(0), kWarps));

  dim3 block(kLanes, kWarps);


  if (k == 1) {

    binaryDistanceAnySize<1, 1, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 32) {

    binaryDistanceAnySize<32, 2, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 64) {

    binaryDistanceAnySize<64, 3, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 128) {

    binaryDistanceAnySize<128, 3, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 256) {

    binaryDistanceAnySize<256, 4, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 512) {

    binaryDistanceAnySize<512, 8, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 1024) {

    binaryDistanceAnySize<1024, 8, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  }

#if GPU_MAX_SELECTION_K >= 2048

  else if (k <= 2048) {

    binaryDistanceAnySize<2048, 8, BinaryType>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  }

#endif

}


template <typename BinaryType, int ReductionLimit>

void runBinaryDistanceLimitSize(Tensor<BinaryType, 2, true>& vecs,

                                Tensor<BinaryType, 2, true>& query,

                                Tensor<int, 2, true>& outK,

                                Tensor<int, 2, true>& outV,

                                int k, cudaStream_t stream) {

  dim3 grid(utils::divUp(query.getSize(0), kWarps));

  dim3 block(kLanes, kWarps);


  if (k == 1) {

    binaryDistanceLimitSize<1, 1, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 32) {

    binaryDistanceLimitSize<32, 2, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 64) {

    binaryDistanceLimitSize<64, 3, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 128) {

    binaryDistanceLimitSize<128, 3, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 256) {

    binaryDistanceLimitSize<256, 4, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 512) {

    binaryDistanceLimitSize<512, 8, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  } else if (k <= 1024) {

    binaryDistanceLimitSize<1024, 8, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  }

#if GPU_MAX_SELECTION_K >= 2048

  else if (k <= 2048) {

    binaryDistanceLimitSize<2048, 8, BinaryType, ReductionLimit>

      <<<grid, block, 0, stream>>>(

      vecs, query, outK, outV, k);

  }

#endif

}


void runBinaryDistance(Tensor<unsigned char, 2, true>& vecs,

                       Tensor<unsigned char, 2, true>& query,

                       Tensor<int, 2, true>& outK,

                       Tensor<int, 2, true>& outV,

                       int k, cudaStream_t stream) {

  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);

  FAISS_ASSERT(vecs.getSize(1) == query.getSize(1));


  FAISS_ASSERT(outK.getSize(1) == k);

  FAISS_ASSERT(outV.getSize(1) == k);


  // For the optimized uint32 kernel, we handle 32 * 8 = 256 max dims

  constexpr int kReductionLimit32 = 8;


  // For the optimized uint8 kernel, we handle 8 * 16 = 128 max dims

  constexpr int kReductionLimit8 = 16;


  // All other cases (large or small) go through the general kernel


  if (vecs.getSize(1) % sizeof(unsigned int) == 0 &&

      (vecs.getSize(1) / sizeof(unsigned int)) <= kReductionLimit32) {

    auto vecs32 = vecs.castResize<unsigned int>();

    auto query32 = query.castResize<unsigned int>();


    // Optimize for vectors with dimensions a multiple of 32 that are less than

    // 32 * kReductionLimit (256) dimensions in size

    runBinaryDistanceLimitSize<unsigned int, kReductionLimit32>(

      vecs32, query32, outK, outV, k, stream);


  } else if (vecs.getSize(1) <= kReductionLimit8) {

    // Optimize for vectors with dimensions a multiple of 32 that are less than

    // 32 * kReductionLimit (256) dimensions in size

    runBinaryDistanceLimitSize<unsigned char, kReductionLimit8>(

      vecs, query, outK, outV, k, stream);

  } else {

    // Arbitrary size kernel

    runBinaryDistanceAnySize<unsigned char>(

      vecs, query, outK, outV, k, stream);

  }

}


} } // namespace