docs/html/Select_8cuh_source.html

 /**

  * Copyright (c) 2015-present, Facebook, Inc.

  * All rights reserved.

  *

  * This source code is licensed under the CC-by-NC license found in the

  * LICENSE file in the root directory of this source tree.

  */


 // Copyright 2004-present Facebook. All Rights Reserved.

 #pragma once


 #include "Comparators.cuh"

 #include "DeviceDefs.cuh"

 #include "MergeNetworkBlock.cuh"

 #include "MergeNetworkWarp.cuh"

 #include "PtxUtils.cuh"

 #include "Reductions.cuh"

 #include "ReductionOperators.cuh"

 #include "Tensor.cuh"


 namespace faiss { namespace gpu {


 // Specialization for block-wide monotonic merges producing a merge sort

 // since what we really want is a constexpr loop expansion

 template <int NumWarps,

           int NumThreads, typename K, typename V, int NumWarpQ,

           bool Dir, typename Comp>

 struct FinalBlockMerge {

 };


 template <int NumThreads, typename K, typename V, int NumWarpQ,

           bool Dir, typename Comp>

 struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> {

   static inline __device__ void merge(K* sharedK, V* sharedV) {

     // no merge required; single warp

   }

 };


 template <int NumThreads, typename K, typename V, int NumWarpQ,

           bool Dir, typename Comp>

 struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> {

   static inline __device__ void merge(K* sharedK, V* sharedV) {

     blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 2),

                NumWarpQ, !Dir, Comp>(sharedK, sharedV);

   }

 };


 template <int NumThreads, typename K, typename V, int NumWarpQ,

           bool Dir, typename Comp>

 struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> {

   static inline __device__ void merge(K* sharedK, V* sharedV) {

     blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 2),

                NumWarpQ, !Dir, Comp>(sharedK, sharedV);

     blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 4),

                NumWarpQ * 2, !Dir, Comp>(sharedK, sharedV);

   }

 };


 template <int NumThreads, typename K, typename V, int NumWarpQ,

           bool Dir, typename Comp>

 struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> {

   static inline __device__ void merge(K* sharedK, V* sharedV) {

     blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 2),

                NumWarpQ, !Dir, Comp>(sharedK, sharedV);

     blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 4),

                NumWarpQ * 2, !Dir, Comp>(sharedK, sharedV);

     blockMerge<NumThreads, K, V, NumThreads / (kWarpSize * 8),

                NumWarpQ * 4, !Dir, Comp>(sharedK, sharedV);

   }

 };


 // `Dir` true, produce largest values.

 // `Dir` false, produce smallest values.

 template <typename K,

           typename V,

           bool Dir,

           typename Comp,

           int NumWarpQ,

           int NumThreadQ,

           int ThreadsPerBlock>

 struct BlockSelect {

   static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;

   static constexpr int kTotalWarpSortSize = NumWarpQ;


   __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k) :

       sharedK(smemK),

       sharedV(smemV),

       warpKTop(initK),

       kMinus1(k - 1) {

     static_assert(utils::isPowerOf2(ThreadsPerBlock),

                   "threads must be a power-of-2");

     static_assert(utils::isPowerOf2(NumWarpQ),

                   "warp queue must be power-of-2");


     // Fill the per-thread queue keys with the default value; values

     // can remain uninitialized

 #pragma unroll

     for (int i = 0; i < NumThreadQ; ++i) {

       threadK[i] = initK;

       threadV[i] = initV;

     }


     int laneId = getLaneId();

     int warpId = threadIdx.x / kWarpSize;

     warpK = sharedK + warpId * kTotalWarpSortSize;

     warpV = sharedV + warpId * kTotalWarpSortSize;


     // Fill warp queue (only the actual queue space is fine, not where

     // we write the per-thread queues for merging)

     for (int i = laneId; i < NumWarpQ; i += kWarpSize) {

       warpK[i] = initK;

       warpV[i] = initV;

     }


     warpFence();

   }


   __device__ inline void addThreadQ(K k, V v) {

     // If we're greater or equal to the highest element, then we don't

     // need to add. In the equal to case, the element we have is

     // sufficient.

     if (Dir ? Comp::gt(k, threadK[0]) : Comp::lt(k, threadK[0])) {

       threadK[0] = k;

       threadV[0] = v;


       // Perform in-register bubble sort of this new element

 #pragma unroll

       for (int i = 1; i < NumThreadQ; ++i) {

         bool swap = Dir ? Comp::lt(threadK[i], threadK[i - 1]) :

           Comp::gt(threadK[i], threadK[i - 1]);


         K tmpK = threadK[i];

         threadK[i] = swap ? threadK[i - 1] : tmpK;

         threadK[i - 1] = swap ? tmpK : threadK[i - 1];


         V tmpV = threadV[i];

         threadV[i] = swap ? threadV[i - 1] : tmpV;

         threadV[i - 1] = swap ? tmpV : threadV[i - 1];

       }

     }

   }


   __device__ inline void checkThreadQ() {

     // There is no need to merge queues if no thread-local queue is

     // better than the warp-wide queue

     bool needSort = (Dir ?

                      Comp::gt(threadK[0], warpKTop) :

                      Comp::lt(threadK[0], warpKTop));

     if (!__any(needSort)) {

       return;

     }


     // This has a trailing warpFence

     mergeWarpQ();


     // We have to beat at least this element

     warpKTop = warpK[kMinus1];


     warpFence();

   }


   /// This function handles sorting and merging together the

   /// per-thread queues with the warp-wide queue, creating a sorted

   /// list across both

   __device__ inline void mergeWarpQ() {

     int laneId = getLaneId();


     // Sort all of the per-thread queues

     warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);


     constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;

     K warpKRegisters[kNumWarpQRegisters];

     V warpVRegisters[kNumWarpQRegisters];


 #pragma unroll

     for (int i = 0; i < kNumWarpQRegisters; ++i) {

       warpKRegisters[i] = warpK[i * kWarpSize + laneId];

       warpVRegisters[i] = warpV[i * kWarpSize + laneId];

     }


     warpFence();


     // The warp queue is already sorted, and now that we've sorted the

     // per-thread queue, merge both sorted lists together, producing

     // one sorted list

     warpMergeAnyRegisters<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp>(

       warpKRegisters, warpVRegisters, threadK, threadV);


     // Write back out the warp queue

 #pragma unroll

     for (int i = 0; i < kNumWarpQRegisters; ++i) {

       warpK[i * kWarpSize + laneId] = warpKRegisters[i];

       warpV[i * kWarpSize + laneId] = warpVRegisters[i];

     }


     warpFence();


     // Re-map the registers for our per-thread queues

     K tmpThreadK[NumThreadQ];

     V tmpThreadV[NumThreadQ];


 #pragma unroll

     for (int i = 0; i < NumThreadQ; ++i) {

       tmpThreadK[i] = threadK[i];

       tmpThreadV[i] = threadV[i];

     }


 #pragma unroll

     for (int i = 0; i < NumThreadQ; ++i) {

       // After merging, the data is in the order small -> large.

       // We wish to reload data in our thread queues, which have the

       // order large -> small by indexing, so it will be reverse order.

       // However, we also wish to give every thread an equal shot at the

       // largest elements, so we interleave the loading.

       threadK[i] = tmpThreadK[NumThreadQ - i - 1];

       threadV[i] = tmpThreadV[NumThreadQ - i - 1];

     }

   }


   /// WARNING: all threads in a warp must participate in this.

   /// Otherwise, you must call the constituent parts separately.

   __device__ inline void add(K k, V v) {

     addThreadQ(k, v);

     checkThreadQ();

   }


   __device__ inline void reduce() {

     // Have all warps dump and merge their queues; this will produce

     // the final per-warp results

     mergeWarpQ();


     // block-wide dep; thus far, all warps have been completely

     // independent

     __syncthreads();


     // All warp queues are contiguous in smem.

     // Now, we have kNumWarps lists of NumWarpQ elements.

     // This is a power of 2.

     FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, V, NumWarpQ, Dir, Comp>::

       merge(sharedK, sharedV);


     // The block-wide merge has a trailing syncthreads

   }


   // threadK[0] is lowest (Dir) or highest (!Dir)

   K threadK[NumThreadQ];

   V threadV[NumThreadQ];


   // Queues for all warps

   K* sharedK;

   V* sharedV;


   // Our warp's queue (points into sharedK/sharedV)

   // warpK[0] is highest (Dir) or lowest (!Dir)

   K* warpK;

   V* warpV;


   // Warp queue head value cached in a register, so we needn't read

   // shared memory as frequently

   K warpKTop;


   // This is a cached k-1 value

   int kMinus1;

 };


 /// Specialization for k == 1 (NumWarpQ == 1)

 template <typename K,

           typename V,

           bool Dir,

           typename Comp,

           int NumThreadQ,

           int ThreadsPerBlock>

 struct BlockSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {

   static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;


   __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k) :

       sharedK(smemK),

       sharedV(smemV),

       threadK(initK),

       threadV(initV) {

   }


   __device__ inline void addThreadQ(K k, V v) {

     bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);

     threadK = swap ? k : threadK;

     threadV = swap ? v : threadV;

   }


   __device__ inline void checkThreadQ() {

     // We don't need to do anything here, since the warp doesn't

     // cooperate until the end

   }


   __device__ inline void add(K k, V v) {

     addThreadQ(k, v);

   }


   __device__ inline void reduce() {

     // Reduce within the warp

     Pair<K, V> pair(threadK, threadV);


     if (Dir) {

       pair =

         warpReduceAll<Pair<K, V>, Max<Pair<K, V>>>(pair, Max<Pair<K, V>>());

     } else {

       pair =

         warpReduceAll<Pair<K, V>, Min<Pair<K, V>>>(pair, Min<Pair<K, V>>());

     }


     // Each warp writes out a single value

     int laneId = getLaneId();

     int warpId = threadIdx.x / kWarpSize;


     if (laneId == 0) {

       sharedK[warpId] = pair.k;

       sharedV[warpId] = pair.v;

     }


     __syncthreads();


     // We typically use this for small blocks (<= 128), just having the first

     // thread in the block perform the reduction across warps is

     // faster

     if (threadIdx.x == 0) {

       threadK = sharedK[0];

       threadV = sharedV[0];


 #pragma unroll

       for (int i = 1; i < kNumWarps; ++i) {

         K k = sharedK[i];

         V v = sharedV[i];


         bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);

         threadK = swap ? k : threadK;

         threadV = swap ? v : threadV;

       }


       // Hopefully a thread's smem reads/writes are ordered wrt

       // itself, so no barrier needed :)

       sharedK[0] = threadK;

       sharedV[0] = threadV;

     }


     // In case other threads wish to read this value

     __syncthreads();

   }


   // threadK is lowest (Dir) or highest (!Dir)

   K threadK;

   V threadV;


   // Where we reduce in smem

   K* sharedK;

   V* sharedV;

 };


 //

 // per-warp WarpSelect

 //


 // `Dir` true, produce largest values.

 // `Dir` false, produce smallest values.

 template <typename K,

           typename V,

           bool Dir,

           typename Comp,

           int NumWarpQ,

           int NumThreadQ,

           int ThreadsPerBlock>

 struct WarpSelect {

   static constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;


   __device__ inline WarpSelect(K initK, V initV, int k) :

       warpKTop(initK),

       kLane((k - 1) % kWarpSize) {

     static_assert(utils::isPowerOf2(ThreadsPerBlock),

                   "threads must be a power-of-2");

     static_assert(utils::isPowerOf2(NumWarpQ),

                   "warp queue must be power-of-2");


     // Fill the per-thread queue keys with the default value; values

     // can remain uninitialized

 #pragma unroll

     for (int i = 0; i < NumThreadQ; ++i) {

       threadK[i] = initK;

       threadV[i] = initV;

     }


     // Fill the warp queue with the default value

 #pragma unroll

     for (int i = 0; i < kNumWarpQRegisters; ++i) {

       warpK[i] = initK;

       warpV[i] = initV;

     }

   }


   __device__ inline void addThreadQ(K k, V v) {

     // If we're greater or equal to the highest element, then we don't

     // need to add. In the equal to case, the element we have is

     // sufficient.

     if (Dir ? Comp::gt(k, threadK[0]) : Comp::lt(k, threadK[0])) {

       threadK[0] = k;

       threadV[0] = v;


       // Perform in-register bubble sort of this new element

 #pragma unroll

       for (int i = 1; i < NumThreadQ; ++i) {

         bool swap = Dir ? Comp::lt(threadK[i], threadK[i - 1]) :

           Comp::gt(threadK[i], threadK[i - 1]);


         K tmpK = threadK[i];

         threadK[i] = swap ? threadK[i - 1] : tmpK;

         threadK[i - 1] = swap ? tmpK : threadK[i - 1];


         V tmpV = threadV[i];

         threadV[i] = swap ? threadV[i - 1] : tmpV;

         threadV[i - 1] = swap ? tmpV : threadV[i - 1];

       }

     }

   }


   __device__ inline void checkThreadQ() {

     // There is no need to merge queues if no thread-local queue is

     // better than the warp-wide queue

     bool needSort = (Dir ?

                      Comp::gt(threadK[0], warpKTop) :

                      Comp::lt(threadK[0], warpKTop));

     if (!__any(needSort)) {

       return;

     }


     mergeWarpQ();


     // We have to beat at least this element

     warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);

   }


   /// This function handles sorting and merging together the

   /// per-thread queues with the warp-wide queue, creating a sorted

   /// list across both

   __device__ inline void mergeWarpQ() {

     // Sort all of the per-thread queues

     warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);


     // The warp queue is already sorted, and now that we've sorted the

     // per-thread queue, merge both sorted lists together, producing

     // one sorted list

     warpMergeAnyRegisters<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp>(

       warpK, warpV, threadK, threadV);


     // Re-map the registers for our per-thread queues

     K tmpThreadK[NumThreadQ];

     V tmpThreadV[NumThreadQ];


 #pragma unroll

     for (int i = 0; i < NumThreadQ; ++i) {

       tmpThreadK[i] = threadK[i];

       tmpThreadV[i] = threadV[i];

     }


 #pragma unroll

     for (int i = 0; i < NumThreadQ; ++i) {

       // After merging, the data is in the order small -> large.

       // We wish to reload data in our thread queues, which have the

       // order large -> small by indexing, so it will be reverse order.

       // However, we also wish to give every thread an equal shot at the

       // largest elements, so we interleave the loading.

       threadK[i] = tmpThreadK[NumThreadQ - i - 1];

       threadV[i] = tmpThreadV[NumThreadQ - i - 1];

     }

   }


   /// WARNING: all threads in a warp must participate in this.

   /// Otherwise, you must call the constituent parts separately.

   __device__ inline void add(K k, V v) {

     addThreadQ(k, v);

     checkThreadQ();

   }


   __device__ inline void reduce() {

     // Have all warps dump and merge their queues; this will produce

     // the final per-warp results

     mergeWarpQ();

   }


   /// Dump final k selected values for this warp out

   __device__ inline void writeOut(K* outK, V* outV, int k) {

     int laneId = getLaneId();


 #pragma unroll

     for (int i = 0; i < kNumWarpQRegisters; ++i) {

       int idx = i * kWarpSize + laneId;


       if (idx < k) {

         outK[idx] = warpK[i];

         outV[idx] = warpV[i];

       }

     }

   }


   // threadK[0] is lowest (Dir) or highest (!Dir)

   K threadK[NumThreadQ];

   V threadV[NumThreadQ];


   // warpK[0] is highest (Dir) or lowest (!Dir)

   K warpK[kNumWarpQRegisters];

   V warpV[kNumWarpQRegisters];


   // Warp queue head value cached in a register, so we needn't read

   // shared memory as frequently

   K warpKTop;


   // This is what lane we should load an approximation (>=k) to the

   // kth element from the last register in the warp queue (i.e.,

   // warpK[kNumWarpQRegisters - 1]).

   int kLane;

 };


 /// Specialization for k == 1 (NumWarpQ == 1)

 template <typename K,

           typename V,

           bool Dir,

           typename Comp,

           int NumThreadQ,

           int ThreadsPerBlock>

 struct WarpSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {

   static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;


   __device__ inline WarpSelect(K initK, V initV, int k) :

       threadK(initK),

       threadV(initV) {

   }


   __device__ inline void addThreadQ(K k, V v) {

     bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);

     threadK = swap ? k : threadK;

     threadV = swap ? v : threadV;

   }


   __device__ inline void checkThreadQ() {

     // We don't need to do anything here, since the warp doesn't

     // cooperate until the end

   }


   __device__ inline void add(K k, V v) {

     addThreadQ(k, v);

   }


   __device__ inline void reduce() {

     // Reduce within the warp

     Pair<K, V> pair(threadK, threadV);


     if (Dir) {

       pair =

         warpReduceAll<Pair<K, V>, Max<Pair<K, V>>>(pair, Max<Pair<K, V>>());

     } else {

       pair =

         warpReduceAll<Pair<K, V>, Min<Pair<K, V>>>(pair, Min<Pair<K, V>>());

     }


     threadK = pair.k;

     threadV = pair.v;

   }


   /// Dump final k selected values for this warp out

   __device__ inline void writeOut(K* outK, V* outV, int k) {

     if (getLaneId() == 0) {

       *outK = threadK;

       *outV = threadV;

     }

   }


   // threadK is lowest (Dir) or highest (!Dir)

   K threadK;

   V threadV;

 };


 } } // namespace

faiss::gpu::FinalBlockMerge
Definition: Select.cuh:29

faiss::gpu::Pair
A simple pair type for CUDA device usage.
Definition: Pair.cuh:22

faiss::gpu::WarpSelect< K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock >::writeOut
__device__ void writeOut(K *outK, V *outV, int k)
Dump final k selected values for this warp out.
Definition: Select.cuh:567

faiss::gpu::BlockSelect
Definition: Select.cuh:82

faiss::gpu::WarpSelect
Definition: Select.cuh:371

faiss::gpu::WarpSelect::writeOut
__device__ void writeOut(K *outK, V *outV, int k)
Dump final k selected values for this warp out.
Definition: Select.cuh:488

faiss::gpu::WarpSelect::add
__device__ void add(K k, V v)
Definition: Select.cuh:476

faiss::gpu::Min
Definition: ReductionOperators.cuh:33

faiss::gpu::BlockSelect::add
__device__ void add(K k, V v)
Definition: Select.cuh:223

faiss::gpu::WarpSelect::mergeWarpQ
__device__ void mergeWarpQ()
Definition: Select.cuh:442

faiss::gpu::Max
Definition: ReductionOperators.cuh:44

faiss::gpu::BlockSelect::mergeWarpQ
__device__ void mergeWarpQ()
Definition: Select.cuh:166