Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
DeviceDefs.cuh
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #pragma once
13 
14 namespace faiss { namespace gpu {
15 
16 #ifdef __CUDA_ARCH__
17 #if __CUDA_ARCH__ <= 600
18 constexpr int kWarpSize = 32;
19 #else
20 #error Unknown __CUDA_ARCH__; please define parameters for compute capability
21 #endif // __CUDA_ARCH__ types
22 #endif // __CUDA_ARCH__
23 
24 #ifndef __CUDA_ARCH__
25 // dummy value for host compiler
26 constexpr int kWarpSize = 32;
27 #endif // !__CUDA_ARCH__
28 
29 __forceinline__ __device__ void warpFence() {
30  // Technically, memory barriers are required via the CUDA
31  // programming model, since warp synchronous programming no longer
32  // is guaranteed.
33  //
34  // There are two components to it:
35  // -a barrier known to the compiler such that the compiler will not
36  // schedule loads and stores across the barrier;
37  // -a HW-level barrier that guarantees that writes are seen in the
38  // proper order
39  //
40  // However, __threadfence_block() is a stronger constraint than what
41  // we really want out of the hardware: a warp-wide barrier.
42  //
43  // In current hardware, it appears that warp synchronous programming
44  // is a reality; by all tests it appears safe and race-free.
45  //
46  // However, understandably it may not be in the future (based on
47  // what Nvidia says in the Kepler guide, it may change depending
48  // upon compiler/toolchain issues or future hardware).
49  //
50  // Removing the fence results in 10%+ faster performance.
51  // However, we are judicious as to where we insert the fence, so if
52  // this reality ever changes, uncommenting this will result in CUDA
53  // programming model-safe ordering again.
54  //
55  // FIXME: we should probably qualify as volatile as well, since the
56  // compiler could technically preserve values across loops? This
57  // seems very impractical for the compiler to do, however.
58 
59  // __threadfence_block();
60 }
61 
62 } } // namespace