Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
DeviceDefs.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 namespace faiss { namespace gpu {
14 
15 #ifdef __CUDA_ARCH__
16 #if __CUDA_ARCH__ <= 620
17 constexpr int kWarpSize = 32;
18 #else
19 #error Unknown __CUDA_ARCH__; please define parameters for compute capability
20 #endif // __CUDA_ARCH__ types
21 #endif // __CUDA_ARCH__
22 
23 #ifndef __CUDA_ARCH__
24 // dummy value for host compiler
25 constexpr int kWarpSize = 32;
26 #endif // !__CUDA_ARCH__
27 
28 __forceinline__ __device__ void warpFence() {
29  // Technically, memory barriers are required via the CUDA
30  // programming model, since warp synchronous programming no longer
31  // is guaranteed.
32  //
33  // There are two components to it:
34  // -a barrier known to the compiler such that the compiler will not
35  // schedule loads and stores across the barrier;
36  // -a HW-level barrier that guarantees that writes are seen in the
37  // proper order
38  //
39  // However, __threadfence_block() is a stronger constraint than what
40  // we really want out of the hardware: a warp-wide barrier.
41  //
42  // In current hardware, it appears that warp synchronous programming
43  // is a reality; by all tests it appears safe and race-free.
44  //
45  // However, understandably it may not be in the future (based on
46  // what Nvidia says in the Kepler guide, it may change depending
47  // upon compiler/toolchain issues or future hardware).
48  //
49  // Removing the fence results in 10%+ faster performance.
50  // However, we are judicious as to where we insert the fence, so if
51  // this reality ever changes, uncommenting this will result in CUDA
52  // programming model-safe ordering again.
53  //
54  // FIXME: we should probably qualify as volatile as well, since the
55  // compiler could technically preserve values across loops? This
56  // seems very impractical for the compiler to do, however.
57 
58  // __threadfence_block();
59 }
60 
61 } } // namespace