Faiss
Main Page
Namespaces
Classes
Files
File List
All
Classes
Namespaces
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
gpu
utils
DeviceDefs.cuh
1
2
/**
3
* Copyright (c) 2015-present, Facebook, Inc.
4
* All rights reserved.
5
*
6
* This source code is licensed under the CC-by-NC license found in the
7
* LICENSE file in the root directory of this source tree.
8
*/
9
10
// Copyright 2004-present Facebook. All Rights Reserved.
11
12
#pragma once
13
14
namespace
faiss {
namespace
gpu {
15
16
#ifdef __CUDA_ARCH__
17
#if __CUDA_ARCH__ <= 600
18
constexpr
int
kWarpSize = 32;
19
#else
20
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
21
#endif // __CUDA_ARCH__ types
22
#endif // __CUDA_ARCH__
23
24
#ifndef __CUDA_ARCH__
25
// dummy value for host compiler
26
constexpr
int
kWarpSize = 32;
27
#endif // !__CUDA_ARCH__
28
29
__forceinline__ __device__
void
warpFence() {
30
// Technically, memory barriers are required via the CUDA
31
// programming model, since warp synchronous programming no longer
32
// is guaranteed.
33
//
34
// There are two components to it:
35
// -a barrier known to the compiler such that the compiler will not
36
// schedule loads and stores across the barrier;
37
// -a HW-level barrier that guarantees that writes are seen in the
38
// proper order
39
//
40
// However, __threadfence_block() is a stronger constraint than what
41
// we really want out of the hardware: a warp-wide barrier.
42
//
43
// In current hardware, it appears that warp synchronous programming
44
// is a reality; by all tests it appears safe and race-free.
45
//
46
// However, understandably it may not be in the future (based on
47
// what Nvidia says in the Kepler guide, it may change depending
48
// upon compiler/toolchain issues or future hardware).
49
//
50
// Removing the fence results in 10%+ faster performance.
51
// However, we are judicious as to where we insert the fence, so if
52
// this reality ever changes, uncommenting this will result in CUDA
53
// programming model-safe ordering again.
54
//
55
// FIXME: we should probably qualify as volatile as well, since the
56
// compiler could technically preserve values across loops? This
57
// seems very impractical for the compiler to do, however.
58
59
// __threadfence_block();
60
}
61
62
} }
// namespace
Generated by
1.8.5