Faiss
Main Page
Namespaces
Classes
Files
File List
All
Classes
Namespaces
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
gpu
utils
DeviceDefs.cuh
1
/**
2
* Copyright (c) 2015-present, Facebook, Inc.
3
* All rights reserved.
4
*
5
* This source code is licensed under the BSD+Patents license found in the
6
* LICENSE file in the root directory of this source tree.
7
*/
8
9
// Copyright 2004-present Facebook. All Rights Reserved.
10
11
#pragma once
12
13
namespace
faiss {
namespace
gpu {
14
15
#ifdef __CUDA_ARCH__
16
#if __CUDA_ARCH__ <= 620
17
constexpr
int
kWarpSize = 32;
18
#else
19
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
20
#endif // __CUDA_ARCH__ types
21
#endif // __CUDA_ARCH__
22
23
#ifndef __CUDA_ARCH__
24
// dummy value for host compiler
25
constexpr
int
kWarpSize = 32;
26
#endif // !__CUDA_ARCH__
27
28
__forceinline__ __device__
void
warpFence() {
29
// Technically, memory barriers are required via the CUDA
30
// programming model, since warp synchronous programming no longer
31
// is guaranteed.
32
//
33
// There are two components to it:
34
// -a barrier known to the compiler such that the compiler will not
35
// schedule loads and stores across the barrier;
36
// -a HW-level barrier that guarantees that writes are seen in the
37
// proper order
38
//
39
// However, __threadfence_block() is a stronger constraint than what
40
// we really want out of the hardware: a warp-wide barrier.
41
//
42
// In current hardware, it appears that warp synchronous programming
43
// is a reality; by all tests it appears safe and race-free.
44
//
45
// However, understandably it may not be in the future (based on
46
// what Nvidia says in the Kepler guide, it may change depending
47
// upon compiler/toolchain issues or future hardware).
48
//
49
// Removing the fence results in 10%+ faster performance.
50
// However, we are judicious as to where we insert the fence, so if
51
// this reality ever changes, uncommenting this will result in CUDA
52
// programming model-safe ordering again.
53
//
54
// FIXME: we should probably qualify as volatile as well, since the
55
// compiler could technically preserve values across loops? This
56
// seems very impractical for the compiler to do, however.
57
58
// __threadfence_block();
59
}
60
61
} }
// namespace
Generated by
1.8.5