14 namespace faiss {
namespace gpu {
16 __device__ __forceinline__
17 unsigned int getBitfield(
unsigned int val,
int pos,
int len) {
19 asm(
"bfe.u32 %0, %1, %2, %3;" :
"=r"(ret) :
"r"(val),
"r"(pos),
"r"(len));
23 __device__ __forceinline__
24 unsigned long getBitfield(
unsigned long val,
int pos,
int len) {
26 asm(
"bfe.u64 %0, %1, %2, %3;" :
"=l"(ret) :
"l"(val),
"r"(pos),
"r"(len));
30 __device__ __forceinline__
31 unsigned int setBitfield(
unsigned int val,
32 unsigned int toInsert,
int pos,
int len) {
34 asm(
"bfi.b32 %0, %1, %2, %3, %4;" :
35 "=r"(ret) :
"r"(toInsert),
"r"(val),
"r"(pos),
"r"(len));
39 __device__ __forceinline__
int getLaneId() {
41 asm(
"mov.s32 %0, %laneid;" :
"=r"(laneId) );
45 __device__ __forceinline__
unsigned getLaneMaskLt() {
47 asm(
"mov.u32 %0, %%lanemask_lt;" :
"=r"(mask));
51 __device__ __forceinline__
unsigned getLaneMaskLe() {
53 asm(
"mov.u32 %0, %%lanemask_le;" :
"=r"(mask));
57 __device__ __forceinline__
unsigned getLaneMaskGt() {
59 asm(
"mov.u32 %0, %%lanemask_gt;" :
"=r"(mask));
63 __device__ __forceinline__
unsigned getLaneMaskGe() {
65 asm(
"mov.u32 %0, %%lanemask_ge;" :
"=r"(mask));
69 __device__ __forceinline__
void namedBarrierWait(
int name,
int numThreads) {
70 asm volatile(
"bar.sync %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
73 __device__ __forceinline__
void namedBarrierArrived(
int name,
int numThreads) {
74 asm volatile(
"bar.arrive %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
78 __device__ __forceinline__
void prefetchL2(
const void *p) {
79 asm volatile(
"prefetch.global.L2 [%0];" : :
"l"(p));
82 __device__ __forceinline__
void prefetchL1(
const void *p) {
83 asm volatile(
"prefetch.global.L1 [%0];" : :
"l"(p));