16 namespace faiss {
namespace gpu {
18 __device__ __forceinline__
19 unsigned int getBitfield(
unsigned int val,
int pos,
int len) {
21 asm(
"bfe.u32 %0, %1, %2, %3;" :
"=r"(ret) :
"r"(val),
"r"(pos),
"r"(len));
25 __device__ __forceinline__
26 unsigned long getBitfield(
unsigned long val,
int pos,
int len) {
28 asm(
"bfe.u64 %0, %1, %2, %3;" :
"=l"(ret) :
"l"(val),
"r"(pos),
"r"(len));
32 __device__ __forceinline__
33 unsigned int setBitfield(
unsigned int val,
34 unsigned int toInsert,
int pos,
int len) {
36 asm(
"bfi.b32 %0, %1, %2, %3, %4;" :
37 "=r"(ret) :
"r"(toInsert),
"r"(val),
"r"(pos),
"r"(len));
41 __device__ __forceinline__
int getLaneId() {
43 asm(
"mov.s32 %0, %laneid;" :
"=r"(laneId) );
47 __device__ __forceinline__
unsigned getLaneMaskLt() {
49 asm(
"mov.u32 %0, %%lanemask_lt;" :
"=r"(mask));
53 __device__ __forceinline__
unsigned getLaneMaskLe() {
55 asm(
"mov.u32 %0, %%lanemask_le;" :
"=r"(mask));
59 __device__ __forceinline__
unsigned getLaneMaskGt() {
61 asm(
"mov.u32 %0, %%lanemask_gt;" :
"=r"(mask));
65 __device__ __forceinline__
unsigned getLaneMaskGe() {
67 asm(
"mov.u32 %0, %%lanemask_ge;" :
"=r"(mask));
71 __device__ __forceinline__
void namedBarrierWait(
int name,
int numThreads) {
72 asm volatile(
"bar.sync %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
75 __device__ __forceinline__
void namedBarrierArrived(
int name,
int numThreads) {
76 asm volatile(
"bar.arrive %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
80 __device__ __forceinline__
void prefetchL2(
const void *p) {
81 asm volatile(
"prefetch.global.L2 [%0];" : :
"l"(p));
84 __device__ __forceinline__
void prefetchL1(
const void *p) {
85 asm volatile(
"prefetch.global.L1 [%0];" : :
"l"(p));