15 namespace faiss {
namespace gpu {
17 __device__ __forceinline__
18 unsigned int getBitfield(
unsigned int val,
int pos,
int len) {
20 asm(
"bfe.u32 %0, %1, %2, %3;" :
"=r"(ret) :
"r"(val),
"r"(pos),
"r"(len));
24 __device__ __forceinline__
25 unsigned long getBitfield(
unsigned long val,
int pos,
int len) {
27 asm(
"bfe.u64 %0, %1, %2, %3;" :
"=l"(ret) :
"l"(val),
"r"(pos),
"r"(len));
31 __device__ __forceinline__
32 unsigned int setBitfield(
unsigned int val,
33 unsigned int toInsert,
int pos,
int len) {
35 asm(
"bfi.b32 %0, %1, %2, %3, %4;" :
36 "=r"(ret) :
"r"(toInsert),
"r"(val),
"r"(pos),
"r"(len));
40 __device__ __forceinline__
int getLaneId() {
42 asm(
"mov.s32 %0, %laneid;" :
"=r"(laneId) );
46 __device__ __forceinline__
unsigned getLaneMaskLt() {
48 asm(
"mov.u32 %0, %%lanemask_lt;" :
"=r"(mask));
52 __device__ __forceinline__
unsigned getLaneMaskLe() {
54 asm(
"mov.u32 %0, %%lanemask_le;" :
"=r"(mask));
58 __device__ __forceinline__
unsigned getLaneMaskGt() {
60 asm(
"mov.u32 %0, %%lanemask_gt;" :
"=r"(mask));
64 __device__ __forceinline__
unsigned getLaneMaskGe() {
66 asm(
"mov.u32 %0, %%lanemask_ge;" :
"=r"(mask));
70 __device__ __forceinline__
void namedBarrierWait(
int name,
int numThreads) {
71 asm volatile(
"bar.sync %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
74 __device__ __forceinline__
void namedBarrierArrived(
int name,
int numThreads) {
75 asm volatile(
"bar.arrive %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
79 __device__ __forceinline__
void prefetchL2(
const void *p) {
80 asm volatile(
"prefetch.global.L2 [%0];" : :
"l"(p));
83 __device__ __forceinline__
void prefetchL1(
const void *p) {
84 asm volatile(
"prefetch.global.L1 [%0];" : :
"l"(p));