13 namespace faiss {
namespace gpu {
15 __device__ __forceinline__
16 unsigned int getBitfield(
unsigned int val,
int pos,
int len) {
18 asm(
"bfe.u32 %0, %1, %2, %3;" :
"=r"(ret) :
"r"(val),
"r"(pos),
"r"(len));
22 __device__ __forceinline__
23 unsigned long getBitfield(
unsigned long val,
int pos,
int len) {
25 asm(
"bfe.u64 %0, %1, %2, %3;" :
"=l"(ret) :
"l"(val),
"r"(pos),
"r"(len));
29 __device__ __forceinline__
30 unsigned int setBitfield(
unsigned int val,
31 unsigned int toInsert,
int pos,
int len) {
33 asm(
"bfi.b32 %0, %1, %2, %3, %4;" :
34 "=r"(ret) :
"r"(toInsert),
"r"(val),
"r"(pos),
"r"(len));
38 __device__ __forceinline__
int getLaneId() {
40 asm(
"mov.s32 %0, %laneid;" :
"=r"(laneId) );
44 __device__ __forceinline__
unsigned getLaneMaskLt() {
46 asm(
"mov.u32 %0, %%lanemask_lt;" :
"=r"(mask));
50 __device__ __forceinline__
unsigned getLaneMaskLe() {
52 asm(
"mov.u32 %0, %%lanemask_le;" :
"=r"(mask));
56 __device__ __forceinline__
unsigned getLaneMaskGt() {
58 asm(
"mov.u32 %0, %%lanemask_gt;" :
"=r"(mask));
62 __device__ __forceinline__
unsigned getLaneMaskGe() {
64 asm(
"mov.u32 %0, %%lanemask_ge;" :
"=r"(mask));
68 __device__ __forceinline__
void namedBarrierWait(
int name,
int numThreads) {
69 asm volatile(
"bar.sync %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
72 __device__ __forceinline__
void namedBarrierArrived(
int name,
int numThreads) {
73 asm volatile(
"bar.arrive %0, %1;" : :
"r"(name),
"r"(numThreads) :
"memory");
77 __device__ __forceinline__
void prefetchL2(
const void *p) {
78 asm volatile(
"prefetch.global.L2 [%0];" : :
"l"(p));
81 __device__ __forceinline__
void prefetchL1(
const void *p) {
82 asm volatile(
"prefetch.global.L1 [%0];" : :
"l"(p));