faiss/gpu/utils/PtxUtils.cuh

89 lines
2.3 KiB
Plaintext

/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the CC-by-NC license found in the
* LICENSE file in the root directory of this source tree.
*/
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include <cuda.h>
namespace faiss { namespace gpu {
__device__ __forceinline__
unsigned int getBitfield(unsigned int val, int pos, int len) {
unsigned int ret;
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
return ret;
}
__device__ __forceinline__
unsigned long getBitfield(unsigned long val, int pos, int len) {
unsigned long ret;
asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
return ret;
}
__device__ __forceinline__
unsigned int setBitfield(unsigned int val,
unsigned int toInsert, int pos, int len) {
unsigned int ret;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
return ret;
}
__device__ __forceinline__ int getLaneId() {
int laneId;
asm("mov.s32 %0, %laneid;" : "=r"(laneId) );
return laneId;
}
__device__ __forceinline__ unsigned getLaneMaskLt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned getLaneMaskLe() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned getLaneMaskGt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned getLaneMaskGe() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
}
__device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
}
// FIXME: prefetch does nothing (in SASS) on Maxwell
__device__ __forceinline__ void prefetchL2(const void *p) {
asm volatile("prefetch.global.L2 [%0];" : : "l"(p));
}
__device__ __forceinline__ void prefetchL1(const void *p) {
asm volatile("prefetch.global.L1 [%0];" : : "l"(p));
}
} } // namespace