faiss/gpu/impl/PQCodeLoad.cuh

358 lines
12 KiB
Plaintext

/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/gpu/utils/PtxUtils.cuh>
namespace faiss { namespace gpu {
#if __CUDA_ARCH__ >= 350
// Use the CC 3.5+ read-only texture cache (nc)
#define LD_NC_V1 "ld.global.cs.nc.u32"
#define LD_NC_V2 "ld.global.cs.nc.v2.u32"
#define LD_NC_V4 "ld.global.cs.nc.v4.u32"
#else
// Read normally
#define LD_NC_V1 "ld.global.cs.u32"
#define LD_NC_V2 "ld.global.cs.v2.u32"
#define LD_NC_V4 "ld.global.cs.v4.u32"
#endif // __CUDA_ARCH__
///
/// This file contains loader functions for PQ codes of various byte
/// length.
///
// Type-specific wrappers around the PTX bfe.* instruction, for
// quantization code extraction
inline __device__ unsigned int getByte(unsigned char v,
int pos,
int width) {
return v;
}
inline __device__ unsigned int getByte(unsigned short v,
int pos,
int width) {
return getBitfield((unsigned int) v, pos, width);
}
inline __device__ unsigned int getByte(unsigned int v,
int pos,
int width) {
return getBitfield(v, pos, width);
}
inline __device__ unsigned int getByte(unsigned long v,
int pos,
int width) {
return getBitfield(v, pos, width);
}
template <int NumSubQuantizers>
struct LoadCode32 {};
template<>
struct LoadCode32<1> {
static inline __device__ void load(unsigned int code32[1],
unsigned char* p,
int offset) {
p += offset * 1;
asm("ld.global.cs.u8 {%0}, [%1];" :
"=r"(code32[0]) : "l"(p));
}
};
template<>
struct LoadCode32<2> {
static inline __device__ void load(unsigned int code32[1],
unsigned char* p,
int offset) {
p += offset * 2;
asm("ld.global.cs.u16 {%0}, [%1];" :
"=r"(code32[0]) : "l"(p));
}
};
template<>
struct LoadCode32<3> {
static inline __device__ void load(unsigned int code32[1],
unsigned char* p,
int offset) {
p += offset * 3;
unsigned int a;
unsigned int b;
unsigned int c;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// unfortunately need to reorganize memory layout by warp
asm("ld.global.cs.u8 {%0}, [%1 + 0];" :
"=r"(a) : "l"(p));
asm("ld.global.cs.u8 {%0}, [%1 + 1];" :
"=r"(b) : "l"(p));
asm("ld.global.cs.u8 {%0}, [%1 + 2];" :
"=r"(c) : "l"(p));
// FIXME: this is also slow, since we have to recover the
// individual bytes loaded
code32[0] = (c << 16) | (b << 8) | a;
}
};
template<>
struct LoadCode32<4> {
static inline __device__ void load(unsigned int code32[1],
unsigned char* p,
int offset) {
p += offset * 4;
asm("ld.global.cs.u32 {%0}, [%1];" :
"=r"(code32[0]) : "l"(p));
}
};
template<>
struct LoadCode32<8> {
static inline __device__ void load(unsigned int code32[2],
unsigned char* p,
int offset) {
p += offset * 8;
asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" :
"=r"(code32[0]), "=r"(code32[1]) : "l"(p));
}
};
template<>
struct LoadCode32<12> {
static inline __device__ void load(unsigned int code32[3],
unsigned char* p,
int offset) {
p += offset * 12;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V1 " {%0}, [%1 + 0];" :
"=r"(code32[0]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 4];" :
"=r"(code32[1]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 8];" :
"=r"(code32[2]) : "l"(p));
}
};
template<>
struct LoadCode32<16> {
static inline __device__ void load(unsigned int code32[4],
unsigned char* p,
int offset) {
p += offset * 16;
asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
"=r"(code32[0]), "=r"(code32[1]),
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
}
};
template<>
struct LoadCode32<20> {
static inline __device__ void load(unsigned int code32[5],
unsigned char* p,
int offset) {
p += offset * 20;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V1 " {%0}, [%1 + 0];" :
"=r"(code32[0]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 4];" :
"=r"(code32[1]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 8];" :
"=r"(code32[2]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 12];" :
"=r"(code32[3]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 16];" :
"=r"(code32[4]) : "l"(p));
}
};
template<>
struct LoadCode32<24> {
static inline __device__ void load(unsigned int code32[6],
unsigned char* p,
int offset) {
p += offset * 24;
// FIXME: this is a non-coalesced, unaligned, 2-vectorized load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
"=r"(code32[0]), "=r"(code32[1]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
"=r"(code32[4]), "=r"(code32[5]) : "l"(p));
}
};
template<>
struct LoadCode32<28> {
static inline __device__ void load(unsigned int code32[7],
unsigned char* p,
int offset) {
p += offset * 28;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V1 " {%0}, [%1 + 0];" :
"=r"(code32[0]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 4];" :
"=r"(code32[1]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 8];" :
"=r"(code32[2]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 12];" :
"=r"(code32[3]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 16];" :
"=r"(code32[4]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 20];" :
"=r"(code32[5]) : "l"(p));
asm(LD_NC_V1 " {%0}, [%1 + 24];" :
"=r"(code32[6]) : "l"(p));
}
};
template<>
struct LoadCode32<32> {
static inline __device__ void load(unsigned int code32[8],
unsigned char* p,
int offset) {
p += offset * 32;
// FIXME: this is a non-coalesced load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
"=r"(code32[0]), "=r"(code32[1]),
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
"=r"(code32[4]), "=r"(code32[5]),
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
}
};
template<>
struct LoadCode32<40> {
static inline __device__ void load(unsigned int code32[10],
unsigned char* p,
int offset) {
p += offset * 40;
// FIXME: this is a non-coalesced, unaligned, 2-vectorized load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
"=r"(code32[0]), "=r"(code32[1]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
"=r"(code32[4]), "=r"(code32[5]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
"=r"(code32[8]), "=r"(code32[9]) : "l"(p));
}
};
template<>
struct LoadCode32<48> {
static inline __device__ void load(unsigned int code32[12],
unsigned char* p,
int offset) {
p += offset * 48;
// FIXME: this is a non-coalesced load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
"=r"(code32[0]), "=r"(code32[1]),
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
"=r"(code32[4]), "=r"(code32[5]),
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
"=r"(code32[8]), "=r"(code32[9]),
"=r"(code32[10]), "=r"(code32[11]) : "l"(p));
}
};
template<>
struct LoadCode32<56> {
static inline __device__ void load(unsigned int code32[14],
unsigned char* p,
int offset) {
p += offset * 56;
// FIXME: this is a non-coalesced, unaligned, 2-vectorized load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
"=r"(code32[0]), "=r"(code32[1]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
"=r"(code32[4]), "=r"(code32[5]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
"=r"(code32[8]), "=r"(code32[9]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 40];" :
"=r"(code32[10]), "=r"(code32[11]) : "l"(p));
asm(LD_NC_V2 " {%0, %1}, [%2 + 48];" :
"=r"(code32[12]), "=r"(code32[13]) : "l"(p));
}
};
template<>
struct LoadCode32<64> {
static inline __device__ void load(unsigned int code32[16],
unsigned char* p,
int offset) {
p += offset * 64;
// FIXME: this is a non-coalesced load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
"=r"(code32[0]), "=r"(code32[1]),
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
"=r"(code32[4]), "=r"(code32[5]),
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
"=r"(code32[8]), "=r"(code32[9]),
"=r"(code32[10]), "=r"(code32[11]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
"=r"(code32[12]), "=r"(code32[13]),
"=r"(code32[14]), "=r"(code32[15]) : "l"(p));
}
};
template<>
struct LoadCode32<96> {
static inline __device__ void load(unsigned int code32[24],
unsigned char* p,
int offset) {
p += offset * 96;
// FIXME: this is a non-coalesced load
// unfortunately need to reorganize memory layout by warp
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
"=r"(code32[0]), "=r"(code32[1]),
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
"=r"(code32[4]), "=r"(code32[5]),
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
"=r"(code32[8]), "=r"(code32[9]),
"=r"(code32[10]), "=r"(code32[11]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
"=r"(code32[12]), "=r"(code32[13]),
"=r"(code32[14]), "=r"(code32[15]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 64];" :
"=r"(code32[16]), "=r"(code32[17]),
"=r"(code32[18]), "=r"(code32[19]) : "l"(p));
asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 80];" :
"=r"(code32[20]), "=r"(code32[21]),
"=r"(code32[22]), "=r"(code32[23]) : "l"(p));
}
};
} } // namespace