13 #include "../utils/PtxUtils.cuh"
15 namespace faiss {
namespace gpu {
17 #if __CUDA_ARCH__ >= 350
19 #define LD_NC_V1 "ld.global.cs.nc.u32"
20 #define LD_NC_V2 "ld.global.cs.nc.v2.u32"
21 #define LD_NC_V4 "ld.global.cs.nc.v4.u32"
24 #define LD_NC_V1 "ld.global.cs.u32"
25 #define LD_NC_V2 "ld.global.cs.v2.u32"
26 #define LD_NC_V4 "ld.global.cs.v4.u32"
27 #endif // __CUDA_ARCH__
36 inline __device__
unsigned int getByte(
unsigned char v,
42 inline __device__
unsigned int getByte(
unsigned short v,
45 return getBitfield((
unsigned int) v, pos, width);
48 inline __device__
unsigned int getByte(
unsigned int v,
51 return getBitfield(v, pos, width);
54 inline __device__
unsigned int getByte(
unsigned long v,
57 return getBitfield(v, pos, width);
60 template <
int NumSubQuantizers>
65 static inline __device__
void load(
unsigned int code32[1],
69 asm(
"ld.global.cs.u8 {%0}, [%1];" :
70 "=r"(code32[0]) :
"l"(p));
76 static inline __device__
void load(
unsigned int code32[1],
80 asm(
"ld.global.cs.u16 {%0}, [%1];" :
81 "=r"(code32[0]) :
"l"(p));
87 static inline __device__
void load(
unsigned int code32[1],
97 asm(
"ld.global.cs.u8 {%0}, [%1 + 0];" :
99 asm(
"ld.global.cs.u8 {%0}, [%1 + 1];" :
101 asm(
"ld.global.cs.u8 {%0}, [%1 + 2];" :
106 code32[0] = (c << 16) | (b << 8) | a;
112 static inline __device__
void load(
unsigned int code32[1],
116 asm(
"ld.global.cs.u32 {%0}, [%1];" :
117 "=r"(code32[0]) :
"l"(p));
123 static inline __device__
void load(
unsigned int code32[2],
127 asm(
"ld.global.cs.v2.u32 {%0, %1}, [%2];" :
128 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
134 static inline __device__
void load(
unsigned int code32[3],
140 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
141 "=r"(code32[0]) :
"l"(p));
142 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
143 "=r"(code32[1]) :
"l"(p));
144 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
145 "=r"(code32[2]) :
"l"(p));
151 static inline __device__
void load(
unsigned int code32[4],
155 asm(
"ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
156 "=r"(code32[0]),
"=r"(code32[1]),
157 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
163 static inline __device__
void load(
unsigned int code32[5],
169 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
170 "=r"(code32[0]) :
"l"(p));
171 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
172 "=r"(code32[1]) :
"l"(p));
173 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
174 "=r"(code32[2]) :
"l"(p));
175 asm(LD_NC_V1
" {%0}, [%1 + 12];" :
176 "=r"(code32[3]) :
"l"(p));
177 asm(LD_NC_V1
" {%0}, [%1 + 16];" :
178 "=r"(code32[4]) :
"l"(p));
184 static inline __device__
void load(
unsigned int code32[6],
190 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
191 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
192 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
193 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
194 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
195 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
201 static inline __device__
void load(
unsigned int code32[7],
207 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
208 "=r"(code32[0]) :
"l"(p));
209 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
210 "=r"(code32[1]) :
"l"(p));
211 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
212 "=r"(code32[2]) :
"l"(p));
213 asm(LD_NC_V1
" {%0}, [%1 + 12];" :
214 "=r"(code32[3]) :
"l"(p));
215 asm(LD_NC_V1
" {%0}, [%1 + 16];" :
216 "=r"(code32[4]) :
"l"(p));
217 asm(LD_NC_V1
" {%0}, [%1 + 20];" :
218 "=r"(code32[5]) :
"l"(p));
219 asm(LD_NC_V1
" {%0}, [%1 + 24];" :
220 "=r"(code32[6]) :
"l"(p));
226 static inline __device__
void load(
unsigned int code32[8],
232 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
233 "=r"(code32[0]),
"=r"(code32[1]),
234 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
235 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
236 "=r"(code32[4]),
"=r"(code32[5]),
237 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
243 static inline __device__
void load(
unsigned int code32[10],
249 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
250 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
251 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
252 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
253 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
254 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
255 asm(LD_NC_V2
" {%0, %1}, [%2 + 24];" :
256 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
257 asm(LD_NC_V2
" {%0, %1}, [%2 + 32];" :
258 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
264 static inline __device__
void load(
unsigned int code32[12],
270 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
271 "=r"(code32[0]),
"=r"(code32[1]),
272 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
273 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
274 "=r"(code32[4]),
"=r"(code32[5]),
275 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
276 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
277 "=r"(code32[8]),
"=r"(code32[9]),
278 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
284 static inline __device__
void load(
unsigned int code32[14],
290 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
291 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
292 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
293 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
294 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
295 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
296 asm(LD_NC_V2
" {%0, %1}, [%2 + 24];" :
297 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
298 asm(LD_NC_V2
" {%0, %1}, [%2 + 32];" :
299 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
300 asm(LD_NC_V2
" {%0, %1}, [%2 + 40];" :
301 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
302 asm(LD_NC_V2
" {%0, %1}, [%2 + 48];" :
303 "=r"(code32[12]),
"=r"(code32[13]) :
"l"(p));
309 static inline __device__
void load(
unsigned int code32[16],
315 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
316 "=r"(code32[0]),
"=r"(code32[1]),
317 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
318 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
319 "=r"(code32[4]),
"=r"(code32[5]),
320 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
321 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
322 "=r"(code32[8]),
"=r"(code32[9]),
323 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
324 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 48];" :
325 "=r"(code32[12]),
"=r"(code32[13]),
326 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
332 static inline __device__
void load(
unsigned int code32[24],
338 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
339 "=r"(code32[0]),
"=r"(code32[1]),
340 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
341 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
342 "=r"(code32[4]),
"=r"(code32[5]),
343 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
344 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
345 "=r"(code32[8]),
"=r"(code32[9]),
346 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
347 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 48];" :
348 "=r"(code32[12]),
"=r"(code32[13]),
349 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
350 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 64];" :
351 "=r"(code32[16]),
"=r"(code32[17]),
352 "=r"(code32[18]),
"=r"(code32[19]) :
"l"(p));
353 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 80];" :
354 "=r"(code32[20]),
"=r"(code32[21]),
355 "=r"(code32[22]),
"=r"(code32[23]) :
"l"(p));