11 #include "../utils/PtxUtils.cuh"
13 namespace faiss {
namespace gpu {
15 #if __CUDA_ARCH__ >= 350
17 #define LD_NC_V1 "ld.global.cs.nc.u32"
18 #define LD_NC_V2 "ld.global.cs.nc.v2.u32"
19 #define LD_NC_V4 "ld.global.cs.nc.v4.u32"
22 #define LD_NC_V1 "ld.global.cs.u32"
23 #define LD_NC_V2 "ld.global.cs.v2.u32"
24 #define LD_NC_V4 "ld.global.cs.v4.u32"
25 #endif // __CUDA_ARCH__
34 inline __device__
unsigned int getByte(
unsigned char v,
40 inline __device__
unsigned int getByte(
unsigned short v,
43 return getBitfield((
unsigned int) v, pos, width);
46 inline __device__
unsigned int getByte(
unsigned int v,
49 return getBitfield(v, pos, width);
52 inline __device__
unsigned int getByte(
unsigned long v,
55 return getBitfield(v, pos, width);
58 template <
int NumSubQuantizers>
63 static inline __device__
void load(
unsigned int code32[1],
67 asm(
"ld.global.cs.u8 {%0}, [%1];" :
68 "=r"(code32[0]) :
"l"(p));
74 static inline __device__
void load(
unsigned int code32[1],
78 asm(
"ld.global.cs.u16 {%0}, [%1];" :
79 "=r"(code32[0]) :
"l"(p));
85 static inline __device__
void load(
unsigned int code32[1],
95 asm(
"ld.global.cs.u8 {%0}, [%1 + 0];" :
97 asm(
"ld.global.cs.u8 {%0}, [%1 + 1];" :
99 asm(
"ld.global.cs.u8 {%0}, [%1 + 2];" :
104 code32[0] = (c << 16) | (b << 8) | a;
110 static inline __device__
void load(
unsigned int code32[1],
114 asm(
"ld.global.cs.u32 {%0}, [%1];" :
115 "=r"(code32[0]) :
"l"(p));
121 static inline __device__
void load(
unsigned int code32[2],
125 asm(
"ld.global.cs.v2.u32 {%0, %1}, [%2];" :
126 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
132 static inline __device__
void load(
unsigned int code32[3],
138 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
139 "=r"(code32[0]) :
"l"(p));
140 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
141 "=r"(code32[1]) :
"l"(p));
142 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
143 "=r"(code32[2]) :
"l"(p));
149 static inline __device__
void load(
unsigned int code32[4],
153 asm(
"ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
154 "=r"(code32[0]),
"=r"(code32[1]),
155 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
161 static inline __device__
void load(
unsigned int code32[5],
167 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
168 "=r"(code32[0]) :
"l"(p));
169 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
170 "=r"(code32[1]) :
"l"(p));
171 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
172 "=r"(code32[2]) :
"l"(p));
173 asm(LD_NC_V1
" {%0}, [%1 + 12];" :
174 "=r"(code32[3]) :
"l"(p));
175 asm(LD_NC_V1
" {%0}, [%1 + 16];" :
176 "=r"(code32[4]) :
"l"(p));
182 static inline __device__
void load(
unsigned int code32[6],
188 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
189 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
190 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
191 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
192 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
193 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
199 static inline __device__
void load(
unsigned int code32[7],
205 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
206 "=r"(code32[0]) :
"l"(p));
207 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
208 "=r"(code32[1]) :
"l"(p));
209 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
210 "=r"(code32[2]) :
"l"(p));
211 asm(LD_NC_V1
" {%0}, [%1 + 12];" :
212 "=r"(code32[3]) :
"l"(p));
213 asm(LD_NC_V1
" {%0}, [%1 + 16];" :
214 "=r"(code32[4]) :
"l"(p));
215 asm(LD_NC_V1
" {%0}, [%1 + 20];" :
216 "=r"(code32[5]) :
"l"(p));
217 asm(LD_NC_V1
" {%0}, [%1 + 24];" :
218 "=r"(code32[6]) :
"l"(p));
224 static inline __device__
void load(
unsigned int code32[8],
230 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
231 "=r"(code32[0]),
"=r"(code32[1]),
232 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
233 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
234 "=r"(code32[4]),
"=r"(code32[5]),
235 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
241 static inline __device__
void load(
unsigned int code32[10],
247 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
248 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
249 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
250 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
251 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
252 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
253 asm(LD_NC_V2
" {%0, %1}, [%2 + 24];" :
254 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
255 asm(LD_NC_V2
" {%0, %1}, [%2 + 32];" :
256 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
262 static inline __device__
void load(
unsigned int code32[12],
268 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
269 "=r"(code32[0]),
"=r"(code32[1]),
270 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
271 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
272 "=r"(code32[4]),
"=r"(code32[5]),
273 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
274 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
275 "=r"(code32[8]),
"=r"(code32[9]),
276 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
282 static inline __device__
void load(
unsigned int code32[14],
288 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
289 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
290 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
291 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
292 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
293 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
294 asm(LD_NC_V2
" {%0, %1}, [%2 + 24];" :
295 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
296 asm(LD_NC_V2
" {%0, %1}, [%2 + 32];" :
297 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
298 asm(LD_NC_V2
" {%0, %1}, [%2 + 40];" :
299 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
300 asm(LD_NC_V2
" {%0, %1}, [%2 + 48];" :
301 "=r"(code32[12]),
"=r"(code32[13]) :
"l"(p));
307 static inline __device__
void load(
unsigned int code32[16],
313 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
314 "=r"(code32[0]),
"=r"(code32[1]),
315 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
316 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
317 "=r"(code32[4]),
"=r"(code32[5]),
318 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
319 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
320 "=r"(code32[8]),
"=r"(code32[9]),
321 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
322 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 48];" :
323 "=r"(code32[12]),
"=r"(code32[13]),
324 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
330 static inline __device__
void load(
unsigned int code32[24],
336 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
337 "=r"(code32[0]),
"=r"(code32[1]),
338 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
339 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
340 "=r"(code32[4]),
"=r"(code32[5]),
341 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
342 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
343 "=r"(code32[8]),
"=r"(code32[9]),
344 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
345 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 48];" :
346 "=r"(code32[12]),
"=r"(code32[13]),
347 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
348 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 64];" :
349 "=r"(code32[16]),
"=r"(code32[17]),
350 "=r"(code32[18]),
"=r"(code32[19]) :
"l"(p));
351 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 80];" :
352 "=r"(code32[20]),
"=r"(code32[21]),
353 "=r"(code32[22]),
"=r"(code32[23]) :
"l"(p));