12 #include "../utils/PtxUtils.cuh"
14 namespace faiss {
namespace gpu {
16 #if __CUDA_ARCH__ >= 350
18 #define LD_NC_V1 "ld.global.cs.nc.u32"
19 #define LD_NC_V2 "ld.global.cs.nc.v2.u32"
20 #define LD_NC_V4 "ld.global.cs.nc.v4.u32"
23 #define LD_NC_V1 "ld.global.cs.u32"
24 #define LD_NC_V2 "ld.global.cs.v2.u32"
25 #define LD_NC_V4 "ld.global.cs.v4.u32"
26 #endif // __CUDA_ARCH__
35 inline __device__
unsigned int getByte(
unsigned char v,
41 inline __device__
unsigned int getByte(
unsigned short v,
44 return getBitfield((
unsigned int) v, pos, width);
47 inline __device__
unsigned int getByte(
unsigned int v,
50 return getBitfield(v, pos, width);
53 inline __device__
unsigned int getByte(
unsigned long v,
56 return getBitfield(v, pos, width);
59 template <
int NumSubQuantizers>
64 static inline __device__
void load(
unsigned int code32[1],
68 asm(
"ld.global.cs.u8 {%0}, [%1];" :
69 "=r"(code32[0]) :
"l"(p));
75 static inline __device__
void load(
unsigned int code32[1],
79 asm(
"ld.global.cs.u16 {%0}, [%1];" :
80 "=r"(code32[0]) :
"l"(p));
86 static inline __device__
void load(
unsigned int code32[1],
96 asm(
"ld.global.cs.u8 {%0}, [%1 + 0];" :
98 asm(
"ld.global.cs.u8 {%0}, [%1 + 1];" :
100 asm(
"ld.global.cs.u8 {%0}, [%1 + 2];" :
105 code32[0] = (c << 16) | (b << 8) | a;
111 static inline __device__
void load(
unsigned int code32[1],
115 asm(
"ld.global.cs.u32 {%0}, [%1];" :
116 "=r"(code32[0]) :
"l"(p));
122 static inline __device__
void load(
unsigned int code32[2],
126 asm(
"ld.global.cs.v2.u32 {%0, %1}, [%2];" :
127 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
133 static inline __device__
void load(
unsigned int code32[3],
139 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
140 "=r"(code32[0]) :
"l"(p));
141 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
142 "=r"(code32[1]) :
"l"(p));
143 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
144 "=r"(code32[2]) :
"l"(p));
150 static inline __device__
void load(
unsigned int code32[4],
154 asm(
"ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
155 "=r"(code32[0]),
"=r"(code32[1]),
156 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
162 static inline __device__
void load(
unsigned int code32[5],
168 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
169 "=r"(code32[0]) :
"l"(p));
170 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
171 "=r"(code32[1]) :
"l"(p));
172 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
173 "=r"(code32[2]) :
"l"(p));
174 asm(LD_NC_V1
" {%0}, [%1 + 12];" :
175 "=r"(code32[3]) :
"l"(p));
176 asm(LD_NC_V1
" {%0}, [%1 + 16];" :
177 "=r"(code32[4]) :
"l"(p));
183 static inline __device__
void load(
unsigned int code32[6],
189 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
190 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
191 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
192 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
193 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
194 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
200 static inline __device__
void load(
unsigned int code32[7],
206 asm(LD_NC_V1
" {%0}, [%1 + 0];" :
207 "=r"(code32[0]) :
"l"(p));
208 asm(LD_NC_V1
" {%0}, [%1 + 4];" :
209 "=r"(code32[1]) :
"l"(p));
210 asm(LD_NC_V1
" {%0}, [%1 + 8];" :
211 "=r"(code32[2]) :
"l"(p));
212 asm(LD_NC_V1
" {%0}, [%1 + 12];" :
213 "=r"(code32[3]) :
"l"(p));
214 asm(LD_NC_V1
" {%0}, [%1 + 16];" :
215 "=r"(code32[4]) :
"l"(p));
216 asm(LD_NC_V1
" {%0}, [%1 + 20];" :
217 "=r"(code32[5]) :
"l"(p));
218 asm(LD_NC_V1
" {%0}, [%1 + 24];" :
219 "=r"(code32[6]) :
"l"(p));
225 static inline __device__
void load(
unsigned int code32[8],
231 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
232 "=r"(code32[0]),
"=r"(code32[1]),
233 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
234 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
235 "=r"(code32[4]),
"=r"(code32[5]),
236 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
242 static inline __device__
void load(
unsigned int code32[10],
248 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
249 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
250 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
251 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
252 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
253 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
254 asm(LD_NC_V2
" {%0, %1}, [%2 + 24];" :
255 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
256 asm(LD_NC_V2
" {%0, %1}, [%2 + 32];" :
257 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
263 static inline __device__
void load(
unsigned int code32[12],
269 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
270 "=r"(code32[0]),
"=r"(code32[1]),
271 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
272 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
273 "=r"(code32[4]),
"=r"(code32[5]),
274 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
275 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
276 "=r"(code32[8]),
"=r"(code32[9]),
277 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
283 static inline __device__
void load(
unsigned int code32[14],
289 asm(LD_NC_V2
" {%0, %1}, [%2 + 0];" :
290 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
291 asm(LD_NC_V2
" {%0, %1}, [%2 + 8];" :
292 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
293 asm(LD_NC_V2
" {%0, %1}, [%2 + 16];" :
294 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
295 asm(LD_NC_V2
" {%0, %1}, [%2 + 24];" :
296 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
297 asm(LD_NC_V2
" {%0, %1}, [%2 + 32];" :
298 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
299 asm(LD_NC_V2
" {%0, %1}, [%2 + 40];" :
300 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
301 asm(LD_NC_V2
" {%0, %1}, [%2 + 48];" :
302 "=r"(code32[12]),
"=r"(code32[13]) :
"l"(p));
308 static inline __device__
void load(
unsigned int code32[16],
314 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
315 "=r"(code32[0]),
"=r"(code32[1]),
316 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
317 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
318 "=r"(code32[4]),
"=r"(code32[5]),
319 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
320 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
321 "=r"(code32[8]),
"=r"(code32[9]),
322 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
323 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 48];" :
324 "=r"(code32[12]),
"=r"(code32[13]),
325 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
331 static inline __device__
void load(
unsigned int code32[24],
337 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4];" :
338 "=r"(code32[0]),
"=r"(code32[1]),
339 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
340 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 16];" :
341 "=r"(code32[4]),
"=r"(code32[5]),
342 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
343 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 32];" :
344 "=r"(code32[8]),
"=r"(code32[9]),
345 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
346 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 48];" :
347 "=r"(code32[12]),
"=r"(code32[13]),
348 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
349 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 64];" :
350 "=r"(code32[16]),
"=r"(code32[17]),
351 "=r"(code32[18]),
"=r"(code32[19]) :
"l"(p));
352 asm(LD_NC_V4
" {%0, %1, %2, %3}, [%4 + 80];" :
353 "=r"(code32[20]),
"=r"(code32[21]),
354 "=r"(code32[22]),
"=r"(code32[23]) :
"l"(p));