14 #include "../utils/PtxUtils.cuh"
16 namespace faiss {
namespace gpu {
25 inline __device__
unsigned int getByte(
unsigned char v,
31 inline __device__
unsigned int getByte(
unsigned short v,
34 return getBitfield((
unsigned int) v, pos, width);
37 inline __device__
unsigned int getByte(
unsigned int v,
40 return getBitfield(v, pos, width);
43 inline __device__
unsigned int getByte(
unsigned long v,
46 return getBitfield(v, pos, width);
49 template <
int NumSubQuantizers>
54 static inline __device__
void load(
unsigned int code32[1],
58 asm(
"ld.global.cs.u8 {%0}, [%1];" :
59 "=r"(code32[0]) :
"l"(p));
65 static inline __device__
void load(
unsigned int code32[1],
69 asm(
"ld.global.cs.u16 {%0}, [%1];" :
70 "=r"(code32[0]) :
"l"(p));
76 static inline __device__
void load(
unsigned int code32[1],
86 asm(
"ld.global.cs.u8 {%0}, [%1 + 0];" :
88 asm(
"ld.global.cs.u8 {%0}, [%1 + 1];" :
90 asm(
"ld.global.cs.u8 {%0}, [%1 + 2];" :
95 code32[0] = (c << 16) | (b << 8) | a;
101 static inline __device__
void load(
unsigned int code32[1],
105 asm(
"ld.global.cs.u32 {%0}, [%1];" :
106 "=r"(code32[0]) :
"l"(p));
112 static inline __device__
void load(
unsigned int code32[2],
116 asm(
"ld.global.cs.v2.u32 {%0, %1}, [%2];" :
117 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
123 static inline __device__
void load(
unsigned int code32[3],
129 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
130 "=r"(code32[0]) :
"l"(p));
131 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
132 "=r"(code32[1]) :
"l"(p));
133 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
134 "=r"(code32[2]) :
"l"(p));
140 static inline __device__
void load(
unsigned int code32[4],
144 asm(
"ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
145 "=r"(code32[0]),
"=r"(code32[1]),
146 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
152 static inline __device__
void load(
unsigned int code32[5],
158 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
159 "=r"(code32[0]) :
"l"(p));
160 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
161 "=r"(code32[1]) :
"l"(p));
162 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
163 "=r"(code32[2]) :
"l"(p));
164 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
165 "=r"(code32[3]) :
"l"(p));
166 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
167 "=r"(code32[4]) :
"l"(p));
173 static inline __device__
void load(
unsigned int code32[6],
179 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
180 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
181 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
182 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
183 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
184 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
190 static inline __device__
void load(
unsigned int code32[7],
196 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
197 "=r"(code32[0]) :
"l"(p));
198 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
199 "=r"(code32[1]) :
"l"(p));
200 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
201 "=r"(code32[2]) :
"l"(p));
202 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
203 "=r"(code32[3]) :
"l"(p));
204 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
205 "=r"(code32[4]) :
"l"(p));
206 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
207 "=r"(code32[5]) :
"l"(p));
208 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
209 "=r"(code32[6]) :
"l"(p));
215 static inline __device__
void load(
unsigned int code32[8],
221 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
222 "=r"(code32[0]),
"=r"(code32[1]),
223 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
224 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
225 "=r"(code32[4]),
"=r"(code32[5]),
226 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
232 static inline __device__
void load(
unsigned int code32[10],
238 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
239 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
240 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
241 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
242 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
243 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
244 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
245 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
246 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
247 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
253 static inline __device__
void load(
unsigned int code32[12],
259 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
260 "=r"(code32[0]),
"=r"(code32[1]),
261 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
262 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
263 "=r"(code32[4]),
"=r"(code32[5]),
264 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
265 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
266 "=r"(code32[8]),
"=r"(code32[9]),
267 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
273 static inline __device__
void load(
unsigned int code32[14],
279 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
280 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
281 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
282 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
283 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
284 "=r"(code32[4]),
"=r"(code32[5]) :
"l"(p));
285 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
286 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
287 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
288 "=r"(code32[8]),
"=r"(code32[9]) :
"l"(p));
289 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 40];" :
290 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
291 asm(
"ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 48];" :
292 "=r"(code32[12]),
"=r"(code32[13]) :
"l"(p));
298 static inline __device__
void load(
unsigned int code32[16],
304 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
305 "=r"(code32[0]),
"=r"(code32[1]),
306 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
307 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
308 "=r"(code32[4]),
"=r"(code32[5]),
309 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
310 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
311 "=r"(code32[8]),
"=r"(code32[9]),
312 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
313 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
314 "=r"(code32[12]),
"=r"(code32[13]),
315 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
321 static inline __device__
void load(
unsigned int code32[24],
327 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
328 "=r"(code32[0]),
"=r"(code32[1]),
329 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
330 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
331 "=r"(code32[4]),
"=r"(code32[5]),
332 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
333 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
334 "=r"(code32[8]),
"=r"(code32[9]),
335 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
336 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
337 "=r"(code32[12]),
"=r"(code32[13]),
338 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));
339 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 64];" :
340 "=r"(code32[16]),
"=r"(code32[17]),
341 "=r"(code32[18]),
"=r"(code32[19]) :
"l"(p));
342 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 80];" :
343 "=r"(code32[20]),
"=r"(code32[21]),
344 "=r"(code32[22]),
"=r"(code32[23]) :
"l"(p));