14 #include "../utils/PtxUtils.cuh"
16 namespace faiss {
namespace gpu {
25 inline __device__
unsigned int getByte(
unsigned char v,
31 inline __device__
unsigned int getByte(
unsigned short v,
34 return getBitfield((
unsigned int) v, pos, width);
37 inline __device__
unsigned int getByte(
unsigned int v,
40 return getBitfield(v, pos, width);
43 inline __device__
unsigned int getByte(
unsigned long v,
46 return getBitfield(v, pos, width);
49 template <
int NumSubQuantizers>
54 static inline __device__
void load(
unsigned int code32[1],
58 asm(
"ld.global.cs.u8 {%0}, [%1];" :
59 "=r"(code32[0]) :
"l"(p));
65 static inline __device__
void load(
unsigned int code32[1],
69 asm(
"ld.global.cs.u16 {%0}, [%1];" :
70 "=r"(code32[0]) :
"l"(p));
76 static inline __device__
void load(
unsigned int code32[1],
86 asm(
"ld.global.cs.u8 {%0}, [%1 + 0];" :
88 asm(
"ld.global.cs.u8 {%0}, [%1 + 1];" :
90 asm(
"ld.global.cs.u8 {%0}, [%1 + 2];" :
95 code32[0] = (c << 16) | (b << 8) | a;
101 static inline __device__
void load(
unsigned int code32[1],
105 asm(
"ld.global.cs.u32 {%0}, [%1];" :
106 "=r"(code32[0]) :
"l"(p));
112 static inline __device__
void load(
unsigned int code32[2],
116 asm(
"ld.global.cs.v2.u32 {%0, %1}, [%2];" :
117 "=r"(code32[0]),
"=r"(code32[1]) :
"l"(p));
123 static inline __device__
void load(
unsigned int code32[3],
129 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
130 "=r"(code32[0]) :
"l"(p));
131 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
132 "=r"(code32[1]) :
"l"(p));
133 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
134 "=r"(code32[2]) :
"l"(p));
140 static inline __device__
void load(
unsigned int code32[4],
144 asm(
"ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
145 "=r"(code32[0]),
"=r"(code32[1]),
146 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
152 static inline __device__
void load(
unsigned int code32[5],
158 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
159 "=r"(code32[0]) :
"l"(p));
160 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
161 "=r"(code32[1]) :
"l"(p));
162 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
163 "=r"(code32[2]) :
"l"(p));
164 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
165 "=r"(code32[3]) :
"l"(p));
166 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
167 "=r"(code32[4]) :
"l"(p));
173 static inline __device__
void load(
unsigned int code32[6],
179 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
180 "=r"(code32[0]) :
"l"(p));
181 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
182 "=r"(code32[1]) :
"l"(p));
183 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
184 "=r"(code32[2]) :
"l"(p));
185 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
186 "=r"(code32[3]) :
"l"(p));
187 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
188 "=r"(code32[4]) :
"l"(p));
189 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
190 "=r"(code32[5]) :
"l"(p));
196 static inline __device__
void load(
unsigned int code32[7],
202 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
203 "=r"(code32[0]) :
"l"(p));
204 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
205 "=r"(code32[1]) :
"l"(p));
206 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
207 "=r"(code32[2]) :
"l"(p));
208 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
209 "=r"(code32[3]) :
"l"(p));
210 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
211 "=r"(code32[4]) :
"l"(p));
212 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
213 "=r"(code32[5]) :
"l"(p));
214 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
215 "=r"(code32[6]) :
"l"(p));
221 static inline __device__
void load(
unsigned int code32[8],
227 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
228 "=r"(code32[0]),
"=r"(code32[1]),
229 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
230 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
231 "=r"(code32[4]),
"=r"(code32[5]),
232 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
238 static inline __device__
void load(
unsigned int code32[10],
244 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
245 "=r"(code32[0]) :
"l"(p));
246 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
247 "=r"(code32[1]) :
"l"(p));
248 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
249 "=r"(code32[2]) :
"l"(p));
250 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
251 "=r"(code32[3]) :
"l"(p));
252 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
253 "=r"(code32[4]) :
"l"(p));
254 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
255 "=r"(code32[5]) :
"l"(p));
256 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
257 "=r"(code32[6]) :
"l"(p));
258 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
259 "=r"(code32[7]) :
"l"(p));
260 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
261 "=r"(code32[8]) :
"l"(p));
262 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
263 "=r"(code32[9]) :
"l"(p));
269 static inline __device__
void load(
unsigned int code32[12],
275 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
276 "=r"(code32[0]) :
"l"(p));
277 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
278 "=r"(code32[1]) :
"l"(p));
279 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
280 "=r"(code32[2]) :
"l"(p));
281 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
282 "=r"(code32[3]) :
"l"(p));
283 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
284 "=r"(code32[4]) :
"l"(p));
285 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
286 "=r"(code32[5]) :
"l"(p));
287 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
288 "=r"(code32[6]) :
"l"(p));
289 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
290 "=r"(code32[7]) :
"l"(p));
291 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
292 "=r"(code32[8]) :
"l"(p));
293 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
294 "=r"(code32[9]) :
"l"(p));
295 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
296 "=r"(code32[10]) :
"l"(p));
297 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
298 "=r"(code32[11]) :
"l"(p));
304 static inline __device__
void load(
unsigned int code32[14],
310 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
311 "=r"(code32[0]) :
"l"(p));
312 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
313 "=r"(code32[1]) :
"l"(p));
314 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
315 "=r"(code32[2]) :
"l"(p));
316 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
317 "=r"(code32[3]) :
"l"(p));
318 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
319 "=r"(code32[4]) :
"l"(p));
320 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
321 "=r"(code32[5]) :
"l"(p));
322 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
323 "=r"(code32[6]) :
"l"(p));
324 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
325 "=r"(code32[7]) :
"l"(p));
326 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
327 "=r"(code32[8]) :
"l"(p));
328 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
329 "=r"(code32[9]) :
"l"(p));
330 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
331 "=r"(code32[10]) :
"l"(p));
332 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
333 "=r"(code32[11]) :
"l"(p));
334 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 48];" :
335 "=r"(code32[12]) :
"l"(p));
336 asm(
"ld.global.cs.nc.u32 {%0}, [%1 + 52];" :
337 "=r"(code32[13]) :
"l"(p));
343 static inline __device__
void load(
unsigned int code32[16],
349 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
350 "=r"(code32[0]),
"=r"(code32[1]),
351 "=r"(code32[2]),
"=r"(code32[3]) :
"l"(p));
352 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
353 "=r"(code32[4]),
"=r"(code32[5]),
354 "=r"(code32[6]),
"=r"(code32[7]) :
"l"(p));
355 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
356 "=r"(code32[8]),
"=r"(code32[9]),
357 "=r"(code32[10]),
"=r"(code32[11]) :
"l"(p));
358 asm(
"ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
359 "=r"(code32[12]),
"=r"(code32[13]),
360 "=r"(code32[14]),
"=r"(code32[15]) :
"l"(p));