Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
PQCodeLoad.cuh
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #pragma once
10 
11 #include "../utils/PtxUtils.cuh"
12 
13 namespace faiss { namespace gpu {
14 
15 #if __CUDA_ARCH__ >= 350
16 // Use the CC 3.5+ read-only texture cache (nc)
17 #define LD_NC_V1 "ld.global.cs.nc.u32"
18 #define LD_NC_V2 "ld.global.cs.nc.v2.u32"
19 #define LD_NC_V4 "ld.global.cs.nc.v4.u32"
20 #else
21 // Read normally
22 #define LD_NC_V1 "ld.global.cs.u32"
23 #define LD_NC_V2 "ld.global.cs.v2.u32"
24 #define LD_NC_V4 "ld.global.cs.v4.u32"
25 #endif // __CUDA_ARCH__
26 
27 ///
28 /// This file contains loader functions for PQ codes of various byte
29 /// length.
30 ///
31 
32 // Type-specific wrappers around the PTX bfe.* instruction, for
33 // quantization code extraction
34 inline __device__ unsigned int getByte(unsigned char v,
35  int pos,
36  int width) {
37  return v;
38 }
39 
40 inline __device__ unsigned int getByte(unsigned short v,
41  int pos,
42  int width) {
43  return getBitfield((unsigned int) v, pos, width);
44 }
45 
46 inline __device__ unsigned int getByte(unsigned int v,
47  int pos,
48  int width) {
49  return getBitfield(v, pos, width);
50 }
51 
52 inline __device__ unsigned int getByte(unsigned long v,
53  int pos,
54  int width) {
55  return getBitfield(v, pos, width);
56 }
57 
58 template <int NumSubQuantizers>
59 struct LoadCode32 {};
60 
61 template<>
62 struct LoadCode32<1> {
63  static inline __device__ void load(unsigned int code32[1],
64  unsigned char* p,
65  int offset) {
66  p += offset * 1;
67  asm("ld.global.cs.u8 {%0}, [%1];" :
68  "=r"(code32[0]) : "l"(p));
69  }
70 };
71 
72 template<>
73 struct LoadCode32<2> {
74  static inline __device__ void load(unsigned int code32[1],
75  unsigned char* p,
76  int offset) {
77  p += offset * 2;
78  asm("ld.global.cs.u16 {%0}, [%1];" :
79  "=r"(code32[0]) : "l"(p));
80  }
81 };
82 
83 template<>
84 struct LoadCode32<3> {
85  static inline __device__ void load(unsigned int code32[1],
86  unsigned char* p,
87  int offset) {
88  p += offset * 3;
89  unsigned int a;
90  unsigned int b;
91  unsigned int c;
92 
93  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
94  // unfortunately need to reorganize memory layout by warp
95  asm("ld.global.cs.u8 {%0}, [%1 + 0];" :
96  "=r"(a) : "l"(p));
97  asm("ld.global.cs.u8 {%0}, [%1 + 1];" :
98  "=r"(b) : "l"(p));
99  asm("ld.global.cs.u8 {%0}, [%1 + 2];" :
100  "=r"(c) : "l"(p));
101 
102  // FIXME: this is also slow, since we have to recover the
103  // individual bytes loaded
104  code32[0] = (c << 16) | (b << 8) | a;
105  }
106 };
107 
108 template<>
109 struct LoadCode32<4> {
110  static inline __device__ void load(unsigned int code32[1],
111  unsigned char* p,
112  int offset) {
113  p += offset * 4;
114  asm("ld.global.cs.u32 {%0}, [%1];" :
115  "=r"(code32[0]) : "l"(p));
116  }
117 };
118 
119 template<>
120 struct LoadCode32<8> {
121  static inline __device__ void load(unsigned int code32[2],
122  unsigned char* p,
123  int offset) {
124  p += offset * 8;
125  asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" :
126  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
127  }
128 };
129 
130 template<>
131 struct LoadCode32<12> {
132  static inline __device__ void load(unsigned int code32[3],
133  unsigned char* p,
134  int offset) {
135  p += offset * 12;
136  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
137  // unfortunately need to reorganize memory layout by warp
138  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
139  "=r"(code32[0]) : "l"(p));
140  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
141  "=r"(code32[1]) : "l"(p));
142  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
143  "=r"(code32[2]) : "l"(p));
144  }
145 };
146 
147 template<>
148 struct LoadCode32<16> {
149  static inline __device__ void load(unsigned int code32[4],
150  unsigned char* p,
151  int offset) {
152  p += offset * 16;
153  asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
154  "=r"(code32[0]), "=r"(code32[1]),
155  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
156  }
157 };
158 
159 template<>
160 struct LoadCode32<20> {
161  static inline __device__ void load(unsigned int code32[5],
162  unsigned char* p,
163  int offset) {
164  p += offset * 20;
165  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
166  // unfortunately need to reorganize memory layout by warp
167  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
168  "=r"(code32[0]) : "l"(p));
169  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
170  "=r"(code32[1]) : "l"(p));
171  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
172  "=r"(code32[2]) : "l"(p));
173  asm(LD_NC_V1 " {%0}, [%1 + 12];" :
174  "=r"(code32[3]) : "l"(p));
175  asm(LD_NC_V1 " {%0}, [%1 + 16];" :
176  "=r"(code32[4]) : "l"(p));
177  }
178 };
179 
180 template<>
181 struct LoadCode32<24> {
182  static inline __device__ void load(unsigned int code32[6],
183  unsigned char* p,
184  int offset) {
185  p += offset * 24;
186  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
187  // unfortunately need to reorganize memory layout by warp
188  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
189  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
190  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
191  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
192  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
193  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
194  }
195 };
196 
197 template<>
198 struct LoadCode32<28> {
199  static inline __device__ void load(unsigned int code32[7],
200  unsigned char* p,
201  int offset) {
202  p += offset * 28;
203  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
204  // unfortunately need to reorganize memory layout by warp
205  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
206  "=r"(code32[0]) : "l"(p));
207  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
208  "=r"(code32[1]) : "l"(p));
209  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
210  "=r"(code32[2]) : "l"(p));
211  asm(LD_NC_V1 " {%0}, [%1 + 12];" :
212  "=r"(code32[3]) : "l"(p));
213  asm(LD_NC_V1 " {%0}, [%1 + 16];" :
214  "=r"(code32[4]) : "l"(p));
215  asm(LD_NC_V1 " {%0}, [%1 + 20];" :
216  "=r"(code32[5]) : "l"(p));
217  asm(LD_NC_V1 " {%0}, [%1 + 24];" :
218  "=r"(code32[6]) : "l"(p));
219  }
220 };
221 
222 template<>
223 struct LoadCode32<32> {
224  static inline __device__ void load(unsigned int code32[8],
225  unsigned char* p,
226  int offset) {
227  p += offset * 32;
228  // FIXME: this is a non-coalesced load
229  // unfortunately need to reorganize memory layout by warp
230  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
231  "=r"(code32[0]), "=r"(code32[1]),
232  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
233  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
234  "=r"(code32[4]), "=r"(code32[5]),
235  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
236  }
237 };
238 
239 template<>
240 struct LoadCode32<40> {
241  static inline __device__ void load(unsigned int code32[10],
242  unsigned char* p,
243  int offset) {
244  p += offset * 40;
245  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
246  // unfortunately need to reorganize memory layout by warp
247  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
248  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
249  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
250  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
251  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
252  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
253  asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
254  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
255  asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
256  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
257  }
258 };
259 
260 template<>
261 struct LoadCode32<48> {
262  static inline __device__ void load(unsigned int code32[12],
263  unsigned char* p,
264  int offset) {
265  p += offset * 48;
266  // FIXME: this is a non-coalesced load
267  // unfortunately need to reorganize memory layout by warp
268  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
269  "=r"(code32[0]), "=r"(code32[1]),
270  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
271  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
272  "=r"(code32[4]), "=r"(code32[5]),
273  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
274  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
275  "=r"(code32[8]), "=r"(code32[9]),
276  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
277  }
278 };
279 
280 template<>
281 struct LoadCode32<56> {
282  static inline __device__ void load(unsigned int code32[14],
283  unsigned char* p,
284  int offset) {
285  p += offset * 56;
286  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
287  // unfortunately need to reorganize memory layout by warp
288  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
289  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
290  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
291  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
292  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
293  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
294  asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
295  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
296  asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
297  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
298  asm(LD_NC_V2 " {%0, %1}, [%2 + 40];" :
299  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
300  asm(LD_NC_V2 " {%0, %1}, [%2 + 48];" :
301  "=r"(code32[12]), "=r"(code32[13]) : "l"(p));
302  }
303 };
304 
305 template<>
306 struct LoadCode32<64> {
307  static inline __device__ void load(unsigned int code32[16],
308  unsigned char* p,
309  int offset) {
310  p += offset * 64;
311  // FIXME: this is a non-coalesced load
312  // unfortunately need to reorganize memory layout by warp
313  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
314  "=r"(code32[0]), "=r"(code32[1]),
315  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
316  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
317  "=r"(code32[4]), "=r"(code32[5]),
318  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
319  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
320  "=r"(code32[8]), "=r"(code32[9]),
321  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
322  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
323  "=r"(code32[12]), "=r"(code32[13]),
324  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
325  }
326 };
327 
328 template<>
329 struct LoadCode32<96> {
330  static inline __device__ void load(unsigned int code32[24],
331  unsigned char* p,
332  int offset) {
333  p += offset * 96;
334  // FIXME: this is a non-coalesced load
335  // unfortunately need to reorganize memory layout by warp
336  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
337  "=r"(code32[0]), "=r"(code32[1]),
338  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
339  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
340  "=r"(code32[4]), "=r"(code32[5]),
341  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
342  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
343  "=r"(code32[8]), "=r"(code32[9]),
344  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
345  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
346  "=r"(code32[12]), "=r"(code32[13]),
347  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
348  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 64];" :
349  "=r"(code32[16]), "=r"(code32[17]),
350  "=r"(code32[18]), "=r"(code32[19]) : "l"(p));
351  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 80];" :
352  "=r"(code32[20]), "=r"(code32[21]),
353  "=r"(code32[22]), "=r"(code32[23]) : "l"(p));
354  }
355 };
356 
357 } } // namespace