Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
PQCodeLoad.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #pragma once
11 
12 #include "../utils/PtxUtils.cuh"
13 
14 namespace faiss { namespace gpu {
15 
16 #if __CUDA_ARCH__ >= 350
17 // Use the CC 3.5+ read-only texture cache (nc)
18 #define LD_NC_V1 "ld.global.cs.nc.u32"
19 #define LD_NC_V2 "ld.global.cs.nc.v2.u32"
20 #define LD_NC_V4 "ld.global.cs.nc.v4.u32"
21 #else
22 // Read normally
23 #define LD_NC_V1 "ld.global.cs.u32"
24 #define LD_NC_V2 "ld.global.cs.v2.u32"
25 #define LD_NC_V4 "ld.global.cs.v4.u32"
26 #endif // __CUDA_ARCH__
27 
28 ///
29 /// This file contains loader functions for PQ codes of various byte
30 /// length.
31 ///
32 
33 // Type-specific wrappers around the PTX bfe.* instruction, for
34 // quantization code extraction
35 inline __device__ unsigned int getByte(unsigned char v,
36  int pos,
37  int width) {
38  return v;
39 }
40 
41 inline __device__ unsigned int getByte(unsigned short v,
42  int pos,
43  int width) {
44  return getBitfield((unsigned int) v, pos, width);
45 }
46 
47 inline __device__ unsigned int getByte(unsigned int v,
48  int pos,
49  int width) {
50  return getBitfield(v, pos, width);
51 }
52 
53 inline __device__ unsigned int getByte(unsigned long v,
54  int pos,
55  int width) {
56  return getBitfield(v, pos, width);
57 }
58 
59 template <int NumSubQuantizers>
60 struct LoadCode32 {};
61 
62 template<>
63 struct LoadCode32<1> {
64  static inline __device__ void load(unsigned int code32[1],
65  unsigned char* p,
66  int offset) {
67  p += offset * 1;
68  asm("ld.global.cs.u8 {%0}, [%1];" :
69  "=r"(code32[0]) : "l"(p));
70  }
71 };
72 
73 template<>
74 struct LoadCode32<2> {
75  static inline __device__ void load(unsigned int code32[1],
76  unsigned char* p,
77  int offset) {
78  p += offset * 2;
79  asm("ld.global.cs.u16 {%0}, [%1];" :
80  "=r"(code32[0]) : "l"(p));
81  }
82 };
83 
84 template<>
85 struct LoadCode32<3> {
86  static inline __device__ void load(unsigned int code32[1],
87  unsigned char* p,
88  int offset) {
89  p += offset * 3;
90  unsigned int a;
91  unsigned int b;
92  unsigned int c;
93 
94  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
95  // unfortunately need to reorganize memory layout by warp
96  asm("ld.global.cs.u8 {%0}, [%1 + 0];" :
97  "=r"(a) : "l"(p));
98  asm("ld.global.cs.u8 {%0}, [%1 + 1];" :
99  "=r"(b) : "l"(p));
100  asm("ld.global.cs.u8 {%0}, [%1 + 2];" :
101  "=r"(c) : "l"(p));
102 
103  // FIXME: this is also slow, since we have to recover the
104  // individual bytes loaded
105  code32[0] = (c << 16) | (b << 8) | a;
106  }
107 };
108 
109 template<>
110 struct LoadCode32<4> {
111  static inline __device__ void load(unsigned int code32[1],
112  unsigned char* p,
113  int offset) {
114  p += offset * 4;
115  asm("ld.global.cs.u32 {%0}, [%1];" :
116  "=r"(code32[0]) : "l"(p));
117  }
118 };
119 
120 template<>
121 struct LoadCode32<8> {
122  static inline __device__ void load(unsigned int code32[2],
123  unsigned char* p,
124  int offset) {
125  p += offset * 8;
126  asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" :
127  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
128  }
129 };
130 
131 template<>
132 struct LoadCode32<12> {
133  static inline __device__ void load(unsigned int code32[3],
134  unsigned char* p,
135  int offset) {
136  p += offset * 12;
137  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
138  // unfortunately need to reorganize memory layout by warp
139  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
140  "=r"(code32[0]) : "l"(p));
141  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
142  "=r"(code32[1]) : "l"(p));
143  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
144  "=r"(code32[2]) : "l"(p));
145  }
146 };
147 
148 template<>
149 struct LoadCode32<16> {
150  static inline __device__ void load(unsigned int code32[4],
151  unsigned char* p,
152  int offset) {
153  p += offset * 16;
154  asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
155  "=r"(code32[0]), "=r"(code32[1]),
156  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
157  }
158 };
159 
160 template<>
161 struct LoadCode32<20> {
162  static inline __device__ void load(unsigned int code32[5],
163  unsigned char* p,
164  int offset) {
165  p += offset * 20;
166  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
167  // unfortunately need to reorganize memory layout by warp
168  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
169  "=r"(code32[0]) : "l"(p));
170  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
171  "=r"(code32[1]) : "l"(p));
172  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
173  "=r"(code32[2]) : "l"(p));
174  asm(LD_NC_V1 " {%0}, [%1 + 12];" :
175  "=r"(code32[3]) : "l"(p));
176  asm(LD_NC_V1 " {%0}, [%1 + 16];" :
177  "=r"(code32[4]) : "l"(p));
178  }
179 };
180 
181 template<>
182 struct LoadCode32<24> {
183  static inline __device__ void load(unsigned int code32[6],
184  unsigned char* p,
185  int offset) {
186  p += offset * 24;
187  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
188  // unfortunately need to reorganize memory layout by warp
189  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
190  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
191  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
192  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
193  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
194  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
195  }
196 };
197 
198 template<>
199 struct LoadCode32<28> {
200  static inline __device__ void load(unsigned int code32[7],
201  unsigned char* p,
202  int offset) {
203  p += offset * 28;
204  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
205  // unfortunately need to reorganize memory layout by warp
206  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
207  "=r"(code32[0]) : "l"(p));
208  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
209  "=r"(code32[1]) : "l"(p));
210  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
211  "=r"(code32[2]) : "l"(p));
212  asm(LD_NC_V1 " {%0}, [%1 + 12];" :
213  "=r"(code32[3]) : "l"(p));
214  asm(LD_NC_V1 " {%0}, [%1 + 16];" :
215  "=r"(code32[4]) : "l"(p));
216  asm(LD_NC_V1 " {%0}, [%1 + 20];" :
217  "=r"(code32[5]) : "l"(p));
218  asm(LD_NC_V1 " {%0}, [%1 + 24];" :
219  "=r"(code32[6]) : "l"(p));
220  }
221 };
222 
223 template<>
224 struct LoadCode32<32> {
225  static inline __device__ void load(unsigned int code32[8],
226  unsigned char* p,
227  int offset) {
228  p += offset * 32;
229  // FIXME: this is a non-coalesced load
230  // unfortunately need to reorganize memory layout by warp
231  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
232  "=r"(code32[0]), "=r"(code32[1]),
233  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
234  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
235  "=r"(code32[4]), "=r"(code32[5]),
236  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
237  }
238 };
239 
240 template<>
241 struct LoadCode32<40> {
242  static inline __device__ void load(unsigned int code32[10],
243  unsigned char* p,
244  int offset) {
245  p += offset * 40;
246  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
247  // unfortunately need to reorganize memory layout by warp
248  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
249  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
250  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
251  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
252  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
253  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
254  asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
255  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
256  asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
257  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
258  }
259 };
260 
261 template<>
262 struct LoadCode32<48> {
263  static inline __device__ void load(unsigned int code32[12],
264  unsigned char* p,
265  int offset) {
266  p += offset * 48;
267  // FIXME: this is a non-coalesced load
268  // unfortunately need to reorganize memory layout by warp
269  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
270  "=r"(code32[0]), "=r"(code32[1]),
271  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
272  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
273  "=r"(code32[4]), "=r"(code32[5]),
274  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
275  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
276  "=r"(code32[8]), "=r"(code32[9]),
277  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
278  }
279 };
280 
281 template<>
282 struct LoadCode32<56> {
283  static inline __device__ void load(unsigned int code32[14],
284  unsigned char* p,
285  int offset) {
286  p += offset * 56;
287  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
288  // unfortunately need to reorganize memory layout by warp
289  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
290  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
291  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
292  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
293  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
294  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
295  asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
296  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
297  asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
298  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
299  asm(LD_NC_V2 " {%0, %1}, [%2 + 40];" :
300  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
301  asm(LD_NC_V2 " {%0, %1}, [%2 + 48];" :
302  "=r"(code32[12]), "=r"(code32[13]) : "l"(p));
303  }
304 };
305 
306 template<>
307 struct LoadCode32<64> {
308  static inline __device__ void load(unsigned int code32[16],
309  unsigned char* p,
310  int offset) {
311  p += offset * 64;
312  // FIXME: this is a non-coalesced load
313  // unfortunately need to reorganize memory layout by warp
314  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
315  "=r"(code32[0]), "=r"(code32[1]),
316  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
317  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
318  "=r"(code32[4]), "=r"(code32[5]),
319  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
320  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
321  "=r"(code32[8]), "=r"(code32[9]),
322  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
323  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
324  "=r"(code32[12]), "=r"(code32[13]),
325  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
326  }
327 };
328 
329 template<>
330 struct LoadCode32<96> {
331  static inline __device__ void load(unsigned int code32[24],
332  unsigned char* p,
333  int offset) {
334  p += offset * 96;
335  // FIXME: this is a non-coalesced load
336  // unfortunately need to reorganize memory layout by warp
337  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
338  "=r"(code32[0]), "=r"(code32[1]),
339  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
340  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
341  "=r"(code32[4]), "=r"(code32[5]),
342  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
343  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
344  "=r"(code32[8]), "=r"(code32[9]),
345  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
346  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
347  "=r"(code32[12]), "=r"(code32[13]),
348  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
349  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 64];" :
350  "=r"(code32[16]), "=r"(code32[17]),
351  "=r"(code32[18]), "=r"(code32[19]) : "l"(p));
352  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 80];" :
353  "=r"(code32[20]), "=r"(code32[21]),
354  "=r"(code32[22]), "=r"(code32[23]) : "l"(p));
355  }
356 };
357 
358 } } // namespace