Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
PQCodeLoad.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the CC-by-NC license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 #include "../utils/PtxUtils.cuh"
14 
15 namespace faiss { namespace gpu {
16 
17 #if __CUDA_ARCH__ >= 350
18 // Use the CC 3.5+ read-only texture cache (nc)
19 #define LD_NC_V1 "ld.global.cs.nc.u32"
20 #define LD_NC_V2 "ld.global.cs.nc.v2.u32"
21 #define LD_NC_V4 "ld.global.cs.nc.v4.u32"
22 #else
23 // Read normally
24 #define LD_NC_V1 "ld.global.cs.u32"
25 #define LD_NC_V2 "ld.global.cs.v2.u32"
26 #define LD_NC_V4 "ld.global.cs.v4.u32"
27 #endif // __CUDA_ARCH__
28 
29 ///
30 /// This file contains loader functions for PQ codes of various byte
31 /// length.
32 ///
33 
34 // Type-specific wrappers around the PTX bfe.* instruction, for
35 // quantization code extraction
36 inline __device__ unsigned int getByte(unsigned char v,
37  int pos,
38  int width) {
39  return v;
40 }
41 
42 inline __device__ unsigned int getByte(unsigned short v,
43  int pos,
44  int width) {
45  return getBitfield((unsigned int) v, pos, width);
46 }
47 
48 inline __device__ unsigned int getByte(unsigned int v,
49  int pos,
50  int width) {
51  return getBitfield(v, pos, width);
52 }
53 
54 inline __device__ unsigned int getByte(unsigned long v,
55  int pos,
56  int width) {
57  return getBitfield(v, pos, width);
58 }
59 
60 template <int NumSubQuantizers>
61 struct LoadCode32 {};
62 
63 template<>
64 struct LoadCode32<1> {
65  static inline __device__ void load(unsigned int code32[1],
66  unsigned char* p,
67  int offset) {
68  p += offset * 1;
69  asm("ld.global.cs.u8 {%0}, [%1];" :
70  "=r"(code32[0]) : "l"(p));
71  }
72 };
73 
74 template<>
75 struct LoadCode32<2> {
76  static inline __device__ void load(unsigned int code32[1],
77  unsigned char* p,
78  int offset) {
79  p += offset * 2;
80  asm("ld.global.cs.u16 {%0}, [%1];" :
81  "=r"(code32[0]) : "l"(p));
82  }
83 };
84 
85 template<>
86 struct LoadCode32<3> {
87  static inline __device__ void load(unsigned int code32[1],
88  unsigned char* p,
89  int offset) {
90  p += offset * 3;
91  unsigned int a;
92  unsigned int b;
93  unsigned int c;
94 
95  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
96  // unfortunately need to reorganize memory layout by warp
97  asm("ld.global.cs.u8 {%0}, [%1 + 0];" :
98  "=r"(a) : "l"(p));
99  asm("ld.global.cs.u8 {%0}, [%1 + 1];" :
100  "=r"(b) : "l"(p));
101  asm("ld.global.cs.u8 {%0}, [%1 + 2];" :
102  "=r"(c) : "l"(p));
103 
104  // FIXME: this is also slow, since we have to recover the
105  // individual bytes loaded
106  code32[0] = (c << 16) | (b << 8) | a;
107  }
108 };
109 
110 template<>
111 struct LoadCode32<4> {
112  static inline __device__ void load(unsigned int code32[1],
113  unsigned char* p,
114  int offset) {
115  p += offset * 4;
116  asm("ld.global.cs.u32 {%0}, [%1];" :
117  "=r"(code32[0]) : "l"(p));
118  }
119 };
120 
121 template<>
122 struct LoadCode32<8> {
123  static inline __device__ void load(unsigned int code32[2],
124  unsigned char* p,
125  int offset) {
126  p += offset * 8;
127  asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" :
128  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
129  }
130 };
131 
132 template<>
133 struct LoadCode32<12> {
134  static inline __device__ void load(unsigned int code32[3],
135  unsigned char* p,
136  int offset) {
137  p += offset * 12;
138  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
139  // unfortunately need to reorganize memory layout by warp
140  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
141  "=r"(code32[0]) : "l"(p));
142  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
143  "=r"(code32[1]) : "l"(p));
144  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
145  "=r"(code32[2]) : "l"(p));
146  }
147 };
148 
149 template<>
150 struct LoadCode32<16> {
151  static inline __device__ void load(unsigned int code32[4],
152  unsigned char* p,
153  int offset) {
154  p += offset * 16;
155  asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
156  "=r"(code32[0]), "=r"(code32[1]),
157  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
158  }
159 };
160 
161 template<>
162 struct LoadCode32<20> {
163  static inline __device__ void load(unsigned int code32[5],
164  unsigned char* p,
165  int offset) {
166  p += offset * 20;
167  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
168  // unfortunately need to reorganize memory layout by warp
169  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
170  "=r"(code32[0]) : "l"(p));
171  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
172  "=r"(code32[1]) : "l"(p));
173  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
174  "=r"(code32[2]) : "l"(p));
175  asm(LD_NC_V1 " {%0}, [%1 + 12];" :
176  "=r"(code32[3]) : "l"(p));
177  asm(LD_NC_V1 " {%0}, [%1 + 16];" :
178  "=r"(code32[4]) : "l"(p));
179  }
180 };
181 
182 template<>
183 struct LoadCode32<24> {
184  static inline __device__ void load(unsigned int code32[6],
185  unsigned char* p,
186  int offset) {
187  p += offset * 24;
188  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
189  // unfortunately need to reorganize memory layout by warp
190  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
191  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
192  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
193  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
194  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
195  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
196  }
197 };
198 
199 template<>
200 struct LoadCode32<28> {
201  static inline __device__ void load(unsigned int code32[7],
202  unsigned char* p,
203  int offset) {
204  p += offset * 28;
205  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
206  // unfortunately need to reorganize memory layout by warp
207  asm(LD_NC_V1 " {%0}, [%1 + 0];" :
208  "=r"(code32[0]) : "l"(p));
209  asm(LD_NC_V1 " {%0}, [%1 + 4];" :
210  "=r"(code32[1]) : "l"(p));
211  asm(LD_NC_V1 " {%0}, [%1 + 8];" :
212  "=r"(code32[2]) : "l"(p));
213  asm(LD_NC_V1 " {%0}, [%1 + 12];" :
214  "=r"(code32[3]) : "l"(p));
215  asm(LD_NC_V1 " {%0}, [%1 + 16];" :
216  "=r"(code32[4]) : "l"(p));
217  asm(LD_NC_V1 " {%0}, [%1 + 20];" :
218  "=r"(code32[5]) : "l"(p));
219  asm(LD_NC_V1 " {%0}, [%1 + 24];" :
220  "=r"(code32[6]) : "l"(p));
221  }
222 };
223 
224 template<>
225 struct LoadCode32<32> {
226  static inline __device__ void load(unsigned int code32[8],
227  unsigned char* p,
228  int offset) {
229  p += offset * 32;
230  // FIXME: this is a non-coalesced load
231  // unfortunately need to reorganize memory layout by warp
232  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
233  "=r"(code32[0]), "=r"(code32[1]),
234  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
235  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
236  "=r"(code32[4]), "=r"(code32[5]),
237  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
238  }
239 };
240 
241 template<>
242 struct LoadCode32<40> {
243  static inline __device__ void load(unsigned int code32[10],
244  unsigned char* p,
245  int offset) {
246  p += offset * 40;
247  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
248  // unfortunately need to reorganize memory layout by warp
249  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
250  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
251  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
252  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
253  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
254  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
255  asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
256  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
257  asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
258  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
259  }
260 };
261 
262 template<>
263 struct LoadCode32<48> {
264  static inline __device__ void load(unsigned int code32[12],
265  unsigned char* p,
266  int offset) {
267  p += offset * 48;
268  // FIXME: this is a non-coalesced load
269  // unfortunately need to reorganize memory layout by warp
270  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
271  "=r"(code32[0]), "=r"(code32[1]),
272  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
273  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
274  "=r"(code32[4]), "=r"(code32[5]),
275  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
276  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
277  "=r"(code32[8]), "=r"(code32[9]),
278  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
279  }
280 };
281 
282 template<>
283 struct LoadCode32<56> {
284  static inline __device__ void load(unsigned int code32[14],
285  unsigned char* p,
286  int offset) {
287  p += offset * 56;
288  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
289  // unfortunately need to reorganize memory layout by warp
290  asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" :
291  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
292  asm(LD_NC_V2 " {%0, %1}, [%2 + 8];" :
293  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
294  asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" :
295  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
296  asm(LD_NC_V2 " {%0, %1}, [%2 + 24];" :
297  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
298  asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" :
299  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
300  asm(LD_NC_V2 " {%0, %1}, [%2 + 40];" :
301  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
302  asm(LD_NC_V2 " {%0, %1}, [%2 + 48];" :
303  "=r"(code32[12]), "=r"(code32[13]) : "l"(p));
304  }
305 };
306 
307 template<>
308 struct LoadCode32<64> {
309  static inline __device__ void load(unsigned int code32[16],
310  unsigned char* p,
311  int offset) {
312  p += offset * 64;
313  // FIXME: this is a non-coalesced load
314  // unfortunately need to reorganize memory layout by warp
315  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
316  "=r"(code32[0]), "=r"(code32[1]),
317  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
318  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
319  "=r"(code32[4]), "=r"(code32[5]),
320  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
321  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
322  "=r"(code32[8]), "=r"(code32[9]),
323  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
324  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
325  "=r"(code32[12]), "=r"(code32[13]),
326  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
327  }
328 };
329 
330 template<>
331 struct LoadCode32<96> {
332  static inline __device__ void load(unsigned int code32[24],
333  unsigned char* p,
334  int offset) {
335  p += offset * 96;
336  // FIXME: this is a non-coalesced load
337  // unfortunately need to reorganize memory layout by warp
338  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" :
339  "=r"(code32[0]), "=r"(code32[1]),
340  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
341  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" :
342  "=r"(code32[4]), "=r"(code32[5]),
343  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
344  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];" :
345  "=r"(code32[8]), "=r"(code32[9]),
346  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
347  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];" :
348  "=r"(code32[12]), "=r"(code32[13]),
349  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
350  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 64];" :
351  "=r"(code32[16]), "=r"(code32[17]),
352  "=r"(code32[18]), "=r"(code32[19]) : "l"(p));
353  asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 80];" :
354  "=r"(code32[20]), "=r"(code32[21]),
355  "=r"(code32[22]), "=r"(code32[23]) : "l"(p));
356  }
357 };
358 
359 } } // namespace