Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
PQCodeLoad.cuh
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #pragma once
13 
14 #include "../utils/PtxUtils.cuh"
15 
16 namespace faiss { namespace gpu {
17 
18 ///
19 /// This file contains loader functions for PQ codes of various byte
20 /// length.
21 ///
22 
23 // Type-specific wrappers around the PTX bfe.* instruction, for
24 // quantization code extraction
25 inline __device__ unsigned int getByte(unsigned char v,
26  int pos,
27  int width) {
28  return v;
29 }
30 
31 inline __device__ unsigned int getByte(unsigned short v,
32  int pos,
33  int width) {
34  return getBitfield((unsigned int) v, pos, width);
35 }
36 
37 inline __device__ unsigned int getByte(unsigned int v,
38  int pos,
39  int width) {
40  return getBitfield(v, pos, width);
41 }
42 
43 inline __device__ unsigned int getByte(unsigned long v,
44  int pos,
45  int width) {
46  return getBitfield(v, pos, width);
47 }
48 
49 template <int NumSubQuantizers>
50 struct LoadCode32 {};
51 
52 template<>
53 struct LoadCode32<1> {
54  static inline __device__ void load(unsigned int code32[1],
55  unsigned char* p,
56  int offset) {
57  p += offset * 1;
58  asm("ld.global.cs.u8 {%0}, [%1];" :
59  "=r"(code32[0]) : "l"(p));
60  }
61 };
62 
63 template<>
64 struct LoadCode32<2> {
65  static inline __device__ void load(unsigned int code32[1],
66  unsigned char* p,
67  int offset) {
68  p += offset * 2;
69  asm("ld.global.cs.u16 {%0}, [%1];" :
70  "=r"(code32[0]) : "l"(p));
71  }
72 };
73 
74 template<>
75 struct LoadCode32<3> {
76  static inline __device__ void load(unsigned int code32[1],
77  unsigned char* p,
78  int offset) {
79  p += offset * 3;
80  unsigned int a;
81  unsigned int b;
82  unsigned int c;
83 
84  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
85  // unfortunately need to reorganize memory layout by warp
86  asm("ld.global.cs.u8 {%0}, [%1 + 0];" :
87  "=r"(a) : "l"(p));
88  asm("ld.global.cs.u8 {%0}, [%1 + 1];" :
89  "=r"(b) : "l"(p));
90  asm("ld.global.cs.u8 {%0}, [%1 + 2];" :
91  "=r"(c) : "l"(p));
92 
93  // FIXME: this is also slow, since we have to recover the
94  // individual bytes loaded
95  code32[0] = (c << 16) | (b << 8) | a;
96  }
97 };
98 
99 template<>
100 struct LoadCode32<4> {
101  static inline __device__ void load(unsigned int code32[1],
102  unsigned char* p,
103  int offset) {
104  p += offset * 4;
105  asm("ld.global.cs.u32 {%0}, [%1];" :
106  "=r"(code32[0]) : "l"(p));
107  }
108 };
109 
110 template<>
111 struct LoadCode32<8> {
112  static inline __device__ void load(unsigned int code32[2],
113  unsigned char* p,
114  int offset) {
115  p += offset * 8;
116  asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" :
117  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
118  }
119 };
120 
121 template<>
122 struct LoadCode32<12> {
123  static inline __device__ void load(unsigned int code32[3],
124  unsigned char* p,
125  int offset) {
126  p += offset * 12;
127  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
128  // unfortunately need to reorganize memory layout by warp
129  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
130  "=r"(code32[0]) : "l"(p));
131  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
132  "=r"(code32[1]) : "l"(p));
133  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
134  "=r"(code32[2]) : "l"(p));
135  }
136 };
137 
138 template<>
139 struct LoadCode32<16> {
140  static inline __device__ void load(unsigned int code32[4],
141  unsigned char* p,
142  int offset) {
143  p += offset * 16;
144  asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
145  "=r"(code32[0]), "=r"(code32[1]),
146  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
147  }
148 };
149 
150 template<>
151 struct LoadCode32<20> {
152  static inline __device__ void load(unsigned int code32[5],
153  unsigned char* p,
154  int offset) {
155  p += offset * 20;
156  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
157  // unfortunately need to reorganize memory layout by warp
158  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
159  "=r"(code32[0]) : "l"(p));
160  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
161  "=r"(code32[1]) : "l"(p));
162  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
163  "=r"(code32[2]) : "l"(p));
164  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
165  "=r"(code32[3]) : "l"(p));
166  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
167  "=r"(code32[4]) : "l"(p));
168  }
169 };
170 
171 template<>
172 struct LoadCode32<24> {
173  static inline __device__ void load(unsigned int code32[6],
174  unsigned char* p,
175  int offset) {
176  p += offset * 24;
177  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
178  // unfortunately need to reorganize memory layout by warp
179  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
180  "=r"(code32[0]) : "l"(p));
181  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
182  "=r"(code32[1]) : "l"(p));
183  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
184  "=r"(code32[2]) : "l"(p));
185  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
186  "=r"(code32[3]) : "l"(p));
187  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
188  "=r"(code32[4]) : "l"(p));
189  asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
190  "=r"(code32[5]) : "l"(p));
191  }
192 };
193 
194 template<>
195 struct LoadCode32<28> {
196  static inline __device__ void load(unsigned int code32[7],
197  unsigned char* p,
198  int offset) {
199  p += offset * 28;
200  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
201  // unfortunately need to reorganize memory layout by warp
202  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
203  "=r"(code32[0]) : "l"(p));
204  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
205  "=r"(code32[1]) : "l"(p));
206  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
207  "=r"(code32[2]) : "l"(p));
208  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
209  "=r"(code32[3]) : "l"(p));
210  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
211  "=r"(code32[4]) : "l"(p));
212  asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
213  "=r"(code32[5]) : "l"(p));
214  asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
215  "=r"(code32[6]) : "l"(p));
216  }
217 };
218 
219 template<>
220 struct LoadCode32<32> {
221  static inline __device__ void load(unsigned int code32[8],
222  unsigned char* p,
223  int offset) {
224  p += offset * 32;
225  // FIXME: this is a non-coalesced load
226  // unfortunately need to reorganize memory layout by warp
227  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
228  "=r"(code32[0]), "=r"(code32[1]),
229  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
230  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
231  "=r"(code32[4]), "=r"(code32[5]),
232  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
233  }
234 };
235 
236 template<>
237 struct LoadCode32<40> {
238  static inline __device__ void load(unsigned int code32[10],
239  unsigned char* p,
240  int offset) {
241  p += offset * 40;
242  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
243  // unfortunately need to reorganize memory layout by warp
244  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
245  "=r"(code32[0]) : "l"(p));
246  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
247  "=r"(code32[1]) : "l"(p));
248  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
249  "=r"(code32[2]) : "l"(p));
250  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
251  "=r"(code32[3]) : "l"(p));
252  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
253  "=r"(code32[4]) : "l"(p));
254  asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
255  "=r"(code32[5]) : "l"(p));
256  asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
257  "=r"(code32[6]) : "l"(p));
258  asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
259  "=r"(code32[7]) : "l"(p));
260  asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
261  "=r"(code32[8]) : "l"(p));
262  asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
263  "=r"(code32[9]) : "l"(p));
264  }
265 };
266 
267 template<>
268 struct LoadCode32<48> {
269  static inline __device__ void load(unsigned int code32[12],
270  unsigned char* p,
271  int offset) {
272  p += offset * 48;
273  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
274  // unfortunately need to reorganize memory layout by warp
275  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
276  "=r"(code32[0]) : "l"(p));
277  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
278  "=r"(code32[1]) : "l"(p));
279  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
280  "=r"(code32[2]) : "l"(p));
281  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
282  "=r"(code32[3]) : "l"(p));
283  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
284  "=r"(code32[4]) : "l"(p));
285  asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
286  "=r"(code32[5]) : "l"(p));
287  asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
288  "=r"(code32[6]) : "l"(p));
289  asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
290  "=r"(code32[7]) : "l"(p));
291  asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
292  "=r"(code32[8]) : "l"(p));
293  asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
294  "=r"(code32[9]) : "l"(p));
295  asm("ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
296  "=r"(code32[10]) : "l"(p));
297  asm("ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
298  "=r"(code32[11]) : "l"(p));
299  }
300 };
301 
302 template<>
303 struct LoadCode32<56> {
304  static inline __device__ void load(unsigned int code32[14],
305  unsigned char* p,
306  int offset) {
307  p += offset * 56;
308  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
309  // unfortunately need to reorganize memory layout by warp
310  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
311  "=r"(code32[0]) : "l"(p));
312  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
313  "=r"(code32[1]) : "l"(p));
314  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
315  "=r"(code32[2]) : "l"(p));
316  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
317  "=r"(code32[3]) : "l"(p));
318  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
319  "=r"(code32[4]) : "l"(p));
320  asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
321  "=r"(code32[5]) : "l"(p));
322  asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
323  "=r"(code32[6]) : "l"(p));
324  asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
325  "=r"(code32[7]) : "l"(p));
326  asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
327  "=r"(code32[8]) : "l"(p));
328  asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
329  "=r"(code32[9]) : "l"(p));
330  asm("ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
331  "=r"(code32[10]) : "l"(p));
332  asm("ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
333  "=r"(code32[11]) : "l"(p));
334  asm("ld.global.cs.nc.u32 {%0}, [%1 + 48];" :
335  "=r"(code32[12]) : "l"(p));
336  asm("ld.global.cs.nc.u32 {%0}, [%1 + 52];" :
337  "=r"(code32[13]) : "l"(p));
338  }
339 };
340 
341 template<>
342 struct LoadCode32<64> {
343  static inline __device__ void load(unsigned int code32[16],
344  unsigned char* p,
345  int offset) {
346  p += offset * 64;
347  // FIXME: this is a non-coalesced load
348  // unfortunately need to reorganize memory layout by warp
349  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
350  "=r"(code32[0]), "=r"(code32[1]),
351  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
352  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
353  "=r"(code32[4]), "=r"(code32[5]),
354  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
355  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
356  "=r"(code32[8]), "=r"(code32[9]),
357  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
358  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
359  "=r"(code32[12]), "=r"(code32[13]),
360  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
361  }
362 };
363 
364 
365 } } // namespace