Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
PQCodeLoad.cuh
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #pragma once
13 
14 #include "../utils/PtxUtils.cuh"
15 
16 namespace faiss { namespace gpu {
17 
18 ///
19 /// This file contains loader functions for PQ codes of various byte
20 /// length.
21 ///
22 
23 // Type-specific wrappers around the PTX bfe.* instruction, for
24 // quantization code extraction
25 inline __device__ unsigned int getByte(unsigned char v,
26  int pos,
27  int width) {
28  return v;
29 }
30 
31 inline __device__ unsigned int getByte(unsigned short v,
32  int pos,
33  int width) {
34  return getBitfield((unsigned int) v, pos, width);
35 }
36 
37 inline __device__ unsigned int getByte(unsigned int v,
38  int pos,
39  int width) {
40  return getBitfield(v, pos, width);
41 }
42 
43 inline __device__ unsigned int getByte(unsigned long v,
44  int pos,
45  int width) {
46  return getBitfield(v, pos, width);
47 }
48 
49 template <int NumSubQuantizers>
50 struct LoadCode32 {};
51 
52 template<>
53 struct LoadCode32<1> {
54  static inline __device__ void load(unsigned int code32[1],
55  unsigned char* p,
56  int offset) {
57  p += offset * 1;
58  asm("ld.global.cs.u8 {%0}, [%1];" :
59  "=r"(code32[0]) : "l"(p));
60  }
61 };
62 
63 template<>
64 struct LoadCode32<2> {
65  static inline __device__ void load(unsigned int code32[1],
66  unsigned char* p,
67  int offset) {
68  p += offset * 2;
69  asm("ld.global.cs.u16 {%0}, [%1];" :
70  "=r"(code32[0]) : "l"(p));
71  }
72 };
73 
74 template<>
75 struct LoadCode32<3> {
76  static inline __device__ void load(unsigned int code32[1],
77  unsigned char* p,
78  int offset) {
79  p += offset * 3;
80  unsigned int a;
81  unsigned int b;
82  unsigned int c;
83 
84  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
85  // unfortunately need to reorganize memory layout by warp
86  asm("ld.global.cs.u8 {%0}, [%1 + 0];" :
87  "=r"(a) : "l"(p));
88  asm("ld.global.cs.u8 {%0}, [%1 + 1];" :
89  "=r"(b) : "l"(p));
90  asm("ld.global.cs.u8 {%0}, [%1 + 2];" :
91  "=r"(c) : "l"(p));
92 
93  // FIXME: this is also slow, since we have to recover the
94  // individual bytes loaded
95  code32[0] = (c << 16) | (b << 8) | a;
96  }
97 };
98 
99 template<>
100 struct LoadCode32<4> {
101  static inline __device__ void load(unsigned int code32[1],
102  unsigned char* p,
103  int offset) {
104  p += offset * 4;
105  asm("ld.global.cs.u32 {%0}, [%1];" :
106  "=r"(code32[0]) : "l"(p));
107  }
108 };
109 
110 template<>
111 struct LoadCode32<8> {
112  static inline __device__ void load(unsigned int code32[2],
113  unsigned char* p,
114  int offset) {
115  p += offset * 8;
116  asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" :
117  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
118  }
119 };
120 
121 template<>
122 struct LoadCode32<12> {
123  static inline __device__ void load(unsigned int code32[3],
124  unsigned char* p,
125  int offset) {
126  p += offset * 12;
127  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
128  // unfortunately need to reorganize memory layout by warp
129  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
130  "=r"(code32[0]) : "l"(p));
131  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
132  "=r"(code32[1]) : "l"(p));
133  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
134  "=r"(code32[2]) : "l"(p));
135  }
136 };
137 
138 template<>
139 struct LoadCode32<16> {
140  static inline __device__ void load(unsigned int code32[4],
141  unsigned char* p,
142  int offset) {
143  p += offset * 16;
144  asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" :
145  "=r"(code32[0]), "=r"(code32[1]),
146  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
147  }
148 };
149 
150 template<>
151 struct LoadCode32<20> {
152  static inline __device__ void load(unsigned int code32[5],
153  unsigned char* p,
154  int offset) {
155  p += offset * 20;
156  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
157  // unfortunately need to reorganize memory layout by warp
158  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
159  "=r"(code32[0]) : "l"(p));
160  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
161  "=r"(code32[1]) : "l"(p));
162  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
163  "=r"(code32[2]) : "l"(p));
164  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
165  "=r"(code32[3]) : "l"(p));
166  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
167  "=r"(code32[4]) : "l"(p));
168  }
169 };
170 
171 template<>
172 struct LoadCode32<24> {
173  static inline __device__ void load(unsigned int code32[6],
174  unsigned char* p,
175  int offset) {
176  p += offset * 24;
177  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
178  // unfortunately need to reorganize memory layout by warp
179  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
180  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
181  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
182  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
183  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
184  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
185  }
186 };
187 
188 template<>
189 struct LoadCode32<28> {
190  static inline __device__ void load(unsigned int code32[7],
191  unsigned char* p,
192  int offset) {
193  p += offset * 28;
194  // FIXME: this is a non-coalesced, unaligned, non-vectorized load
195  // unfortunately need to reorganize memory layout by warp
196  asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
197  "=r"(code32[0]) : "l"(p));
198  asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
199  "=r"(code32[1]) : "l"(p));
200  asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
201  "=r"(code32[2]) : "l"(p));
202  asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
203  "=r"(code32[3]) : "l"(p));
204  asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
205  "=r"(code32[4]) : "l"(p));
206  asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
207  "=r"(code32[5]) : "l"(p));
208  asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
209  "=r"(code32[6]) : "l"(p));
210  }
211 };
212 
213 template<>
214 struct LoadCode32<32> {
215  static inline __device__ void load(unsigned int code32[8],
216  unsigned char* p,
217  int offset) {
218  p += offset * 32;
219  // FIXME: this is a non-coalesced load
220  // unfortunately need to reorganize memory layout by warp
221  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
222  "=r"(code32[0]), "=r"(code32[1]),
223  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
224  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
225  "=r"(code32[4]), "=r"(code32[5]),
226  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
227  }
228 };
229 
230 template<>
231 struct LoadCode32<40> {
232  static inline __device__ void load(unsigned int code32[10],
233  unsigned char* p,
234  int offset) {
235  p += offset * 40;
236  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
237  // unfortunately need to reorganize memory layout by warp
238  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
239  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
240  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
241  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
242  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
243  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
244  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
245  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
246  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
247  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
248  }
249 };
250 
251 template<>
252 struct LoadCode32<48> {
253  static inline __device__ void load(unsigned int code32[12],
254  unsigned char* p,
255  int offset) {
256  p += offset * 48;
257  // FIXME: this is a non-coalesced load
258  // unfortunately need to reorganize memory layout by warp
259  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
260  "=r"(code32[0]), "=r"(code32[1]),
261  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
262  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
263  "=r"(code32[4]), "=r"(code32[5]),
264  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
265  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
266  "=r"(code32[8]), "=r"(code32[9]),
267  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
268  }
269 };
270 
271 template<>
272 struct LoadCode32<56> {
273  static inline __device__ void load(unsigned int code32[14],
274  unsigned char* p,
275  int offset) {
276  p += offset * 56;
277  // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
278  // unfortunately need to reorganize memory layout by warp
279  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
280  "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
281  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
282  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
283  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
284  "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
285  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
286  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
287  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
288  "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
289  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 40];" :
290  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
291  asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 48];" :
292  "=r"(code32[12]), "=r"(code32[13]) : "l"(p));
293  }
294 };
295 
296 template<>
297 struct LoadCode32<64> {
298  static inline __device__ void load(unsigned int code32[16],
299  unsigned char* p,
300  int offset) {
301  p += offset * 64;
302  // FIXME: this is a non-coalesced load
303  // unfortunately need to reorganize memory layout by warp
304  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
305  "=r"(code32[0]), "=r"(code32[1]),
306  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
307  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
308  "=r"(code32[4]), "=r"(code32[5]),
309  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
310  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
311  "=r"(code32[8]), "=r"(code32[9]),
312  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
313  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
314  "=r"(code32[12]), "=r"(code32[13]),
315  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
316  }
317 };
318 
319 template<>
320 struct LoadCode32<96> {
321  static inline __device__ void load(unsigned int code32[24],
322  unsigned char* p,
323  int offset) {
324  p += offset * 96;
325  // FIXME: this is a non-coalesced load
326  // unfortunately need to reorganize memory layout by warp
327  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
328  "=r"(code32[0]), "=r"(code32[1]),
329  "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
330  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
331  "=r"(code32[4]), "=r"(code32[5]),
332  "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
333  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
334  "=r"(code32[8]), "=r"(code32[9]),
335  "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
336  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
337  "=r"(code32[12]), "=r"(code32[13]),
338  "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
339  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 64];" :
340  "=r"(code32[16]), "=r"(code32[17]),
341  "=r"(code32[18]), "=r"(code32[19]) : "l"(p));
342  asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 80];" :
343  "=r"(code32[20]), "=r"(code32[21]),
344  "=r"(code32[22]), "=r"(code32[23]) : "l"(p));
345  }
346 };
347 
348 
349 } } // namespace