mirror of
https://github.com/facebookresearch/faiss.git
synced 2025-06-03 21:54:02 +08:00
various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops
716 lines
90 KiB
HTML
716 lines
90 KiB
HTML
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head>
|
|
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
|
|
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
|
|
<meta name="generator" content="Doxygen 1.8.5"/>
|
|
<title>Faiss: /data/users/matthijs/github_faiss/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu Source File</title>
|
|
<link href="tabs.css" rel="stylesheet" type="text/css"/>
|
|
<script type="text/javascript" src="jquery.js"></script>
|
|
<script type="text/javascript" src="dynsections.js"></script>
|
|
<link href="search/search.css" rel="stylesheet" type="text/css"/>
|
|
<script type="text/javascript" src="search/search.js"></script>
|
|
<script type="text/javascript">
|
|
$(document).ready(function() { searchBox.OnSelectItem(0); });
|
|
</script>
|
|
<link href="doxygen.css" rel="stylesheet" type="text/css" />
|
|
</head>
|
|
<body>
|
|
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
|
|
<div id="titlearea">
|
|
<table cellspacing="0" cellpadding="0">
|
|
<tbody>
|
|
<tr style="height: 56px;">
|
|
<td style="padding-left: 0.5em;">
|
|
<div id="projectname">Faiss
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</div>
|
|
<!-- end header part -->
|
|
<!-- Generated by Doxygen 1.8.5 -->
|
|
<script type="text/javascript">
|
|
var searchBox = new SearchBox("searchBox", "search",false,'Search');
|
|
</script>
|
|
<div id="navrow1" class="tabs">
|
|
<ul class="tablist">
|
|
<li><a href="index.html"><span>Main Page</span></a></li>
|
|
<li><a href="namespaces.html"><span>Namespaces</span></a></li>
|
|
<li><a href="annotated.html"><span>Classes</span></a></li>
|
|
<li class="current"><a href="files.html"><span>Files</span></a></li>
|
|
<li>
|
|
<div id="MSearchBox" class="MSearchBoxInactive">
|
|
<span class="left">
|
|
<img id="MSearchSelect" src="search/mag_sel.png"
|
|
onmouseover="return searchBox.OnSearchSelectShow()"
|
|
onmouseout="return searchBox.OnSearchSelectHide()"
|
|
alt=""/>
|
|
<input type="text" id="MSearchField" value="Search" accesskey="S"
|
|
onfocus="searchBox.OnSearchFieldFocus(true)"
|
|
onblur="searchBox.OnSearchFieldFocus(false)"
|
|
onkeyup="searchBox.OnSearchFieldChange(event)"/>
|
|
</span><span class="right">
|
|
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
|
|
</span>
|
|
</div>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
<div id="navrow2" class="tabs2">
|
|
<ul class="tablist">
|
|
<li><a href="files.html"><span>File List</span></a></li>
|
|
</ul>
|
|
</div>
|
|
<!-- window showing the filter options -->
|
|
<div id="MSearchSelectWindow"
|
|
onmouseover="return searchBox.OnSearchSelectShow()"
|
|
onmouseout="return searchBox.OnSearchSelectHide()"
|
|
onkeydown="return searchBox.OnSearchSelectKey(event)">
|
|
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark"> </span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark"> </span>Classes</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark"> </span>Namespaces</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark"> </span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark"> </span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(5)"><span class="SelectionMark"> </span>Typedefs</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(6)"><span class="SelectionMark"> </span>Enumerations</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(7)"><span class="SelectionMark"> </span>Enumerator</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(8)"><span class="SelectionMark"> </span>Friends</a></div>
|
|
|
|
<!-- iframe showing the search results (closed by default) -->
|
|
<div id="MSearchResultsWindow">
|
|
<iframe src="javascript:void(0)" frameborder="0"
|
|
name="MSearchResults" id="MSearchResults">
|
|
</iframe>
|
|
</div>
|
|
|
|
<div id="nav-path" class="navpath">
|
|
<ul>
|
|
<li class="navelem"><a class="el" href="dir_6b3ae6988449b0834e9596fad5d75199.html">gpu</a></li><li class="navelem"><a class="el" href="dir_49d1182a3b8dfb62757c53ae905481ad.html">impl</a></li> </ul>
|
|
</div>
|
|
</div><!-- top -->
|
|
<div class="header">
|
|
<div class="headertitle">
|
|
<div class="title">PQScanMultiPassNoPrecomputed.cu</div> </div>
|
|
</div><!--header-->
|
|
<div class="contents">
|
|
<div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno"> 1</span> <span class="comment">/**</span></div>
|
|
<div class="line"><a name="l00002"></a><span class="lineno"> 2</span> <span class="comment"> * Copyright (c) 2015-present, Facebook, Inc.</span></div>
|
|
<div class="line"><a name="l00003"></a><span class="lineno"> 3</span> <span class="comment"> * All rights reserved.</span></div>
|
|
<div class="line"><a name="l00004"></a><span class="lineno"> 4</span> <span class="comment"> *</span></div>
|
|
<div class="line"><a name="l00005"></a><span class="lineno"> 5</span> <span class="comment"> * This source code is licensed under the BSD+Patents license found in the</span></div>
|
|
<div class="line"><a name="l00006"></a><span class="lineno"> 6</span> <span class="comment"> * LICENSE file in the root directory of this source tree.</span></div>
|
|
<div class="line"><a name="l00007"></a><span class="lineno"> 7</span> <span class="comment"> */</span></div>
|
|
<div class="line"><a name="l00008"></a><span class="lineno"> 8</span> </div>
|
|
<div class="line"><a name="l00009"></a><span class="lineno"> 9</span> <span class="comment">// Copyright 2004-present Facebook. All Rights Reserved.</span></div>
|
|
<div class="line"><a name="l00010"></a><span class="lineno"> 10</span> </div>
|
|
<div class="line"><a name="l00011"></a><span class="lineno"> 11</span> <span class="preprocessor">#include "PQScanMultiPassNoPrecomputed.cuh"</span></div>
|
|
<div class="line"><a name="l00012"></a><span class="lineno"> 12</span> <span class="preprocessor">#include "../GpuResources.h"</span></div>
|
|
<div class="line"><a name="l00013"></a><span class="lineno"> 13</span> <span class="preprocessor">#include "PQCodeDistances.cuh"</span></div>
|
|
<div class="line"><a name="l00014"></a><span class="lineno"> 14</span> <span class="preprocessor">#include "PQCodeLoad.cuh"</span></div>
|
|
<div class="line"><a name="l00015"></a><span class="lineno"> 15</span> <span class="preprocessor">#include "IVFUtils.cuh"</span></div>
|
|
<div class="line"><a name="l00016"></a><span class="lineno"> 16</span> <span class="preprocessor">#include "../utils/ConversionOperators.cuh"</span></div>
|
|
<div class="line"><a name="l00017"></a><span class="lineno"> 17</span> <span class="preprocessor">#include "../utils/DeviceTensor.cuh"</span></div>
|
|
<div class="line"><a name="l00018"></a><span class="lineno"> 18</span> <span class="preprocessor">#include "../utils/DeviceUtils.h"</span></div>
|
|
<div class="line"><a name="l00019"></a><span class="lineno"> 19</span> <span class="preprocessor">#include "../utils/Float16.cuh"</span></div>
|
|
<div class="line"><a name="l00020"></a><span class="lineno"> 20</span> <span class="preprocessor">#include "../utils/LoadStoreOperators.cuh"</span></div>
|
|
<div class="line"><a name="l00021"></a><span class="lineno"> 21</span> <span class="preprocessor">#include "../utils/NoTypeTensor.cuh"</span></div>
|
|
<div class="line"><a name="l00022"></a><span class="lineno"> 22</span> <span class="preprocessor">#include "../utils/StaticUtils.h"</span></div>
|
|
<div class="line"><a name="l00023"></a><span class="lineno"> 23</span> </div>
|
|
<div class="line"><a name="l00024"></a><span class="lineno"> 24</span> <span class="preprocessor">#include "../utils/HostTensor.cuh"</span></div>
|
|
<div class="line"><a name="l00025"></a><span class="lineno"> 25</span> </div>
|
|
<div class="line"><a name="l00026"></a><span class="lineno"> 26</span> <span class="keyword">namespace </span>faiss { <span class="keyword">namespace </span>gpu {</div>
|
|
<div class="line"><a name="l00027"></a><span class="lineno"> 27</span> </div>
|
|
<div class="line"><a name="l00028"></a><span class="lineno"> 28</span> <span class="comment">// This must be kept in sync with PQCodeDistances.cu</span></div>
|
|
<div class="line"><a name="l00029"></a><span class="lineno"> 29</span> <span class="keywordtype">bool</span> isSupportedNoPrecomputedSubDimSize(<span class="keywordtype">int</span> dims) {</div>
|
|
<div class="line"><a name="l00030"></a><span class="lineno"> 30</span>  <span class="keywordflow">switch</span> (dims) {</div>
|
|
<div class="line"><a name="l00031"></a><span class="lineno"> 31</span>  <span class="keywordflow">case</span> 1:</div>
|
|
<div class="line"><a name="l00032"></a><span class="lineno"> 32</span>  <span class="keywordflow">case</span> 2:</div>
|
|
<div class="line"><a name="l00033"></a><span class="lineno"> 33</span>  <span class="keywordflow">case</span> 3:</div>
|
|
<div class="line"><a name="l00034"></a><span class="lineno"> 34</span>  <span class="keywordflow">case</span> 4:</div>
|
|
<div class="line"><a name="l00035"></a><span class="lineno"> 35</span>  <span class="keywordflow">case</span> 6:</div>
|
|
<div class="line"><a name="l00036"></a><span class="lineno"> 36</span>  <span class="keywordflow">case</span> 8:</div>
|
|
<div class="line"><a name="l00037"></a><span class="lineno"> 37</span>  <span class="keywordflow">case</span> 10:</div>
|
|
<div class="line"><a name="l00038"></a><span class="lineno"> 38</span>  <span class="keywordflow">case</span> 12:</div>
|
|
<div class="line"><a name="l00039"></a><span class="lineno"> 39</span>  <span class="keywordflow">case</span> 16:</div>
|
|
<div class="line"><a name="l00040"></a><span class="lineno"> 40</span>  <span class="keywordflow">case</span> 20:</div>
|
|
<div class="line"><a name="l00041"></a><span class="lineno"> 41</span>  <span class="keywordflow">case</span> 24:</div>
|
|
<div class="line"><a name="l00042"></a><span class="lineno"> 42</span>  <span class="keywordflow">case</span> 28:</div>
|
|
<div class="line"><a name="l00043"></a><span class="lineno"> 43</span>  <span class="keywordflow">case</span> 32:</div>
|
|
<div class="line"><a name="l00044"></a><span class="lineno"> 44</span>  <span class="keywordflow">return</span> <span class="keyword">true</span>;</div>
|
|
<div class="line"><a name="l00045"></a><span class="lineno"> 45</span>  <span class="keywordflow">default</span>:</div>
|
|
<div class="line"><a name="l00046"></a><span class="lineno"> 46</span>  <span class="comment">// FIXME: larger sizes require too many registers - we need the</span></div>
|
|
<div class="line"><a name="l00047"></a><span class="lineno"> 47</span>  <span class="comment">// MM implementation working</span></div>
|
|
<div class="line"><a name="l00048"></a><span class="lineno"> 48</span>  <span class="keywordflow">return</span> <span class="keyword">false</span>;</div>
|
|
<div class="line"><a name="l00049"></a><span class="lineno"> 49</span>  }</div>
|
|
<div class="line"><a name="l00050"></a><span class="lineno"> 50</span> }</div>
|
|
<div class="line"><a name="l00051"></a><span class="lineno"> 51</span> </div>
|
|
<div class="line"><a name="l00052"></a><span class="lineno"> 52</span> <span class="keyword">template</span> <<span class="keyword">typename</span> LookupT, <span class="keyword">typename</span> LookupVecT></div>
|
|
<div class="line"><a name="l00053"></a><span class="lineno"><a class="line" href="structfaiss_1_1gpu_1_1LoadCodeDistances.html"> 53</a></span> <span class="keyword">struct </span><a class="code" href="structfaiss_1_1gpu_1_1LoadCodeDistances.html">LoadCodeDistances</a> {</div>
|
|
<div class="line"><a name="l00054"></a><span class="lineno"> 54</span>  <span class="keyword">static</span> <span class="keyword">inline</span> __device__ <span class="keywordtype">void</span> load(LookupT* smem,</div>
|
|
<div class="line"><a name="l00055"></a><span class="lineno"> 55</span>  LookupT* codes,</div>
|
|
<div class="line"><a name="l00056"></a><span class="lineno"> 56</span>  <span class="keywordtype">int</span> numCodes) {</div>
|
|
<div class="line"><a name="l00057"></a><span class="lineno"> 57</span>  constexpr <span class="keywordtype">int</span> kWordSize = <span class="keyword">sizeof</span>(LookupVecT) / <span class="keyword">sizeof</span>(LookupT);</div>
|
|
<div class="line"><a name="l00058"></a><span class="lineno"> 58</span> </div>
|
|
<div class="line"><a name="l00059"></a><span class="lineno"> 59</span>  <span class="comment">// We can only use the vector type if the data is guaranteed to be</span></div>
|
|
<div class="line"><a name="l00060"></a><span class="lineno"> 60</span>  <span class="comment">// aligned. The codes are innermost, so if it is evenly divisible,</span></div>
|
|
<div class="line"><a name="l00061"></a><span class="lineno"> 61</span>  <span class="comment">// then any slice will be aligned.</span></div>
|
|
<div class="line"><a name="l00062"></a><span class="lineno"> 62</span>  <span class="keywordflow">if</span> (numCodes % kWordSize == 0) {</div>
|
|
<div class="line"><a name="l00063"></a><span class="lineno"> 63</span>  <span class="comment">// Load the data by float4 for efficiency, and then handle any remainder</span></div>
|
|
<div class="line"><a name="l00064"></a><span class="lineno"> 64</span>  <span class="comment">// limitVec is the number of whole vec words we can load, in terms</span></div>
|
|
<div class="line"><a name="l00065"></a><span class="lineno"> 65</span>  <span class="comment">// of whole blocks performing the load</span></div>
|
|
<div class="line"><a name="l00066"></a><span class="lineno"> 66</span>  constexpr <span class="keywordtype">int</span> kUnroll = 2;</div>
|
|
<div class="line"><a name="l00067"></a><span class="lineno"> 67</span>  <span class="keywordtype">int</span> limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);</div>
|
|
<div class="line"><a name="l00068"></a><span class="lineno"> 68</span>  limitVec *= kUnroll * blockDim.x;</div>
|
|
<div class="line"><a name="l00069"></a><span class="lineno"> 69</span> </div>
|
|
<div class="line"><a name="l00070"></a><span class="lineno"> 70</span>  LookupVecT* smemV = (LookupVecT*) smem;</div>
|
|
<div class="line"><a name="l00071"></a><span class="lineno"> 71</span>  LookupVecT* codesV = (LookupVecT*) codes;</div>
|
|
<div class="line"><a name="l00072"></a><span class="lineno"> 72</span> </div>
|
|
<div class="line"><a name="l00073"></a><span class="lineno"> 73</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {</div>
|
|
<div class="line"><a name="l00074"></a><span class="lineno"> 74</span>  LookupVecT vals[kUnroll];</div>
|
|
<div class="line"><a name="l00075"></a><span class="lineno"> 75</span> </div>
|
|
<div class="line"><a name="l00076"></a><span class="lineno"> 76</span> <span class="preprocessor">#pragma unroll</span></div>
|
|
<div class="line"><a name="l00077"></a><span class="lineno"> 77</span> <span class="preprocessor"></span> <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j < kUnroll; ++j) {</div>
|
|
<div class="line"><a name="l00078"></a><span class="lineno"> 78</span>  vals[j] =</div>
|
|
<div class="line"><a name="l00079"></a><span class="lineno"> 79</span>  <a class="code" href="structfaiss_1_1gpu_1_1LoadStore.html">LoadStore<LookupVecT>::load</a>(&codesV[i + j * blockDim.x]);</div>
|
|
<div class="line"><a name="l00080"></a><span class="lineno"> 80</span>  }</div>
|
|
<div class="line"><a name="l00081"></a><span class="lineno"> 81</span> </div>
|
|
<div class="line"><a name="l00082"></a><span class="lineno"> 82</span> <span class="preprocessor">#pragma unroll</span></div>
|
|
<div class="line"><a name="l00083"></a><span class="lineno"> 83</span> <span class="preprocessor"></span> <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j < kUnroll; ++j) {</div>
|
|
<div class="line"><a name="l00084"></a><span class="lineno"> 84</span>  <a class="code" href="structfaiss_1_1gpu_1_1LoadStore.html">LoadStore<LookupVecT>::store</a>(&smemV[i + j * blockDim.x], vals[j]);</div>
|
|
<div class="line"><a name="l00085"></a><span class="lineno"> 85</span>  }</div>
|
|
<div class="line"><a name="l00086"></a><span class="lineno"> 86</span>  }</div>
|
|
<div class="line"><a name="l00087"></a><span class="lineno"> 87</span> </div>
|
|
<div class="line"><a name="l00088"></a><span class="lineno"> 88</span>  <span class="comment">// This is where we start loading the remainder that does not evenly</span></div>
|
|
<div class="line"><a name="l00089"></a><span class="lineno"> 89</span>  <span class="comment">// fit into kUnroll x blockDim.x</span></div>
|
|
<div class="line"><a name="l00090"></a><span class="lineno"> 90</span>  <span class="keywordtype">int</span> remainder = limitVec * kWordSize;</div>
|
|
<div class="line"><a name="l00091"></a><span class="lineno"> 91</span> </div>
|
|
<div class="line"><a name="l00092"></a><span class="lineno"> 92</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {</div>
|
|
<div class="line"><a name="l00093"></a><span class="lineno"> 93</span>  smem[i] = codes[i];</div>
|
|
<div class="line"><a name="l00094"></a><span class="lineno"> 94</span>  }</div>
|
|
<div class="line"><a name="l00095"></a><span class="lineno"> 95</span>  } <span class="keywordflow">else</span> {</div>
|
|
<div class="line"><a name="l00096"></a><span class="lineno"> 96</span>  <span class="comment">// Potential unaligned load</span></div>
|
|
<div class="line"><a name="l00097"></a><span class="lineno"> 97</span>  constexpr <span class="keywordtype">int</span> kUnroll = 4;</div>
|
|
<div class="line"><a name="l00098"></a><span class="lineno"> 98</span> </div>
|
|
<div class="line"><a name="l00099"></a><span class="lineno"> 99</span>  <span class="keywordtype">int</span> limit = utils::roundDown(numCodes, kUnroll * blockDim.x);</div>
|
|
<div class="line"><a name="l00100"></a><span class="lineno"> 100</span> </div>
|
|
<div class="line"><a name="l00101"></a><span class="lineno"> 101</span>  <span class="keywordtype">int</span> i = threadIdx.x;</div>
|
|
<div class="line"><a name="l00102"></a><span class="lineno"> 102</span>  <span class="keywordflow">for</span> (; i < limit; i += kUnroll * blockDim.x) {</div>
|
|
<div class="line"><a name="l00103"></a><span class="lineno"> 103</span>  LookupT vals[kUnroll];</div>
|
|
<div class="line"><a name="l00104"></a><span class="lineno"> 104</span> </div>
|
|
<div class="line"><a name="l00105"></a><span class="lineno"> 105</span> <span class="preprocessor">#pragma unroll</span></div>
|
|
<div class="line"><a name="l00106"></a><span class="lineno"> 106</span> <span class="preprocessor"></span> <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j < kUnroll; ++j) {</div>
|
|
<div class="line"><a name="l00107"></a><span class="lineno"> 107</span>  vals[j] = codes[i + j * blockDim.x];</div>
|
|
<div class="line"><a name="l00108"></a><span class="lineno"> 108</span>  }</div>
|
|
<div class="line"><a name="l00109"></a><span class="lineno"> 109</span> </div>
|
|
<div class="line"><a name="l00110"></a><span class="lineno"> 110</span> <span class="preprocessor">#pragma unroll</span></div>
|
|
<div class="line"><a name="l00111"></a><span class="lineno"> 111</span> <span class="preprocessor"></span> <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j < kUnroll; ++j) {</div>
|
|
<div class="line"><a name="l00112"></a><span class="lineno"> 112</span>  smem[i + j * blockDim.x] = vals[j];</div>
|
|
<div class="line"><a name="l00113"></a><span class="lineno"> 113</span>  }</div>
|
|
<div class="line"><a name="l00114"></a><span class="lineno"> 114</span>  }</div>
|
|
<div class="line"><a name="l00115"></a><span class="lineno"> 115</span> </div>
|
|
<div class="line"><a name="l00116"></a><span class="lineno"> 116</span>  <span class="keywordflow">for</span> (; i < numCodes; i += blockDim.x) {</div>
|
|
<div class="line"><a name="l00117"></a><span class="lineno"> 117</span>  smem[i] = codes[i];</div>
|
|
<div class="line"><a name="l00118"></a><span class="lineno"> 118</span>  }</div>
|
|
<div class="line"><a name="l00119"></a><span class="lineno"> 119</span>  }</div>
|
|
<div class="line"><a name="l00120"></a><span class="lineno"> 120</span>  }</div>
|
|
<div class="line"><a name="l00121"></a><span class="lineno"> 121</span> };</div>
|
|
<div class="line"><a name="l00122"></a><span class="lineno"> 122</span> </div>
|
|
<div class="line"><a name="l00123"></a><span class="lineno"> 123</span> <span class="keyword">template</span> <<span class="keywordtype">int</span> NumSubQuantizers, <span class="keyword">typename</span> LookupT, <span class="keyword">typename</span> LookupVecT></div>
|
|
<div class="line"><a name="l00124"></a><span class="lineno"> 124</span> __global__ <span class="keywordtype">void</span></div>
|
|
<div class="line"><a name="l00125"></a><span class="lineno"> 125</span> pqScanNoPrecomputedMultiPass(<a class="code" href="classfaiss_1_1gpu_1_1Tensor.html">Tensor<float, 2, true></a> queries,</div>
|
|
<div class="line"><a name="l00126"></a><span class="lineno"> 126</span>  <a class="code" href="classfaiss_1_1gpu_1_1Tensor.html">Tensor<float, 3, true></a> pqCentroids,</div>
|
|
<div class="line"><a name="l00127"></a><span class="lineno"> 127</span>  <a class="code" href="classfaiss_1_1gpu_1_1Tensor.html">Tensor<int, 2, true></a> topQueryToCentroid,</div>
|
|
<div class="line"><a name="l00128"></a><span class="lineno"> 128</span>  <a class="code" href="classfaiss_1_1gpu_1_1Tensor.html">Tensor<LookupT, 4, true></a> codeDistances,</div>
|
|
<div class="line"><a name="l00129"></a><span class="lineno"> 129</span>  <span class="keywordtype">void</span>** listCodes,</div>
|
|
<div class="line"><a name="l00130"></a><span class="lineno"> 130</span>  <span class="keywordtype">int</span>* listLengths,</div>
|
|
<div class="line"><a name="l00131"></a><span class="lineno"> 131</span>  <a class="code" href="classfaiss_1_1gpu_1_1Tensor.html">Tensor<int, 2, true></a> prefixSumOffsets,</div>
|
|
<div class="line"><a name="l00132"></a><span class="lineno"> 132</span>  <a class="code" href="classfaiss_1_1gpu_1_1Tensor.html">Tensor<float, 1, true></a> distance) {</div>
|
|
<div class="line"><a name="l00133"></a><span class="lineno"> 133</span>  <span class="keyword">const</span> <span class="keyword">auto</span> codesPerSubQuantizer = pqCentroids.<a class="code" href="classfaiss_1_1gpu_1_1Tensor.html#a6699c311648457f257afa340c61f417c">getSize</a>(2);</div>
|
|
<div class="line"><a name="l00134"></a><span class="lineno"> 134</span> </div>
|
|
<div class="line"><a name="l00135"></a><span class="lineno"> 135</span>  <span class="comment">// Where the pq code -> residual distance is stored</span></div>
|
|
<div class="line"><a name="l00136"></a><span class="lineno"> 136</span>  <span class="keyword">extern</span> __shared__ <span class="keywordtype">char</span> smemCodeDistances[];</div>
|
|
<div class="line"><a name="l00137"></a><span class="lineno"> 137</span>  LookupT* codeDist = (LookupT*) smemCodeDistances;</div>
|
|
<div class="line"><a name="l00138"></a><span class="lineno"> 138</span> </div>
|
|
<div class="line"><a name="l00139"></a><span class="lineno"> 139</span>  <span class="comment">// Each block handles a single query</span></div>
|
|
<div class="line"><a name="l00140"></a><span class="lineno"> 140</span>  <span class="keyword">auto</span> queryId = blockIdx.y;</div>
|
|
<div class="line"><a name="l00141"></a><span class="lineno"> 141</span>  <span class="keyword">auto</span> probeId = blockIdx.x;</div>
|
|
<div class="line"><a name="l00142"></a><span class="lineno"> 142</span> </div>
|
|
<div class="line"><a name="l00143"></a><span class="lineno"> 143</span>  <span class="comment">// This is where we start writing out data</span></div>
|
|
<div class="line"><a name="l00144"></a><span class="lineno"> 144</span>  <span class="comment">// We ensure that before the array (at offset -1), there is a 0 value</span></div>
|
|
<div class="line"><a name="l00145"></a><span class="lineno"> 145</span>  <span class="keywordtype">int</span> outBase = *(prefixSumOffsets[queryId][probeId].<a class="code" href="classfaiss_1_1gpu_1_1Tensor.html#a50411ce4d0fa32ef715e3321b6e33212">data</a>() - 1);</div>
|
|
<div class="line"><a name="l00146"></a><span class="lineno"> 146</span>  <span class="keywordtype">float</span>* distanceOut = distance[outBase].<a class="code" href="classfaiss_1_1gpu_1_1Tensor.html#a50411ce4d0fa32ef715e3321b6e33212">data</a>();</div>
|
|
<div class="line"><a name="l00147"></a><span class="lineno"> 147</span> </div>
|
|
<div class="line"><a name="l00148"></a><span class="lineno"> 148</span>  <span class="keyword">auto</span> listId = topQueryToCentroid[queryId][probeId];</div>
|
|
<div class="line"><a name="l00149"></a><span class="lineno"> 149</span>  <span class="comment">// Safety guard in case NaNs in input cause no list ID to be generated</span></div>
|
|
<div class="line"><a name="l00150"></a><span class="lineno"> 150</span>  <span class="keywordflow">if</span> (listId == -1) {</div>
|
|
<div class="line"><a name="l00151"></a><span class="lineno"> 151</span>  <span class="keywordflow">return</span>;</div>
|
|
<div class="line"><a name="l00152"></a><span class="lineno"> 152</span>  }</div>
|
|
<div class="line"><a name="l00153"></a><span class="lineno"> 153</span> </div>
|
|
<div class="line"><a name="l00154"></a><span class="lineno"> 154</span>  <span class="keywordtype">unsigned</span> <span class="keywordtype">char</span>* codeList = (<span class="keywordtype">unsigned</span> <span class="keywordtype">char</span>*) listCodes[listId];</div>
|
|
<div class="line"><a name="l00155"></a><span class="lineno"> 155</span>  <span class="keywordtype">int</span> limit = listLengths[listId];</div>
|
|
<div class="line"><a name="l00156"></a><span class="lineno"> 156</span> </div>
|
|
<div class="line"><a name="l00157"></a><span class="lineno"> 157</span>  constexpr <span class="keywordtype">int</span> kNumCode32 = NumSubQuantizers <= 4 ? 1 :</div>
|
|
<div class="line"><a name="l00158"></a><span class="lineno"> 158</span>  (NumSubQuantizers / 4);</div>
|
|
<div class="line"><a name="l00159"></a><span class="lineno"> 159</span>  <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> code32[kNumCode32];</div>
|
|
<div class="line"><a name="l00160"></a><span class="lineno"> 160</span>  <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> nextCode32[kNumCode32];</div>
|
|
<div class="line"><a name="l00161"></a><span class="lineno"> 161</span> </div>
|
|
<div class="line"><a name="l00162"></a><span class="lineno"> 162</span>  <span class="comment">// We double-buffer the code loading, which improves memory utilization</span></div>
|
|
<div class="line"><a name="l00163"></a><span class="lineno"> 163</span>  <span class="keywordflow">if</span> (threadIdx.x < limit) {</div>
|
|
<div class="line"><a name="l00164"></a><span class="lineno"> 164</span>  LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);</div>
|
|
<div class="line"><a name="l00165"></a><span class="lineno"> 165</span>  }</div>
|
|
<div class="line"><a name="l00166"></a><span class="lineno"> 166</span> </div>
|
|
<div class="line"><a name="l00167"></a><span class="lineno"> 167</span>  LoadCodeDistances<LookupT, LookupVecT>::load(</div>
|
|
<div class="line"><a name="l00168"></a><span class="lineno"> 168</span>  codeDist,</div>
|
|
<div class="line"><a name="l00169"></a><span class="lineno"> 169</span>  codeDistances[queryId][probeId].data(),</div>
|
|
<div class="line"><a name="l00170"></a><span class="lineno"> 170</span>  codeDistances.<a class="code" href="classfaiss_1_1gpu_1_1Tensor.html#a6699c311648457f257afa340c61f417c">getSize</a>(2) * codeDistances.<a class="code" href="classfaiss_1_1gpu_1_1Tensor.html#a6699c311648457f257afa340c61f417c">getSize</a>(3));</div>
|
|
<div class="line"><a name="l00171"></a><span class="lineno"> 171</span> </div>
|
|
<div class="line"><a name="l00172"></a><span class="lineno"> 172</span>  <span class="comment">// Prevent WAR dependencies</span></div>
|
|
<div class="line"><a name="l00173"></a><span class="lineno"> 173</span>  __syncthreads();</div>
|
|
<div class="line"><a name="l00174"></a><span class="lineno"> 174</span> </div>
|
|
<div class="line"><a name="l00175"></a><span class="lineno"> 175</span>  <span class="comment">// Each thread handles one code element in the list, with a</span></div>
|
|
<div class="line"><a name="l00176"></a><span class="lineno"> 176</span>  <span class="comment">// block-wide stride</span></div>
|
|
<div class="line"><a name="l00177"></a><span class="lineno"> 177</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> codeIndex = threadIdx.x;</div>
|
|
<div class="line"><a name="l00178"></a><span class="lineno"> 178</span>  codeIndex < limit;</div>
|
|
<div class="line"><a name="l00179"></a><span class="lineno"> 179</span>  codeIndex += blockDim.x) {</div>
|
|
<div class="line"><a name="l00180"></a><span class="lineno"> 180</span>  <span class="comment">// Prefetch next codes</span></div>
|
|
<div class="line"><a name="l00181"></a><span class="lineno"> 181</span>  <span class="keywordflow">if</span> (codeIndex + blockDim.x < limit) {</div>
|
|
<div class="line"><a name="l00182"></a><span class="lineno"> 182</span>  LoadCode32<NumSubQuantizers>::load(</div>
|
|
<div class="line"><a name="l00183"></a><span class="lineno"> 183</span>  nextCode32, codeList, codeIndex + blockDim.x);</div>
|
|
<div class="line"><a name="l00184"></a><span class="lineno"> 184</span>  }</div>
|
|
<div class="line"><a name="l00185"></a><span class="lineno"> 185</span> </div>
|
|
<div class="line"><a name="l00186"></a><span class="lineno"> 186</span>  <span class="keywordtype">float</span> dist = 0.0f;</div>
|
|
<div class="line"><a name="l00187"></a><span class="lineno"> 187</span> </div>
|
|
<div class="line"><a name="l00188"></a><span class="lineno"> 188</span> <span class="preprocessor">#pragma unroll</span></div>
|
|
<div class="line"><a name="l00189"></a><span class="lineno"> 189</span> <span class="preprocessor"></span> <span class="keywordflow">for</span> (<span class="keywordtype">int</span> word = 0; word < kNumCode32; ++word) {</div>
|
|
<div class="line"><a name="l00190"></a><span class="lineno"> 190</span>  constexpr <span class="keywordtype">int</span> kBytesPerCode32 =</div>
|
|
<div class="line"><a name="l00191"></a><span class="lineno"> 191</span>  NumSubQuantizers < 4 ? NumSubQuantizers : 4;</div>
|
|
<div class="line"><a name="l00192"></a><span class="lineno"> 192</span> </div>
|
|
<div class="line"><a name="l00193"></a><span class="lineno"> 193</span>  <span class="keywordflow">if</span> (kBytesPerCode32 == 1) {</div>
|
|
<div class="line"><a name="l00194"></a><span class="lineno"> 194</span>  <span class="keyword">auto</span> code = code32[0];</div>
|
|
<div class="line"><a name="l00195"></a><span class="lineno"> 195</span>  dist = ConvertTo<float>::to(codeDist[code]);</div>
|
|
<div class="line"><a name="l00196"></a><span class="lineno"> 196</span> </div>
|
|
<div class="line"><a name="l00197"></a><span class="lineno"> 197</span>  } <span class="keywordflow">else</span> {</div>
|
|
<div class="line"><a name="l00198"></a><span class="lineno"> 198</span> <span class="preprocessor">#pragma unroll</span></div>
|
|
<div class="line"><a name="l00199"></a><span class="lineno"> 199</span> <span class="preprocessor"></span> <span class="keywordflow">for</span> (<span class="keywordtype">int</span> byte = 0; byte < kBytesPerCode32; ++byte) {</div>
|
|
<div class="line"><a name="l00200"></a><span class="lineno"> 200</span>  <span class="keyword">auto</span> code = getByte(code32[word], byte * 8, 8);</div>
|
|
<div class="line"><a name="l00201"></a><span class="lineno"> 201</span> </div>
|
|
<div class="line"><a name="l00202"></a><span class="lineno"> 202</span>  <span class="keyword">auto</span> offset =</div>
|
|
<div class="line"><a name="l00203"></a><span class="lineno"> 203</span>  codesPerSubQuantizer * (word * kBytesPerCode32 + byte);</div>
|
|
<div class="line"><a name="l00204"></a><span class="lineno"> 204</span> </div>
|
|
<div class="line"><a name="l00205"></a><span class="lineno"> 205</span>  dist += ConvertTo<float>::to(codeDist[offset + code]);</div>
|
|
<div class="line"><a name="l00206"></a><span class="lineno"> 206</span>  }</div>
|
|
<div class="line"><a name="l00207"></a><span class="lineno"> 207</span>  }</div>
|
|
<div class="line"><a name="l00208"></a><span class="lineno"> 208</span>  }</div>
|
|
<div class="line"><a name="l00209"></a><span class="lineno"> 209</span> </div>
|
|
<div class="line"><a name="l00210"></a><span class="lineno"> 210</span>  <span class="comment">// Write out intermediate distance result</span></div>
|
|
<div class="line"><a name="l00211"></a><span class="lineno"> 211</span>  <span class="comment">// We do not maintain indices here, in order to reduce global</span></div>
|
|
<div class="line"><a name="l00212"></a><span class="lineno"> 212</span>  <span class="comment">// memory traffic. Those are recovered in the final selection step.</span></div>
|
|
<div class="line"><a name="l00213"></a><span class="lineno"> 213</span>  distanceOut[codeIndex] = dist;</div>
|
|
<div class="line"><a name="l00214"></a><span class="lineno"> 214</span> </div>
|
|
<div class="line"><a name="l00215"></a><span class="lineno"> 215</span>  <span class="comment">// Rotate buffers</span></div>
|
|
<div class="line"><a name="l00216"></a><span class="lineno"> 216</span> <span class="preprocessor">#pragma unroll</span></div>
|
|
<div class="line"><a name="l00217"></a><span class="lineno"> 217</span> <span class="preprocessor"></span> <span class="keywordflow">for</span> (<span class="keywordtype">int</span> word = 0; word < kNumCode32; ++word) {</div>
|
|
<div class="line"><a name="l00218"></a><span class="lineno"> 218</span>  code32[word] = nextCode32[word];</div>
|
|
<div class="line"><a name="l00219"></a><span class="lineno"> 219</span>  }</div>
|
|
<div class="line"><a name="l00220"></a><span class="lineno"> 220</span>  }</div>
|
|
<div class="line"><a name="l00221"></a><span class="lineno"> 221</span> }</div>
|
|
<div class="line"><a name="l00222"></a><span class="lineno"> 222</span> </div>
|
|
<div class="line"><a name="l00223"></a><span class="lineno"> 223</span> <span class="keywordtype">void</span></div>
|
|
<div class="line"><a name="l00224"></a><span class="lineno"> 224</span> runMultiPassTile(Tensor<float, 2, true>& queries,</div>
|
|
<div class="line"><a name="l00225"></a><span class="lineno"> 225</span>  Tensor<float, 2, true>& centroids,</div>
|
|
<div class="line"><a name="l00226"></a><span class="lineno"> 226</span>  Tensor<float, 3, true>& pqCentroidsInnermostCode,</div>
|
|
<div class="line"><a name="l00227"></a><span class="lineno"> 227</span>  NoTypeTensor<4, true>& codeDistances,</div>
|
|
<div class="line"><a name="l00228"></a><span class="lineno"> 228</span>  Tensor<int, 2, true>& topQueryToCentroid,</div>
|
|
<div class="line"><a name="l00229"></a><span class="lineno"> 229</span>  <span class="keywordtype">bool</span> useFloat16Lookup,</div>
|
|
<div class="line"><a name="l00230"></a><span class="lineno"> 230</span>  <span class="keywordtype">int</span> bytesPerCode,</div>
|
|
<div class="line"><a name="l00231"></a><span class="lineno"> 231</span>  <span class="keywordtype">int</span> numSubQuantizers,</div>
|
|
<div class="line"><a name="l00232"></a><span class="lineno"> 232</span>  <span class="keywordtype">int</span> numSubQuantizerCodes,</div>
|
|
<div class="line"><a name="l00233"></a><span class="lineno"> 233</span>  thrust::device_vector<void*>& listCodes,</div>
|
|
<div class="line"><a name="l00234"></a><span class="lineno"> 234</span>  thrust::device_vector<void*>& listIndices,</div>
|
|
<div class="line"><a name="l00235"></a><span class="lineno"> 235</span>  IndicesOptions indicesOptions,</div>
|
|
<div class="line"><a name="l00236"></a><span class="lineno"> 236</span>  thrust::device_vector<int>& listLengths,</div>
|
|
<div class="line"><a name="l00237"></a><span class="lineno"> 237</span>  Tensor<char, 1, true>& thrustMem,</div>
|
|
<div class="line"><a name="l00238"></a><span class="lineno"> 238</span>  Tensor<int, 2, true>& prefixSumOffsets,</div>
|
|
<div class="line"><a name="l00239"></a><span class="lineno"> 239</span>  Tensor<float, 1, true>& allDistances,</div>
|
|
<div class="line"><a name="l00240"></a><span class="lineno"> 240</span>  Tensor<float, 3, true>& heapDistances,</div>
|
|
<div class="line"><a name="l00241"></a><span class="lineno"> 241</span>  Tensor<int, 3, true>& heapIndices,</div>
|
|
<div class="line"><a name="l00242"></a><span class="lineno"> 242</span>  <span class="keywordtype">int</span> k,</div>
|
|
<div class="line"><a name="l00243"></a><span class="lineno"> 243</span>  Tensor<float, 2, true>& outDistances,</div>
|
|
<div class="line"><a name="l00244"></a><span class="lineno"> 244</span>  Tensor<long, 2, true>& outIndices,</div>
|
|
<div class="line"><a name="l00245"></a><span class="lineno"> 245</span>  cudaStream_t stream) {</div>
|
|
<div class="line"><a name="l00246"></a><span class="lineno"> 246</span> <span class="preprocessor">#ifndef FAISS_USE_FLOAT16</span></div>
|
|
<div class="line"><a name="l00247"></a><span class="lineno"> 247</span> <span class="preprocessor"></span> FAISS_ASSERT(!useFloat16Lookup);</div>
|
|
<div class="line"><a name="l00248"></a><span class="lineno"> 248</span> <span class="preprocessor">#endif</span></div>
|
|
<div class="line"><a name="l00249"></a><span class="lineno"> 249</span> <span class="preprocessor"></span></div>
|
|
<div class="line"><a name="l00250"></a><span class="lineno"> 250</span>  <span class="comment">// Calculate offset lengths, so we know where to write out</span></div>
|
|
<div class="line"><a name="l00251"></a><span class="lineno"> 251</span>  <span class="comment">// intermediate results</span></div>
|
|
<div class="line"><a name="l00252"></a><span class="lineno"> 252</span>  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,</div>
|
|
<div class="line"><a name="l00253"></a><span class="lineno"> 253</span>  thrustMem, stream);</div>
|
|
<div class="line"><a name="l00254"></a><span class="lineno"> 254</span> </div>
|
|
<div class="line"><a name="l00255"></a><span class="lineno"> 255</span>  <span class="comment">// Calculate residual code distances, since this is without</span></div>
|
|
<div class="line"><a name="l00256"></a><span class="lineno"> 256</span>  <span class="comment">// precomputed codes</span></div>
|
|
<div class="line"><a name="l00257"></a><span class="lineno"> 257</span>  runPQCodeDistances(pqCentroidsInnermostCode,</div>
|
|
<div class="line"><a name="l00258"></a><span class="lineno"> 258</span>  queries,</div>
|
|
<div class="line"><a name="l00259"></a><span class="lineno"> 259</span>  centroids,</div>
|
|
<div class="line"><a name="l00260"></a><span class="lineno"> 260</span>  topQueryToCentroid,</div>
|
|
<div class="line"><a name="l00261"></a><span class="lineno"> 261</span>  codeDistances,</div>
|
|
<div class="line"><a name="l00262"></a><span class="lineno"> 262</span>  useFloat16Lookup,</div>
|
|
<div class="line"><a name="l00263"></a><span class="lineno"> 263</span>  stream);</div>
|
|
<div class="line"><a name="l00264"></a><span class="lineno"> 264</span> </div>
|
|
<div class="line"><a name="l00265"></a><span class="lineno"> 265</span>  <span class="comment">// Convert all codes to a distance, and write out (distance,</span></div>
|
|
<div class="line"><a name="l00266"></a><span class="lineno"> 266</span>  <span class="comment">// index) values for all intermediate results</span></div>
|
|
<div class="line"><a name="l00267"></a><span class="lineno"> 267</span>  {</div>
|
|
<div class="line"><a name="l00268"></a><span class="lineno"> 268</span>  <span class="keyword">auto</span> kThreadsPerBlock = 256;</div>
|
|
<div class="line"><a name="l00269"></a><span class="lineno"> 269</span> </div>
|
|
<div class="line"><a name="l00270"></a><span class="lineno"> 270</span>  <span class="keyword">auto</span> grid = dim3(topQueryToCentroid.getSize(1),</div>
|
|
<div class="line"><a name="l00271"></a><span class="lineno"> 271</span>  topQueryToCentroid.getSize(0));</div>
|
|
<div class="line"><a name="l00272"></a><span class="lineno"> 272</span>  <span class="keyword">auto</span> block = dim3(kThreadsPerBlock);</div>
|
|
<div class="line"><a name="l00273"></a><span class="lineno"> 273</span> </div>
|
|
<div class="line"><a name="l00274"></a><span class="lineno"> 274</span>  <span class="comment">// pq centroid distances</span></div>
|
|
<div class="line"><a name="l00275"></a><span class="lineno"> 275</span>  <span class="keyword">auto</span> smem = <span class="keyword">sizeof</span>(float);</div>
|
|
<div class="line"><a name="l00276"></a><span class="lineno"> 276</span> <span class="preprocessor">#ifdef FAISS_USE_FLOAT16</span></div>
|
|
<div class="line"><a name="l00277"></a><span class="lineno"> 277</span> <span class="preprocessor"></span> <span class="keywordflow">if</span> (useFloat16Lookup) {</div>
|
|
<div class="line"><a name="l00278"></a><span class="lineno"> 278</span>  smem = <span class="keyword">sizeof</span>(half);</div>
|
|
<div class="line"><a name="l00279"></a><span class="lineno"> 279</span>  }</div>
|
|
<div class="line"><a name="l00280"></a><span class="lineno"> 280</span> <span class="preprocessor">#endif</span></div>
|
|
<div class="line"><a name="l00281"></a><span class="lineno"> 281</span> <span class="preprocessor"></span> smem *= numSubQuantizers * numSubQuantizerCodes;</div>
|
|
<div class="line"><a name="l00282"></a><span class="lineno"> 282</span>  FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());</div>
|
|
<div class="line"><a name="l00283"></a><span class="lineno"> 283</span> </div>
|
|
<div class="line"><a name="l00284"></a><span class="lineno"> 284</span> <span class="preprocessor">#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \</span></div>
|
|
<div class="line"><a name="l00285"></a><span class="lineno"> 285</span> <span class="preprocessor"> do { \</span></div>
|
|
<div class="line"><a name="l00286"></a><span class="lineno"> 286</span> <span class="preprocessor"> auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>(); \</span></div>
|
|
<div class="line"><a name="l00287"></a><span class="lineno"> 287</span> <span class="preprocessor"> \</span></div>
|
|
<div class="line"><a name="l00288"></a><span class="lineno"> 288</span> <span class="preprocessor"> pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \</span></div>
|
|
<div class="line"><a name="l00289"></a><span class="lineno"> 289</span> <span class="preprocessor"> <<<grid, block, smem, stream>>>( \</span></div>
|
|
<div class="line"><a name="l00290"></a><span class="lineno"> 290</span> <span class="preprocessor"> queries, \</span></div>
|
|
<div class="line"><a name="l00291"></a><span class="lineno"> 291</span> <span class="preprocessor"> pqCentroidsInnermostCode, \</span></div>
|
|
<div class="line"><a name="l00292"></a><span class="lineno"> 292</span> <span class="preprocessor"> topQueryToCentroid, \</span></div>
|
|
<div class="line"><a name="l00293"></a><span class="lineno"> 293</span> <span class="preprocessor"> codeDistancesT, \</span></div>
|
|
<div class="line"><a name="l00294"></a><span class="lineno"> 294</span> <span class="preprocessor"> listCodes.data().get(), \</span></div>
|
|
<div class="line"><a name="l00295"></a><span class="lineno"> 295</span> <span class="preprocessor"> listLengths.data().get(), \</span></div>
|
|
<div class="line"><a name="l00296"></a><span class="lineno"> 296</span> <span class="preprocessor"> prefixSumOffsets, \</span></div>
|
|
<div class="line"><a name="l00297"></a><span class="lineno"> 297</span> <span class="preprocessor"> allDistances); \</span></div>
|
|
<div class="line"><a name="l00298"></a><span class="lineno"> 298</span> <span class="preprocessor"> } while (0)</span></div>
|
|
<div class="line"><a name="l00299"></a><span class="lineno"> 299</span> <span class="preprocessor"></span></div>
|
|
<div class="line"><a name="l00300"></a><span class="lineno"> 300</span> <span class="preprocessor">#ifdef FAISS_USE_FLOAT16</span></div>
|
|
<div class="line"><a name="l00301"></a><span class="lineno"> 301</span> <span class="preprocessor"></span><span class="preprocessor">#define RUN_PQ(NUM_SUB_Q) \</span></div>
|
|
<div class="line"><a name="l00302"></a><span class="lineno"> 302</span> <span class="preprocessor"> do { \</span></div>
|
|
<div class="line"><a name="l00303"></a><span class="lineno"> 303</span> <span class="preprocessor"> if (useFloat16Lookup) { \</span></div>
|
|
<div class="line"><a name="l00304"></a><span class="lineno"> 304</span> <span class="preprocessor"> RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \</span></div>
|
|
<div class="line"><a name="l00305"></a><span class="lineno"> 305</span> <span class="preprocessor"> } else { \</span></div>
|
|
<div class="line"><a name="l00306"></a><span class="lineno"> 306</span> <span class="preprocessor"> RUN_PQ_OPT(NUM_SUB_Q, float, float4); \</span></div>
|
|
<div class="line"><a name="l00307"></a><span class="lineno"> 307</span> <span class="preprocessor"> } \</span></div>
|
|
<div class="line"><a name="l00308"></a><span class="lineno"> 308</span> <span class="preprocessor"> } while (0)</span></div>
|
|
<div class="line"><a name="l00309"></a><span class="lineno"> 309</span> <span class="preprocessor"></span><span class="preprocessor">#else</span></div>
|
|
<div class="line"><a name="l00310"></a><span class="lineno"> 310</span> <span class="preprocessor"></span><span class="preprocessor">#define RUN_PQ(NUM_SUB_Q) \</span></div>
|
|
<div class="line"><a name="l00311"></a><span class="lineno"> 311</span> <span class="preprocessor"> do { \</span></div>
|
|
<div class="line"><a name="l00312"></a><span class="lineno"> 312</span> <span class="preprocessor"> RUN_PQ_OPT(NUM_SUB_Q, float, float4); \</span></div>
|
|
<div class="line"><a name="l00313"></a><span class="lineno"> 313</span> <span class="preprocessor"> } while (0)</span></div>
|
|
<div class="line"><a name="l00314"></a><span class="lineno"> 314</span> <span class="preprocessor"></span><span class="preprocessor">#endif // FAISS_USE_FLOAT16</span></div>
|
|
<div class="line"><a name="l00315"></a><span class="lineno"> 315</span> <span class="preprocessor"></span></div>
|
|
<div class="line"><a name="l00316"></a><span class="lineno"> 316</span>  <span class="keywordflow">switch</span> (bytesPerCode) {</div>
|
|
<div class="line"><a name="l00317"></a><span class="lineno"> 317</span>  <span class="keywordflow">case</span> 1:</div>
|
|
<div class="line"><a name="l00318"></a><span class="lineno"> 318</span>  RUN_PQ(1);</div>
|
|
<div class="line"><a name="l00319"></a><span class="lineno"> 319</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00320"></a><span class="lineno"> 320</span>  <span class="keywordflow">case</span> 2:</div>
|
|
<div class="line"><a name="l00321"></a><span class="lineno"> 321</span>  RUN_PQ(2);</div>
|
|
<div class="line"><a name="l00322"></a><span class="lineno"> 322</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00323"></a><span class="lineno"> 323</span>  <span class="keywordflow">case</span> 3:</div>
|
|
<div class="line"><a name="l00324"></a><span class="lineno"> 324</span>  RUN_PQ(3);</div>
|
|
<div class="line"><a name="l00325"></a><span class="lineno"> 325</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00326"></a><span class="lineno"> 326</span>  <span class="keywordflow">case</span> 4:</div>
|
|
<div class="line"><a name="l00327"></a><span class="lineno"> 327</span>  RUN_PQ(4);</div>
|
|
<div class="line"><a name="l00328"></a><span class="lineno"> 328</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00329"></a><span class="lineno"> 329</span>  <span class="keywordflow">case</span> 8:</div>
|
|
<div class="line"><a name="l00330"></a><span class="lineno"> 330</span>  RUN_PQ(8);</div>
|
|
<div class="line"><a name="l00331"></a><span class="lineno"> 331</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00332"></a><span class="lineno"> 332</span>  <span class="keywordflow">case</span> 12:</div>
|
|
<div class="line"><a name="l00333"></a><span class="lineno"> 333</span>  RUN_PQ(12);</div>
|
|
<div class="line"><a name="l00334"></a><span class="lineno"> 334</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00335"></a><span class="lineno"> 335</span>  <span class="keywordflow">case</span> 16:</div>
|
|
<div class="line"><a name="l00336"></a><span class="lineno"> 336</span>  RUN_PQ(16);</div>
|
|
<div class="line"><a name="l00337"></a><span class="lineno"> 337</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00338"></a><span class="lineno"> 338</span>  <span class="keywordflow">case</span> 20:</div>
|
|
<div class="line"><a name="l00339"></a><span class="lineno"> 339</span>  RUN_PQ(20);</div>
|
|
<div class="line"><a name="l00340"></a><span class="lineno"> 340</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00341"></a><span class="lineno"> 341</span>  <span class="keywordflow">case</span> 24:</div>
|
|
<div class="line"><a name="l00342"></a><span class="lineno"> 342</span>  RUN_PQ(24);</div>
|
|
<div class="line"><a name="l00343"></a><span class="lineno"> 343</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00344"></a><span class="lineno"> 344</span>  <span class="keywordflow">case</span> 28:</div>
|
|
<div class="line"><a name="l00345"></a><span class="lineno"> 345</span>  RUN_PQ(28);</div>
|
|
<div class="line"><a name="l00346"></a><span class="lineno"> 346</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00347"></a><span class="lineno"> 347</span>  <span class="keywordflow">case</span> 32:</div>
|
|
<div class="line"><a name="l00348"></a><span class="lineno"> 348</span>  RUN_PQ(32);</div>
|
|
<div class="line"><a name="l00349"></a><span class="lineno"> 349</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00350"></a><span class="lineno"> 350</span>  <span class="keywordflow">case</span> 40:</div>
|
|
<div class="line"><a name="l00351"></a><span class="lineno"> 351</span>  RUN_PQ(40);</div>
|
|
<div class="line"><a name="l00352"></a><span class="lineno"> 352</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00353"></a><span class="lineno"> 353</span>  <span class="keywordflow">case</span> 48:</div>
|
|
<div class="line"><a name="l00354"></a><span class="lineno"> 354</span>  RUN_PQ(48);</div>
|
|
<div class="line"><a name="l00355"></a><span class="lineno"> 355</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00356"></a><span class="lineno"> 356</span>  <span class="keywordflow">case</span> 56:</div>
|
|
<div class="line"><a name="l00357"></a><span class="lineno"> 357</span>  RUN_PQ(56);</div>
|
|
<div class="line"><a name="l00358"></a><span class="lineno"> 358</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00359"></a><span class="lineno"> 359</span>  <span class="keywordflow">case</span> 64:</div>
|
|
<div class="line"><a name="l00360"></a><span class="lineno"> 360</span>  RUN_PQ(64);</div>
|
|
<div class="line"><a name="l00361"></a><span class="lineno"> 361</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00362"></a><span class="lineno"> 362</span>  <span class="keywordflow">case</span> 96:</div>
|
|
<div class="line"><a name="l00363"></a><span class="lineno"> 363</span>  RUN_PQ(96);</div>
|
|
<div class="line"><a name="l00364"></a><span class="lineno"> 364</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00365"></a><span class="lineno"> 365</span>  <span class="keywordflow">default</span>:</div>
|
|
<div class="line"><a name="l00366"></a><span class="lineno"> 366</span>  FAISS_ASSERT(<span class="keyword">false</span>);</div>
|
|
<div class="line"><a name="l00367"></a><span class="lineno"> 367</span>  <span class="keywordflow">break</span>;</div>
|
|
<div class="line"><a name="l00368"></a><span class="lineno"> 368</span>  }</div>
|
|
<div class="line"><a name="l00369"></a><span class="lineno"> 369</span> </div>
|
|
<div class="line"><a name="l00370"></a><span class="lineno"> 370</span> <span class="preprocessor">#undef RUN_PQ</span></div>
|
|
<div class="line"><a name="l00371"></a><span class="lineno"> 371</span> <span class="preprocessor"></span><span class="preprocessor">#undef RUN_PQ_OPT</span></div>
|
|
<div class="line"><a name="l00372"></a><span class="lineno"> 372</span> <span class="preprocessor"></span> }</div>
|
|
<div class="line"><a name="l00373"></a><span class="lineno"> 373</span> </div>
|
|
<div class="line"><a name="l00374"></a><span class="lineno"> 374</span>  CUDA_TEST_ERROR();</div>
|
|
<div class="line"><a name="l00375"></a><span class="lineno"> 375</span> </div>
|
|
<div class="line"><a name="l00376"></a><span class="lineno"> 376</span>  <span class="comment">// k-select the output in chunks, to increase parallelism</span></div>
|
|
<div class="line"><a name="l00377"></a><span class="lineno"> 377</span>  runPass1SelectLists(prefixSumOffsets,</div>
|
|
<div class="line"><a name="l00378"></a><span class="lineno"> 378</span>  allDistances,</div>
|
|
<div class="line"><a name="l00379"></a><span class="lineno"> 379</span>  topQueryToCentroid.getSize(1),</div>
|
|
<div class="line"><a name="l00380"></a><span class="lineno"> 380</span>  k,</div>
|
|
<div class="line"><a name="l00381"></a><span class="lineno"> 381</span>  <span class="keyword">false</span>, <span class="comment">// L2 distance chooses smallest</span></div>
|
|
<div class="line"><a name="l00382"></a><span class="lineno"> 382</span>  heapDistances,</div>
|
|
<div class="line"><a name="l00383"></a><span class="lineno"> 383</span>  heapIndices,</div>
|
|
<div class="line"><a name="l00384"></a><span class="lineno"> 384</span>  stream);</div>
|
|
<div class="line"><a name="l00385"></a><span class="lineno"> 385</span> </div>
|
|
<div class="line"><a name="l00386"></a><span class="lineno"> 386</span>  <span class="comment">// k-select final output</span></div>
|
|
<div class="line"><a name="l00387"></a><span class="lineno"> 387</span>  <span class="keyword">auto</span> flatHeapDistances = heapDistances.downcastInner<2>();</div>
|
|
<div class="line"><a name="l00388"></a><span class="lineno"> 388</span>  <span class="keyword">auto</span> flatHeapIndices = heapIndices.downcastInner<2>();</div>
|
|
<div class="line"><a name="l00389"></a><span class="lineno"> 389</span> </div>
|
|
<div class="line"><a name="l00390"></a><span class="lineno"> 390</span>  runPass2SelectLists(flatHeapDistances,</div>
|
|
<div class="line"><a name="l00391"></a><span class="lineno"> 391</span>  flatHeapIndices,</div>
|
|
<div class="line"><a name="l00392"></a><span class="lineno"> 392</span>  listIndices,</div>
|
|
<div class="line"><a name="l00393"></a><span class="lineno"> 393</span>  indicesOptions,</div>
|
|
<div class="line"><a name="l00394"></a><span class="lineno"> 394</span>  prefixSumOffsets,</div>
|
|
<div class="line"><a name="l00395"></a><span class="lineno"> 395</span>  topQueryToCentroid,</div>
|
|
<div class="line"><a name="l00396"></a><span class="lineno"> 396</span>  k,</div>
|
|
<div class="line"><a name="l00397"></a><span class="lineno"> 397</span>  <span class="keyword">false</span>, <span class="comment">// L2 distance chooses smallest</span></div>
|
|
<div class="line"><a name="l00398"></a><span class="lineno"> 398</span>  outDistances,</div>
|
|
<div class="line"><a name="l00399"></a><span class="lineno"> 399</span>  outIndices,</div>
|
|
<div class="line"><a name="l00400"></a><span class="lineno"> 400</span>  stream);</div>
|
|
<div class="line"><a name="l00401"></a><span class="lineno"> 401</span> }</div>
|
|
<div class="line"><a name="l00402"></a><span class="lineno"> 402</span> </div>
|
|
<div class="line"><a name="l00403"></a><span class="lineno"> 403</span> <span class="keywordtype">void</span> runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,</div>
|
|
<div class="line"><a name="l00404"></a><span class="lineno"> 404</span>  Tensor<float, 2, true>& centroids,</div>
|
|
<div class="line"><a name="l00405"></a><span class="lineno"> 405</span>  Tensor<float, 3, true>& pqCentroidsInnermostCode,</div>
|
|
<div class="line"><a name="l00406"></a><span class="lineno"> 406</span>  Tensor<int, 2, true>& topQueryToCentroid,</div>
|
|
<div class="line"><a name="l00407"></a><span class="lineno"> 407</span>  <span class="keywordtype">bool</span> useFloat16Lookup,</div>
|
|
<div class="line"><a name="l00408"></a><span class="lineno"> 408</span>  <span class="keywordtype">int</span> bytesPerCode,</div>
|
|
<div class="line"><a name="l00409"></a><span class="lineno"> 409</span>  <span class="keywordtype">int</span> numSubQuantizers,</div>
|
|
<div class="line"><a name="l00410"></a><span class="lineno"> 410</span>  <span class="keywordtype">int</span> numSubQuantizerCodes,</div>
|
|
<div class="line"><a name="l00411"></a><span class="lineno"> 411</span>  thrust::device_vector<void*>& listCodes,</div>
|
|
<div class="line"><a name="l00412"></a><span class="lineno"> 412</span>  thrust::device_vector<void*>& listIndices,</div>
|
|
<div class="line"><a name="l00413"></a><span class="lineno"> 413</span>  IndicesOptions indicesOptions,</div>
|
|
<div class="line"><a name="l00414"></a><span class="lineno"> 414</span>  thrust::device_vector<int>& listLengths,</div>
|
|
<div class="line"><a name="l00415"></a><span class="lineno"> 415</span>  <span class="keywordtype">int</span> maxListLength,</div>
|
|
<div class="line"><a name="l00416"></a><span class="lineno"> 416</span>  <span class="keywordtype">int</span> k,</div>
|
|
<div class="line"><a name="l00417"></a><span class="lineno"> 417</span>  <span class="comment">// output</span></div>
|
|
<div class="line"><a name="l00418"></a><span class="lineno"> 418</span>  Tensor<float, 2, true>& outDistances,</div>
|
|
<div class="line"><a name="l00419"></a><span class="lineno"> 419</span>  <span class="comment">// output</span></div>
|
|
<div class="line"><a name="l00420"></a><span class="lineno"> 420</span>  Tensor<long, 2, true>& outIndices,</div>
|
|
<div class="line"><a name="l00421"></a><span class="lineno"> 421</span>  GpuResources* res) {</div>
|
|
<div class="line"><a name="l00422"></a><span class="lineno"> 422</span>  constexpr <span class="keywordtype">int</span> kMinQueryTileSize = 8;</div>
|
|
<div class="line"><a name="l00423"></a><span class="lineno"> 423</span>  constexpr <span class="keywordtype">int</span> kMaxQueryTileSize = 128;</div>
|
|
<div class="line"><a name="l00424"></a><span class="lineno"> 424</span>  constexpr <span class="keywordtype">int</span> kThrustMemSize = 16384;</div>
|
|
<div class="line"><a name="l00425"></a><span class="lineno"> 425</span> </div>
|
|
<div class="line"><a name="l00426"></a><span class="lineno"> 426</span>  <span class="keywordtype">int</span> nprobe = topQueryToCentroid.getSize(1);</div>
|
|
<div class="line"><a name="l00427"></a><span class="lineno"> 427</span> </div>
|
|
<div class="line"><a name="l00428"></a><span class="lineno"> 428</span>  <span class="keyword">auto</span>& mem = res->getMemoryManagerCurrentDevice();</div>
|
|
<div class="line"><a name="l00429"></a><span class="lineno"> 429</span>  <span class="keyword">auto</span> stream = res->getDefaultStreamCurrentDevice();</div>
|
|
<div class="line"><a name="l00430"></a><span class="lineno"> 430</span> </div>
|
|
<div class="line"><a name="l00431"></a><span class="lineno"> 431</span>  <span class="comment">// Make a reservation for Thrust to do its dirty work (global memory</span></div>
|
|
<div class="line"><a name="l00432"></a><span class="lineno"> 432</span>  <span class="comment">// cross-block reduction space); hopefully this is large enough.</span></div>
|
|
<div class="line"><a name="l00433"></a><span class="lineno"> 433</span>  DeviceTensor<char, 1, true> thrustMem1(</div>
|
|
<div class="line"><a name="l00434"></a><span class="lineno"> 434</span>  mem, {kThrustMemSize}, stream);</div>
|
|
<div class="line"><a name="l00435"></a><span class="lineno"> 435</span>  DeviceTensor<char, 1, true> thrustMem2(</div>
|
|
<div class="line"><a name="l00436"></a><span class="lineno"> 436</span>  mem, {kThrustMemSize}, stream);</div>
|
|
<div class="line"><a name="l00437"></a><span class="lineno"> 437</span>  DeviceTensor<char, 1, true>* thrustMem[2] =</div>
|
|
<div class="line"><a name="l00438"></a><span class="lineno"> 438</span>  {&thrustMem1, &thrustMem2};</div>
|
|
<div class="line"><a name="l00439"></a><span class="lineno"> 439</span> </div>
|
|
<div class="line"><a name="l00440"></a><span class="lineno"> 440</span>  <span class="comment">// How much temporary storage is available?</span></div>
|
|
<div class="line"><a name="l00441"></a><span class="lineno"> 441</span>  <span class="comment">// If possible, we'd like to fit within the space available.</span></div>
|
|
<div class="line"><a name="l00442"></a><span class="lineno"> 442</span>  <span class="keywordtype">size_t</span> sizeAvailable = mem.getSizeAvailable();</div>
|
|
<div class="line"><a name="l00443"></a><span class="lineno"> 443</span> </div>
|
|
<div class="line"><a name="l00444"></a><span class="lineno"> 444</span>  <span class="comment">// We run two passes of heap selection</span></div>
|
|
<div class="line"><a name="l00445"></a><span class="lineno"> 445</span>  <span class="comment">// This is the size of the first-level heap passes</span></div>
|
|
<div class="line"><a name="l00446"></a><span class="lineno"> 446</span>  constexpr <span class="keywordtype">int</span> kNProbeSplit = 8;</div>
|
|
<div class="line"><a name="l00447"></a><span class="lineno"> 447</span>  <span class="keywordtype">int</span> pass2Chunks = std::min(nprobe, kNProbeSplit);</div>
|
|
<div class="line"><a name="l00448"></a><span class="lineno"> 448</span> </div>
|
|
<div class="line"><a name="l00449"></a><span class="lineno"> 449</span>  <span class="keywordtype">size_t</span> sizeForFirstSelectPass =</div>
|
|
<div class="line"><a name="l00450"></a><span class="lineno"> 450</span>  pass2Chunks * k * (<span class="keyword">sizeof</span>(float) + <span class="keyword">sizeof</span>(<span class="keywordtype">int</span>));</div>
|
|
<div class="line"><a name="l00451"></a><span class="lineno"> 451</span> </div>
|
|
<div class="line"><a name="l00452"></a><span class="lineno"> 452</span>  <span class="comment">// How much temporary storage we need per each query</span></div>
|
|
<div class="line"><a name="l00453"></a><span class="lineno"> 453</span>  <span class="keywordtype">size_t</span> sizePerQuery =</div>
|
|
<div class="line"><a name="l00454"></a><span class="lineno"> 454</span>  2 * <span class="comment">// streams</span></div>
|
|
<div class="line"><a name="l00455"></a><span class="lineno"> 455</span>  ((nprobe * <span class="keyword">sizeof</span>(int) + <span class="keyword">sizeof</span>(<span class="keywordtype">int</span>)) + <span class="comment">// prefixSumOffsets</span></div>
|
|
<div class="line"><a name="l00456"></a><span class="lineno"> 456</span>  nprobe * maxListLength * <span class="keyword">sizeof</span>(<span class="keywordtype">float</span>) + <span class="comment">// allDistances</span></div>
|
|
<div class="line"><a name="l00457"></a><span class="lineno"> 457</span>  <span class="comment">// residual distances</span></div>
|
|
<div class="line"><a name="l00458"></a><span class="lineno"> 458</span>  nprobe * numSubQuantizers * numSubQuantizerCodes * <span class="keyword">sizeof</span>(float) +</div>
|
|
<div class="line"><a name="l00459"></a><span class="lineno"> 459</span>  sizeForFirstSelectPass);</div>
|
|
<div class="line"><a name="l00460"></a><span class="lineno"> 460</span> </div>
|
|
<div class="line"><a name="l00461"></a><span class="lineno"> 461</span>  <span class="keywordtype">int</span> queryTileSize = (int) (sizeAvailable / sizePerQuery);</div>
|
|
<div class="line"><a name="l00462"></a><span class="lineno"> 462</span> </div>
|
|
<div class="line"><a name="l00463"></a><span class="lineno"> 463</span>  <span class="keywordflow">if</span> (queryTileSize < kMinQueryTileSize) {</div>
|
|
<div class="line"><a name="l00464"></a><span class="lineno"> 464</span>  queryTileSize = kMinQueryTileSize;</div>
|
|
<div class="line"><a name="l00465"></a><span class="lineno"> 465</span>  } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (queryTileSize > kMaxQueryTileSize) {</div>
|
|
<div class="line"><a name="l00466"></a><span class="lineno"> 466</span>  queryTileSize = kMaxQueryTileSize;</div>
|
|
<div class="line"><a name="l00467"></a><span class="lineno"> 467</span>  }</div>
|
|
<div class="line"><a name="l00468"></a><span class="lineno"> 468</span> </div>
|
|
<div class="line"><a name="l00469"></a><span class="lineno"> 469</span>  <span class="comment">// FIXME: we should adjust queryTileSize to deal with this, since</span></div>
|
|
<div class="line"><a name="l00470"></a><span class="lineno"> 470</span>  <span class="comment">// indexing is in int32</span></div>
|
|
<div class="line"><a name="l00471"></a><span class="lineno"> 471</span>  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <</div>
|
|
<div class="line"><a name="l00472"></a><span class="lineno"> 472</span>  std::numeric_limits<int>::max());</div>
|
|
<div class="line"><a name="l00473"></a><span class="lineno"> 473</span> </div>
|
|
<div class="line"><a name="l00474"></a><span class="lineno"> 474</span>  <span class="comment">// Temporary memory buffers</span></div>
|
|
<div class="line"><a name="l00475"></a><span class="lineno"> 475</span>  <span class="comment">// Make sure there is space prior to the start which will be 0, and</span></div>
|
|
<div class="line"><a name="l00476"></a><span class="lineno"> 476</span>  <span class="comment">// will handle the boundary condition without branches</span></div>
|
|
<div class="line"><a name="l00477"></a><span class="lineno"> 477</span>  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(</div>
|
|
<div class="line"><a name="l00478"></a><span class="lineno"> 478</span>  mem, {queryTileSize * nprobe + 1}, stream);</div>
|
|
<div class="line"><a name="l00479"></a><span class="lineno"> 479</span>  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(</div>
|
|
<div class="line"><a name="l00480"></a><span class="lineno"> 480</span>  mem, {queryTileSize * nprobe + 1}, stream);</div>
|
|
<div class="line"><a name="l00481"></a><span class="lineno"> 481</span> </div>
|
|
<div class="line"><a name="l00482"></a><span class="lineno"> 482</span>  DeviceTensor<int, 2, true> prefixSumOffsets1(</div>
|
|
<div class="line"><a name="l00483"></a><span class="lineno"> 483</span>  prefixSumOffsetSpace1[1].data(),</div>
|
|
<div class="line"><a name="l00484"></a><span class="lineno"> 484</span>  {queryTileSize, nprobe});</div>
|
|
<div class="line"><a name="l00485"></a><span class="lineno"> 485</span>  DeviceTensor<int, 2, true> prefixSumOffsets2(</div>
|
|
<div class="line"><a name="l00486"></a><span class="lineno"> 486</span>  prefixSumOffsetSpace2[1].data(),</div>
|
|
<div class="line"><a name="l00487"></a><span class="lineno"> 487</span>  {queryTileSize, nprobe});</div>
|
|
<div class="line"><a name="l00488"></a><span class="lineno"> 488</span>  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =</div>
|
|
<div class="line"><a name="l00489"></a><span class="lineno"> 489</span>  {&prefixSumOffsets1, &prefixSumOffsets2};</div>
|
|
<div class="line"><a name="l00490"></a><span class="lineno"> 490</span> </div>
|
|
<div class="line"><a name="l00491"></a><span class="lineno"> 491</span>  <span class="comment">// Make sure the element before prefixSumOffsets is 0, since we</span></div>
|
|
<div class="line"><a name="l00492"></a><span class="lineno"> 492</span>  <span class="comment">// depend upon simple, boundary-less indexing to get proper results</span></div>
|
|
<div class="line"><a name="l00493"></a><span class="lineno"> 493</span>  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),</div>
|
|
<div class="line"><a name="l00494"></a><span class="lineno"> 494</span>  0,</div>
|
|
<div class="line"><a name="l00495"></a><span class="lineno"> 495</span>  <span class="keyword">sizeof</span>(int),</div>
|
|
<div class="line"><a name="l00496"></a><span class="lineno"> 496</span>  stream));</div>
|
|
<div class="line"><a name="l00497"></a><span class="lineno"> 497</span>  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),</div>
|
|
<div class="line"><a name="l00498"></a><span class="lineno"> 498</span>  0,</div>
|
|
<div class="line"><a name="l00499"></a><span class="lineno"> 499</span>  <span class="keyword">sizeof</span>(int),</div>
|
|
<div class="line"><a name="l00500"></a><span class="lineno"> 500</span>  stream));</div>
|
|
<div class="line"><a name="l00501"></a><span class="lineno"> 501</span> </div>
|
|
<div class="line"><a name="l00502"></a><span class="lineno"> 502</span>  <span class="keywordtype">int</span> codeDistanceTypeSize = <span class="keyword">sizeof</span>(float);</div>
|
|
<div class="line"><a name="l00503"></a><span class="lineno"> 503</span> <span class="preprocessor">#ifdef FAISS_USE_FLOAT16</span></div>
|
|
<div class="line"><a name="l00504"></a><span class="lineno"> 504</span> <span class="preprocessor"></span> <span class="keywordflow">if</span> (useFloat16Lookup) {</div>
|
|
<div class="line"><a name="l00505"></a><span class="lineno"> 505</span>  codeDistanceTypeSize = <span class="keyword">sizeof</span>(half);</div>
|
|
<div class="line"><a name="l00506"></a><span class="lineno"> 506</span>  }</div>
|
|
<div class="line"><a name="l00507"></a><span class="lineno"> 507</span> <span class="preprocessor">#else</span></div>
|
|
<div class="line"><a name="l00508"></a><span class="lineno"> 508</span> <span class="preprocessor"></span> FAISS_ASSERT(!useFloat16Lookup);</div>
|
|
<div class="line"><a name="l00509"></a><span class="lineno"> 509</span>  <span class="keywordtype">int</span> codeSize = <span class="keyword">sizeof</span>(float);</div>
|
|
<div class="line"><a name="l00510"></a><span class="lineno"> 510</span> <span class="preprocessor">#endif</span></div>
|
|
<div class="line"><a name="l00511"></a><span class="lineno"> 511</span> <span class="preprocessor"></span></div>
|
|
<div class="line"><a name="l00512"></a><span class="lineno"> 512</span>  <span class="keywordtype">int</span> totalCodeDistancesSize =</div>
|
|
<div class="line"><a name="l00513"></a><span class="lineno"> 513</span>  queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *</div>
|
|
<div class="line"><a name="l00514"></a><span class="lineno"> 514</span>  codeDistanceTypeSize;</div>
|
|
<div class="line"><a name="l00515"></a><span class="lineno"> 515</span> </div>
|
|
<div class="line"><a name="l00516"></a><span class="lineno"> 516</span>  DeviceTensor<char, 1, true> codeDistances1Mem(</div>
|
|
<div class="line"><a name="l00517"></a><span class="lineno"> 517</span>  mem, {totalCodeDistancesSize}, stream);</div>
|
|
<div class="line"><a name="l00518"></a><span class="lineno"> 518</span>  NoTypeTensor<4, true> codeDistances1(</div>
|
|
<div class="line"><a name="l00519"></a><span class="lineno"> 519</span>  codeDistances1Mem.data(),</div>
|
|
<div class="line"><a name="l00520"></a><span class="lineno"> 520</span>  codeDistanceTypeSize,</div>
|
|
<div class="line"><a name="l00521"></a><span class="lineno"> 521</span>  {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});</div>
|
|
<div class="line"><a name="l00522"></a><span class="lineno"> 522</span> </div>
|
|
<div class="line"><a name="l00523"></a><span class="lineno"> 523</span>  DeviceTensor<char, 1, true> codeDistances2Mem(</div>
|
|
<div class="line"><a name="l00524"></a><span class="lineno"> 524</span>  mem, {totalCodeDistancesSize}, stream);</div>
|
|
<div class="line"><a name="l00525"></a><span class="lineno"> 525</span>  NoTypeTensor<4, true> codeDistances2(</div>
|
|
<div class="line"><a name="l00526"></a><span class="lineno"> 526</span>  codeDistances2Mem.data(),</div>
|
|
<div class="line"><a name="l00527"></a><span class="lineno"> 527</span>  codeDistanceTypeSize,</div>
|
|
<div class="line"><a name="l00528"></a><span class="lineno"> 528</span>  {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});</div>
|
|
<div class="line"><a name="l00529"></a><span class="lineno"> 529</span> </div>
|
|
<div class="line"><a name="l00530"></a><span class="lineno"> 530</span>  NoTypeTensor<4, true>* codeDistances[2] =</div>
|
|
<div class="line"><a name="l00531"></a><span class="lineno"> 531</span>  {&codeDistances1, &codeDistances2};</div>
|
|
<div class="line"><a name="l00532"></a><span class="lineno"> 532</span> </div>
|
|
<div class="line"><a name="l00533"></a><span class="lineno"> 533</span>  DeviceTensor<float, 1, true> allDistances1(</div>
|
|
<div class="line"><a name="l00534"></a><span class="lineno"> 534</span>  mem, {queryTileSize * nprobe * maxListLength}, stream);</div>
|
|
<div class="line"><a name="l00535"></a><span class="lineno"> 535</span>  DeviceTensor<float, 1, true> allDistances2(</div>
|
|
<div class="line"><a name="l00536"></a><span class="lineno"> 536</span>  mem, {queryTileSize * nprobe * maxListLength}, stream);</div>
|
|
<div class="line"><a name="l00537"></a><span class="lineno"> 537</span>  DeviceTensor<float, 1, true>* allDistances[2] =</div>
|
|
<div class="line"><a name="l00538"></a><span class="lineno"> 538</span>  {&allDistances1, &allDistances2};</div>
|
|
<div class="line"><a name="l00539"></a><span class="lineno"> 539</span> </div>
|
|
<div class="line"><a name="l00540"></a><span class="lineno"> 540</span>  DeviceTensor<float, 3, true> heapDistances1(</div>
|
|
<div class="line"><a name="l00541"></a><span class="lineno"> 541</span>  mem, {queryTileSize, pass2Chunks, k}, stream);</div>
|
|
<div class="line"><a name="l00542"></a><span class="lineno"> 542</span>  DeviceTensor<float, 3, true> heapDistances2(</div>
|
|
<div class="line"><a name="l00543"></a><span class="lineno"> 543</span>  mem, {queryTileSize, pass2Chunks, k}, stream);</div>
|
|
<div class="line"><a name="l00544"></a><span class="lineno"> 544</span>  DeviceTensor<float, 3, true>* heapDistances[2] =</div>
|
|
<div class="line"><a name="l00545"></a><span class="lineno"> 545</span>  {&heapDistances1, &heapDistances2};</div>
|
|
<div class="line"><a name="l00546"></a><span class="lineno"> 546</span> </div>
|
|
<div class="line"><a name="l00547"></a><span class="lineno"> 547</span>  DeviceTensor<int, 3, true> heapIndices1(</div>
|
|
<div class="line"><a name="l00548"></a><span class="lineno"> 548</span>  mem, {queryTileSize, pass2Chunks, k}, stream);</div>
|
|
<div class="line"><a name="l00549"></a><span class="lineno"> 549</span>  DeviceTensor<int, 3, true> heapIndices2(</div>
|
|
<div class="line"><a name="l00550"></a><span class="lineno"> 550</span>  mem, {queryTileSize, pass2Chunks, k}, stream);</div>
|
|
<div class="line"><a name="l00551"></a><span class="lineno"> 551</span>  DeviceTensor<int, 3, true>* heapIndices[2] =</div>
|
|
<div class="line"><a name="l00552"></a><span class="lineno"> 552</span>  {&heapIndices1, &heapIndices2};</div>
|
|
<div class="line"><a name="l00553"></a><span class="lineno"> 553</span> </div>
|
|
<div class="line"><a name="l00554"></a><span class="lineno"> 554</span>  <span class="keyword">auto</span> streams = res->getAlternateStreamsCurrentDevice();</div>
|
|
<div class="line"><a name="l00555"></a><span class="lineno"> 555</span>  streamWait(streams, {stream});</div>
|
|
<div class="line"><a name="l00556"></a><span class="lineno"> 556</span> </div>
|
|
<div class="line"><a name="l00557"></a><span class="lineno"> 557</span>  <span class="keywordtype">int</span> curStream = 0;</div>
|
|
<div class="line"><a name="l00558"></a><span class="lineno"> 558</span> </div>
|
|
<div class="line"><a name="l00559"></a><span class="lineno"> 559</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> query = 0; query < queries.getSize(0); query += queryTileSize) {</div>
|
|
<div class="line"><a name="l00560"></a><span class="lineno"> 560</span>  <span class="keywordtype">int</span> numQueriesInTile =</div>
|
|
<div class="line"><a name="l00561"></a><span class="lineno"> 561</span>  std::min(queryTileSize, queries.getSize(0) - query);</div>
|
|
<div class="line"><a name="l00562"></a><span class="lineno"> 562</span> </div>
|
|
<div class="line"><a name="l00563"></a><span class="lineno"> 563</span>  <span class="keyword">auto</span> prefixSumOffsetsView =</div>
|
|
<div class="line"><a name="l00564"></a><span class="lineno"> 564</span>  prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00565"></a><span class="lineno"> 565</span> </div>
|
|
<div class="line"><a name="l00566"></a><span class="lineno"> 566</span>  <span class="keyword">auto</span> codeDistancesView =</div>
|
|
<div class="line"><a name="l00567"></a><span class="lineno"> 567</span>  codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00568"></a><span class="lineno"> 568</span>  <span class="keyword">auto</span> coarseIndicesView =</div>
|
|
<div class="line"><a name="l00569"></a><span class="lineno"> 569</span>  topQueryToCentroid.narrowOutermost(query, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00570"></a><span class="lineno"> 570</span>  <span class="keyword">auto</span> queryView =</div>
|
|
<div class="line"><a name="l00571"></a><span class="lineno"> 571</span>  queries.narrowOutermost(query, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00572"></a><span class="lineno"> 572</span> </div>
|
|
<div class="line"><a name="l00573"></a><span class="lineno"> 573</span>  <span class="keyword">auto</span> heapDistancesView =</div>
|
|
<div class="line"><a name="l00574"></a><span class="lineno"> 574</span>  heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00575"></a><span class="lineno"> 575</span>  <span class="keyword">auto</span> heapIndicesView =</div>
|
|
<div class="line"><a name="l00576"></a><span class="lineno"> 576</span>  heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00577"></a><span class="lineno"> 577</span> </div>
|
|
<div class="line"><a name="l00578"></a><span class="lineno"> 578</span>  <span class="keyword">auto</span> outDistanceView =</div>
|
|
<div class="line"><a name="l00579"></a><span class="lineno"> 579</span>  outDistances.narrowOutermost(query, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00580"></a><span class="lineno"> 580</span>  <span class="keyword">auto</span> outIndicesView =</div>
|
|
<div class="line"><a name="l00581"></a><span class="lineno"> 581</span>  outIndices.narrowOutermost(query, numQueriesInTile);</div>
|
|
<div class="line"><a name="l00582"></a><span class="lineno"> 582</span> </div>
|
|
<div class="line"><a name="l00583"></a><span class="lineno"> 583</span>  runMultiPassTile(queryView,</div>
|
|
<div class="line"><a name="l00584"></a><span class="lineno"> 584</span>  centroids,</div>
|
|
<div class="line"><a name="l00585"></a><span class="lineno"> 585</span>  pqCentroidsInnermostCode,</div>
|
|
<div class="line"><a name="l00586"></a><span class="lineno"> 586</span>  codeDistancesView,</div>
|
|
<div class="line"><a name="l00587"></a><span class="lineno"> 587</span>  coarseIndicesView,</div>
|
|
<div class="line"><a name="l00588"></a><span class="lineno"> 588</span>  useFloat16Lookup,</div>
|
|
<div class="line"><a name="l00589"></a><span class="lineno"> 589</span>  bytesPerCode,</div>
|
|
<div class="line"><a name="l00590"></a><span class="lineno"> 590</span>  numSubQuantizers,</div>
|
|
<div class="line"><a name="l00591"></a><span class="lineno"> 591</span>  numSubQuantizerCodes,</div>
|
|
<div class="line"><a name="l00592"></a><span class="lineno"> 592</span>  listCodes,</div>
|
|
<div class="line"><a name="l00593"></a><span class="lineno"> 593</span>  listIndices,</div>
|
|
<div class="line"><a name="l00594"></a><span class="lineno"> 594</span>  indicesOptions,</div>
|
|
<div class="line"><a name="l00595"></a><span class="lineno"> 595</span>  listLengths,</div>
|
|
<div class="line"><a name="l00596"></a><span class="lineno"> 596</span>  *thrustMem[curStream],</div>
|
|
<div class="line"><a name="l00597"></a><span class="lineno"> 597</span>  prefixSumOffsetsView,</div>
|
|
<div class="line"><a name="l00598"></a><span class="lineno"> 598</span>  *allDistances[curStream],</div>
|
|
<div class="line"><a name="l00599"></a><span class="lineno"> 599</span>  heapDistancesView,</div>
|
|
<div class="line"><a name="l00600"></a><span class="lineno"> 600</span>  heapIndicesView,</div>
|
|
<div class="line"><a name="l00601"></a><span class="lineno"> 601</span>  k,</div>
|
|
<div class="line"><a name="l00602"></a><span class="lineno"> 602</span>  outDistanceView,</div>
|
|
<div class="line"><a name="l00603"></a><span class="lineno"> 603</span>  outIndicesView,</div>
|
|
<div class="line"><a name="l00604"></a><span class="lineno"> 604</span>  streams[curStream]);</div>
|
|
<div class="line"><a name="l00605"></a><span class="lineno"> 605</span> </div>
|
|
<div class="line"><a name="l00606"></a><span class="lineno"> 606</span>  curStream = (curStream + 1) % 2;</div>
|
|
<div class="line"><a name="l00607"></a><span class="lineno"> 607</span>  }</div>
|
|
<div class="line"><a name="l00608"></a><span class="lineno"> 608</span> </div>
|
|
<div class="line"><a name="l00609"></a><span class="lineno"> 609</span>  streamWait({stream}, streams);</div>
|
|
<div class="line"><a name="l00610"></a><span class="lineno"> 610</span> }</div>
|
|
<div class="line"><a name="l00611"></a><span class="lineno"> 611</span> </div>
|
|
<div class="line"><a name="l00612"></a><span class="lineno"> 612</span> } } <span class="comment">// namespace</span></div>
|
|
<div class="ttc" id="structfaiss_1_1gpu_1_1LoadStore_html"><div class="ttname"><a href="structfaiss_1_1gpu_1_1LoadStore.html">faiss::gpu::LoadStore</a></div><div class="ttdef"><b>Definition:</b> <a href="LoadStoreOperators_8cuh_source.html#l00024">LoadStoreOperators.cuh:24</a></div></div>
|
|
<div class="ttc" id="classfaiss_1_1gpu_1_1Tensor_html_a6699c311648457f257afa340c61f417c"><div class="ttname"><a href="classfaiss_1_1gpu_1_1Tensor.html#a6699c311648457f257afa340c61f417c">faiss::gpu::Tensor::getSize</a></div><div class="ttdeci">__host__ __device__ IndexT getSize(int i) const </div><div class="ttdef"><b>Definition:</b> <a href="Tensor_8cuh_source.html#l00226">Tensor.cuh:226</a></div></div>
|
|
<div class="ttc" id="classfaiss_1_1gpu_1_1Tensor_html_a50411ce4d0fa32ef715e3321b6e33212"><div class="ttname"><a href="classfaiss_1_1gpu_1_1Tensor.html#a50411ce4d0fa32ef715e3321b6e33212">faiss::gpu::Tensor::data</a></div><div class="ttdeci">__host__ __device__ DataPtrType data()</div><div class="ttdoc">Returns a raw pointer to the start of our data. </div><div class="ttdef"><b>Definition:</b> <a href="Tensor_8cuh_source.html#l00178">Tensor.cuh:178</a></div></div>
|
|
<div class="ttc" id="classfaiss_1_1gpu_1_1Tensor_html"><div class="ttname"><a href="classfaiss_1_1gpu_1_1Tensor.html">faiss::gpu::Tensor</a></div><div class="ttdoc">Our tensor type. </div><div class="ttdef"><b>Definition:</b> <a href="Tensor_8cuh_source.html#l00030">Tensor.cuh:30</a></div></div>
|
|
<div class="ttc" id="structfaiss_1_1gpu_1_1LoadCodeDistances_html"><div class="ttname"><a href="structfaiss_1_1gpu_1_1LoadCodeDistances.html">faiss::gpu::LoadCodeDistances</a></div><div class="ttdef"><b>Definition:</b> <a href="PQScanMultiPassNoPrecomputed_8cu_source.html#l00053">PQScanMultiPassNoPrecomputed.cu:53</a></div></div>
|
|
</div><!-- fragment --></div><!-- contents -->
|
|
<!-- start footer part -->
|
|
<hr class="footer"/><address class="footer"><small>
|
|
Generated by  <a href="http://www.doxygen.org/index.html">
|
|
<img class="footer" src="doxygen.png" alt="doxygen"/>
|
|
</a> 1.8.5
|
|
</small></address>
|
|
</body>
|
|
</html>
|