faiss/gpu/utils/MatrixMult.cuh

/**
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */


#pragma once

#include <cublas_v2.h>
#include "Float16.cuh"
#include "Tensor.cuh"

namespace faiss { namespace gpu {

class DeviceMemory;

/// C = alpha * A * B + beta * C
/// Expects row major layout, not fortran/blas column major!
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
                   Tensor<float, 2, true>& a, bool transA,
                   Tensor<float, 2, true>& b, bool transB,
                   float alpha,
                   float beta,
                   bool useHgemm, // ignored for float32
                   cublasHandle_t handle,
                   cudaStream_t stream);

#ifdef FAISS_USE_FLOAT16
/// C = alpha * A * B + beta * C
/// Expects row major layout, not fortran/blas column major!
void runMatrixMult(Tensor<half, 2, true>& c, bool transC,
                   Tensor<half, 2, true>& a, bool transA,
                   Tensor<half, 2, true>& b, bool transB,
                   float alpha,
                   float beta,
                   bool useHgemm,
                   cublasHandle_t handle,
                   cudaStream_t stream);
#endif

/// C_i = alpha * A_i * B_i + beta * C_i
/// where `i` is the outermost dimension, via iterated gemm
/// Expects row major layout, not fortran/blas column major!
void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
                           Tensor<float, 3, true>& a, bool transA,
                           Tensor<float, 3, true>& b, bool transB,
                           float alpha,
                           float beta,
                           cublasHandle_t handle,
                           cudaStream_t stream);

/// C_i = alpha * A_i * B_i + beta * C_i
/// where `i` is the outermost dimension, via batched gemm
/// Expects row major layout, not fortran/blas column major!
void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
                        Tensor<float, 3, true>& a, bool transA,
                        Tensor<float, 3, true>& b, bool transB,
                        float alpha,
                        float beta,
                        DeviceMemory& mem,
                        cublasHandle_t handle,
                        cudaStream_t stream);

} } // namespace
Initial commit 2017-02-22 23:26:44 +01:00			`/**`
Facebook sync (May 2019) + relicense (#838) Changelog: - changed license: BSD+Patents -> MIT - propagates exceptions raised in sub-indexes of IndexShards and IndexReplicas - support for searching several inverted lists in parallel (parallel_mode != 0) - better support for PQ codes where nbit != 8 or 16 - IVFSpectralHash implementation: spectral hash codes inside an IVF - 6-bit per component scalar quantizer (4 and 8 bit were already supported) - combinations of inverted lists: HStackInvertedLists and VStackInvertedLists - configurable number of threads for OnDiskInvertedLists prefetching (including 0=no prefetch) - more test and demo code compatible with Python 3 (print with parentheses) - refactored benchmark code: data loading is now in a single file 2019-05-28 16:17:22 +02:00			`* Copyright (c) Facebook, Inc. and its affiliates.`
Initial commit 2017-02-22 23:26:44 +01:00			`*`
Facebook sync (May 2019) + relicense (#838) Changelog: - changed license: BSD+Patents -> MIT - propagates exceptions raised in sub-indexes of IndexShards and IndexReplicas - support for searching several inverted lists in parallel (parallel_mode != 0) - better support for PQ codes where nbit != 8 or 16 - IVFSpectralHash implementation: spectral hash codes inside an IVF - 6-bit per component scalar quantizer (4 and 8 bit were already supported) - combinations of inverted lists: HStackInvertedLists and VStackInvertedLists - configurable number of threads for OnDiskInvertedLists prefetching (including 0=no prefetch) - more test and demo code compatible with Python 3 (print with parentheses) - refactored benchmark code: data loading is now in a single file 2019-05-28 16:17:22 +02:00			`* This source code is licensed under the MIT license found in the`
Initial commit 2017-02-22 23:26:44 +01:00			`* LICENSE file in the root directory of this source tree.`
			`*/`


			`#pragma once`

			`#include <cublas_v2.h>`
			`#include "Float16.cuh"`
			`#include "Tensor.cuh"`

			`namespace faiss { namespace gpu {`

			`class DeviceMemory;`

			`/// C = alpha * A * B + beta * C`
			`/// Expects row major layout, not fortran/blas column major!`
			`void runMatrixMult(Tensor<float, 2, true>& c, bool transC,`
			`Tensor<float, 2, true>& a, bool transA,`
			`Tensor<float, 2, true>& b, bool transB,`
			`float alpha,`
			`float beta,`
Synchronization with FB version 2017-06-21 * moved most FAISS_ASSERT calls to C++ exceptions, and adjusted memory allocation to avoid mem leaks * added an IndexIVFScalarQuantizer type that offers an intermediate compression between IVFFlat and IVFPQ * support removal of indices in IndexIDMap / IndexFlat combination * various fixes in GPU code 2017-06-21 06:54:28 -07:00			`bool useHgemm, // ignored for float32`
Initial commit 2017-02-22 23:26:44 +01:00			`cublasHandle_t handle,`
			`cudaStream_t stream);`

			`#ifdef FAISS_USE_FLOAT16`
			`/// C = alpha * A * B + beta * C`
			`/// Expects row major layout, not fortran/blas column major!`
			`void runMatrixMult(Tensor<half, 2, true>& c, bool transC,`
			`Tensor<half, 2, true>& a, bool transA,`
			`Tensor<half, 2, true>& b, bool transB,`
			`float alpha,`
			`float beta,`
Synchronization with FB version 2017-06-21 * moved most FAISS_ASSERT calls to C++ exceptions, and adjusted memory allocation to avoid mem leaks * added an IndexIVFScalarQuantizer type that offers an intermediate compression between IVFFlat and IVFPQ * support removal of indices in IndexIDMap / IndexFlat combination * various fixes in GPU code 2017-06-21 06:54:28 -07:00			`bool useHgemm,`
Initial commit 2017-02-22 23:26:44 +01:00			`cublasHandle_t handle,`
			`cudaStream_t stream);`
			`#endif`

			`/// C_i = alpha * A_i * B_i + beta * C_i`
			/// where `i` is the outermost dimension, via iterated gemm
			`/// Expects row major layout, not fortran/blas column major!`
			`void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,`
			`Tensor<float, 3, true>& a, bool transA,`
			`Tensor<float, 3, true>& b, bool transB,`
			`float alpha,`
			`float beta,`
			`cublasHandle_t handle,`
			`cudaStream_t stream);`

			`/// C_i = alpha * A_i * B_i + beta * C_i`
			/// where `i` is the outermost dimension, via batched gemm
			`/// Expects row major layout, not fortran/blas column major!`
			`void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,`
			`Tensor<float, 3, true>& a, bool transA,`
			`Tensor<float, 3, true>& b, bool transB,`
			`float alpha,`
			`float beta,`
			`DeviceMemory& mem,`
			`cublasHandle_t handle,`
			`cudaStream_t stream);`

			`} } // namespace`