/** * Copyright (c) 2015-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the CC-by-NC license found in the * LICENSE file in the root directory of this source tree. */ // Copyright 2004-present Facebook. All Rights Reserved. #pragma once #include #include "Float16.cuh" #include "Tensor.cuh" namespace faiss { namespace gpu { class DeviceMemory; /// C = alpha * A * B + beta * C /// Expects row major layout, not fortran/blas column major! void runMatrixMult(Tensor& c, bool transC, Tensor& a, bool transA, Tensor& b, bool transB, float alpha, float beta, cublasHandle_t handle, cudaStream_t stream); #ifdef FAISS_USE_FLOAT16 /// C = alpha * A * B + beta * C /// Expects row major layout, not fortran/blas column major! void runMatrixMult(Tensor& c, bool transC, Tensor& a, bool transA, Tensor& b, bool transB, float alpha, float beta, cublasHandle_t handle, cudaStream_t stream); #endif /// C_i = alpha * A_i * B_i + beta * C_i /// where `i` is the outermost dimension, via iterated gemm /// Expects row major layout, not fortran/blas column major! void runIteratedMatrixMult(Tensor& c, bool transC, Tensor& a, bool transA, Tensor& b, bool transB, float alpha, float beta, cublasHandle_t handle, cudaStream_t stream); /// C_i = alpha * A_i * B_i + beta * C_i /// where `i` is the outermost dimension, via batched gemm /// Expects row major layout, not fortran/blas column major! void runBatchMatrixMult(Tensor& c, bool transC, Tensor& a, bool transA, Tensor& b, bool transB, float alpha, float beta, DeviceMemory& mem, cublasHandle_t handle, cudaStream_t stream); } } // namespace