diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index c97bae703..080a95cbc 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -314,7 +314,9 @@ foreach(header ${FAISS_GPU_HEADERS}) endforeach() if (USE_ROCM) - target_link_libraries(faiss_gpu PRIVATE $<$:hip::host> $<$:roc::hipblas>) + target_link_libraries(faiss_gpu PRIVATE + $<$:hip::host> + $<$:roc::hipblas>) target_compile_options(faiss_gpu PRIVATE) else() # Prepares a host linker script and enables host linker to support @@ -333,6 +335,13 @@ else() target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") find_package(CUDAToolkit REQUIRED) - target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$:raft::raft> $<$:raft::compiled> $<$:nvidia::cutlass::cutlass> $<$:OpenMP::OpenMP_CXX>) - target_compile_options(faiss_gpu PRIVATE $<$:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$:-Xcompiler=${OpenMP_CXX_FLAGS}>>) + target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas + $<$:raft::raft> + $<$:raft::compiled> + $<$:nvidia::cutlass::cutlass> + $<$:OpenMP::OpenMP_CXX>) + target_compile_options(faiss_gpu PRIVATE + $<$:-Xfatbin=-compress-all + --expt-extended-lambda --expt-relaxed-constexpr + $<$:-Xcompiler=${OpenMP_CXX_FLAGS}>>) endif() diff --git a/faiss/gpu/hipify.sh b/faiss/gpu/hipify.sh index 09d466545..dc0af11a7 100755 --- a/faiss/gpu/hipify.sh +++ b/faiss/gpu/hipify.sh @@ -3,35 +3,35 @@ # go one level up from faiss/gpu top=$(dirname "${BASH_SOURCE[0]}")/.. echo "top=$top" -cd $top -echo "pwd=`pwd`" +cd "$top" || exit +echo "pwd=$(pwd)" # create all destination directories for hipified files into sibling 'gpu-rocm' directory -for src in $(find ./gpu -type d) +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's/gpu/gpu-rocm/') + dst="${src//gpu/gpu-rocm}" echo "Creating $dst" - mkdir -p $dst -done + mkdir -p "$dst" +done < <(find ./gpu -type d -print0) # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming # run all files in parallel to speed up for ext in cu cuh h cpp do - for src in $(find ./gpu -name "*.$ext") + while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - hipify-perl -o=$dst.tmp $src & - done + dst="${src//\.\/gpu/\.\/gpu-rocm}" + hipify-perl -o="$dst.tmp" "$src" & + done < <(find ./gpu -name "*.$ext" -print0) done wait # rename all hipified *.cu files to *.hip -for src in $(find ./gpu-rocm -name "*.cu.tmp") +while IFS= read -r -d '' src do dst=${src%.cu.tmp}.hip.tmp - mv $src $dst -done + mv "$src" "$dst" +done < <(find ./gpu-rocm -name "*.cu.tmp" -print0) # replace header include statements "@#include @' $src - sed -i 's@#include @#include @' $src - done + sed -i 's@#include @#include @' "$src" + sed -i 's@#include @#include @' "$src" + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # hipify was run in parallel above # don't copy the tmp file if it is unchanged for ext in hip cuh h cpp do - for src in $(find ./gpu-rocm -name "*.$ext.tmp") + while IFS= read -r -d '' src do dst=${src%.tmp} - if test -f $dst + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" - rm $src + rm "$src" else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi - done + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # copy over CMakeLists.txt -for src in $(find ./gpu -name "CMakeLists.txt") +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - if test -f $dst + dst="${src//\.\/gpu/\.\/gpu-rocm}" + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi -done +done < <(find ./gpu -name "CMakeLists.txt" -print0) # Copy over other files -for ext in py +other_exts="py" +for ext in $other_exts do - for src in $(find ./gpu -name "*.$ext") + while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - if test -f $dst + dst="${src//\.\/gpu/\.\/gpu-rocm}" + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi - done + done < <(find ./gpu -name "*.$ext" -print0) done - ################################################################################### # C_API Support ################################################################################### @@ -122,36 +122,36 @@ done # This points to the faiss/c_api dir top_c_api=$(dirname "${BASH_SOURCE[0]}")/../../c_api echo "top=$top_c_api" -cd ../$top_c_api -echo "pwd=`pwd`" +cd "../$top_c_api" || exit +echo "pwd=$(pwd)" # create all destination directories for hipified files into sibling 'gpu-rocm' directory -for src in $(find ./gpu -type d) +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's/gpu/gpu-rocm/') + dst="${src//gpu/gpu-rocm}" echo "Creating $dst" - mkdir -p $dst -done + mkdir -p "$dst" +done < <(find ./gpu -type d -print0) # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming # run all files in parallel to speed up for ext in cu cuh h cpp c do - for src in $(find ./gpu -name "*.$ext") + while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - hipify-perl -o=$dst.tmp $src & - done + dst="${src//\.\/gpu/\.\/gpu-rocm}" + hipify-perl -o="$dst.tmp" "$src" & + done < <(find ./gpu -name "*.$ext" -print0) done wait # rename all hipified *.cu files to *.hip -for src in $(find ./gpu-rocm -name "*.cu.tmp") +while IFS= read -r -d '' src do dst=${src%.cu.tmp}.hip.tmp - mv $src $dst -done + mv "$src" "$dst" +done < <(find ./gpu-rocm -name "*.cu.tmp" -print0) # replace header include statements "@#include @' $src - sed -i 's@#include @#include @' $src - done + sed -i 's@#include @#include @' "$src" + sed -i 's@#include @#include @' "$src" + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # hipify was run in parallel above # don't copy the tmp file if it is unchanged for ext in hip cuh h cpp c do - for src in $(find ./gpu-rocm -name "*.$ext.tmp") + while IFS= read -r -d '' src do dst=${src%.tmp} - if test -f $dst + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" - rm $src + rm "$src" else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi - done + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # copy over CMakeLists.txt -for src in $(find ./gpu -name "CMakeLists.txt") +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - if test -f $dst + dst="${src//\.\/gpu/\.\/gpu-rocm}" + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi -done +done < <(find ./gpu -name "CMakeLists.txt" -print0) diff --git a/faiss/gpu/impl/IVFAppend.cu b/faiss/gpu/impl/IVFAppend.cu index 8ee85eaed..65af470cd 100644 --- a/faiss/gpu/impl/IVFAppend.cu +++ b/faiss/gpu/impl/IVFAppend.cu @@ -411,7 +411,8 @@ __global__ void ivfInterleavedAppend( EncodeT* listStart = ((EncodeT*)listData[listId]); // Each warp within the block handles a different chunk of kWarpSize - auto warpVec = alignedListVecStart + warpId * kWarpSize; + auto warpVec = alignedListVecStart + + (faiss::gpu::Tensor::DataType)warpId * kWarpSize; // The warp data starts here EncodeT* warpData = listStart + (warpVec / kWarpSize) * wordsPerVectorBlock; diff --git a/faiss/gpu/impl/InterleavedCodes.cpp b/faiss/gpu/impl/InterleavedCodes.cpp index bd9464d5c..5a2dc4b77 100644 --- a/faiss/gpu/impl/InterleavedCodes.cpp +++ b/faiss/gpu/impl/InterleavedCodes.cpp @@ -168,7 +168,7 @@ void unpackInterleavedWord( int dims, int bitsPerCode) { int warpSize = getWarpSizeCurrentDevice(); - int wordsPerDimBlock = warpSize * bitsPerCode / (8 * sizeof(T)); + int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T)); int wordsPerBlock = wordsPerDimBlock * dims; int numBlocks = utils::divUp(numVecs, warpSize); @@ -446,7 +446,7 @@ void packInterleavedWord( int dims, int bitsPerCode) { int warpSize = getWarpSizeCurrentDevice(); - int wordsPerDimBlock = warpSize * bitsPerCode / (8 * sizeof(T)); + int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T)); int wordsPerBlock = wordsPerDimBlock * dims; int numBlocks = utils::divUp(numVecs, warpSize); diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt index 2983ddc21..64a916fe2 100644 --- a/faiss/gpu/test/CMakeLists.txt +++ b/faiss/gpu/test/CMakeLists.txt @@ -21,10 +21,14 @@ include(GoogleTest) add_library(faiss_gpu_test_helper TestUtils.cpp) if(USE_ROCM) - target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest $<$:hip::host>) + target_link_libraries(faiss_gpu_test_helper PUBLIC + faiss gtest $<$:hip::host>) else() find_package(CUDAToolkit REQUIRED) - target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$:raft::raft> $<$:raft::compiled>) + target_link_libraries(faiss_gpu_test_helper PUBLIC + faiss gtest CUDA::cudart + $<$:raft::raft> + $<$:raft::compiled>) endif() macro(faiss_gpu_test file) diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt index 84bc33142..104ecc2fe 100644 --- a/faiss/python/CMakeLists.txt +++ b/faiss/python/CMakeLists.txt @@ -67,23 +67,34 @@ if(TARGET faiss) # Manually add headers as extra dependencies of swigfaiss. set(SWIG_MODULE_swigfaiss_EXTRA_DEPS) foreach(h ${FAISS_HEADERS}) - list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") - list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") endforeach() if(USE_ROCM) foreach(h ${FAISS_GPU_HEADERS}) - list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") + list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") endforeach() else() foreach(h ${FAISS_GPU_HEADERS}) - list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") - list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") endforeach() endif() else() @@ -159,18 +170,29 @@ endif() if(FAISS_ENABLE_GPU) if(USE_ROCM) find_package(HIP REQUIRED) - target_link_libraries(swigfaiss PRIVATE $<$:hip::host>) - target_link_libraries(swigfaiss_avx2 PRIVATE $<$:hip::host>) - target_link_libraries(swigfaiss_avx512 PRIVATE $<$:hip::host>) + target_link_libraries(swigfaiss PRIVATE + $<$:hip::host>) + target_link_libraries(swigfaiss_avx2 PRIVATE + $<$:hip::host>) + target_link_libraries(swigfaiss_avx512 PRIVATE + $<$:hip::host>) else() find_package(CUDAToolkit REQUIRED) if(FAISS_ENABLE_RAFT) find_package(raft COMPONENTS compiled distributed) endif() - target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) - target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) - target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) - target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) endif() endif()