docs/html/DeviceDefs_8cuh_source.html

/**

 * Copyright (c) 2015-present, Facebook, Inc.

 * All rights reserved.

 *

 * This source code is licensed under the CC-by-NC license found in the

 * LICENSE file in the root directory of this source tree.

 */


// Copyright 2004-present Facebook. All Rights Reserved.


#pragma once


namespace faiss { namespace gpu {


#ifdef __CUDA_ARCH__

#if __CUDA_ARCH__ <= 600

constexpr int kWarpSize = 32;

#else

#error Unknown __CUDA_ARCH__; please define parameters for compute capability

#endif // __CUDA_ARCH__ types

#endif // __CUDA_ARCH__


#ifndef __CUDA_ARCH__

// dummy value for host compiler

constexpr int kWarpSize = 32;

#endif // !__CUDA_ARCH__


__forceinline__ __device__ void warpFence() {

  // Technically, memory barriers are required via the CUDA

  // programming model, since warp synchronous programming no longer

  // is guaranteed.

  //

  // There are two components to it:

  // -a barrier known to the compiler such that the compiler will not

  // schedule loads and stores across the barrier;

  // -a HW-level barrier that guarantees that writes are seen in the

  // proper order

  //

  // However, __threadfence_block() is a stronger constraint than what

  // we really want out of the hardware: a warp-wide barrier.

  //

  // In current hardware, it appears that warp synchronous programming

  // is a reality; by all tests it appears safe and race-free.

  //

  // However, understandably it may not be in the future (based on

  // what Nvidia says in the Kepler guide, it may change depending

  // upon compiler/toolchain issues or future hardware).

  //

  // Removing the fence results in 10%+ faster performance.

  // However, we are judicious as to where we insert the fence, so if

  // this reality ever changes, uncommenting this will result in CUDA

  // programming model-safe ordering again.

  //

  // FIXME: we should probably qualify as volatile as well, since the

  // compiler could technically preserve values across loops? This

  // seems very impractical for the compiler to do, however.


  //  __threadfence_block();

}


} } // namespace