diff --git a/gpu/impl/BroadcastSum.cu b/gpu/impl/BroadcastSum.cu index eece86de9..7736e4b3b 100644 --- a/gpu/impl/BroadcastSum.cu +++ b/gpu/impl/BroadcastSum.cu @@ -47,7 +47,7 @@ __global__ void sumAlongColumns(Tensor input, if (endRow) { for (int row = rowStart; row < output.getSize(0); ++row) { - T out = output[row][col].ldg(); + T out = output[row][col]; out = Math::add(out, val); output[row][col] = out; } @@ -57,7 +57,7 @@ __global__ void sumAlongColumns(Tensor input, for (int row = rowStart; row < rowEnd; row += kRowUnroll) { #pragma unroll for (int i = 0; i < kRowUnroll; ++i) { - rows[i] = output[row + i][col].ldg(); + rows[i] = output[row + i][col]; } #pragma unroll @@ -86,7 +86,7 @@ __global__ void sumAlongColumns(Tensor input, for (int row = rowStart; row < output.getSize(0); ++row) { #pragma unroll for (int i = 0; i < kColLoad; ++i) { - T out = output[row][col + i * blockDim.x].ldg(); + T out = output[row][col + i * blockDim.x]; out = Math::add(out, val[i]); output[row][col + i * blockDim.x] = out; } @@ -100,7 +100,7 @@ __global__ void sumAlongColumns(Tensor input, #pragma unroll for (int j = 0; j < kColLoad; ++j) { rows[i * kColLoad + j] = - output[row + i][col + j * blockDim.x].ldg(); + output[row + i][col + j * blockDim.x]; } } diff --git a/gpu/utils/Tensor-inl.cuh b/gpu/utils/Tensor-inl.cuh index d6d9762fa..50a801fc9 100644 --- a/gpu/utils/Tensor-inl.cuh +++ b/gpu/utils/Tensor-inl.cuh @@ -310,6 +310,11 @@ Tensor::canCastResize() const { static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes"); constexpr int kMultiple = sizeof(U) / sizeof(T); + // Ensure that the base pointer is sizeof(U) aligned + if (((uintptr_t) data_) % sizeof(U) != 0) { + return false; + } + // Check all outer strides for (int i = 0; i < Dim - 1; ++i) { if (stride_[i] % kMultiple != 0) {