Adding chunk_sum for amp f16

coreylowman · Jul 24, 2023 · aa6a0b5 · aa6a0b5
1 parent a143a0a
commit aa6a0b5
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 29 deletions.
diff --git a/src/tensor_ops/sum_to/cuda_kernel.rs b/src/tensor_ops/sum_to/cuda_kernel.rs
@@ -25,7 +25,7 @@ impl HasCudaKernel<f16> for Cuda {
 #[cfg(feature = "f16")]
 impl HasCudaKernel<AMP<f16>> for Cuda {
     const MOD: &'static str = "sum_amp_f16";
-    const FNS: &'static [&'static str] = &["sum_to_fwd_amp_f16", "sum_to_bwd_amp_f16"];
+    const FNS: &'static [&'static str] = &["sum_to_fwd_amp_f16", "sum_to_bwd_f16"];
 }
 
 impl HasCudaKernel<f32> for Cuda {

diff --git a/src/tensor_ops/sum_to/sum_to.cu b/src/tensor_ops/sum_to/sum_to.cu
@@ -81,52 +81,73 @@ SUM(__half, sum_to_fwd_f16, sum_to_bwd_f16);
 SUM(float, sum_to_fwd_f32, sum_to_bwd_f32);
 SUM(double, sum_to_fwd_f64, sum_to_bwd_f64);
 
-extern "C" __global__ void sum_to_fwd_amp_f16(
-    const size_t numel,
-    const size_t num_dims,
-    const __half elems_per_thread,
+__device__ void chunk_sum_amp_f16(
     const size_t chunk_len,
-    const size_t *info,
-    const __half *inp,
-    __half *out
+    const __half data,
+    __half* out
 ) {
+    __shared__ float buf[1024];
+
+    // assumes that threads where i >= numel have already exited
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int block_i = threadIdx.x;
 
-    if (i >= numel) {
+    // Fall back to atomicAdd if chunk_len is small to reduce overhead
+    if (chunk_len <= 2) {
+        atomicAdd(out + i / chunk_len, data);
         return;
     }
+    buf[block_i] = data;
 
-    const size_t *dims = info;
-    const size_t *strides = info + num_dims;
+    unsigned int chunk_i = i % chunk_len;
+    unsigned int chunk_start = max((int)(block_i - chunk_i), 0);
+    unsigned int chunk_end = min((unsigned int)(block_i + chunk_len - chunk_i), blockDim.x);
 
-    unsigned int inp_i = get_strided_index(i, num_dims, dims, strides);
-    chunk_sum(chunk_len, inp[inp_i] * elems_per_thread, out);
+    chunk_i = block_i - chunk_start;
+
+    size_t max_chunk_len = min(chunk_end - chunk_start, blockDim.x);
+    size_t incr = next_power_of_two(max_chunk_len) >> 1;
+
+    __syncthreads();
+
+    // Uses sequential addressing as discussed in
+    // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+    for (; incr > 0; incr >>= 1) {
+        unsigned int block_i_2 = block_i + incr;
+
+        if (block_i_2 < chunk_end && chunk_i < incr) {
+            // This is sound because __syncthreads and the conditions above
+            // ensure that no data races occur
+            buf[block_i] += buf[block_i_2];
+        }
+
+        __syncthreads();
+    }
+
+    if (block_i == chunk_start) {
+        __half y = buf[block_i];
+        atomicAdd(out + i / chunk_len, y);
+    }
 }
 
-// Accepts pre-broadcasted strides for both input & output.
-// So both inp & out are expected to be broadcasted to the same size.
-extern "C" __global__ void sum_to_bwd_amp_f16(
+extern "C" __global__ void sum_to_fwd_amp_f16(
     const size_t numel,
     const size_t num_dims,
     const __half elems_per_thread,
+    const size_t chunk_len,
     const size_t *info,
-    __half *grad_inp,
-    const __half *grad_out
+    const __half *inp,
+    __half *out
 ) {
-    unsigned int inp_i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (inp_i >= numel) {
+    if (i >= numel) {
         return;
     }
 
     const size_t *dims = info;
-    const size_t *inp_strides = info + num_dims;
-    const size_t *out_strides = info + 2 * num_dims;
-
-    unsigned int out_i = restrided(inp_i, num_dims, dims, inp_strides, out_strides);
+    const size_t *strides = info + num_dims;
 
-    // NOTE: since size of output is less than input, only 1 thread will be writing to inp
-    // at a time. this means we don't have to worry about multiple concurrent writes
-    // like we do with fwd.
-    grad_inp[inp_i] += grad_out[out_i] * elems_per_thread;
-}
+    unsigned int inp_i = get_strided_index(i, num_dims, dims, strides);
+    chunk_sum_amp_f16(chunk_len, inp[inp_i] * elems_per_thread, out);
+}