coreylowman · swfsql · Jan 26, 2024 · Jan 26, 2024 · Feb 7, 2024 · Feb 6, 2024
diff --git a/dfdx-core/Cargo.toml b/dfdx-core/Cargo.toml
@@ -35,7 +35,7 @@ num-traits = { workspace = true }
 safetensors = { workspace = true, optional = true }
 memmap2 = { workspace = true, optional = true }
 half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
-gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
+gemm = { version = "0.17.1", default-features = false, optional = true, features = ["rayon"] }
 rayon = { version = "1.7.0", optional = true }
 libm = { workspace = true }
 wgpu = { version = "0.18.0", features = ["glsl", "spirv"], optional = true }

diff --git a/dfdx-core/src/data/collate.rs b/dfdx-core/src/data/collate.rs
@@ -1,4 +1,4 @@
 use std::{mem::MaybeUninit, vec::Vec};

 /// Collates `Self` into some other type.
 /// Generally similar to an unzip method;
@@ -55,6 +55,7 @@
 impl<'a, A, B> Collate for Vec<&'a (A, B)> {
     type Collated = (Vec<&'a A>, Vec<&'a B>);
     fn collated(self) -> Self::Collated {
+        #[allow(clippy::map_identity)]
         self.into_iter().map(|(a, b)| (a, b)).unzip()
     }
 }

diff --git a/dfdx-core/src/lib.rs b/dfdx-core/src/lib.rs
@@ -9,7 +9,7 @@
 //! The following sections provide some high level core concepts & exmaples, and
 //! there is more detailed documentation in each of dfdx's submodules.
 //!
 //! See [feature_flags] for details on feature flags.
 //!
 //! # Shapes & Tensors
 //!
@@ -59,7 +59,7 @@
 //! There are two options for this currently, with more planned to be added in the future:
 //!
 //! 1. [tensor::Cpu] - for tensors stored on the heap
 //! 2. [tensor::Cuda] - for tensors stored in GPU memory
 //!
 //! Both devices implement [Default], you can also create them with a certain seed
 //! and ordinal.
@@ -85,8 +85,8 @@
 //! | Unary Operations | `a.sqrt()` | `a.sqrt()` | `a.sqrt()` |
 //! | Binary Operations | `a + b` | `a + b` | `a + b` |
 //! | gemm/gemv | [tensor_ops::matmul] | `a @ b` | `a @ b` |
 //! | 2d Convolution | [tensor_ops::TryConv2D] | - | `torch.conv2d` |
 //! | 2d Transposed Convolution | [tensor_ops::TryConvTrans2D] | - | `torch.conv_transpose2d` |
 //! | Slicing | [tensor_ops::slice] | `a[...]` | `a[...]` |
 //! | Select | [tensor_ops::SelectTo] | `a[...]` | `torch.select` |
 //! | Gather | [tensor_ops::GatherTo] | `np.take` | `torch.gather` |
@@ -128,44 +128,6 @@
     pub use crate::tensor_ops::*;
 }
 
-/// Sets a CPU `sse` flag to flush denormal floating point numbers to zero. The opposite of this is [keep_denormals()].
-///
-/// Some resources:
-/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
-/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
-pub fn flush_denormals_to_zero() {
-    #[cfg(all(target_arch = "x86", target_feature = "sse"))]
-    {
-        use std::arch::x86::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-    {
-        use std::arch::x86_64::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
-    }
-}
-
-/// Sets a CPU flag to keep denormal floating point numbers. The opposite of this is [flush_denormals_to_zero()].
-///
-/// Some resources:
-/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
-/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
-pub fn keep_denormals() {
-    #[cfg(all(target_arch = "x86", target_feature = "sse"))]
-    {
-        use std::arch::x86::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-    {
-        use std::arch::x86_64::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
-    }
-}
-
 #[cfg(test)]
 pub(crate) mod tests {
     pub use num_traits::{Float, NumCast, Zero};

diff --git a/dfdx-core/src/tensor/gradients.rs b/dfdx-core/src/tensor/gradients.rs
@@ -153,7 +153,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
     #[inline]
     pub(crate) fn many_and_ref<L: Shape, R: Shape>(
         &mut self,
-        ls: &Vec<impl Tensorlike<L, E, D>>,
+        ls: &[impl Tensorlike<L, E, D>],
         r: &impl Tensorlike<R, E, D>,
     ) -> (Vec<&mut D::Vec>, &D::Vec) {
         for i in 0..ls.len() {

diff --git a/dfdx-core/src/tensor_ops/convtrans2d/mod.rs b/dfdx-core/src/tensor_ops/convtrans2d/mod.rs
@@ -51,7 +51,7 @@ pub(super) trait ConvTrans2DKernel<E: Dtype>: Storage<E> {
     ) -> Result<(), Error>;
 }
 
-pub trait TryConvTrans2D<Stride, Padding, Dilation, Groups>: Sized {
+pub trait TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>: Sized {
     type Convolved;
 
     /// Applies a 2D convolution to the input tensor.
@@ -61,8 +61,9 @@ pub trait TryConvTrans2D<Stride, Padding, Dilation, Groups>: Sized {
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
+        output_padding: OutputPadding,
     ) -> Self::Convolved {
-        self.try_convtrans2d(stride, padding, dilation, groups)
+        self.try_convtrans2d(stride, padding, dilation, groups, output_padding)
             .unwrap()
     }
 
@@ -73,6 +74,7 @@ pub trait TryConvTrans2D<Stride, Padding, Dilation, Groups>: Sized {
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
+        output_padding: OutputPadding,
     ) -> Result<Self::Convolved, Error>;
 }
 
@@ -82,27 +84,31 @@ impl<
         const PADDING: usize,
         const DILATION: usize,
         Groups: Dim,
+        const OUTPUT_PADDING: usize,
         const DIM: usize,
-    > TryConvTrans2D<Const<STRIDE>, Const<PADDING>, Const<DILATION>, Groups>
+    > TryConvTrans2D<Const<STRIDE>, Const<PADDING>, Const<DILATION>, Groups, Const<OUTPUT_PADDING>>
     for (Const<DIM>, Const<KERNEL>)
 where
-    Const<{ (DIM - 1) * STRIDE - 2 * PADDING + DILATION * (KERNEL - 1) + 1 }>: Sized,
+    Const<{ (DIM - 1) * STRIDE - 2 * PADDING + DILATION * (KERNEL - 1) + 1 + OUTPUT_PADDING }>:
+        Sized,
 {
-    type Convolved = Const<{ (DIM - 1) * STRIDE - 2 * PADDING + DILATION * (KERNEL - 1) + 1 }>;
+    type Convolved =
+        Const<{ (DIM - 1) * STRIDE - 2 * PADDING + DILATION * (KERNEL - 1) + 1 + OUTPUT_PADDING }>;
 
     fn try_convtrans2d(
         self,
         _: Const<STRIDE>,
         _: Const<PADDING>,
         _: Const<DILATION>,
         _: Groups,
+        _: Const<OUTPUT_PADDING>,
     ) -> Result<Self::Convolved, Error> {
         Ok(Const)
     }
 }
 
-impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim, Groups: Dim>
-    TryConvTrans2D<Stride, Padding, Dilation, Groups> for (usize, Kernel)
+impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim, Groups: Dim, OutputPadding: Dim>
+    TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding> for (usize, Kernel)
 {
     type Convolved = usize;
 
@@ -112,18 +118,33 @@ impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim, Groups: Dim>
         padding: Padding,
         dilation: Dilation,
         _: Groups,
+        output_padding: OutputPadding,
     ) -> Result<Self::Convolved, Error> {
         let (dim, kernel) = self;
-        Ok(
-            ((dim - 1) * stride.size() + dilation.size() * (kernel.size() - 1) + 1)
-                .checked_sub(2 * padding.size())
-                .unwrap(),
-        )
+        Ok(((dim - 1) * stride.size()
+            + dilation.size() * (kernel.size() - 1)
+            + 1
+            + output_padding.size())
+        .checked_sub(2 * padding.size())
+        .unwrap())
     }
 }
 
-impl<InpChan, OutChanOverGroups, Kernel, Stride, Padding, Dilation, Groups, H, W, E, D, T>
-    TryConvTrans2D<Stride, Padding, Dilation, Groups>
+impl<
+        InpChan,
+        OutChanOverGroups,
+        Kernel,
+        Stride,
+        Padding,
+        Dilation,
+        Groups,
+        OutputPadding,
+        H,
+        W,
+        E,
+        D,
+        T,
+    > TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>
     for (
         Tensor<(InpChan, H, W), E, D, T>,
         Tensor<(InpChan, OutChanOverGroups, Kernel, Kernel), E, D>,
@@ -136,23 +157,26 @@ where
     Padding: Dim,
     Dilation: Dim,
     Groups: Dim,
+    OutputPadding: Dim,
     H: Dim,
     W: Dim,
     E: Dtype,
     D: ConvTrans2DKernel<E> + crate::tensor_ops::reshape_to::ReshapeKernel<E>,
     T: Tape<E, D>,
     OutChanOverGroups: std::ops::Mul<Groups>,
     <OutChanOverGroups as std::ops::Mul<Groups>>::Output: Dim,
-    (H, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups>,
-    (W, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups>,
-    <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved: Dim,
-    <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved: Dim,
+    (H, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>,
+    (W, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>,
+    <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved:
+        Dim,
+    <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved:
+        Dim,
 {
     type Convolved = Tensor<
         (
             <OutChanOverGroups as std::ops::Mul<Groups>>::Output,
-            <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved,
-            <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved,
+            <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved,
+            <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved,
         ),
         E,
         D,
@@ -165,11 +189,13 @@ where
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
+        output_padding: OutputPadding,
     ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         let (inp_chan, h, w) = img.shape;
         let img = img.try_reshape_like(&(Const::<1>, inp_chan, h, w))?;
-        let out = (img, filters).try_convtrans2d(stride, padding, dilation, groups)?;
+        let out =
+            (img, filters).try_convtrans2d(stride, padding, dilation, groups, output_padding)?;
         let (_, out_chan, out_h, out_w) = out.shape;
         out.try_reshape_like(&(out_chan, out_h, out_w))
     }
@@ -182,13 +208,14 @@ impl<
         Padding,
         Dilation,
         Groups,
+        OutputPadding,
         Batch,
         H,
         W,
         E,
         D,
         T,
-    > TryConvTrans2D<Stride, Padding, Dilation, Groups>
+    > TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>
     for (
         Tensor<(Batch, InpChan, H, W), E, D, T>,
         Tensor<(InpChan, OutChanOverGroups, Kernel, Kernel), E, D>,
@@ -201,6 +228,7 @@ where
     Padding: Dim,
     Dilation: Dim,
     Groups: Dim,
+    OutputPadding: Dim,
     Batch: Dim,
     H: Dim,
     W: Dim,
@@ -209,17 +237,19 @@ where
     T: Tape<E, D>,
     OutChanOverGroups: std::ops::Mul<Groups>,
     <OutChanOverGroups as std::ops::Mul<Groups>>::Output: Dim,
-    (H, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups>,
-    (W, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups>,
-    <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved: Dim,
-    <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved: Dim,
+    (H, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>,
+    (W, Kernel): TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>,
+    <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved:
+        Dim,
+    <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved:
+        Dim,
 {
     type Convolved = Tensor<
         (
             Batch,
             <OutChanOverGroups as std::ops::Mul<Groups>>::Output,
-            <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved,
-            <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups>>::Convolved,
+            <(H, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved,
+            <(W, Kernel) as TryConvTrans2D<Stride, Padding, Dilation, Groups, OutputPadding>>::Convolved,
         ),
         E,
         D,
@@ -232,6 +262,7 @@ where
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
+        output_padding: OutputPadding,
     ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         assert_eq!(img.shape.1, filters.shape.0);
@@ -242,8 +273,8 @@ where
         if img.strides != img.shape.strides() || filters.strides != filters.shape.strides() {
             panic!("Image & filter inputs to conv2d must be contiguous");
         }
-        let h_out = (h, kernel).convtrans2d(stride, padding, dilation, groups);
-        let w_out = (w, kernel).convtrans2d(stride, padding, dilation, groups);
+        let h_out = (h, kernel).convtrans2d(stride, padding, dilation, groups, output_padding);
+        let w_out = (w, kernel).convtrans2d(stride, padding, dilation, groups, output_padding);
         let op = ConvTrans2DOp {
             stride: stride.size(),
             padding: padding.size(),

diff --git a/dfdx-core/src/tensor_ops/convtrans2d/tests.rs b/dfdx-core/src/tensor_ops/convtrans2d/tests.rs
@@ -33,8 +33,8 @@ fn test_convtrans2d_default() {
             ],
         ])
         .to_dtype::<TestDtype>();
-    let y =
-        (x.leaky_trace(), w.clone()).convtrans2d(Const::<1>, Const::<0>, Const::<1>, Const::<1>);
+    let y = (x.leaky_trace(), w.clone())
+        .convtrans2d(Const::<1>, Const::<0>, Const::<1>, Const::<1>, Const::<0>);
     #[rustfmt::skip]
     assert_close_to_literal!(
         y,
@@ -125,8 +125,8 @@ fn test_convtrans2d_stride_2() {
             ],
         ])
         .to_dtype::<TestDtype>();
-    let y =
-        (x.leaky_trace(), w.clone()).convtrans2d(Const::<2>, Const::<0>, Const::<1>, Const::<1>);
+    let y = (x.leaky_trace(), w.clone())
+        .convtrans2d(Const::<2>, Const::<0>, Const::<1>, Const::<1>, Const::<0>);
     #[rustfmt::skip]
     assert_close_to_literal!(
         y,
@@ -223,8 +223,8 @@ fn test_convtrans2d_padded() {
             ],
         ])
         .to_dtype::<TestDtype>();
-    let y =
-        (x.leaky_trace(), w.clone()).convtrans2d(Const::<1>, Const::<1>, Const::<1>, Const::<1>);
+    let y = (x.leaky_trace(), w.clone())
+        .convtrans2d(Const::<1>, Const::<1>, Const::<1>, Const::<1>, Const::<0>);
     assert_close_to_literal!(
         y,
         [
@@ -283,8 +283,8 @@ fn test_convtrans2d_batched() {
     let x: Tensor<Rank3<3, 28, 28>, TestDtype, _> = dev.sample_normal();
     let w: Tensor<Rank4<3, 5, 6, 6>, TestDtype, _> = dev.sample_normal();
 
-    let y: Tensor<Rank3<5, 83, 83>, _, _, _> =
-        (x.leaky_trace(), w.clone()).convtrans2d(Const::<3>, Const::<2>, Const::<1>, Const::<1>);
+    let y: Tensor<Rank3<5, 83, 83>, _, _, _> = (x.leaky_trace(), w.clone())
+        .convtrans2d(Const::<3>, Const::<2>, Const::<1>, Const::<1>, Const::<0>);
     let y0 = y.retaped::<NoneTape>();
     let grads0 = y.square().mean().backward();
     let x0 = grads0.get(&x);
@@ -294,8 +294,8 @@ fn test_convtrans2d_batched() {
         .broadcast::<Rank4<10, 3, 28, 28>, _>()
         .reshape::<Rank4<10, 3, 28, 28>>();
 
-    let y: Tensor<Rank4<10, 5, 83, 83>, _, _, _> =
-        (x.leaky_trace(), w.clone()).convtrans2d(Const::<3>, Const::<2>, Const::<1>, Const::<1>);
+    let y: Tensor<Rank4<10, 5, 83, 83>, _, _, _> = (x.leaky_trace(), w.clone())
+        .convtrans2d(Const::<3>, Const::<2>, Const::<1>, Const::<1>, Const::<0>);
     for i in 0..10 {
         assert_close_to_tensor!(y0, y.retaped::<NoneTape>().select(dev.tensor(i)), 1e-5);
     }
@@ -341,8 +341,8 @@ fn test_convtrans2d_grouped() {
             ],
         ])
         .to_dtype::<TestDtype>();
-    let y =
-        (x.leaky_trace(), w.clone()).convtrans2d(Const::<1>, Const::<0>, Const::<1>, Const::<2>);
+    let y = (x.leaky_trace(), w.clone())
+        .convtrans2d(Const::<1>, Const::<0>, Const::<1>, Const::<2>, Const::<0>);
     #[rustfmt::skip]
     assert_close_to_literal!(
         y,
@@ -451,8 +451,8 @@ fn test_convtrans2d_dilated() {
             ],
         ])
         .to_dtype::<TestDtype>();
-    let y =
-        (x.leaky_trace(), w.clone()).convtrans2d(Const::<1>, Const::<0>, Const::<2>, Const::<1>);
+    let y = (x.leaky_trace(), w.clone())
+        .convtrans2d(Const::<1>, Const::<0>, Const::<2>, Const::<1>, Const::<0>);
     #[rustfmt::skip]
     assert_close_to_literal!(
         y,