coreylowman · swfsql · Jan 26, 2024 · Jan 26, 2024 · Feb 7, 2024 · Feb 6, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -8,4 +8,13 @@ safetensors = { version = "0.4.0", default-features = false }
 memmap2 = { version = "0.9.0", default-features = false }
 rand = { version = "0.8.5", default-features = false, features = ["std_rng"] }
 rand_distr = { version = "0.4.3", default-features = false }
-libm = "0.2.8"
+libm = "0.2.8"
+
+[patch.crates-io]
+crossbeam = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
+crossbeam-epoch = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
+crossbeam-channel = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
+crossbeam-deque = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
+crossbeam-queue = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
+crossbeam-skiplist = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
+crossbeam-utils = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
diff --git a/dfdx-core/Cargo.toml b/dfdx-core/Cargo.toml
@@ -35,7 +35,7 @@ num-traits = { workspace = true }
 safetensors = { workspace = true, optional = true }
 memmap2 = { workspace = true, optional = true }
 half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
-gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
+gemm = { version = "0.17.1", default-features = false, optional = true, features = ["rayon"] }
 rayon = { version = "1.7.0", optional = true }
 libm = { workspace = true }
 wgpu = { version = "0.18.0", features = ["glsl", "spirv"], optional = true }

diff --git a/dfdx-core/src/data/collate.rs b/dfdx-core/src/data/collate.rs
@@ -1,4 +1,4 @@
 use std::{mem::MaybeUninit, vec::Vec};

 /// Collates `Self` into some other type.
 /// Generally similar to an unzip method;
@@ -55,6 +55,7 @@
 impl<'a, A, B> Collate for Vec<&'a (A, B)> {
     type Collated = (Vec<&'a A>, Vec<&'a B>);
     fn collated(self) -> Self::Collated {
+        #[allow(clippy::map_identity)]
         self.into_iter().map(|(a, b)| (a, b)).unzip()
     }
 }

diff --git a/dfdx-core/src/lib.rs b/dfdx-core/src/lib.rs
@@ -9,7 +9,7 @@
 //! The following sections provide some high level core concepts & exmaples, and
 //! there is more detailed documentation in each of dfdx's submodules.
 //!
 //! See [feature_flags] for details on feature flags.
 //!
 //! # Shapes & Tensors
 //!
@@ -59,7 +59,7 @@
 //! There are two options for this currently, with more planned to be added in the future:
 //!
 //! 1. [tensor::Cpu] - for tensors stored on the heap
 //! 2. [tensor::Cuda] - for tensors stored in GPU memory
 //!
 //! Both devices implement [Default], you can also create them with a certain seed
 //! and ordinal.
@@ -85,8 +85,8 @@
 //! | Unary Operations | `a.sqrt()` | `a.sqrt()` | `a.sqrt()` |
 //! | Binary Operations | `a + b` | `a + b` | `a + b` |
 //! | gemm/gemv | [tensor_ops::matmul] | `a @ b` | `a @ b` |
 //! | 2d Convolution | [tensor_ops::TryConv2D] | - | `torch.conv2d` |
 //! | 2d Transposed Convolution | [tensor_ops::TryConvTrans2D] | - | `torch.conv_transpose2d` |
 //! | Slicing | [tensor_ops::slice] | `a[...]` | `a[...]` |
 //! | Select | [tensor_ops::SelectTo] | `a[...]` | `torch.select` |
 //! | Gather | [tensor_ops::GatherTo] | `np.take` | `torch.gather` |
@@ -128,44 +128,6 @@
     pub use crate::tensor_ops::*;
 }
 
-/// Sets a CPU `sse` flag to flush denormal floating point numbers to zero. The opposite of this is [keep_denormals()].
-///
-/// Some resources:
-/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
-/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
-pub fn flush_denormals_to_zero() {
-    #[cfg(all(target_arch = "x86", target_feature = "sse"))]
-    {
-        use std::arch::x86::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-    {
-        use std::arch::x86_64::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
-    }
-}
-
-/// Sets a CPU flag to keep denormal floating point numbers. The opposite of this is [flush_denormals_to_zero()].
-///
-/// Some resources:
-/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
-/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
-pub fn keep_denormals() {
-    #[cfg(all(target_arch = "x86", target_feature = "sse"))]
-    {
-        use std::arch::x86::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-    {
-        use std::arch::x86_64::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
-    }
-}
-
 #[cfg(test)]
 pub(crate) mod tests {
     pub use num_traits::{Float, NumCast, Zero};

diff --git a/dfdx-core/src/tensor/cache.rs b/dfdx-core/src/tensor/cache.rs
@@ -1,4 +1,4 @@
 use std::{alloc::Layout, collections::BTreeMap, vec::Vec};

 #[cfg(not(feature = "no-std"))]
 use std::sync::RwLock;
@@ -33,21 +33,35 @@
 /// valid allocation. When the last value is removed from the list, the key
 /// is removed.
 #[derive(Debug)]
-pub(crate) struct TensorCache<Ptr> {
+pub(crate) struct TensorCache<Ptr: CachePtr<DeviceDev>, DeviceDev = ()> {
     pub(crate) allocations: RwLock<BTreeMap<AllocationKey, Vec<Ptr>>>,
     pub(crate) enabled: RwLock<bool>,
+    device_dev: DeviceDev,
 }
 
-impl<Ptr> Default for TensorCache<Ptr> {
+impl<Ptr: CachePtr<DeviceDev>, DeviceDev: Default> Default for TensorCache<Ptr, DeviceDev> {
     fn default() -> Self {
         Self {
             allocations: Default::default(),
             enabled: RwLock::new(false),
+            device_dev: DeviceDev::default(),
         }
     }
 }
 
-impl<Ptr> TensorCache<Ptr> {
+#[allow(dead_code)]
+impl<Ptr: CachePtr<DeviceDev>, DeviceDev> TensorCache<Ptr, DeviceDev> {
+    /// Initiate an empty [TensorCache] with a given `device_dev`.
+    pub(crate) fn new(device_dev: DeviceDev) -> Self {
+        Self {
+            allocations: Default::default(),
+            enabled: RwLock::new(false),
+            device_dev,
+        }
+    }
+}
+
+impl<Ptr: CachePtr<DeviceDev>, DeviceDev> TensorCache<Ptr, DeviceDev> {
     /// Returns the number of allocations in the cache.
     #[allow(unused)]
     pub(crate) fn len(&self) -> usize {
@@ -183,6 +197,60 @@
     }
 }
 
+impl<Ptr: CachePtr<DeviceDev>, DeviceDev> TensorCache<Ptr, DeviceDev> {
+    /// Deallocates all cached memory on the device and empties the cache.
+    pub(crate) fn try_clear(&self) -> Result<(), crate::prelude::Error> {
+        let mut cache = {
+            #[cfg(not(feature = "no-std"))]
+            {
+                self.allocations.write().unwrap()
+            }
+            #[cfg(feature = "no-std")]
+            {
+                self.allocations.write()
+            }
+        };
+
+        for (&key, allocations) in cache.iter_mut() {
+            for alloc in allocations.drain(..) {
+                alloc.dealloc(&key, &self.device_dev);
+            }
+        }
+        cache.clear();
+        Ok(())
+    }
+}
+
+impl<Ptr: CachePtr<DeviceDev>, DeviceDev> Drop for TensorCache<Ptr, DeviceDev> {
+    fn drop(&mut self) {
+        self.try_clear().unwrap();
+    }
+}
+
+/// Functionality internalized by the pointer.
+pub(crate) trait CachePtr<Dev>: Sized {
+    // by default no deallocation is made for any cache ptr
+    // ie. they leak
+    /// Deallocates the memory referred by this pointer.
+    fn dealloc(self, _key: &AllocationKey, _dev: &Dev) {}
+}
+
+impl<Dev> CachePtr<Dev> for bool {}
+impl<Dev> CachePtr<Dev> for u8 {}
+impl<Dev> CachePtr<Dev> for u16 {}
+impl<Dev> CachePtr<Dev> for u32 {}
+impl<Dev> CachePtr<Dev> for u64 {}
+impl<Dev> CachePtr<Dev> for u128 {}
+impl<Dev> CachePtr<Dev> for usize {}
+impl<Dev> CachePtr<Dev> for i8 {}
+impl<Dev> CachePtr<Dev> for i16 {}
+impl<Dev> CachePtr<Dev> for i32 {}
+impl<Dev> CachePtr<Dev> for i64 {}
+impl<Dev> CachePtr<Dev> for i128 {}
+impl<Dev> CachePtr<Dev> for isize {}
+impl<Dev> CachePtr<Dev> for f32 {}
+impl<Dev> CachePtr<Dev> for f64 {}
+
 #[cfg(test)]
 mod test {
     use super::*;

diff --git a/dfdx-core/src/tensor/cpu/device.rs b/dfdx-core/src/tensor/cpu/device.rs
@@ -25,7 +25,7 @@ pub struct Cpu {
     /// A thread safe random number generator.
     pub(crate) rng: Arc<Mutex<StdRng>>,
     /// A thread safe cache of memory allocations that can be reused.
-    pub(crate) cache: Arc<TensorCache<BytesPtr>>,
+    pub(crate) cache: Arc<TensorCache<BytesPtr, CpuDevice>>,
 }
 
 impl Default for Cpu {
@@ -47,14 +47,53 @@ impl Cpu {
     }
 }
 
+/// Unit struct to represent information needed for managing allocations on the Cpu.
+#[derive(Clone, Debug, Default)]
+pub(crate) struct CpuDevice;
+
+impl crate::tensor::cache::CachePtr<CpuDevice> for BytesPtr {
+    fn dealloc(self, key: &crate::tensor::cache::AllocationKey, _dev: &CpuDevice) {
+        assert!(key.num_bytes % key.size == 0);
+        assert!(key.num_bytes < isize::MAX as usize);
+        let len = key.num_bytes / key.size;
+        let cap = len;
+        // SAFETY:
+        // - "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
+        //    - ✅ cpu uses global allocator
+        // - "T needs to have the same alignment as what ptr was allocated with."
+        //    - ✅ we are matching on the alignment below
+        // - "The size of T times the capacity needs to be the same size as the pointer was allocated with."
+        //    - ✅ covered by `key.num_bytes / key.size` and the `key.num_bytes % key.size == 0` assertion above
+        // - "length needs to be less than or equal to capacity."
+        //    - ✅ they are equal
+        // - "The first length values must be properly initialized values of type T."
+        //    - ✅ any bit pattern is valid for unsigned ints used below
+        // - "capacity needs to be the capacity that the pointer was allocated with."
+        //    - ✅ handled by assertion above (key.num_bytes % key.size == 0)
+        // - "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
+        //    - ✅ handled by assertion above
+        debug_assert_eq!(std::alloc::Layout::new::<u8>().align(), 1);
+        debug_assert_eq!(std::alloc::Layout::new::<u16>().align(), 2);
+        debug_assert_eq!(std::alloc::Layout::new::<u32>().align(), 4);
+        debug_assert_eq!(std::alloc::Layout::new::<u64>().align(), 8);
+        match key.alignment {
+            1 => unsafe { drop(Vec::from_raw_parts(self.0, len, cap)) },
+            2 => unsafe { drop(Vec::from_raw_parts(self.0 as *mut u16, len, cap)) },
+            4 => unsafe { drop(Vec::from_raw_parts(self.0 as *mut u32, len, cap)) },
+            8 => unsafe { drop(Vec::from_raw_parts(self.0 as *mut u64, len, cap)) },
+            _ => unreachable!(),
+        };
+    }
+}
+
 /// A [Vec] that can be cloned without allocating new memory.
 /// When [Drop]ed it will insert it's data into the cache.
 #[derive(Debug)]
 pub struct CachableVec<E> {
     /// The data stored in this vector.
     pub(crate) data: Vec<E>,
     /// A cache of memory allocations that can be reused.
-    pub(crate) cache: Arc<TensorCache<BytesPtr>>,
+    pub(crate) cache: Arc<TensorCache<BytesPtr, CpuDevice>>,
 }
 
 impl<E: Clone> Clone for CachableVec<E> {
@@ -166,45 +205,6 @@ impl Cache for Cpu {
     }
 
     fn try_empty_cache(&self) -> Result<(), Error> {
-        #[cfg(not(feature = "no-std"))]
-        let mut cache = self.cache.allocations.write().unwrap();
-        #[cfg(feature = "no-std")]
-        let mut cache = self.cache.allocations.write();
-        for (&key, allocations) in cache.iter_mut() {
-            assert!(key.num_bytes % key.size == 0);
-            assert!(key.num_bytes < isize::MAX as usize);
-            let len = key.num_bytes / key.size;
-            let cap = len;
-            for alloc in allocations.drain(..) {
-                // SAFETY:
-                // - "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
-                //    - ✅ cpu uses global allocator
-                // - "T needs to have the same alignment as what ptr was allocated with."
-                //    - ✅ we are matching on the alignment below
-                // - "The size of T times the capacity needs to be the same size as the pointer was allocated with."
-                //    - ✅ covered by `key.num_bytes / key.size` and the `key.num_bytes % key.size == 0` assertion above
-                // - "length needs to be less than or equal to capacity."
-                //    - ✅ they are equal
-                // - "The first length values must be properly initialized values of type T."
-                //    - ✅ any bit pattern is valid for unsigned ints used below
-                // - "capacity needs to be the capacity that the pointer was allocated with."
-                //    - ✅ handled by assertion above (key.num_bytes % key.size == 0)
-                // - "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
-                //    - ✅ handled by assertion above
-                debug_assert_eq!(std::alloc::Layout::new::<u8>().align(), 1);
-                debug_assert_eq!(std::alloc::Layout::new::<u16>().align(), 2);
-                debug_assert_eq!(std::alloc::Layout::new::<u32>().align(), 4);
-                debug_assert_eq!(std::alloc::Layout::new::<u64>().align(), 8);
-                match key.alignment {
-                    1 => unsafe { drop(Vec::from_raw_parts(alloc.0, len, cap)) },
-                    2 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u16, len, cap)) },
-                    4 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u32, len, cap)) },
-                    8 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u64, len, cap)) },
-                    _ => unreachable!(),
-                };
-            }
-        }
-        cache.clear();
-        Ok(())
+        self.cache.try_clear()
     }
 }
diff --git a/dfdx-core/src/tensor/cuda/device.rs b/dfdx-core/src/tensor/cuda/device.rs
@@ -29,7 +29,7 @@ pub struct Cuda {
     /// A second stream for kernels to optionally execute on.
     pub(crate) par_stream: Arc<CudaStream>,
     pub(crate) workspace: Arc<Mutex<CudaSlice<u8>>>,
-    pub(crate) cache: Arc<TensorCache<CUdeviceptr>>,
+    pub(crate) cache: Arc<TensorCache<CudaBytesPtr, Arc<CudaDevice>>>,
 }
 
 impl From<CublasError> for Error {
@@ -77,6 +77,7 @@ impl Cuda {
         let cudnn = cudarc::cudnn::Cudnn::new(dev.clone())?;
         let par_stream = Arc::new(dev.fork_default_stream()?);
         let workspace = Arc::new(Mutex::new(dev.alloc_zeros::<u8>(1)?));
+        let cache = Arc::new(TensorCache::new(Arc::clone(&dev)));
         Ok(Self {
             cpu,
             dev,
@@ -85,7 +86,7 @@ impl Cuda {
             cudnn,
             par_stream,
             workspace,
-            cache: Default::default(),
+            cache,
         })
     }
 }
@@ -100,7 +101,7 @@ impl Cuda {
     ) -> Result<CudaSlice<E>, Error> {
         let data = self.cache.try_pop::<E>(len).map_or_else(
             || self.dev.alloc::<E>(len),
-            |ptr| Ok(self.dev.upgrade_device_ptr(ptr, len)),
+            |ptr| Ok(self.dev.upgrade_device_ptr(ptr.0, len)),
         )?;
         Ok(data)
     }
@@ -122,14 +123,26 @@ impl Cuda {
     }
 }
 
+/// A pointer to a bytes on the Cuda device. Used in conjunction with [TensorCache].
+#[repr(transparent)]
+#[derive(Clone, Debug)]
+pub struct CudaBytesPtr(pub(crate) CUdeviceptr);
+
+impl crate::tensor::cache::CachePtr<Arc<CudaDevice>> for CudaBytesPtr {
+    fn dealloc(self, key: &crate::tensor::cache::AllocationKey, dev: &Arc<CudaDevice>) {
+        let data = unsafe { dev.upgrade_device_ptr::<u8>(self.0, key.num_bytes) };
+        drop(data);
+    }
+}
+
 /// A [CudaSlice] that can be cloned without allocating new memory.
 /// When [Drop]ed it will insert it's data into the cache.
 #[derive(Debug)]
 pub struct CachableCudaSlice<E> {
     /// The actual data.
     pub(crate) data: CudaSlice<E>,
     /// A cache of device pointers that can be reused.
-    pub(crate) cache: Arc<TensorCache<CUdeviceptr>>,
+    pub(crate) cache: Arc<TensorCache<CudaBytesPtr, Arc<CudaDevice>>>,
 }
 
 impl<E: cudarc::driver::DeviceRepr> Clone for CachableCudaSlice<E> {
@@ -142,7 +155,7 @@ impl<E: cudarc::driver::DeviceRepr> Clone for CachableCudaSlice<E> {
                 // SAFETY:
                 // 1. we know that ptr is valid for `num_bytes` because it was registered for that.
                 // 2. we are about to set the memory with dtod_copy
-                let mut slice = unsafe { dev.upgrade_device_ptr(ptr, len) };
+                let mut slice = unsafe { dev.upgrade_device_ptr(ptr.0, len) };
                 dev.dtod_copy(&self.data, &mut slice).unwrap();
                 slice
             },
@@ -209,7 +222,7 @@ impl<E> Drop for CachableCudaSlice<E> {
             let numel = data.len();
             // Get access to the raw pointer without freeing it.
             let ptr = data.leak();
-            self.cache.insert::<E>(numel, ptr);
+            self.cache.insert::<E>(numel, CudaBytesPtr(ptr));
         }
     }
 }
@@ -232,18 +245,7 @@ impl Cache for Cuda {
     }
 
     fn try_empty_cache(&self) -> Result<(), Error> {
-        #[cfg(not(feature = "no-std"))]
-        let mut cache = self.cache.allocations.write().unwrap();
-        #[cfg(feature = "no-std")]
-        let mut cache = self.cache.allocations.write();
-        for (&key, allocations) in cache.iter_mut() {
-            for alloc in allocations.drain(..) {
-                let data = unsafe { self.dev.upgrade_device_ptr::<u8>(alloc, key.num_bytes) };
-                drop(data);
-            }
-        }
-        cache.clear();
-        Ok(())
+        self.cache.try_clear()
     }
 }
 

diff --git a/dfdx-core/src/tensor/gradients.rs b/dfdx-core/src/tensor/gradients.rs
@@ -153,7 +153,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
     #[inline]
     pub(crate) fn many_and_ref<L: Shape, R: Shape>(
         &mut self,
-        ls: &Vec<impl Tensorlike<L, E, D>>,
+        ls: &[impl Tensorlike<L, E, D>],
         r: &impl Tensorlike<R, E, D>,
     ) -> (Vec<&mut D::Vec>, &D::Vec) {
         for i in 0..ls.len() {