Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run tests with miri #899

Closed
wants to merge 12 commits into from
11 changes: 10 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,13 @@ safetensors = { version = "0.4.0", default-features = false }
memmap2 = { version = "0.9.0", default-features = false }
rand = { version = "0.8.5", default-features = false, features = ["std_rng"] }
rand_distr = { version = "0.4.3", default-features = false }
libm = "0.2.8"
libm = "0.2.8"

[patch.crates-io]
crossbeam = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
crossbeam-epoch = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
crossbeam-channel = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
crossbeam-deque = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
crossbeam-queue = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
crossbeam-skiplist = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
crossbeam-utils = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "a57e655eef415c21babddc4ba0217b6ca7acd0a2" }
2 changes: 1 addition & 1 deletion dfdx-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ num-traits = { workspace = true }
safetensors = { workspace = true, optional = true }
memmap2 = { workspace = true, optional = true }
half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
gemm = { version = "0.17.1", default-features = false, optional = true, features = ["rayon"] }
rayon = { version = "1.7.0", optional = true }
libm = { workspace = true }
wgpu = { version = "0.18.0", features = ["glsl", "spirv"], optional = true }
Expand Down
1 change: 1 addition & 0 deletions dfdx-core/src/data/collate.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{mem::MaybeUninit, vec::Vec};

Check warning on line 1 in dfdx-core/src/data/collate.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

Check warning on line 1 in dfdx-core/src/data/collate.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

Check warning on line 1 in dfdx-core/src/data/collate.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

Check warning on line 1 in dfdx-core/src/data/collate.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

Check warning on line 1 in dfdx-core/src/data/collate.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

/// Collates `Self` into some other type.
/// Generally similar to an unzip method;
Expand Down Expand Up @@ -55,6 +55,7 @@
impl<'a, A, B> Collate for Vec<&'a (A, B)> {
type Collated = (Vec<&'a A>, Vec<&'a B>);
fn collated(self) -> Self::Collated {
#[allow(clippy::map_identity)]
self.into_iter().map(|(a, b)| (a, b)).unzip()
}
}
Expand Down
38 changes: 0 additions & 38 deletions dfdx-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
//! The following sections provide some high level core concepts & exmaples, and
//! there is more detailed documentation in each of dfdx's submodules.
//!
//! See [feature_flags] for details on feature flags.

Check warning on line 12 in dfdx-core/src/lib.rs

View workflow job for this annotation

GitHub Actions / cargo-check

unresolved link to `feature_flags`
//!
//! # Shapes & Tensors
//!
Expand Down Expand Up @@ -59,7 +59,7 @@
//! There are two options for this currently, with more planned to be added in the future:
//!
//! 1. [tensor::Cpu] - for tensors stored on the heap
//! 2. [tensor::Cuda] - for tensors stored in GPU memory

Check warning on line 62 in dfdx-core/src/lib.rs

View workflow job for this annotation

GitHub Actions / cargo-check

unresolved link to `tensor::Cuda`
//!
//! Both devices implement [Default], you can also create them with a certain seed
//! and ordinal.
Expand All @@ -85,8 +85,8 @@
//! | Unary Operations | `a.sqrt()` | `a.sqrt()` | `a.sqrt()` |
//! | Binary Operations | `a + b` | `a + b` | `a + b` |
//! | gemm/gemv | [tensor_ops::matmul] | `a @ b` | `a @ b` |
//! | 2d Convolution | [tensor_ops::TryConv2D] | - | `torch.conv2d` |

Check warning on line 88 in dfdx-core/src/lib.rs

View workflow job for this annotation

GitHub Actions / cargo-check

unresolved link to `tensor_ops::TryConv2D`
//! | 2d Transposed Convolution | [tensor_ops::TryConvTrans2D] | - | `torch.conv_transpose2d` |

Check warning on line 89 in dfdx-core/src/lib.rs

View workflow job for this annotation

GitHub Actions / cargo-check

unresolved link to `tensor_ops::TryConvTrans2D`
//! | Slicing | [tensor_ops::slice] | `a[...]` | `a[...]` |
//! | Select | [tensor_ops::SelectTo] | `a[...]` | `torch.select` |
//! | Gather | [tensor_ops::GatherTo] | `np.take` | `torch.gather` |
Expand Down Expand Up @@ -128,44 +128,6 @@
pub use crate::tensor_ops::*;
}

/// Sets a CPU `sse` flag to flush denormal floating point numbers to zero. The opposite of this is [keep_denormals()].
///
/// Some resources:
/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
pub fn flush_denormals_to_zero() {
#[cfg(all(target_arch = "x86", target_feature = "sse"))]
{
use std::arch::x86::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
}

#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
{
use std::arch::x86_64::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
}
}

/// Sets a CPU flag to keep denormal floating point numbers. The opposite of this is [flush_denormals_to_zero()].
///
/// Some resources:
/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
pub fn keep_denormals() {
#[cfg(all(target_arch = "x86", target_feature = "sse"))]
{
use std::arch::x86::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
}

#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
{
use std::arch::x86_64::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
}
}

#[cfg(test)]
pub(crate) mod tests {
pub use num_traits::{Float, NumCast, Zero};
Expand Down
74 changes: 71 additions & 3 deletions dfdx-core/src/tensor/cache.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{alloc::Layout, collections::BTreeMap, vec::Vec};

Check warning on line 1 in dfdx-core/src/tensor/cache.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

Check warning on line 1 in dfdx-core/src/tensor/cache.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

Check warning on line 1 in dfdx-core/src/tensor/cache.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

Check warning on line 1 in dfdx-core/src/tensor/cache.rs

View workflow job for this annotation

GitHub Actions / cargo-test-nightly

the item `Vec` is imported redundantly

#[cfg(not(feature = "no-std"))]
use std::sync::RwLock;
Expand Down Expand Up @@ -33,21 +33,35 @@
/// valid allocation. When the last value is removed from the list, the key
/// is removed.
#[derive(Debug)]
pub(crate) struct TensorCache<Ptr> {
pub(crate) struct TensorCache<Ptr: CachePtr<DeviceDev>, DeviceDev = ()> {
pub(crate) allocations: RwLock<BTreeMap<AllocationKey, Vec<Ptr>>>,
pub(crate) enabled: RwLock<bool>,
device_dev: DeviceDev,
}

impl<Ptr> Default for TensorCache<Ptr> {
impl<Ptr: CachePtr<DeviceDev>, DeviceDev: Default> Default for TensorCache<Ptr, DeviceDev> {
fn default() -> Self {
Self {
allocations: Default::default(),
enabled: RwLock::new(false),
device_dev: DeviceDev::default(),
}
}
}

impl<Ptr> TensorCache<Ptr> {
#[allow(dead_code)]
impl<Ptr: CachePtr<DeviceDev>, DeviceDev> TensorCache<Ptr, DeviceDev> {
/// Initiate an empty [TensorCache] with a given `device_dev`.
pub(crate) fn new(device_dev: DeviceDev) -> Self {
Self {
allocations: Default::default(),
enabled: RwLock::new(false),
device_dev,
}
}
}

impl<Ptr: CachePtr<DeviceDev>, DeviceDev> TensorCache<Ptr, DeviceDev> {
/// Returns the number of allocations in the cache.
#[allow(unused)]
pub(crate) fn len(&self) -> usize {
Expand Down Expand Up @@ -183,6 +197,60 @@
}
}

impl<Ptr: CachePtr<DeviceDev>, DeviceDev> TensorCache<Ptr, DeviceDev> {
/// Deallocates all cached memory on the device and empties the cache.
pub(crate) fn try_clear(&self) -> Result<(), crate::prelude::Error> {
let mut cache = {
#[cfg(not(feature = "no-std"))]
{
self.allocations.write().unwrap()
}
#[cfg(feature = "no-std")]
{
self.allocations.write()
}
};

for (&key, allocations) in cache.iter_mut() {
for alloc in allocations.drain(..) {
alloc.dealloc(&key, &self.device_dev);
}
}
cache.clear();
Ok(())
}
}

impl<Ptr: CachePtr<DeviceDev>, DeviceDev> Drop for TensorCache<Ptr, DeviceDev> {
fn drop(&mut self) {
self.try_clear().unwrap();
}
}

/// Functionality internalized by the pointer.
pub(crate) trait CachePtr<Dev>: Sized {
// by default no deallocation is made for any cache ptr
// ie. they leak
/// Deallocates the memory referred by this pointer.
fn dealloc(self, _key: &AllocationKey, _dev: &Dev) {}
}

impl<Dev> CachePtr<Dev> for bool {}
impl<Dev> CachePtr<Dev> for u8 {}
impl<Dev> CachePtr<Dev> for u16 {}
impl<Dev> CachePtr<Dev> for u32 {}
impl<Dev> CachePtr<Dev> for u64 {}
impl<Dev> CachePtr<Dev> for u128 {}
impl<Dev> CachePtr<Dev> for usize {}
impl<Dev> CachePtr<Dev> for i8 {}
impl<Dev> CachePtr<Dev> for i16 {}
impl<Dev> CachePtr<Dev> for i32 {}
impl<Dev> CachePtr<Dev> for i64 {}
impl<Dev> CachePtr<Dev> for i128 {}
impl<Dev> CachePtr<Dev> for isize {}
impl<Dev> CachePtr<Dev> for f32 {}
impl<Dev> CachePtr<Dev> for f64 {}

#[cfg(test)]
mod test {
use super::*;
Expand Down
84 changes: 42 additions & 42 deletions dfdx-core/src/tensor/cpu/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub struct Cpu {
/// A thread safe random number generator.
pub(crate) rng: Arc<Mutex<StdRng>>,
/// A thread safe cache of memory allocations that can be reused.
pub(crate) cache: Arc<TensorCache<BytesPtr>>,
pub(crate) cache: Arc<TensorCache<BytesPtr, CpuDevice>>,
}

impl Default for Cpu {
Expand All @@ -47,14 +47,53 @@ impl Cpu {
}
}

/// Unit struct to represent information needed for managing allocations on the Cpu.
#[derive(Clone, Debug, Default)]
pub(crate) struct CpuDevice;

impl crate::tensor::cache::CachePtr<CpuDevice> for BytesPtr {
fn dealloc(self, key: &crate::tensor::cache::AllocationKey, _dev: &CpuDevice) {
assert!(key.num_bytes % key.size == 0);
assert!(key.num_bytes < isize::MAX as usize);
let len = key.num_bytes / key.size;
let cap = len;
// SAFETY:
// - "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
// - ✅ cpu uses global allocator
// - "T needs to have the same alignment as what ptr was allocated with."
// - ✅ we are matching on the alignment below
// - "The size of T times the capacity needs to be the same size as the pointer was allocated with."
// - ✅ covered by `key.num_bytes / key.size` and the `key.num_bytes % key.size == 0` assertion above
// - "length needs to be less than or equal to capacity."
// - ✅ they are equal
// - "The first length values must be properly initialized values of type T."
// - ✅ any bit pattern is valid for unsigned ints used below
// - "capacity needs to be the capacity that the pointer was allocated with."
// - ✅ handled by assertion above (key.num_bytes % key.size == 0)
// - "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
// - ✅ handled by assertion above
debug_assert_eq!(std::alloc::Layout::new::<u8>().align(), 1);
debug_assert_eq!(std::alloc::Layout::new::<u16>().align(), 2);
debug_assert_eq!(std::alloc::Layout::new::<u32>().align(), 4);
debug_assert_eq!(std::alloc::Layout::new::<u64>().align(), 8);
match key.alignment {
1 => unsafe { drop(Vec::from_raw_parts(self.0, len, cap)) },
2 => unsafe { drop(Vec::from_raw_parts(self.0 as *mut u16, len, cap)) },
4 => unsafe { drop(Vec::from_raw_parts(self.0 as *mut u32, len, cap)) },
8 => unsafe { drop(Vec::from_raw_parts(self.0 as *mut u64, len, cap)) },
_ => unreachable!(),
};
}
}

/// A [Vec] that can be cloned without allocating new memory.
/// When [Drop]ed it will insert it's data into the cache.
#[derive(Debug)]
pub struct CachableVec<E> {
/// The data stored in this vector.
pub(crate) data: Vec<E>,
/// A cache of memory allocations that can be reused.
pub(crate) cache: Arc<TensorCache<BytesPtr>>,
pub(crate) cache: Arc<TensorCache<BytesPtr, CpuDevice>>,
}

impl<E: Clone> Clone for CachableVec<E> {
Expand Down Expand Up @@ -166,45 +205,6 @@ impl Cache for Cpu {
}

fn try_empty_cache(&self) -> Result<(), Error> {
#[cfg(not(feature = "no-std"))]
let mut cache = self.cache.allocations.write().unwrap();
#[cfg(feature = "no-std")]
let mut cache = self.cache.allocations.write();
for (&key, allocations) in cache.iter_mut() {
assert!(key.num_bytes % key.size == 0);
assert!(key.num_bytes < isize::MAX as usize);
let len = key.num_bytes / key.size;
let cap = len;
for alloc in allocations.drain(..) {
// SAFETY:
// - "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
// - ✅ cpu uses global allocator
// - "T needs to have the same alignment as what ptr was allocated with."
// - ✅ we are matching on the alignment below
// - "The size of T times the capacity needs to be the same size as the pointer was allocated with."
// - ✅ covered by `key.num_bytes / key.size` and the `key.num_bytes % key.size == 0` assertion above
// - "length needs to be less than or equal to capacity."
// - ✅ they are equal
// - "The first length values must be properly initialized values of type T."
// - ✅ any bit pattern is valid for unsigned ints used below
// - "capacity needs to be the capacity that the pointer was allocated with."
// - ✅ handled by assertion above (key.num_bytes % key.size == 0)
// - "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
// - ✅ handled by assertion above
debug_assert_eq!(std::alloc::Layout::new::<u8>().align(), 1);
debug_assert_eq!(std::alloc::Layout::new::<u16>().align(), 2);
debug_assert_eq!(std::alloc::Layout::new::<u32>().align(), 4);
debug_assert_eq!(std::alloc::Layout::new::<u64>().align(), 8);
match key.alignment {
1 => unsafe { drop(Vec::from_raw_parts(alloc.0, len, cap)) },
2 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u16, len, cap)) },
4 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u32, len, cap)) },
8 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u64, len, cap)) },
_ => unreachable!(),
};
}
}
cache.clear();
Ok(())
self.cache.try_clear()
}
}
38 changes: 20 additions & 18 deletions dfdx-core/src/tensor/cuda/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pub struct Cuda {
/// A second stream for kernels to optionally execute on.
pub(crate) par_stream: Arc<CudaStream>,
pub(crate) workspace: Arc<Mutex<CudaSlice<u8>>>,
pub(crate) cache: Arc<TensorCache<CUdeviceptr>>,
pub(crate) cache: Arc<TensorCache<CudaBytesPtr, Arc<CudaDevice>>>,
}

impl From<CublasError> for Error {
Expand Down Expand Up @@ -77,6 +77,7 @@ impl Cuda {
let cudnn = cudarc::cudnn::Cudnn::new(dev.clone())?;
let par_stream = Arc::new(dev.fork_default_stream()?);
let workspace = Arc::new(Mutex::new(dev.alloc_zeros::<u8>(1)?));
let cache = Arc::new(TensorCache::new(Arc::clone(&dev)));
Ok(Self {
cpu,
dev,
Expand All @@ -85,7 +86,7 @@ impl Cuda {
cudnn,
par_stream,
workspace,
cache: Default::default(),
cache,
})
}
}
Expand All @@ -100,7 +101,7 @@ impl Cuda {
) -> Result<CudaSlice<E>, Error> {
let data = self.cache.try_pop::<E>(len).map_or_else(
|| self.dev.alloc::<E>(len),
|ptr| Ok(self.dev.upgrade_device_ptr(ptr, len)),
|ptr| Ok(self.dev.upgrade_device_ptr(ptr.0, len)),
)?;
Ok(data)
}
Expand All @@ -122,14 +123,26 @@ impl Cuda {
}
}

/// A pointer to a bytes on the Cuda device. Used in conjunction with [TensorCache].
#[repr(transparent)]
#[derive(Clone, Debug)]
pub struct CudaBytesPtr(pub(crate) CUdeviceptr);

impl crate::tensor::cache::CachePtr<Arc<CudaDevice>> for CudaBytesPtr {
fn dealloc(self, key: &crate::tensor::cache::AllocationKey, dev: &Arc<CudaDevice>) {
let data = unsafe { dev.upgrade_device_ptr::<u8>(self.0, key.num_bytes) };
drop(data);
}
}

/// A [CudaSlice] that can be cloned without allocating new memory.
/// When [Drop]ed it will insert it's data into the cache.
#[derive(Debug)]
pub struct CachableCudaSlice<E> {
/// The actual data.
pub(crate) data: CudaSlice<E>,
/// A cache of device pointers that can be reused.
pub(crate) cache: Arc<TensorCache<CUdeviceptr>>,
pub(crate) cache: Arc<TensorCache<CudaBytesPtr, Arc<CudaDevice>>>,
}

impl<E: cudarc::driver::DeviceRepr> Clone for CachableCudaSlice<E> {
Expand All @@ -142,7 +155,7 @@ impl<E: cudarc::driver::DeviceRepr> Clone for CachableCudaSlice<E> {
// SAFETY:
// 1. we know that ptr is valid for `num_bytes` because it was registered for that.
// 2. we are about to set the memory with dtod_copy
let mut slice = unsafe { dev.upgrade_device_ptr(ptr, len) };
let mut slice = unsafe { dev.upgrade_device_ptr(ptr.0, len) };
dev.dtod_copy(&self.data, &mut slice).unwrap();
slice
},
Expand Down Expand Up @@ -209,7 +222,7 @@ impl<E> Drop for CachableCudaSlice<E> {
let numel = data.len();
// Get access to the raw pointer without freeing it.
let ptr = data.leak();
self.cache.insert::<E>(numel, ptr);
self.cache.insert::<E>(numel, CudaBytesPtr(ptr));
}
}
}
Expand All @@ -232,18 +245,7 @@ impl Cache for Cuda {
}

fn try_empty_cache(&self) -> Result<(), Error> {
#[cfg(not(feature = "no-std"))]
let mut cache = self.cache.allocations.write().unwrap();
#[cfg(feature = "no-std")]
let mut cache = self.cache.allocations.write();
for (&key, allocations) in cache.iter_mut() {
for alloc in allocations.drain(..) {
let data = unsafe { self.dev.upgrade_device_ptr::<u8>(alloc, key.num_bytes) };
drop(data);
}
}
cache.clear();
Ok(())
self.cache.try_clear()
}
}

Expand Down
2 changes: 1 addition & 1 deletion dfdx-core/src/tensor/gradients.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
#[inline]
pub(crate) fn many_and_ref<L: Shape, R: Shape>(
&mut self,
ls: &Vec<impl Tensorlike<L, E, D>>,
ls: &[impl Tensorlike<L, E, D>],
r: &impl Tensorlike<R, E, D>,
) -> (Vec<&mut D::Vec>, &D::Vec) {
for i in 0..ls.len() {
Expand Down
Loading
Loading