Skip to content

Commit

Permalink
I've got abs working with f32 for rust-gpu and raw spirv
Browse files Browse the repository at this point in the history
  • Loading branch information
favilo committed Dec 27, 2023
1 parent c1b440b commit 701cd7b
Show file tree
Hide file tree
Showing 32 changed files with 305 additions and 305 deletions.
9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
[workspace]
members = ["dfdx-core", "dfdx-derives", "dfdx"]
members = [
"dfdx-core",
"dfdx-derives",
"dfdx",
"dfdx-core/wgpu_kernels/abs"
]
resolver = "2"

[workspace.dependencies]
Expand All @@ -8,4 +13,4 @@ safetensors = { version = "0.4.0", default-features = false }
memmap2 = { version = "0.9.0", default-features = false }
rand = { version = "0.8.5", default-features = false, features = ["std_rng"] }
rand_distr = { version = "0.4.3", default-features = false }
libm = "0.2.8"
libm = "0.2.8"
4 changes: 3 additions & 1 deletion dfdx-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ indicatif = "0.17.3"

[build-dependencies]
glob = { version = "0.3.1", optional = true }
spirv-builder = { version = "0.9.0", optional = true }

[features]
default = ["std", "fast-alloc", "cpu"]
default = ["std", "fast-alloc", "webgpu"]
nightly = ["half?/use-intrinsics", "gemm?/nightly"]

std = ["cudarc?/std", "rand_distr/std_math", "gemm?/std"]
Expand All @@ -69,6 +70,7 @@ webgpu = [
"dep:thingbuf",
"dep:naga",
"dep:glob",
"dep:spirv-builder",
"wgpu/expose-ids",
]

Expand Down
47 changes: 5 additions & 42 deletions dfdx-core/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,48 +217,11 @@ mod cuda {
#[cfg(feature = "webgpu")]
mod webgpu {
pub fn build_spv() {
let out_dir = std::env::var("OUT_DIR").unwrap();
let kernel_paths: Vec<std::path::PathBuf> = glob::glob("src/**/*.glsl")
.unwrap()
.map(|p| p.unwrap())
.collect();
for path in &kernel_paths {
println!("cargo:rerun-if-changed={}", path.display());
}

kernel_paths
.iter()
.for_each(|p| println!("cargo:rerun-if-changed={}", p.display()));

let children = kernel_paths
.iter()
.map(|p| {
// TODO: we need to build this for both float and double
let out_path: std::path::PathBuf = out_dir.clone().into();
let base = p.file_stem().unwrap();
let new_name = format!("{}.float.spv", base.to_str().unwrap());
let out_file = &out_path.join(new_name);
eprintln!("out_file: {:?}", out_file);
std::process::Command::new("glslc")
.args(["-std=460core"])
.args(["-fshader-stage=compute"])
.args(["-DTYPENAME=float"])
.args(["-o", &out_file.as_os_str().to_str().unwrap()])
.arg(p)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.expect("glslc failed to start. Ensure that you have shaderc installed and that `glslc` is in your PATH.")
})
.collect::<Vec<_>>();
for (kernel_path, child) in kernel_paths.iter().zip(children.into_iter()) {
let output = child.wait_with_output().expect("glslc failed to run. Ensure that you have shaderc installed and that `glslc` is in your PATH.");
assert!(
output.status.success(),
"glslc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
);
for kernel in std::fs::read_dir("wgpu_kernels").expect("Error finding kernels folder") {
let path = kernel.expect("Invalid path in kernels folder").path();
spirv_builder::SpirvBuilder::new(path, "spirv-unknown-vulkan1.1")
.build()
.expect("Kernel failed to compile");
}
}
}
34 changes: 17 additions & 17 deletions dfdx-core/src/tensor/webgpu/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use wgpu::{
};

use crate::{
prelude::webgpu_kernels::HasGlslType,
shapes::{Shape, Unit},
tensor::{
cache::TensorCache, cpu::Cpu, Cache, Error, NoneTape, RandomU64, Storage, Synchronize,
Expand All @@ -21,6 +20,7 @@ use core::any::TypeId;
#[cfg(not(feature = "no-std"))]
use std::sync::{Mutex, RwLock};

use std::borrow::Cow;
use std::{collections::HashMap, marker::PhantomData, sync::Arc, vec::Vec};

use super::allocate::round_to_buffer_alignment;
Expand Down Expand Up @@ -145,7 +145,7 @@ impl Webgpu {
let adapter = Arc::new(adapter);
let descriptor = DeviceDescriptor {
label: None,
features: Features::default() | Features::SPIRV_SHADER_PASSTHROUGH,
features: Features::SPIRV_SHADER_PASSTHROUGH | Features::TIMESTAMP_QUERY,
limits: Default::default(),
};
let (dev, queue) =
Expand Down Expand Up @@ -221,23 +221,23 @@ impl Webgpu {
self.cs_cache.read().contains_key(&name)
}

pub(crate) fn load_shader_module<E>(&self, name: TypeId, source: &[u8])
where
E: HasGlslType,
{
pub(crate) fn load_shader_module<E>(&self, name: TypeId, source: &[u8]) {
// TODO: Get raw SpirV working. I am guessing that is how we are going
// to have to implement atomic stuff with `wgpu`.
//
// let module = Arc::new(unsafe {
// self.dev.create_shader_module_spirv(&ShaderModuleDescriptorSpirV {
// label: None,
// source: make_spirv_raw(source),
// })
// });
let module = Arc::new(self.dev.create_shader_module(ShaderModuleDescriptor {
label: None,
source: make_spirv(source),
}));

let source = Cow::Owned(make_spirv_raw(source).into_owned());
let module = Arc::new(unsafe {
self.dev
.create_shader_module_spirv(&ShaderModuleDescriptorSpirV {
label: None,
source,
})
});
// let module = Arc::new(self.dev.create_shader_module(ShaderModuleDescriptor {
// label: None,
// source: make_spirv(source),
// }));

#[cfg(not(feature = "no-std"))]
self.cs_cache.write().unwrap().insert(name, module);
#[cfg(feature = "no-std")]
Expand Down
28 changes: 0 additions & 28 deletions dfdx-core/src/tensor_ops/abs/abs.bwd.glsl

This file was deleted.

22 changes: 0 additions & 22 deletions dfdx-core/src/tensor_ops/abs/abs.fwd.glsl

This file was deleted.

9 changes: 3 additions & 6 deletions dfdx-core/src/tensor_ops/abs/webgpu_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
use super::AbsKernelOp;
use crate::tensor_ops::webgpu_kernels::webgpu_unary;

const GLSL_FWD: &str = include_str!("abs.fwd.glsl");
const GLSL_BWD: &str = include_str!("abs.bwd.glsl");
const SPV_FWD: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/abs.fwd.float.spv"));
const SPV_BWD: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/abs.bwd.float.spv"));
const SPV_FWD: &[u8] = include_bytes!(env!("abs.spv"));

webgpu_unary!(AbsKernelOp, f32, SPV_FWD, SPV_BWD);
webgpu_unary!(AbsKernelOp, f32, SPV_FWD, "abs_fwd_f32", "abs_bwd_f32");

#[cfg(test)]
mod tests {
use crate::{tensor::*, tensor_ops::*, tests::*};
use crate::{prelude::*, tensor::*, tests::*};

#[test]
fn test_webgpu_abs() {
Expand Down
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/accurate_gelu/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::AccurateGeLUKernelOp, f32, WGSL, WGSL);
webgpu_unary!(
super::AccurateGeLUKernelOp,
f32,
WGSL,
"accurate_gelu_fwd_f32",
"accurate_gelu_bwd_f32",
);
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/add/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@ use crate::prelude::{

const WGSL: &[u8] = b"TODO";

webgpu_unary!(Scalar<f32>, f32, WGSL, WGSL);
webgpu_unary!(
Scalar<f32>,
f32,
WGSL,
"scalar_add_fwd_f32",
"scalar_add_bwd_f32",
);

impl<E: Dtype> BinaryKernel<super::BinaryAddKernelOp, E> for Webgpu {
const BACKWARD_WITHOUT_DATA: bool = true;
Expand Down
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/clamp/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::ClampKernelOp<f32>, f32, WGSL, WGSL);
webgpu_unary!(
super::ClampKernelOp<f32>,
f32,
WGSL,
"clamp_fwd_f32",
"clamp_bwd_f32",
);
2 changes: 1 addition & 1 deletion dfdx-core/src/tensor_ops/cos/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::CosKernelOp, f32, WGSL, WGSL);
webgpu_unary!(super::CosKernelOp, f32, WGSL, "cos_fwd_f32", "cos_bwd_f32",);
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/div/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@ use crate::prelude::{ops::BinaryKernel, webgpu_kernels::webgpu_unary, Dtype, Web

const WGSL: &[u8] = b"TODO";

webgpu_unary!(const_df() Scalar<f32>, f32, WGSL, WGSL);
webgpu_unary!(const_df()
Scalar<f32>,
f32,
WGSL,
"scalar_div_fwd_f32",
"scalar_div_bwd_f32",
);

impl<E: Dtype> BinaryKernel<super::BinaryDivKernelOp, E> for Webgpu {
const BACKWARD_WITHOUT_DATA: bool = true;
Expand Down
2 changes: 1 addition & 1 deletion dfdx-core/src/tensor_ops/exp/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::ExpKernelOp, f32, WGSL, WGSL);
webgpu_unary!(super::ExpKernelOp, f32, WGSL, "exp_fwd_f32", "exp_bwd_f32",);
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/fast_gelu/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::FastGeLUKernelOp, f32, WGSL, WGSL);
webgpu_unary!(
super::FastGeLUKernelOp,
f32,
WGSL,
"fast_gelu_fwd_f32",
"fast_gelu_bwd_f32",
);
2 changes: 1 addition & 1 deletion dfdx-core/src/tensor_ops/ln/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::LnKernelOp, f32, WGSL, WGSL);
webgpu_unary!(super::LnKernelOp, f32, WGSL, "ln_fwd_f32", "ln_bwd_f32",);
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/mul/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@ use crate::prelude::{ops::BinaryKernel, webgpu_kernels::webgpu_unary, Dtype, Web

const WGSL: &[u8] = b"TODO";

webgpu_unary!(const_df() Scalar<f32>, f32, WGSL, WGSL);
webgpu_unary!(const_df()
Scalar<f32>,
f32,
WGSL,
"scalar_mul_fwd_f32",
"scalar_mul_bwd_f32",
);

impl<E: Dtype> BinaryKernel<super::BinaryMulKernelOp, E> for Webgpu {
const BACKWARD_WITHOUT_DATA: bool = true;
Expand Down
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/nans_to/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(NansToKernelOp<f32>, f32, WGSL, WGSL);
webgpu_unary!(
NansToKernelOp<f32>,
f32,
WGSL,
"nans_to_fwd_f32",
"nans_to_bwd_f32",
);
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/negate/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::NegateKernelOp, f32, WGSL, WGSL);
webgpu_unary!(
super::NegateKernelOp,
f32,
WGSL,
"negate_fwd_f32",
"negate_bwd_f32",
);
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/pow/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@ use crate::prelude::{ops::UnaryKernel, webgpu_kernels::webgpu_unary, Dtype, Webg

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::PowfKernelOp<f32>, f32, WGSL, WGSL);
webgpu_unary!(
super::PowfKernelOp<f32>,
f32,
WGSL,
"pow_fwd_f32",
"pow_bwd_f32",
);

// TODO: Conflicting implementations of trait `UnaryKernel` for type `Webgpu`:
impl UnaryKernel<super::PowiKernelOp, f32> for Webgpu
Expand Down
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/recip/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(df(f(x)) super::RecipKernelOp, f32, WGSL, WGSL);
webgpu_unary!(df(f(x))
super::RecipKernelOp,
f32,
WGSL,
"recip_fwd_f32",
"recip_bwd_f32",
);
8 changes: 7 additions & 1 deletion dfdx-core/src/tensor_ops/relu/webgpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;

const WGSL: &[u8] = b"TODO";

webgpu_unary!(super::ReLUKernelOp, f32, WGSL, WGSL);
webgpu_unary!(
super::ReLUKernelOp,
f32,
WGSL,
"relu_fwd_f32",
"relu_bwd_f32",
);
Loading

0 comments on commit 701cd7b

Please sign in to comment.