I've got abs working with f32 for rust-gpu and raw spirv

coreylowman · Dec 27, 2023 · 701cd7b · 701cd7b
1 parent c1b440b
commit 701cd7b
Show file tree

Hide file tree

Showing 32 changed files with 305 additions and 305 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,5 +1,10 @@
 [workspace]
-members = ["dfdx-core", "dfdx-derives", "dfdx"]
+members = [
+    "dfdx-core", 
+    "dfdx-derives", 
+    "dfdx", 
+    "dfdx-core/wgpu_kernels/abs"
+]
 resolver = "2"
 
 [workspace.dependencies]
@@ -8,4 +13,4 @@ safetensors = { version = "0.4.0", default-features = false }
 memmap2 = { version = "0.9.0", default-features = false }
 rand = { version = "0.8.5", default-features = false, features = ["std_rng"] }
 rand_distr = { version = "0.4.3", default-features = false }
-libm = "0.2.8"
+libm = "0.2.8"
diff --git a/dfdx-core/Cargo.toml b/dfdx-core/Cargo.toml
@@ -50,9 +50,10 @@ indicatif = "0.17.3"
 
 [build-dependencies]
 glob = { version = "0.3.1", optional = true }
+spirv-builder = { version = "0.9.0", optional = true }
 
 [features]
-default = ["std", "fast-alloc", "cpu"]
+default = ["std", "fast-alloc", "webgpu"]
 nightly = ["half?/use-intrinsics", "gemm?/nightly"]
 
 std = ["cudarc?/std", "rand_distr/std_math", "gemm?/std"]
@@ -69,6 +70,7 @@ webgpu = [
     "dep:thingbuf", 
     "dep:naga", 
     "dep:glob",
+    "dep:spirv-builder",
     "wgpu/expose-ids",
 ]
 

diff --git a/dfdx-core/build.rs b/dfdx-core/build.rs
@@ -217,48 +217,11 @@ mod cuda {
 #[cfg(feature = "webgpu")]
 mod webgpu {
     pub fn build_spv() {
-        let out_dir = std::env::var("OUT_DIR").unwrap();
-        let kernel_paths: Vec<std::path::PathBuf> = glob::glob("src/**/*.glsl")
-            .unwrap()
-            .map(|p| p.unwrap())
-            .collect();
-        for path in &kernel_paths {
-            println!("cargo:rerun-if-changed={}", path.display());
-        }
-
-        kernel_paths
-            .iter()
-            .for_each(|p| println!("cargo:rerun-if-changed={}", p.display()));
-
-        let children = kernel_paths
-                .iter()
-                .map(|p| {
-                    // TODO: we need to build this for both float and double
-                    let out_path: std::path::PathBuf = out_dir.clone().into();
-                    let base = p.file_stem().unwrap();
-                    let new_name = format!("{}.float.spv", base.to_str().unwrap());
-                    let out_file = &out_path.join(new_name);
-                    eprintln!("out_file: {:?}", out_file);
-                    std::process::Command::new("glslc")
-                        .args(["-std=460core"])
-                        .args(["-fshader-stage=compute"])
-                        .args(["-DTYPENAME=float"])
-                        .args(["-o", &out_file.as_os_str().to_str().unwrap()])
-                        .arg(p)
-                        .stdout(std::process::Stdio::piped())
-                        .stderr(std::process::Stdio::piped())
-                        .spawn()
-                        .expect("glslc failed to start. Ensure that you have shaderc installed and that `glslc` is in your PATH.")
-                })
-                .collect::<Vec<_>>();
-        for (kernel_path, child) in kernel_paths.iter().zip(children.into_iter()) {
-            let output = child.wait_with_output().expect("glslc failed to run. Ensure that you have shaderc installed and that `glslc` is in your PATH.");
-            assert!(
-                output.status.success(),
-                "glslc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
-                String::from_utf8_lossy(&output.stdout),
-                String::from_utf8_lossy(&output.stderr)
-            );
+        for kernel in std::fs::read_dir("wgpu_kernels").expect("Error finding kernels folder") {
+            let path = kernel.expect("Invalid path in kernels folder").path();
+            spirv_builder::SpirvBuilder::new(path, "spirv-unknown-vulkan1.1")
+                .build()
+                .expect("Kernel failed to compile");
         }
     }
 }
diff --git a/dfdx-core/src/tensor/webgpu/device.rs b/dfdx-core/src/tensor/webgpu/device.rs
@@ -6,7 +6,6 @@ use wgpu::{
 };
 
 use crate::{
-    prelude::webgpu_kernels::HasGlslType,
     shapes::{Shape, Unit},
     tensor::{
         cache::TensorCache, cpu::Cpu, Cache, Error, NoneTape, RandomU64, Storage, Synchronize,
@@ -21,6 +20,7 @@ use core::any::TypeId;
 #[cfg(not(feature = "no-std"))]
 use std::sync::{Mutex, RwLock};
 
+use std::borrow::Cow;
 use std::{collections::HashMap, marker::PhantomData, sync::Arc, vec::Vec};
 
 use super::allocate::round_to_buffer_alignment;
@@ -145,7 +145,7 @@ impl Webgpu {
         let adapter = Arc::new(adapter);
         let descriptor = DeviceDescriptor {
             label: None,
-            features: Features::default() | Features::SPIRV_SHADER_PASSTHROUGH,
+            features: Features::SPIRV_SHADER_PASSTHROUGH | Features::TIMESTAMP_QUERY,
             limits: Default::default(),
         };
         let (dev, queue) =
@@ -221,23 +221,23 @@ impl Webgpu {
         self.cs_cache.read().contains_key(&name)
     }
 
-    pub(crate) fn load_shader_module<E>(&self, name: TypeId, source: &[u8])
-    where
-        E: HasGlslType,
-    {
+    pub(crate) fn load_shader_module<E>(&self, name: TypeId, source: &[u8]) {
         // TODO: Get raw SpirV working. I am guessing that is how we are going
         // to have to implement atomic stuff with `wgpu`.
-        //
-        // let module = Arc::new(unsafe {
-        //     self.dev.create_shader_module_spirv(&ShaderModuleDescriptorSpirV {
-        //         label: None,
-        //         source: make_spirv_raw(source),
-        //     })
-        // });
-        let module = Arc::new(self.dev.create_shader_module(ShaderModuleDescriptor {
-            label: None,
-            source: make_spirv(source),
-        }));
+
+        let source = Cow::Owned(make_spirv_raw(source).into_owned());
+        let module = Arc::new(unsafe {
+            self.dev
+                .create_shader_module_spirv(&ShaderModuleDescriptorSpirV {
+                    label: None,
+                    source,
+                })
+        });
+        // let module = Arc::new(self.dev.create_shader_module(ShaderModuleDescriptor {
+        //     label: None,
+        //     source: make_spirv(source),
+        // }));
+
         #[cfg(not(feature = "no-std"))]
         self.cs_cache.write().unwrap().insert(name, module);
         #[cfg(feature = "no-std")]

diff --git a/dfdx-core/src/tensor_ops/abs/abs.bwd.glsl b/dfdx-core/src/tensor_ops/abs/abs.bwd.glsl
diff --git a/dfdx-core/src/tensor_ops/abs/abs.fwd.glsl b/dfdx-core/src/tensor_ops/abs/abs.fwd.glsl
diff --git a/dfdx-core/src/tensor_ops/abs/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/abs/webgpu_kernel.rs
@@ -1,16 +1,13 @@
 use super::AbsKernelOp;
 use crate::tensor_ops::webgpu_kernels::webgpu_unary;
 
-const GLSL_FWD: &str = include_str!("abs.fwd.glsl");
-const GLSL_BWD: &str = include_str!("abs.bwd.glsl");
-const SPV_FWD: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/abs.fwd.float.spv"));
-const SPV_BWD: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/abs.bwd.float.spv"));
+const SPV_FWD: &[u8] = include_bytes!(env!("abs.spv"));
 
-webgpu_unary!(AbsKernelOp, f32, SPV_FWD, SPV_BWD);
+webgpu_unary!(AbsKernelOp, f32, SPV_FWD, "abs_fwd_f32", "abs_bwd_f32");
 
 #[cfg(test)]
 mod tests {
-    use crate::{tensor::*, tensor_ops::*, tests::*};
+    use crate::{prelude::*, tensor::*, tests::*};
 
     #[test]
     fn test_webgpu_abs() {

diff --git a/dfdx-core/src/tensor_ops/accurate_gelu/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/accurate_gelu/webgpu_kernel.rs
@@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::AccurateGeLUKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(
+    super::AccurateGeLUKernelOp,
+    f32,
+    WGSL,
+    "accurate_gelu_fwd_f32",
+    "accurate_gelu_bwd_f32",
+);
diff --git a/dfdx-core/src/tensor_ops/add/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/add/webgpu_kernel.rs
@@ -9,7 +9,13 @@ use crate::prelude::{
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(Scalar<f32>, f32, WGSL, WGSL);
+webgpu_unary!(
+    Scalar<f32>,
+    f32,
+    WGSL,
+    "scalar_add_fwd_f32",
+    "scalar_add_bwd_f32",
+);
 
 impl<E: Dtype> BinaryKernel<super::BinaryAddKernelOp, E> for Webgpu {
     const BACKWARD_WITHOUT_DATA: bool = true;

diff --git a/dfdx-core/src/tensor_ops/clamp/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/clamp/webgpu_kernel.rs
@@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::ClampKernelOp<f32>, f32, WGSL, WGSL);
+webgpu_unary!(
+    super::ClampKernelOp<f32>,
+    f32,
+    WGSL,
+    "clamp_fwd_f32",
+    "clamp_bwd_f32",
+);
diff --git a/dfdx-core/src/tensor_ops/cos/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/cos/webgpu_kernel.rs
@@ -2,4 +2,4 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::CosKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(super::CosKernelOp, f32, WGSL, "cos_fwd_f32", "cos_bwd_f32",);
diff --git a/dfdx-core/src/tensor_ops/div/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/div/webgpu_kernel.rs
@@ -5,7 +5,13 @@ use crate::prelude::{ops::BinaryKernel, webgpu_kernels::webgpu_unary, Dtype, Web
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(const_df() Scalar<f32>, f32, WGSL, WGSL);
+webgpu_unary!(const_df()
+    Scalar<f32>,
+    f32,
+    WGSL,
+    "scalar_div_fwd_f32",
+    "scalar_div_bwd_f32",
+);
 
 impl<E: Dtype> BinaryKernel<super::BinaryDivKernelOp, E> for Webgpu {
     const BACKWARD_WITHOUT_DATA: bool = true;

diff --git a/dfdx-core/src/tensor_ops/exp/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/exp/webgpu_kernel.rs
@@ -2,4 +2,4 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::ExpKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(super::ExpKernelOp, f32, WGSL, "exp_fwd_f32", "exp_bwd_f32",);
diff --git a/dfdx-core/src/tensor_ops/fast_gelu/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/fast_gelu/webgpu_kernel.rs
@@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::FastGeLUKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(
+    super::FastGeLUKernelOp,
+    f32,
+    WGSL,
+    "fast_gelu_fwd_f32",
+    "fast_gelu_bwd_f32",
+);
diff --git a/dfdx-core/src/tensor_ops/ln/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/ln/webgpu_kernel.rs
@@ -2,4 +2,4 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::LnKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(super::LnKernelOp, f32, WGSL, "ln_fwd_f32", "ln_bwd_f32",);
diff --git a/dfdx-core/src/tensor_ops/mul/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/mul/webgpu_kernel.rs
@@ -5,7 +5,13 @@ use crate::prelude::{ops::BinaryKernel, webgpu_kernels::webgpu_unary, Dtype, Web
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(const_df() Scalar<f32>, f32, WGSL, WGSL);
+webgpu_unary!(const_df()
+    Scalar<f32>,
+    f32,
+    WGSL,
+    "scalar_mul_fwd_f32",
+    "scalar_mul_bwd_f32",
+);
 
 impl<E: Dtype> BinaryKernel<super::BinaryMulKernelOp, E> for Webgpu {
     const BACKWARD_WITHOUT_DATA: bool = true;

diff --git a/dfdx-core/src/tensor_ops/nans_to/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/nans_to/webgpu_kernel.rs
@@ -3,4 +3,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(NansToKernelOp<f32>, f32, WGSL, WGSL);
+webgpu_unary!(
+    NansToKernelOp<f32>,
+    f32,
+    WGSL,
+    "nans_to_fwd_f32",
+    "nans_to_bwd_f32",
+);
diff --git a/dfdx-core/src/tensor_ops/negate/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/negate/webgpu_kernel.rs
@@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::NegateKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(
+    super::NegateKernelOp,
+    f32,
+    WGSL,
+    "negate_fwd_f32",
+    "negate_bwd_f32",
+);
diff --git a/dfdx-core/src/tensor_ops/pow/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/pow/webgpu_kernel.rs
@@ -4,7 +4,13 @@ use crate::prelude::{ops::UnaryKernel, webgpu_kernels::webgpu_unary, Dtype, Webg
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::PowfKernelOp<f32>, f32, WGSL, WGSL);
+webgpu_unary!(
+    super::PowfKernelOp<f32>,
+    f32,
+    WGSL,
+    "pow_fwd_f32",
+    "pow_bwd_f32",
+);
 
 // TODO: Conflicting implementations of trait `UnaryKernel` for type `Webgpu`:
 impl UnaryKernel<super::PowiKernelOp, f32> for Webgpu

diff --git a/dfdx-core/src/tensor_ops/recip/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/recip/webgpu_kernel.rs
@@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(df(f(x)) super::RecipKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(df(f(x))
+    super::RecipKernelOp,
+    f32,
+    WGSL,
+    "recip_fwd_f32",
+    "recip_bwd_f32",
+);
diff --git a/dfdx-core/src/tensor_ops/relu/webgpu_kernel.rs b/dfdx-core/src/tensor_ops/relu/webgpu_kernel.rs
@@ -2,4 +2,10 @@ use crate::prelude::webgpu_kernels::webgpu_unary;
 
 const WGSL: &[u8] = b"TODO";
 
-webgpu_unary!(super::ReLUKernelOp, f32, WGSL, WGSL);
+webgpu_unary!(
+    super::ReLUKernelOp,
+    f32,
+    WGSL,
+    "relu_fwd_f32",
+    "relu_bwd_f32",
+);