From a8aa3302549c04973e72f396a3bdbe746951ba35 Mon Sep 17 00:00:00 2001 From: Matthew Scroggs Date: Wed, 17 Jul 2024 15:24:04 +0100 Subject: [PATCH] add slides --- slides/2024/raz.html | 312 +++++++++++++++++++++++++++++++++++++++++++ talks/raz.yml | 1 + 2 files changed, 313 insertions(+) create mode 100644 slides/2024/raz.html diff --git a/slides/2024/raz.html b/slides/2024/raz.html new file mode 100644 index 0000000..68a5832 --- /dev/null +++ b/slides/2024/raz.html @@ -0,0 +1,312 @@ +
+

Building a compile-time SIMD optimized smoothing filter

+

By Miguel Raz Guzmรกn Macedo
+Scientific Computing in Rust 2024

+
+
+

The problem

+
    +
  • You want to smooth your time series finance/particle physics/econometrics data
  • +
  • You want do it fast, like, really fast
  • +
  • You want to make sure data is still holding up some of its statistical properties
  • +
+
+
+

The solution

+
    +
  • What you want is a Savitzky-Golay filter
  • +
  • Or a weighted moving average, rolling window average, convolution...
  • +
  • It's basically a fancy dot product with one vector being precomputed.
  • +
+
+
+

Why it's juicy

+
    +
  • A dot product has perf opportunities at many levels
  • +
  • Wide set of applications
  • +
  • The coefficent precomputation lends itself to compile-time shenanigans and speedups
  • +
+
+
+

Dot products galore

+

It's just a map-reduce with a binary_op and a plus reduction!

+
    +
  • In Julia, mapreduce(*, +, a, b)
  • +
+

But it's also useful for

+ +
+
+

Benchmarks

+

Scipy

+
   ...: N = 100_000_000 
+#          ๐Ÿ‘†
+   ...: dataf32 = np.arange(1, N, dtype=np.float32)
+   ...: dataf64 = np.arange(1, N, dtype=np.float64)
+   ...: %timeit f(dataf32, 5, 2, mode = "wrap")
+   ...: %timeit f(dataf64, 5, 2, mode = "wrap")
+f32 savgol (5,2) for 100000000 elements
+1.38 s ยฑ 31.7 ms per loop (mean ยฑ std. dev. of 7 runs, 1 loop each)
+# ๐Ÿ‘†
+f64 savgol (5,2) for 100000000 elements
+1.69 s ยฑ 121 ms per loop (mean ยฑ std. dev. of 7 runs, 1 loop each)
+# ๐Ÿ‘†
+
+
    +
  1. Time isn't scaling with bitwidth -> not memory bound!
  2. +
  3. All benchmarking issues are mine, please let me know if I've goofed
  4. +
+
+
+

Julia

+
julia> using BenchmarkTools
+julia> using StagedFilters
+julia> N = 100_000_000;
+julia> data = rand(Float32, N); smoothed = similar(data);
+julia> @btime smooth!($SavitzkyGolayFilter{2,2}, $data, $smoothed);
+  61.839 ms (0 allocations: 0 bytes)
+  # ๐Ÿ‘† ~22x faster than Scipy
+julia> dataf64 = rand(Float64, N); smoothedf64 = similar(dataf64);
+julia> @btime smooth!($SavitzkyGolayFilter{2,2}, $dataf64, $smoothedf64);
+  125.020 ms (0 allocations: 0 bytes)
+  # ๐Ÿ‘† ~13x faster than Scipy
+
+
+
+

Rust

+
#[divan::bench(sample_size = 3, sample_count = 3)]
+fn savgol_f32() -> f32 {
+    let n = 100_000_000;
+    let v = vec![10.0f32; n];
+    let mut buf = vec![0.0f32; n];
+    sav_gol_f32::<2, 2>(bb(&mut buf), bb(&v));
+    buf[0]
+}
+
+

Whelp...

+
mrg:~/staged-sg-filter$ RUSTFLAGS="-C target-feature=+fma,+avx2 -C target-cpu=native" cargo bench
+  #                                                   ๐Ÿ‘† ~2x faster than without
+     Running benches/divan.rs (target/release/deps/divan-a75fca219433bc49)
+Timer precision: 20 ns
+divan          fastest       โ”‚ slowest       โ”‚ median        โ”‚ mean          โ”‚ samples โ”‚ iters
+โ”œโ”€ savgol_f32  521.4 ms      โ”‚ 530.7 ms      โ”‚ 526.2 ms      โ”‚ 526.1 ms      โ”‚ 3       โ”‚ 9
+  #             ๐Ÿ‘† ~2.5x faster than Scipy
+โ•ฐโ”€ savgol_f64  1.046 s       โ”‚ 1.059 s       โ”‚ 1.048 s       โ”‚ 1.051 s       โ”‚ 3       โ”‚ 9
+  #             ๐Ÿ‘† ~30% faster than Scipy
+
+
+
+

Counting cycles

+

Processing 100M elements for a computer with 1e11 peakFLOP/s:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Languagef32 time[s]Elements / nsns / Element
Python1.38.07213.8
Julia0.061.6170.618
Rust.52.195.21
+
    +
  • Surprising result - don't know where this difference between Julia and Rust is coming from!
  • +
+
+
+

The tight loop

+
vfnmadd132ss  xmm0, xmm8, dword ptr [rcx + 4*r10 - 12] # xmm0 = -(xmm0 * mem) + xmm8
+vfmadd231ss   xmm0, xmm3, dword ptr [rcx + 4*r10 - 8]  # xmm0 = (xmm3 * mem) + xmm0
+vfmadd231ss   xmm0, xmm4, dword ptr [rcx + 4*r10 - 4]  # xmm0 = (xmm4 * mem) + xmm0
+vfmadd231ss   xmm0, xmm5, dword ptr [rcx]              # xmm0 = (xmm5 * mem) + xmm0
+vfnmadd231ss  xmm0, xmm13, dword ptr [rcx + 4*rdi + 8] # xmm0 = -(xmm13 * mem) + xmm0
+
+

Tight assembly! Entering ๐Ÿ‘ฉโ€๐Ÿณ ๐Ÿ’‹ territory

+
    +
  • Dot product is the most intensive part of the code -> expect series of vfmadd
  • +
  • Exercise left to reader - exploit AVX512 for even more FLOP/s!
  • +
+
+
+

Ingredients for an enviable tight loop ๐Ÿ…๐Ÿฅฆ๐Ÿฅ•

+
    +
  1. Move all uncertainty to compile time: const generics, const coefficients, const fn's
  2. +
  3. Use iterator idioms with mechanical sympathy (avoid mutation)
  4. +
  5. use fused multiply adds and don't forget to set RUSTFLAGS="-C target-cpu=native target-feature=+avx2,+fma" cargo run --release
  6. +
  7. Use tools to take it one small step at a time +
      +
    1. battery of unit tests and cargo asm invocations
    2. +
    +
  8. +
+
+
+

Rusty implementation

+
pub fn sav_gol_f32<const WINDOW: usize, const M: usize>(buf: &mut [f32], data: &[f32]) {
+  //                ๐Ÿ‘† 1. Const generics
+    let coeffs = get_coeffs_f32::<WINDOW, M>();
+  //                               ๐Ÿ‘† 1. Const generics
+    let window_size = 2 * WINDOW + 1;
+    let body_size = data.len() - (window_size - 1);
+    buf.iter_mut()
+        .skip(window_size / 2)
+        .zip(data.windows(window_size))
+                  //๐Ÿ‘† 2. Use iterator idioms (avoid mutation)
+        .take(body_size)
+        .for_each(|(buf, data)| {
+            dot_prod_update_f32(buf, data, coeffs);
+        });
+}
+
+
+
+

Const all the things

+
    let coeffs = get_coeffs_f32::<WINDOW, M>();
+
+
    +
  • Unfortunately floating point ops are not yet const, but will be Soon โ„ข (?)๏ธ
  • +
+

Experiments:

+
    +
  • How much can you stretch compile time shenanigans?
  • +
  • nalgebra and friends were not helpful
  • +
  • Easiest to just precompute many cases in Julia and copy/paste them with vim into an unholy
  • +
+
pub const COEFFS_F32: [[&[f32]; 25]; 10] = ...;
+
+
+
+

Pass the const ball โšฝ to your iterators

+

prefer

+
    buf.iter_mut()
+        .skip(window_size / 2)
+        .zip(data.windows(window_size))
+        .take(body_size)
+        .for_each(|(buf, data)| {
+            dot_prod_update_f32(buf, data, coeffs);
+        });
+
+

to an indexing approach - tracking the mutability of the inner index can wreck the compiler's analysis and bail out early.

+
    +
  • Try using cargo-remark
  • +
  • Try figuring out all the places where Rust can take advantage of known trip counts, or even multiplying by a known constant
  • +
+
+
+

Small bites at a time

+

Separating out

+
#[inline]
+pub fn dot_prod_update(buf: &mut f64, data: &[f64], coeffs: &[f64]) {
+    *buf = data
+        .iter()
+        .zip(coeffs.iter())
+        .fold(0.0, |acc, (a, b)| a.mul_add(*b, acc));
+}
+
+

meant I could iterate faster with unit tests and swap out fold for sum, a.mul_add(*b, acc) for other idioms, etc.

+
    +
  • Homework: Try and use an exact-sized iterator, deal with the remainder separately
  • +
+
+
+

Low hanging fruit

+

+pub fn div_runtime_loop(x: f32) -> f32 {
+    let mut res = x;
+    for _ in 0..148 {
+        res += 1.0 / 2.0;
+    }
+    res
+}
+
+
+
+

Gives...

+
.LCPI0_0:
+        .long   0x3f000000
+example::div_runtime_loop::haf9d2398cf2f6940:
+        mov     eax, 144
+        movss   xmm1, dword ptr [rip + .LCPI0_0]
+.LBB0_1:
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        test    eax, eax
+        je      .LBB0_3
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        add     eax, -8
+        jmp     .LBB0_1
+.LBB0_3:
+        ret
+
+
+
+

But if the trip count is 149...

+
.LCPI0_0:
+        .long   0x3f000000
+example::div_runtime_loop::haf9d2398cf2f6940:
+        movss   xmm1, dword ptr [rip + .LCPI0_0]
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        addss   xmm0, xmm1
+        ; ... 149 times, very, very bad!
+
+
    +
  • My takeaway: be like a 5 year old and "lick"/test everything!
  • +
+
+
+

Compilers are hard

+

Poke at the link yourself:

+

steve cannon

+
+
+

Thanks to Jubilee

+
+
\ No newline at end of file diff --git a/talks/raz.yml b/talks/raz.yml index f875f53..daf9178 100644 --- a/talks/raz.yml +++ b/talks/raz.yml @@ -9,3 +9,4 @@ speaker: github: miguelraz mastodon: "https://hachyderm.io/@miguelraz_t" duration: short +slides: 2024/raz.html