From 739007f1483c4cdeffe5121732422b078188ef56 Mon Sep 17 00:00:00 2001 From: re0312 <45868716+re0312@users.noreply.github.com> Date: Wed, 4 Sep 2024 07:41:10 +0800 Subject: [PATCH] Opportunistically use dense iter for archetypal iteration in Par_iter (#14673) # Objective - follow of #14049 ,we could use it on our Parallel Iterator,this pr also unified the used function in both regular iter and parallel iterations. ## Performance ![image](https://github.com/user-attachments/assets/cba700bc-169c-4b58-b504-823bdca8ec05) no performance regression for regular itertaion 3.5X faster in hybrid parallel iteraion,this number is far greater than the benefits obtained in regular iteration(~1.81) because mutable iterations on continuous memory can effectively reduce the cost of mataining core cache coherence --- .../iteration/iter_simple_foreach_hybrid.rs | 2 +- benches/benches/bevy_ecs/iteration/mod.rs | 5 + .../par_iter_simple_foreach_hybrid.rs | 45 +++++++ crates/bevy_ecs/src/query/iter.rs | 112 +++++++++++------- crates/bevy_ecs/src/query/state.rs | 33 +----- 5 files changed, 121 insertions(+), 76 deletions(-) create mode 100644 benches/benches/bevy_ecs/iteration/par_iter_simple_foreach_hybrid.rs diff --git a/benches/benches/bevy_ecs/iteration/iter_simple_foreach_hybrid.rs b/benches/benches/bevy_ecs/iteration/iter_simple_foreach_hybrid.rs index 73eb55cfdbf8e..0db614c9aca44 100644 --- a/benches/benches/bevy_ecs/iteration/iter_simple_foreach_hybrid.rs +++ b/benches/benches/bevy_ecs/iteration/iter_simple_foreach_hybrid.rs @@ -20,7 +20,7 @@ impl<'w> Benchmark<'w> { let mut v = vec![]; for _ in 0..10000 { - world.spawn((TableData(0.0), SparseData(0.0))).id(); + world.spawn((TableData(0.0), SparseData(0.0))); v.push(world.spawn(TableData(0.)).id()); } diff --git a/benches/benches/bevy_ecs/iteration/mod.rs b/benches/benches/bevy_ecs/iteration/mod.rs index baa1bb385bb87..a5dcca441c2fe 100644 --- a/benches/benches/bevy_ecs/iteration/mod.rs +++ b/benches/benches/bevy_ecs/iteration/mod.rs @@ -20,6 +20,7 @@ mod iter_simple_system; mod iter_simple_wide; mod iter_simple_wide_sparse_set; mod par_iter_simple; +mod par_iter_simple_foreach_hybrid; use heavy_compute::*; @@ -135,4 +136,8 @@ fn par_iter_simple(c: &mut Criterion) { b.iter(move || bench.run()); }); } + group.bench_function(format!("hybrid"), |b| { + let mut bench = par_iter_simple_foreach_hybrid::Benchmark::new(); + b.iter(move || bench.run()); + }); } diff --git a/benches/benches/bevy_ecs/iteration/par_iter_simple_foreach_hybrid.rs b/benches/benches/bevy_ecs/iteration/par_iter_simple_foreach_hybrid.rs new file mode 100644 index 0000000000000..e2044c0956287 --- /dev/null +++ b/benches/benches/bevy_ecs/iteration/par_iter_simple_foreach_hybrid.rs @@ -0,0 +1,45 @@ +use bevy_ecs::prelude::*; +use bevy_tasks::{ComputeTaskPool, TaskPool}; +use rand::{prelude::SliceRandom, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +#[derive(Component, Copy, Clone)] +struct TableData(f32); + +#[derive(Component, Copy, Clone)] +#[component(storage = "SparseSet")] +struct SparseData(f32); + +fn deterministic_rand() -> ChaCha8Rng { + ChaCha8Rng::seed_from_u64(42) +} +pub struct Benchmark<'w>(World, QueryState<(&'w mut TableData, &'w SparseData)>); + +impl<'w> Benchmark<'w> { + pub fn new() -> Self { + let mut world = World::new(); + ComputeTaskPool::get_or_init(TaskPool::default); + + let mut v = vec![]; + for _ in 0..100000 { + world.spawn((TableData(0.0), SparseData(0.0))); + v.push(world.spawn(TableData(0.)).id()); + } + + // by shuffling ,randomize the archetype iteration order to significantly deviate from the table order. This maximizes the loss of cache locality during archetype-based iteration. + v.shuffle(&mut deterministic_rand()); + for e in v.into_iter() { + world.entity_mut(e).despawn(); + } + + let query = world.query::<(&mut TableData, &SparseData)>(); + Self(world, query) + } + + #[inline(never)] + pub fn run(&mut self) { + self.1 + .par_iter_mut(&mut self.0) + .for_each(|(mut v1, v2)| v1.0 += v2.0) + } +} diff --git a/crates/bevy_ecs/src/query/iter.rs b/crates/bevy_ecs/src/query/iter.rs index 5c1912b7bcc18..90749bcee963b 100644 --- a/crates/bevy_ecs/src/query/iter.rs +++ b/crates/bevy_ecs/src/query/iter.rs @@ -122,6 +122,67 @@ impl<'w, 's, D: QueryData, F: QueryFilter> QueryIter<'w, 's, D, F> { } } + /// Executes the equivalent of [`Iterator::fold`] over a contiguous segment + /// from an storage. + /// + /// # Safety + /// - `range` must be in `[0, storage::entity_count)` or None. + #[inline] + pub(super) unsafe fn fold_over_storage_range( + &mut self, + mut accum: B, + func: &mut Func, + storage: StorageId, + range: Option>, + ) -> B + where + Func: FnMut(B, D::Item<'w>) -> B, + { + if self.cursor.is_dense { + // SAFETY: `self.cursor.is_dense` is true, so storage ids are guaranteed to be table ids. + let table_id = unsafe { storage.table_id }; + // SAFETY: Matched table IDs are guaranteed to still exist. + let table = unsafe { self.tables.get(table_id).debug_checked_unwrap() }; + + let range = range.unwrap_or(0..table.entity_count()); + accum = + // SAFETY: + // - The fetched table matches both D and F + // - caller ensures `range` is within `[0, table.entity_count)` + // - The if block ensures that the query iteration is dense + unsafe { self.fold_over_table_range(accum, func, table, range) }; + } else { + // SAFETY: `self.cursor.is_dense` is false, so storage ids are guaranteed to be archetype ids. + let archetype_id = unsafe { storage.archetype_id }; + // SAFETY: Matched archetype IDs are guaranteed to still exist. + let archetype = unsafe { self.archetypes.get(archetype_id).debug_checked_unwrap() }; + // SAFETY: Matched table IDs are guaranteed to still exist. + let table = unsafe { self.tables.get(archetype.table_id()).debug_checked_unwrap() }; + + let range = range.unwrap_or(0..archetype.len()); + + // When an archetype and its table have equal entity counts, dense iteration can be safely used. + // this leverages cache locality to optimize performance. + if table.entity_count() == archetype.len() { + accum = + // SAFETY: + // - The fetched archetype matches both D and F + // - The provided archetype and its' table have the same length. + // - caller ensures `range` is within `[0, archetype.len)` + // - The if block ensures that the query iteration is not dense. + unsafe { self.fold_over_dense_archetype_range(accum, func, archetype,range) }; + } else { + accum = + // SAFETY: + // - The fetched archetype matches both D and F + // - caller ensures `range` is within `[0, archetype.len)` + // - The if block ensures that the query iteration is not dense. + unsafe { self.fold_over_archetype_range(accum, func, archetype,range) }; + } + } + accum + } + /// Executes the equivalent of [`Iterator::fold`] over a contiguous segment /// from an table. /// @@ -143,7 +204,7 @@ impl<'w, 's, D: QueryData, F: QueryFilter> QueryIter<'w, 's, D, F> { if table.is_empty() { return accum; } - assert!( + debug_assert!( rows.end <= u32::MAX as usize, "TableRow is only valid up to u32::MAX" ); @@ -267,12 +328,11 @@ impl<'w, 's, D: QueryData, F: QueryFilter> QueryIter<'w, 's, D, F> { if archetype.is_empty() { return accum; } - assert!( + debug_assert!( rows.end <= u32::MAX as usize, "TableRow is only valid up to u32::MAX" ); let table = self.tables.get(archetype.table_id()).debug_checked_unwrap(); - debug_assert!( archetype.len() == table.entity_count(), "archetype and it's table must have the same length. " @@ -1032,48 +1092,10 @@ impl<'w, 's, D: QueryData, F: QueryFilter> Iterator for QueryIter<'w, 's, D, F> accum = func(accum, item); } - if self.cursor.is_dense { - for id in self.cursor.storage_id_iter.clone() { - // SAFETY: `self.cursor.is_dense` is true, so storage ids are guaranteed to be table ids. - let table_id = unsafe { id.table_id }; - // SAFETY: Matched table IDs are guaranteed to still exist. - let table = unsafe { self.tables.get(table_id).debug_checked_unwrap() }; - - accum = - // SAFETY: - // - The fetched table matches both D and F - // - The provided range is equivalent to [0, table.entity_count) - // - The if block ensures that the query iteration is dense - unsafe { self.fold_over_table_range(accum, &mut func, table, 0..table.entity_count()) }; - } - } else { - for id in self.cursor.storage_id_iter.clone() { - // SAFETY: `self.cursor.is_dense` is false, so storage ids are guaranteed to be archetype ids. - let archetype_id = unsafe { id.archetype_id }; - // SAFETY: Matched archetype IDs are guaranteed to still exist. - let archetype = unsafe { self.archetypes.get(archetype_id).debug_checked_unwrap() }; - // SAFETY: Matched table IDs are guaranteed to still exist. - let table = unsafe { self.tables.get(archetype.table_id()).debug_checked_unwrap() }; - - // When an archetype and its table have equal entity counts, dense iteration can be safely used. - // this leverages cache locality to optimize performance. - if table.entity_count() == archetype.len() { - accum = - // SAFETY: - // - The fetched archetype matches both D and F - // - The provided archetype and its' table have the same length. - // - The provided range is equivalent to [0, archetype.len) - // - The if block ensures that the query iteration is not dense. - unsafe { self.fold_over_dense_archetype_range(accum, &mut func, archetype, 0..archetype.len()) }; - } else { - accum = - // SAFETY: - // - The fetched archetype matches both D and F - // - The provided range is equivalent to [0, archetype.len) - // - The if block ensures that the query iteration is not dense. - unsafe { self.fold_over_archetype_range(accum, &mut func, archetype, 0..archetype.len()) }; - } - } + for id in self.cursor.storage_id_iter.clone().copied() { + // SAFETY: + // - The range(None) is equivalent to [0, storage.entity_count) + accum = unsafe { self.fold_over_storage_range(accum, &mut func, id, None) }; } accum } diff --git a/crates/bevy_ecs/src/query/state.rs b/crates/bevy_ecs/src/query/state.rs index b94cb4aa90454..93d71551492d6 100644 --- a/crates/bevy_ecs/src/query/state.rs +++ b/crates/bevy_ecs/src/query/state.rs @@ -1505,25 +1505,7 @@ impl QueryState { let mut iter = self.iter_unchecked_manual(world, last_run, this_run); let mut accum = init_accum(); for storage_id in queue { - if self.is_dense { - let id = storage_id.table_id; - let table = &world.storages().tables.get(id).debug_checked_unwrap(); - accum = iter.fold_over_table_range( - accum, - &mut func, - table, - 0..table.entity_count(), - ); - } else { - let id = storage_id.archetype_id; - let archetype = world.archetypes().get(id).debug_checked_unwrap(); - accum = iter.fold_over_archetype_range( - accum, - &mut func, - archetype, - 0..archetype.len(), - ); - } + accum = iter.fold_over_storage_range(accum, &mut func, storage_id, None); } }); }; @@ -1539,17 +1521,8 @@ impl QueryState { #[cfg(feature = "trace")] let _span = self.par_iter_span.enter(); let accum = init_accum(); - if self.is_dense { - let id = storage_id.table_id; - let table = world.storages().tables.get(id).debug_checked_unwrap(); - self.iter_unchecked_manual(world, last_run, this_run) - .fold_over_table_range(accum, &mut func, table, batch); - } else { - let id = storage_id.archetype_id; - let archetype = world.archetypes().get(id).debug_checked_unwrap(); - self.iter_unchecked_manual(world, last_run, this_run) - .fold_over_archetype_range(accum, &mut func, archetype, batch); - } + self.iter_unchecked_manual(world, last_run, this_run) + .fold_over_storage_range(accum, &mut func, storage_id, Some(batch)); }); } };