diff --git a/ortools/util/aligned_memory.h b/ortools/util/aligned_memory.h new file mode 100644 index 00000000000..6bbf4aa32ee --- /dev/null +++ b/ortools/util/aligned_memory.h @@ -0,0 +1,101 @@ +// Copyright 2010-2022 Google LLC +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Provides functions and data structures that make it easier to work with +// aligned memory: +// +// - AlignedAllocator, an extension of std::allocator that also takes +// an explicit memory alignment parameter. The memory blocks returned by the +// allocator are aligned to this number of bytes, i.e. the address of the +// beginning of the block will be N * alignment_bytes for some N. +// - AlignedVector<>, a specialization of std::vector<> that uses the aligned +// allocator to create blocks with explicit allocations. +// +// - AlignUp and AlignDown are functions that align a pointer to the given +// number of bytes. + +#ifndef OR_TOOLS_UTIL_ALIGNED_MEMORY_H_ +#define OR_TOOLS_UTIL_ALIGNED_MEMORY_H_ + +#include +#include +#include + +#include "ortools/util/aligned_memory_internal.h" + +namespace operations_research { + +// Functions for working with pointers and rounding them up and down to a given +// alignment. + +// Returns the nearest greater or equal address that is a multiple of +// alignment_bytes. When ptr is already aligned to alignment_bytes, returns it +// unchanged. +template +inline Value* AlignUp(Value* ptr) { + const std::uintptr_t int_ptr = reinterpret_cast(ptr); + const std::uintptr_t misalignment = int_ptr % alignment_bytes; + if (misalignment == 0) return ptr; + return reinterpret_cast(int_ptr - misalignment + alignment_bytes); +} + +// Returns the nearest smaller or equal address that is a multiple of +// alignment_bytes. When ptr is already aligned to alignment_bytes, returns it +// unchanged +template +inline Value* AlignDown(Value* ptr) { + const std::intptr_t int_ptr = reinterpret_cast(ptr); + const std::intptr_t misalignment = int_ptr % alignment_bytes; + return reinterpret_cast(int_ptr - misalignment); +} + +// Returns true when `ptr` is aligned to `alignment_bytes` bytes. +template +inline bool IsAligned(Value* ptr) { + return reinterpret_cast(ptr) % alignment_bytes == 0; +} + +// Support for aligned containers in STL. + +// An allocator that always aligns its memory to `alignment_bytes`. +template +using AlignedAllocator = + internal::AllocatorWithAlignment; + +// A version of std::vector whose data() pointer is always aligned to +// `alignment_bytes`. +template +using AlignedVector = std::vector>; + +// Intentionally misaligned containers for testing correctness and performance +// of code that may depend on a certain alignment. +namespace use_only_in_tests { + +// A version of AlignedAllocator for testing purposes that adds intentional +// misalignment. The returned address has the form +// alignment_bytes * N + misalignment_bytes. +template +using MisalignedAllocator = + internal::AllocatorWithAlignment; + +// A specialization of std::vector<> that uses MisalignedAllocator with the +// given parameters. +template +using MisalignedVector = + std::vector>; + +} // namespace use_only_in_tests + +} // namespace operations_research + +#endif // OR_TOOLS_UTIL_ALIGNED_MEMORY_H_ diff --git a/ortools/util/aligned_memory_internal.h b/ortools/util/aligned_memory_internal.h new file mode 100644 index 00000000000..5ec45661263 --- /dev/null +++ b/ortools/util/aligned_memory_internal.h @@ -0,0 +1,74 @@ +// Copyright 2010-2022 Google LLC +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OR_TOOLS_UTIL_ALIGNED_MEMORY_INTERNAL_H_ +#define OR_TOOLS_UTIL_ALIGNED_MEMORY_INTERNAL_H_ + +#include +#include +#include + +#include "ortools/base/mathutil.h" + +namespace operations_research { + +namespace internal { + +template +struct AllocatorWithAlignment : public std::allocator { + // Allocates memory for num_items items of type T. The memory must be freed + // using deallocate(); using it with free() or `delete` might cause unexpected + // behavior when misalignment is used. + T* allocate(size_t num_items) { + // Having misalignment_bytes >= alignment_bytes is useless, because all + // misalignments are equivalent modulo `alignment_bytes`. Disallowing it + // allows us to simplify the code below. + static_assert(alignment_bytes == 0 || misalignment_bytes < alignment_bytes); + + // `std::aligned_alloc(alignment, size)` requires that `size` is a multiple + // of `alignment`, and might return a nullptr when this is not respected. To + // be safe, we round the number of bytes up to alignment. + const size_t num_required_bytes = + misalignment_bytes + num_items * sizeof(T); + + const size_t num_allocated_bytes = + MathUtil::RoundUpTo(num_required_bytes, alignment_bytes); + + std::uintptr_t ptr = reinterpret_cast( + std::aligned_alloc(alignment_bytes, num_allocated_bytes)); + return reinterpret_cast(ptr + misalignment_bytes); + } + // A version of allocate() that takes a hint; we just ignore the hint. + T* allocate(size_t n, const void*) { return allocate(n); } + + // Frees memory allocated by allocate(). + void deallocate(T* p, size_t) { + std::uintptr_t aligned_pointer = + reinterpret_cast(p) - misalignment_bytes; + free(reinterpret_cast(aligned_pointer)); + } + + // Rebind must be specialized to produce AllocatorWithAlignment and not + // std::allocator. It uses the same alignment and misalignment as its source. + template + struct rebind { + using other = + AllocatorWithAlignment; + }; +}; + +} // namespace internal + +} // namespace operations_research + +#endif // OR_TOOLS_UTIL_ALIGNED_MEMORY_INTERNAL_H_ diff --git a/ortools/util/vector_sum.h b/ortools/util/vector_sum.h new file mode 100644 index 00000000000..debc9df3ec1 --- /dev/null +++ b/ortools/util/vector_sum.h @@ -0,0 +1,43 @@ +// Copyright 2010-2022 Google LLC +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Fast summation of arrays (vectors, spans) of numbers. +// +// Speed: up to 2x faster than Eigen for float arrays with ~100 elements or more +// (as of 2023-05). +// Precision: Better or comparable precision to std::accumulate<> on the same +// value type. That said, the precision is inferior to precise sum +// algorithm such as ::AccurateSum. + +#ifndef OR_TOOLS_UTIL_VECTOR_SUM_H_ +#define OR_TOOLS_UTIL_VECTOR_SUM_H_ + +#include "absl/types/span.h" +#include "ortools/util/vector_sum_internal.h" + +namespace operations_research { + +// Computes the sum of `values`, assuming that the first element of `values` is +// aligned to 16 bytes. +inline float AlignedVectorSum(absl::Span values) { + return internal::VectorSum<4, 4, /*assume_aligned_at_start=*/true>(values); +} + +// Computes the sum of `values` without assuming anything. +inline float VectorSum(absl::Span values) { + return internal::VectorSum<4, 4, /*assume_aligned_at_start=*/false>(values); +} + +} // namespace operations_research + +#endif // OR_TOOLS_UTIL_VECTOR_SUM_H_ diff --git a/ortools/util/vector_sum_internal.h b/ortools/util/vector_sum_internal.h new file mode 100644 index 00000000000..76b0669ca12 --- /dev/null +++ b/ortools/util/vector_sum_internal.h @@ -0,0 +1,175 @@ +// Copyright 2010-2022 Google LLC +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OR_TOOLS_UTIL_VECTOR_SUM_INTERNAL_H_ +#define OR_TOOLS_UTIL_VECTOR_SUM_INTERNAL_H_ + +#include +#include +#include +#include + +#include "absl/base/attributes.h" +#include "absl/types/span.h" +#include "ortools/util/aligned_memory.h" + +namespace operations_research { +namespace internal { + +// A contiguous block of memory that contains `size` values of type `Value`, and +// the first value is aligned to `alignment` bytes. +template +struct alignas(alignment) AlignedBlock { + Value values[size] = {}; + + Value Sum() const { + alignas(alignment) Value sum[size]; + std::copy(std::begin(values), std::end(values), std::begin(sum)); + for (int i = size; i > 1; i /= 2) { + int middle = i / 2; + for (int j = 0; j < middle; ++j) { + sum[j] += sum[middle + j]; + } + } + return sum[0]; + } +}; + +// In-place addition for two AlignedBlock values. Adds `in` to `out`, storing +// the value in `out`. Unless something steps in, this compiles into a single +// `*addp*` instruction. +template +void AddInPlace(AlignedBlock& out, + const AlignedBlock& in) { + for (int i = 0; i < size; ++i) { + out.values[i] += in.values[i]; + } +} + +// Computes the sum of `num_blocks` aligned blocks. Proceeds in three phases: +// 1. Parallel sum with N = `num_blocks_per_iteration` independent block-sized +// accumulators. At the end, accumulator i contains partial sums at indices +// i, i + N, i + 2*N, ..., i + M*N, where M is the largest number of blocks +// such that i+M*N < num_blocks for all i. +// 2. Parallel addition of remaining blocks. All remaining blocks are added to +// accumulators 0, ..., num_remaining_blocks - 1. +// 3. Sum of accumulators: all accumulators are added together. The result is a +// single block-sized accumulator that is returned to the caller. +// +// The code of the function was specifically tuned for 32-bit floating point +// values, and works best with block_size = 4 and num_blocks_per_iteration = 4. +// It will likely work with other types, but may require extra care and possibly +// different parameters. +// +// NOTE(user): As of 2023-04-28, Clang's auto-vectorizer is *very* brittle +// and most attempts to reduce the last accumulator from step 3 into a single +// value prevents the rest of the function from being vectorized. To mitigate +// this behavior we return the whole block (which should normally fit into a +// single vector register). We also mark the function with +// `ABSL_ATTRIBUTE_NOINLINE` to make prevent the inliner from merging this +// function with any additional code that would prevent vectorization. +template +AlignedBlock ABSL_ATTRIBUTE_NOINLINE AlignedBlockSum( + const AlignedBlock* blocks, size_t num_blocks) { + using Block = AlignedBlock; + static_assert(sizeof(Block[2]) == sizeof(Block::values) * 2, + "The values in the block are not packed."); + + AlignedBlock sum[num_blocks_per_iteration]; + + const int leftover_blocks = num_blocks % num_blocks_per_iteration; + const int packed_blocks = num_blocks - leftover_blocks; + + const AlignedBlock* aligned_block_end = + blocks + packed_blocks; + + // Phase 1: Parallel sum of the bulk of the data. + if (packed_blocks >= num_blocks_per_iteration) { + std::copy(blocks, blocks + num_blocks_per_iteration, sum); + } + for (int i = num_blocks_per_iteration; i < packed_blocks; + i += num_blocks_per_iteration) { + for (int j = 0; j < num_blocks_per_iteration; ++j) { + AddInPlace(sum[j], blocks[i + j]); + } + } + + // Phase 2: Semi-parallel sum of the remaining up to + // num_blocks_per_iteration - 1 blocks. + for (int i = 0; i < leftover_blocks; ++i) { + AddInPlace(sum[i], aligned_block_end[i]); + } + + // Phase 3: Reduce the accumulator blocks to a single block. + // NOTE(user): When this code is auto-vectorized correctly, the initial + // copy is a no-op, and the for loop below translates to + // num_blocks_per_iteration - 1 vector add instructions. In 2023-05, I + // experimented with other versions (e.g. using sum[0] as the target or making + // res a const reference to sum[0], but in most cases they broke vectorization + // of the whole function). + AlignedBlock res = sum[0]; + for (int i = 1; i < num_blocks_per_iteration; ++i) { + AddInPlace(res, sum[i]); + } + + return res; +} + +// Computes the sum of values in `values`, by adding `num_blocks_per_iteration` +// blocks of `block_size` values. +// By default, the sum does not make any assumptions about the size or alignment +// of `values`. When the first item of `values` is known to be aligned to +// `block_size * sizeof(Value)` bytes, `assume_aligned_at_start` can be used to +// save a small amount of time. +template +Value VectorSum(absl::Span values) { + using Block = AlignedBlock; + const Value* start_ptr = values.data(); + const int size = values.size(); + // With less than two blocks, there's not a lot to vectorize, and a simple + // sequential sum is usually faster. + if (size == 0) return Value{0}; + if (size < 2 * block_size) { + return std::accumulate(start_ptr + 1, start_ptr + size, *start_ptr); + } + + if (assume_aligned_at_start) { + ABSL_ASSUME(reinterpret_cast(start_ptr) % alignof(Block) == + 0); + } + const Value* aligned_start_ptr = + assume_aligned_at_start ? start_ptr : AlignUp(start_ptr); + const Block* blocks = reinterpret_cast(aligned_start_ptr); + const Value* end_ptr = start_ptr + size; + const Value* aligned_end_ptr = AlignDown(end_ptr); + ABSL_ASSUME(aligned_start_ptr <= aligned_end_ptr); + const size_t num_blocks = (aligned_end_ptr - aligned_start_ptr) / block_size; + ABSL_ASSUME( + reinterpret_cast(aligned_end_ptr) % alignof(Block) == 0); + + Value leading_items_sum{}; + if (!assume_aligned_at_start) { + leading_items_sum = std::accumulate(start_ptr, aligned_start_ptr, Value{}); + } + Block res = + AlignedBlockSum(blocks, num_blocks); + Value block_sum = res.Sum(); + return std::accumulate(aligned_end_ptr, end_ptr, + block_sum + leading_items_sum); +} + +} // namespace internal +} // namespace operations_research + +#endif // OR_TOOLS_UTIL_VECTOR_SUM_INTERNAL_H_