-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Sequence of 1-to-4 values with prefix sum to support Punycode decoding.
PiperOrigin-RevId: 642696557 Change-Id: Ia6b8e174ddb55e44bd082bf0d81d2f9c53c94016
- Loading branch information
1 parent
17137c0
commit fc76120
Showing
5 changed files
with
283 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// Copyright 2024 The Abseil Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef ABSL_DEBUGGING_INTERNAL_BOUNDED_UTF8_LENGTH_SEQUENCE_H_ | ||
#define ABSL_DEBUGGING_INTERNAL_BOUNDED_UTF8_LENGTH_SEQUENCE_H_ | ||
|
||
#include <cstdint> | ||
|
||
#include "absl/base/config.h" | ||
#include "absl/numeric/bits.h" | ||
|
||
namespace absl { | ||
ABSL_NAMESPACE_BEGIN | ||
namespace debugging_internal { | ||
|
||
// A sequence of up to max_elements integers between 1 and 4 inclusive, whose | ||
// insertion operation computes the sum of all the elements before the insertion | ||
// point. This is useful in decoding Punycode, where one needs to know where in | ||
// a UTF-8 byte stream the n-th code point begins. | ||
// | ||
// BoundedUtf8LengthSequence is async-signal-safe and suitable for use in | ||
// symbolizing stack traces in a signal handler, provided max_elements is not | ||
// improvidently large. For inputs of lengths accepted by the Rust demangler, | ||
// up to a couple hundred code points, InsertAndReturnSumOfPredecessors should | ||
// run in a few dozen clock cycles, on par with the other arithmetic required | ||
// for Punycode decoding. | ||
template <uint32_t max_elements> | ||
class BoundedUtf8LengthSequence { | ||
public: | ||
// Constructs an empty sequence. | ||
BoundedUtf8LengthSequence() = default; | ||
|
||
// Inserts `utf_length` at position `index`, shifting any existing elements at | ||
// or beyond `index` one position to the right. If the sequence is already | ||
// full, the rightmost element is discarded. | ||
// | ||
// Returns the sum of the elements at positions 0 to `index - 1` inclusive. | ||
// If `index` is greater than the number of elements already inserted, the | ||
// excess positions in the range count 1 apiece. | ||
// | ||
// REQUIRES: index < max_elements and 1 <= utf8_length <= 4. | ||
uint32_t InsertAndReturnSumOfPredecessors( | ||
uint32_t index, uint32_t utf8_length) { | ||
// The caller shouldn't pass out-of-bounds inputs, but if it does happen, | ||
// clamp the values and try to continue. If we're being called from a | ||
// signal handler, the last thing we want to do is crash. Emitting | ||
// malformed UTF-8 is a lesser evil. | ||
if (index >= max_elements) index = max_elements - 1; | ||
if (utf8_length == 0 || utf8_length > 4) utf8_length = 1; | ||
|
||
const uint32_t word_index = index/32; | ||
const uint32_t bit_index = 2 * (index % 32); | ||
const uint64_t ones_bit = uint64_t{1} << bit_index; | ||
|
||
// Compute the sum of predecessors. | ||
// - Each value from 1 to 4 is represented by a bit field with value from | ||
// 0 to 3, so the desired sum is index plus the sum of the | ||
// representations actually stored. | ||
// - For each bit field, a set low bit should contribute 1 to the sum, and | ||
// a set high bit should contribute 2. | ||
// - Another way to say the same thing is that each set bit contributes 1, | ||
// and each set high bit contributes an additional 1. | ||
// - So the sum we want is index + popcount(everything) + popcount(bits in | ||
// odd positions). | ||
const uint64_t odd_bits_mask = 0xaaaaaaaaaaaaaaaa; | ||
const uint64_t lower_seminibbles_mask = ones_bit - 1; | ||
const uint64_t higher_seminibbles_mask = ~lower_seminibbles_mask; | ||
const uint64_t same_word_bits_below_insertion = | ||
rep_[word_index] & lower_seminibbles_mask; | ||
int full_popcount = absl::popcount(same_word_bits_below_insertion); | ||
int odd_popcount = | ||
absl::popcount(same_word_bits_below_insertion & odd_bits_mask); | ||
for (uint32_t j = word_index; j > 0; --j) { | ||
const uint64_t word_below_insertion = rep_[j - 1]; | ||
full_popcount += absl::popcount(word_below_insertion); | ||
odd_popcount += absl::popcount(word_below_insertion & odd_bits_mask); | ||
} | ||
const uint32_t sum_of_predecessors = | ||
index + static_cast<uint32_t>(full_popcount + odd_popcount); | ||
|
||
// Now insert utf8_length's representation, shifting successors up one | ||
// place. | ||
for (uint32_t j = max_elements/32 - 1; j > word_index; --j) { | ||
rep_[j] = (rep_[j] << 2) | (rep_[j - 1] >> 62); | ||
} | ||
rep_[word_index] = | ||
(rep_[word_index] & lower_seminibbles_mask) | | ||
(uint64_t{utf8_length - 1} << bit_index) | | ||
((rep_[word_index] & higher_seminibbles_mask) << 2); | ||
|
||
return sum_of_predecessors; | ||
} | ||
|
||
private: | ||
// If the (32 * i + j)-th element of the represented sequence has the value k | ||
// (0 <= j < 32, 1 <= k <= 4), then bits 2 * j and 2 * j + 1 of rep_[i] | ||
// contain the seminibble (k - 1). | ||
// | ||
// In particular, the zero-initialization of rep_ makes positions not holding | ||
// any inserted element count as 1 in InsertAndReturnSumOfPredecessors. | ||
// | ||
// Example: rep_ = {0xb1, ... the rest zeroes ...} represents the sequence | ||
// (2, 1, 4, 3, ... the rest 1's ...). Constructing the sequence of Unicode | ||
// code points "Àa🂻中" = {U+00C0, U+0061, U+1F0BB, U+4E2D} (among many | ||
// other examples) would yield this value of rep_. | ||
static_assert(max_elements > 0 && max_elements % 32 == 0, | ||
"max_elements must be a positive multiple of 32"); | ||
uint64_t rep_[max_elements/32] = {}; | ||
}; | ||
|
||
} // namespace debugging_internal | ||
ABSL_NAMESPACE_END | ||
} // namespace absl | ||
|
||
#endif // ABSL_DEBUGGING_INTERNAL_BOUNDED_UTF8_LENGTH_SEQUENCE_H_ |
126 changes: 126 additions & 0 deletions
126
absl/debugging/internal/bounded_utf8_length_sequence_test.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// Copyright 2024 The Abseil Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "absl/debugging/internal/bounded_utf8_length_sequence.h" | ||
|
||
#include <cstdint> | ||
|
||
#include "gtest/gtest.h" | ||
#include "absl/base/config.h" | ||
|
||
namespace absl { | ||
ABSL_NAMESPACE_BEGIN | ||
namespace debugging_internal { | ||
namespace { | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, RemembersAValueOfOneCorrectly) { | ||
BoundedUtf8LengthSequence<32> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0); | ||
EXPECT_EQ(seq.InsertAndReturnSumOfPredecessors(1, 1), 1); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, RemembersAValueOfTwoCorrectly) { | ||
BoundedUtf8LengthSequence<32> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 2), 0); | ||
EXPECT_EQ(seq.InsertAndReturnSumOfPredecessors(1, 1), 2); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, RemembersAValueOfThreeCorrectly) { | ||
BoundedUtf8LengthSequence<32> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 3), 0); | ||
EXPECT_EQ(seq.InsertAndReturnSumOfPredecessors(1, 1), 3); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, RemembersAValueOfFourCorrectly) { | ||
BoundedUtf8LengthSequence<32> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 4), 0); | ||
EXPECT_EQ(seq.InsertAndReturnSumOfPredecessors(1, 1), 4); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, RemembersSeveralAppendedValues) { | ||
BoundedUtf8LengthSequence<32> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(1, 4), 1); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(2, 2), 5); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(3, 3), 7); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(4, 1), 10); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, RemembersSeveralPrependedValues) { | ||
BoundedUtf8LengthSequence<32> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 4), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 3), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 2), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(4, 1), 10); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(3, 1), 6); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(2, 1), 3); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(1, 1), 1); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, RepeatedInsertsShiftValuesOutTheRightEnd) { | ||
BoundedUtf8LengthSequence<32> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 2), 0); | ||
for (uint32_t i = 1; i < 31; ++i) { | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0) | ||
<< "while moving the 2 into position " << i; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(31, 1), 32) | ||
<< "after moving the 2 into position " << i; | ||
} | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0) | ||
<< "while moving the 2 into position 31"; | ||
EXPECT_EQ(seq.InsertAndReturnSumOfPredecessors(31, 1), 31) | ||
<< "after moving the 2 into position 31"; | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, InsertsIntoWord1LeaveWord0Untouched) { | ||
BoundedUtf8LengthSequence<64> seq; | ||
for (uint32_t i = 0; i < 32; ++i) { | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(i, 2), 2 * i) | ||
<< "at index " << i; | ||
} | ||
EXPECT_EQ(seq.InsertAndReturnSumOfPredecessors(32, 1), 64); | ||
EXPECT_EQ(seq.InsertAndReturnSumOfPredecessors(32, 1), 64); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, InsertsIntoWord0ShiftValuesIntoWord1) { | ||
BoundedUtf8LengthSequence<64> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(29, 2), 29); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(30, 3), 31); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(31, 4), 34); | ||
|
||
// Pushing two 1's on the front moves the 3 and 4 into the high word. | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(34, 1), 31 + 2 + 3 + 4); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(32, 1), 31 + 2); | ||
} | ||
|
||
TEST(BoundedUtf8LengthSequenceTest, ValuesAreShiftedCorrectlyAmongThreeWords) { | ||
BoundedUtf8LengthSequence<96> seq; | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(31, 3), 31); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(63, 4), 62 + 3); | ||
|
||
// This insertion moves both the 3 and the 4 up a word. | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(0, 1), 0); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(65, 1), 63 + 3 + 4); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(64, 1), 63 + 3); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(33, 1), 32 + 3); | ||
ASSERT_EQ(seq.InsertAndReturnSumOfPredecessors(32, 1), 32); | ||
} | ||
|
||
} // namespace | ||
} // namespace debugging_internal | ||
ABSL_NAMESPACE_END | ||
} // namespace absl |