Skip to content

Commit

Permalink
Improve BED and .chrom.sizes parser (#106) [ci full]
Browse files Browse the repository at this point in the history
Support reading files with string fields enclosed by single or double quotes
  • Loading branch information
robomics committed May 11, 2023
1 parent c276349 commit 9cdf681
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 11 deletions.
2 changes: 2 additions & 0 deletions src/common/include/modle/common/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class LockRangeShared {
LockRangeShared& operator=(LockRangeShared&& other) noexcept = delete;
};

[[nodiscard]] constexpr std::string_view strip_quote_pairs(std::string_view s) noexcept;

} // namespace modle::utils

#include "../../../utils_impl.hpp" // IWYU pragma: export
Expand Down
12 changes: 12 additions & 0 deletions src/common/utils_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,18 @@ template <class MutexT>
LockRangeShared<MutexT>::~LockRangeShared() noexcept {
std::for_each(this->_mutexes.begin(), this->_mutexes.end(), [](auto &m) { m.unlock_shared(); });
}

constexpr std::string_view strip_quote_pairs(std::string_view s) noexcept {
if (s.size() < 2) {
return s;
}
const auto str_begins_with_quote = s.front() == '\'' || s.front() == '"';
const auto str_ends_with_quote = s.back() == '\'' || s.back() == '"';
if (str_begins_with_quote && str_ends_with_quote) {
return s.substr(1, s.size() - 2);
}
return s;
}
} // namespace modle::utils

// IWYU pragma: private, include "modle/utils.hpp"
Expand Down
24 changes: 16 additions & 8 deletions src/libmodle_io/bed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,31 @@ namespace modle::bed {

std::string RGB::to_string() const { return fmt::to_string(*this); }

bool RGB::operator==(const modle::bed::RGB& other) const noexcept {
return this->r == other.r && this->g == other.g && this->b == other.b;
}

bool RGB::operator!=(const modle::bed::RGB& other) const noexcept { return !(*this == other); }

void BED::parse_strand_or_throw(const std::vector<std::string_view>& toks, u8 idx, char& field) {
const auto match = bed_strand_encoding.find(toks[idx]);
const auto tok = utils::strip_quote_pairs(toks[idx]);
const auto match = bed_strand_encoding.find(tok);
if (match == bed_strand_encoding.end()) {
throw std::runtime_error(fmt::format(FMT_STRING("unrecognized strand \"{}\""), toks[idx]));
throw std::runtime_error(fmt::format(FMT_STRING("unrecognized strand \"{}\""), tok));
}
field = *match.second;
}

void BED::parse_rgb_or_throw(const std::vector<std::string_view>& toks, u8 idx, RGB& field) {
if (toks[idx] == "0") {
const auto tok = utils::strip_quote_pairs(toks[idx]);
if (tok == "0") {
field = RGB{0, 0, 0};
return;
}
const std::vector<std::string_view> channels = absl::StrSplit(toks[idx], ',');
const std::vector<std::string_view> channels = absl::StrSplit(tok, ',');
if (channels.size() != 3) {
throw std::runtime_error(fmt::format(FMT_STRING("RGB: expected 3 fields, got {}: \"{}\""),
channels.size(), toks[idx]));
throw std::runtime_error(
fmt::format(FMT_STRING("RGB: expected 3 fields, got {}: \"{}\""), channels.size(), tok));
}
utils::parse_numeric_or_throw(channels, 0, field.r);
utils::parse_numeric_or_throw(channels, 1, field.g);
Expand Down Expand Up @@ -118,7 +126,7 @@ void BED::validate_record(const std::vector<std::string_view>& toks, const Diale
}

void BED::parse_chrom(const std::vector<std::string_view>& toks) {
this->chrom = toks[BED_CHROM_IDX];
this->chrom = utils::strip_quote_pairs(toks[BED_CHROM_IDX]);
}

void BED::parse_chrom_start(const std::vector<std::string_view>& toks) {
Expand All @@ -138,7 +146,7 @@ bool BED::parse_chrom_end(const std::vector<std::string_view>& toks) {

bool BED::parse_name(const std::vector<std::string_view>& toks) {
assert(this->_standard >= BED4);
this->name = toks[BED_NAME_IDX];
this->name = utils::strip_quote_pairs(toks[BED_NAME_IDX]);
return this->_standard == BED4;
}

Expand Down
8 changes: 5 additions & 3 deletions src/libmodle_io/chrom_sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <vector> // for vector

#include "modle/bed/bed.hpp" // for BED
#include "modle/common/utils.hpp"

namespace modle::chrom_sizes {

Expand All @@ -41,14 +42,15 @@ std::vector<bed::BED> Parser::parse_all(char sep) {
}
DISABLE_WARNING_PUSH
DISABLE_WARNING_NULL_DEREF
if (const auto chrom_name = *splitter.begin(); chrom_names.contains(chrom_name)) {
const auto chrom_name = utils::strip_quote_pairs(*splitter.begin());
if (chrom_names.contains(chrom_name)) {
throw std::runtime_error(
fmt::format(FMT_STRING("found multiple records for chrom \"{}\""), chrom_name));
}
DISABLE_WARNING_POP
chrom_sizes.emplace_back(
fmt::format(FMT_COMPILE("{}\t0\t{}"), *splitter.begin(), *std::next(splitter.begin())),
id++, bed::BED::BED3);
fmt::format(FMT_COMPILE("{}\t0\t{}"), chrom_name, *std::next(splitter.begin())), id++,
bed::BED::BED3);
} catch (const std::runtime_error& e) {
throw std::runtime_error(
fmt::format(FMT_STRING("encountered a malformed record at line {} of file \"{}\": {}.\n "
Expand Down
2 changes: 2 additions & 0 deletions src/libmodle_io/include/bed/modle/bed/bed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ struct RGB {
u8 g;
u8 b;

bool operator==(const RGB& other) const noexcept;
bool operator!=(const RGB& other) const noexcept;
[[nodiscard]] std::string to_string() const;
};

Expand Down
34 changes: 34 additions & 0 deletions test/units/libmodle_io/bed_parser_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,40 @@ TEST_CASE("BED Parser simple", "[parsers][BED][io][short]") {
CHECK(records[0].strand == '+');
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST_CASE("BED: strip quotes", "[parsers][BED][io][short]") {
SECTION("valid") {
constexpr std::string_view line{
"chr1\t"
"0\t"
"10\t"
"\"name\"\t"
"0.0\t"
"\"+\"\t"
"0\t"
"1\t"
"\"0,0,0\""};

const bed::BED record(line);
CHECK(record.chrom == "chr1");
CHECK(record.chrom_start == 0);
CHECK(record.chrom_end == 10);
CHECK(record.name == "name");
CHECK(record.score == 0.0);
CHECK(record.strand == '+');
CHECK(record.thick_start == 0);
CHECK(record.thick_end == 1);
CHECK(*record.rgb == RGB{});

CHECK(bed::BED("\"chr1\t0\t1").chrom == "\"chr1");
}

SECTION("invalid") {
CHECK_THROWS(bed::BED("chr1\t\"0\"\t1"));
CHECK_THROWS(bed::BED("chr1\t0\t1\t.\t\"0.0\""));
}
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST_CASE("BED Parser simple: BED6 -> BED3", "[parsers][BED][io][short]") {
const auto bed_file = data_dir() / "genomic_intervals" / "intervals.bed6.xz";
Expand Down

0 comments on commit 9cdf681

Please sign in to comment.