Skip to content

Commit

Permalink
Merge pull request #58 from marbl/prefix-filter
Browse files Browse the repository at this point in the history
MashMap v3.0.6
  • Loading branch information
bkille authored Jul 10, 2023
2 parents c6978dd + b440e04 commit 4f4df5d
Show file tree
Hide file tree
Showing 8 changed files with 329 additions and 88 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ endif ()
if (${CMAKE_BUILD_TYPE} MATCHES Debug)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O -g")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O -g")
add_compile_options(-fsanitize=address)
add_link_options(-fsanitize=address)
else()
# Use all standard-compliant optimizations - always add these:
set (CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${PIC_FLAG} ${EXTRA_FLAGS}")
Expand Down
1 change: 1 addition & 0 deletions src/common/progress.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <cmath>
#include <iostream>
#include <string>
#include <atomic>
Expand Down
1 change: 1 addition & 0 deletions src/common/utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <cstdint>
#include <cmath>
#include <string>

Expand Down
37 changes: 30 additions & 7 deletions src/map/include/base_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <tuple>
#include <vector>
#include <chrono>
#include "common/progress.hpp"

namespace skch
{
Expand Down Expand Up @@ -165,6 +166,7 @@ namespace skch

//--for split read mapping

long double kmerComplexity; // Estimated sequence complexity
int n_merged; // how many mappings we've merged into this one
offset_t splitMappingId; // To identify split mappings that are chained
uint8_t discard; // set to 1 for deletion
Expand Down Expand Up @@ -204,10 +206,11 @@ namespace skch
//Container to save copy of kseq object
struct InputSeqContainer
{
seqno_t seqCounter; //sequence counter
offset_t len; //sequence length
std::string seq; //sequence string
std::string seqName; //sequence id
seqno_t seqCounter; //sequence counter
offset_t len; //sequence length
std::string seq; //sequence string
std::string seqName; //sequence id


/*
* @brief constructor
Expand All @@ -216,12 +219,30 @@ namespace skch
* @param[in] len length of sequence
*/
InputSeqContainer(const std::string& s, const std::string& id, seqno_t seqcount)
: seq(s)
, seqName(id)
: seqCounter(seqcount)
, len(s.length())
, seqCounter(seqcount) { }
, seq(s)
, seqName(id) { }
};

struct InputSeqProgContainer : InputSeqContainer
{
using InputSeqContainer::InputSeqContainer;
progress_meter::ProgressMeter& progress; //progress meter (shared)


/*
* @brief constructor
* @param[in] kseq_seq complete read or reference sequence
* @param[in] kseq_id sequence id name
* @param[in] len length of sequence
*/
InputSeqProgContainer(const std::string& s, const std::string& id, seqno_t seqcount, progress_meter::ProgressMeter& pm)
: InputSeqContainer(s, id, seqcount)
, progress(pm) { }
};


//Output type of map function
struct MapModuleOutput
{
Expand All @@ -248,6 +269,8 @@ namespace skch
std::string seqName; //sequence name
MinmerVec minmerTableQuery; //Vector of minmers in the query
MinmerVec seedHits; //Vector of minmers in the reference
int refGroup; //Prefix group of sequence
float kmerComplexity; //Estimated sequence complexity
};
}

Expand Down
37 changes: 35 additions & 2 deletions src/map/include/commonFunc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,25 @@ namespace skch {
ankerl::unordered_dense::map<hash_t, MinmerInfo> sketched_vals;
std::vector<hash_t> sketched_heap;
sketched_heap.reserve(sketchSize+1);

// Get distance until last "N"
int ambig_kmer_count = 0;
for (int i = kmerSize - 1; i >= 0; i--)
{
if (seq[i] == 'N')
{
ambig_kmer_count = i+1;
break;
}
}

for(offset_t i = 0; i < len - kmerSize + 1; i++)
{

if (seq[i+kmerSize-1] == 'N')
{
ambig_kmer_count = kmerSize;
}
//Hash kmers
hash_t hashFwd = CommonFunc::getHash(seq + i, kmerSize);
hash_t hashBwd;
Expand All @@ -199,7 +215,7 @@ namespace skch {
hashBwd = std::numeric_limits<hash_t>::max(); //Pick a dummy high value so that it is ignored later

//Consider non-symmetric kmers only
if(hashBwd != hashFwd)
if(hashBwd != hashFwd && ambig_kmer_count == 0)
{
//Take minimum value of kmer and its reverse complement
hash_t currentKmer = std::min(hashFwd, hashBwd);
Expand Down Expand Up @@ -237,6 +253,10 @@ namespace skch {
}
}
}
if (ambig_kmer_count > 0)
{
ambig_kmer_count--;
}
}

minmerIndex.resize(sketched_heap.size());
Expand Down Expand Up @@ -293,6 +313,10 @@ namespace skch {

//if(alphabetSize == 4) //not protein
//CommonFunc::reverseComplement(seq, seqRev.get(), len);

// Get distance until last "N"
int ambig_kmer_count = 0;


for(offset_t i = 0; i < len - kmerSize + 1; i++)
{
Expand Down Expand Up @@ -369,8 +393,12 @@ namespace skch {
Q.pop_front();
}

if (seq[i+kmerSize-1] == 'N')
{
ambig_kmer_count = kmerSize;
}
//Consider non-symmetric kmers only
if(hashBwd != hashFwd)
if(hashBwd != hashFwd && ambig_kmer_count == 0)
{
// Add current hash to window
Q.push_back(std::make_tuple(currentKmer, currentStrand, i));
Expand Down Expand Up @@ -399,6 +427,11 @@ namespace skch {
std::push_heap(heapWindow.begin(), heapWindow.end(), KIHeap_cmp);
}
}
if (ambig_kmer_count > 0)
{
ambig_kmer_count--;
}




Expand Down
Loading

0 comments on commit 4f4df5d

Please sign in to comment.