Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds --keeplength flag to maff-add action. #84

Merged
merged 2 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions q2_alignment/_mafft.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def run_command(cmd, output_fp, verbose=True):
subprocess.run(cmd, stdout=output_f, check=True)


def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments):
def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
keeplength):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
keeplength):
keeplength=False):

Copy link
Member Author

@mikerobeson mikerobeson Aug 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left this as is for now as it is similar to addfragments. I just I just followed that lead of adding False in the mafft function. I can change that it that is the better way to go. :-)

# Save original sequence IDs since long ids (~250 chars) can be truncated
# by mafft. We'll replace the IDs in the aligned sequences file output by
# mafft with the originals.
Expand Down Expand Up @@ -91,9 +92,13 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments):
if parttree:
cmd += ['--parttree']

if keeplength:
cmd += ['--keeplength']

if alignment_fp is not None:
add_flag = '--addfragments' if addfragments else '--add'
cmd += [add_flag, sequences_fp, alignment_fp]

else:
cmd += [sequences_fp]

Expand Down Expand Up @@ -125,15 +130,17 @@ def mafft(sequences: DNAFASTAFormat,
n_threads: int = 1,
parttree: bool = False) -> AlignedDNAFASTAFormat:
sequences_fp = str(sequences)
return _mafft(sequences_fp, None, n_threads, parttree, False)
return _mafft(sequences_fp, None, n_threads, parttree, False, False)


def mafft_add(alignment: AlignedDNAFASTAFormat,
sequences: DNAFASTAFormat,
n_threads: int = 1,
parttree: bool = False,
addfragments: bool = False) -> AlignedDNAFASTAFormat:
addfragments: bool = False,
keeplength: bool = False) -> AlignedDNAFASTAFormat:
alignment_fp = str(alignment)
sequences_fp = str(sequences)
return _mafft(
sequences_fp, alignment_fp, n_threads, parttree, addfragments)
sequences_fp, alignment_fp, n_threads, parttree, addfragments,
keeplength)
10 changes: 8 additions & 2 deletions q2_alignment/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@
'sequences': FeatureData[Sequence]},
parameters={'n_threads': Threads,
'parttree': Bool,
'addfragments': Bool},
'addfragments': Bool,
'keeplength': Bool},
outputs=[('expanded_alignment', FeatureData[AlignedSequence])],
input_descriptions={'alignment': 'The alignment to which '
'sequences should be added.',
Expand All @@ -60,7 +61,12 @@
'addfragments': 'Optimize for the addition of short sequence '
'fragments (for example, primer or amplicon '
'sequences). If not set, default sequence addition '
'is used.'},
'is used.',
'keeplength': 'If selected, the alignment length will be unchanged. '
'Any added sequence that would otherwise introduce new '
'insertions into the alignment, will have those '
'insertions deleted, to preserve original alignment '
'length.'},
output_descriptions={
'expanded_alignment': 'Alignment containing the provided aligned and '
'unaligned sequences.'},
Expand Down
5 changes: 5 additions & 0 deletions q2_alignment/tests/data/aligned-dna-sequences-2.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>aln-seq-1
AGGG-GGC
>aln-seq-2
AGGGTGGC

4 changes: 4 additions & 0 deletions q2_alignment/tests/data/unaligned-dna-sequences-2.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>seq-3
AGGTTTTGGC
>seq-4
AGGTTATGGC
59 changes: 59 additions & 0 deletions q2_alignment/tests/test_mafft.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,45 @@ def _prepare_sequence_data(self):

return alignment, sequences, exp

def _prepare_sequence_data_2(self):
# for new alignment using `--keeplength` parameter
sequences_fp = self.get_data_path('unaligned-dna-sequences-2.fasta')
sequences = DNAFASTAFormat(sequences_fp, mode='r')
alignment_fp = self.get_data_path('aligned-dna-sequences-2.fasta')
alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
exp = skbio.TabularMSA(
[skbio.DNA('AGGG-GGC',
metadata={'id': 'aln-seq-1', 'description': ''}),
skbio.DNA('AGGGTGGC',
metadata={'id': 'aln-seq-2', 'description': ''}),
skbio.DNA('AGGTTGGC',
metadata={'id': 'seq-3', 'description': ''}),
skbio.DNA('AGGATGGC',
metadata={'id': 'seq-4', 'description': ''})]
)

return alignment, sequences, exp

def _prepare_sequence_data_3(self):
# NOT using `--keeplength` parameter. To compare with
# _prepare_sequence_data_2 output
sequences_fp = self.get_data_path('unaligned-dna-sequences-2.fasta')
sequences = DNAFASTAFormat(sequences_fp, mode='r')
alignment_fp = self.get_data_path('aligned-dna-sequences-2.fasta')
alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
exp = skbio.TabularMSA(
[skbio.DNA('AGG--G---GGC',
metadata={'id': 'aln-seq-1', 'description': ''}),
skbio.DNA('AGG--G--TGGC',
metadata={'id': 'aln-seq-2', 'description': ''}),
skbio.DNA('AGG--TTTTGGC',
metadata={'id': 'seq-3', 'description': ''}),
skbio.DNA('AGGTTA--TGGC',
metadata={'id': 'seq-4', 'description': ''})]
)

return alignment, sequences, exp

def test_mafft_add(self):
alignment, sequences, exp = self._prepare_sequence_data()

Expand All @@ -122,6 +161,26 @@ def test_mafft_add_fragments(self):
constructor=skbio.DNA)
self.assertEqual(obs, exp)

def test_mafft_add_fragments_keeplength(self):
alignment, sequences, exp = self._prepare_sequence_data_2()

with redirected_stdio(stderr=os.devnull):
result = mafft_add(alignment, sequences, addfragments=True,
keeplength=True)
obs = skbio.io.read(str(result), into=skbio.TabularMSA,
constructor=skbio.DNA)
self.assertEqual(obs, exp)

def test_mafft_add_fragments_no_keeplength(self):
alignment, sequences, exp = self._prepare_sequence_data_3()

with redirected_stdio(stderr=os.devnull):
result = mafft_add(alignment, sequences, addfragments=True,
keeplength=False)
obs = skbio.io.read(str(result), into=skbio.TabularMSA,
constructor=skbio.DNA)
self.assertEqual(obs, exp)

def test_mafft_add_flags(self):
alignment, sequences, exp = self._prepare_sequence_data()

Expand Down
Loading