Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] V022 WebAssembly #3

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions .biolib/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
biolib_version: 1
modules:
main:
executor: 'bl-python3:*'
path: covid_spike_classification/__main__.py
tracy:
executor: 'bl-emscripten:*'
path: wasm/tracy.wasm
samtools:
executor: 'bl-emscripten:*'
path: wasm/samtools.wasm
bowtie2:
executor: 'bl-emscripten:*'
path: wasm/bowtie2-align-s.wasm
bowtie2-build:
executor: 'bl-emscripten:*'
path: wasm/bowtie2-build-s.wasm
arguments:
-
default_value: ''
description: 'A zip file containing the sequencing files to call variants on:'
key: ''
key_value_separator: ' '
required: true
type: file
-
default_value: ''
description: ''
key: '--stdout'
key_value_separator: ' '
required: true
type: hidden
-
default_value: ab1
description: 'Select which input format to expect:'
key: '--input-format'
key_value_separator: ' '
required: true
type: dropdown
options:
ab1: ab1
fasta: fasta
fastq: fastq
32 changes: 32 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: ci

on: push

env:
# The location of your BioLib project in the format <account_handle>/<project_name>
BIOLIB_PROJECT_URI: ssi/covid-spike-classification

jobs:
publish-to-biolib:
name: Publish new version to BioLib
runs-on: ubuntu-latest
# Only changes to the master branch should trigger a push to BioLib
if: github.ref == 'refs/heads/master'

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Install BioLib CLI
run: pip3 install pybiolib

- name: Publish new version to BioLib
run: biolib push $BIOLIB_PROJECT_URI
env:
BIOLIB_EMAIL: ${{ secrets.BIOLIB_EMAIL }}
BIOLIB_PASSWORD: ${{ secrets.BIOLIB_PASSWORD }}
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Using Sanger-sequenced RT-PCR product of the spike protein, this tool should pic
mutations currently tracked and give a table with one row per sample and a yes/no/failed column per
tracked mutation.

This workflow is built and maintained at https://github.com/kblin/covid-spike-classification

## Installation

For now, `covid-spike-classification` is distributed via this git repository.
Expand Down Expand Up @@ -49,8 +51,3 @@ Notably, you can provide the input either as a ZIP file or as a directory, as lo
to run the analysis on.

See also the `--help` output for more detailed usage information.


## License
All code is available under the Apache License version 2, see the
[`LICENSE`](LICENSE) file for details.
2 changes: 1 addition & 1 deletion covid_spike_classification/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.2"
__version__ = "0.2.4"
19 changes: 15 additions & 4 deletions covid_spike_classification/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import datetime
import os
import shutil
import sys
import tempfile
from js import BioLib

from .config import CSCConfig
from covid_spike_classification.config import CSCConfig

from .core import (
from covid_spike_classification.core import (
REGIONS,
basecall,
map_reads,
Expand All @@ -23,6 +23,8 @@ def main():
help="A zip file or directory containing the ab1 files to call variants on.")
parser.add_argument("-r", "--reference", default=os.path.join(os.getcwd(), "ref", "NC_045512.fasta"),
help="Reference FASTA file to use (default: %(default)s).")
parser.add_argument("-i", "--input-format", choices=["ab1", "fasta", "fastq"], default="ab1",
help="Select which input format to expect. Choices: %(choices)s. default: %(default)s")
parser.add_argument("-o", "--outdir",
default=datetime.datetime.now().strftime("%Y-%m-%d"),
help="File to write result CSV and fastq files to (default: %(default)s).")
Expand All @@ -34,7 +36,7 @@ def main():
help="Debug mode: Keep bam file around when the parsing crashes")
parser.add_argument("--show-unexpected", action="store_true", default=False,
help="Show unexpected mutations instead of reporting 'no known mutation'")
parser.add_argument("-n", "--name-variants", action="store_true", default=False,
parser.add_argument("-n", "--name-variants", action="store_false", default=True,
help="Add a column naming known variants")
parser.add_argument("-z", "--zip-results", action="store_true", default=False,
help="Create a zipfile from the output directory instead of the output directory.")
Expand All @@ -44,6 +46,14 @@ def main():

os.makedirs(args.outdir, exist_ok=True)

# Build indices
BioLib.call_task(
"bowtie2-build",
b'',
"/ref/NC_045512.fasta /ref/NC_045512.index",
["/ref", "/wasm/bowtie2-build-s.wasm"],
True)

with tempfile.TemporaryDirectory() as tmpdir:
basecall(tmpdir, config)
map_reads(tmpdir, config)
Expand All @@ -55,5 +65,6 @@ def main():
shutil.make_archive(config.outdir, "zip", root_dir=config.outdir)
shutil.rmtree(config.outdir, ignore_errors=True)


if __name__ == "__main__":
main()
8 changes: 8 additions & 0 deletions covid_spike_classification/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
class CSCConfig:
__slots__ = (
'debug',
'_failed',
'input_format',
'name_variants',
'outdir',
'quiet',
Expand All @@ -15,8 +17,14 @@ class CSCConfig:

def __init__(self, **kwargs):
for attr in self.__slots__:
# skip internal slots
if attr.startswith("_"):
continue
setattr(self, attr, kwargs[attr])

# set up internal slots
self._failed = set()

@classmethod
def from_args(cls, namespace):
kwargs = {}
Expand Down
133 changes: 95 additions & 38 deletions covid_spike_classification/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
import os
import shutil
import subprocess
import sys
import zipfile
from js import BioLib


from .config import CSCConfig

from Bio.Seq import Seq

Expand Down Expand Up @@ -58,58 +61,105 @@ class BaseDeletedError(RuntimeError):


def basecall(tmpdir, config):
BioLib.set_execution_progress(5)
if config.input_format != "ab1":
return
fastq_dir = os.path.join(tmpdir, "fastqs")
os.makedirs(fastq_dir)

if os.path.isdir(config.reads):
ab1_dir = config.reads
else:
ab1_dir = os.path.join(tmpdir, "ab1s")
os.makedirs(ab1_dir)
with zipfile.ZipFile(config.reads) as sanger_zip:
ab1_files = [finfo for finfo in sanger_zip.infolist() if finfo.filename.endswith(".ab1")]
for sanger_file in ab1_files:
sanger_zip.extract(sanger_file, ab1_dir)

for sanger_file in glob.glob(os.path.join(ab1_dir, "*.ab1")):
ab1_dir = _extract_if_zip(tmpdir, config)

sanger_files = glob.glob(os.path.join(ab1_dir, "*.ab1"))
sanger_files_len = len(sanger_files)
for idx, sanger_file in enumerate(sanger_files):
BioLib.set_execution_progress(5 + (10 * (idx / sanger_files_len)))
base_name = os.path.basename(sanger_file)
cmd = ["tracy", "basecall", "-f", "fastq", "-o", os.path.join(fastq_dir, f"{base_name}.fastq"), sanger_file]
fastq_file = os.path.join(fastq_dir, f"{base_name}.fastq")
print("", file=open(fastq_file, 'w'))

cmd = ["tracy", "basecall", "-f", "fastq", "-o", fastq_file, sanger_file]
kwargs = {}
if config.quiet:
kwargs["stdout"] = subprocess.DEVNULL
kwargs["stderr"] = subprocess.DEVNULL
subprocess.check_call(cmd, **kwargs)
result = BioLib.call_task(cmd[0], b'', cmd[1:], [sanger_file, '/wasm/tracy.wasm', fastq_file], True)

shutil.copytree(fastq_dir, config.outdir, dirs_exist_ok=True)
# copy all content of fastq_dir to config.outdir
for path in os.listdir(fastq_dir):
source_path = os.path.join(fastq_dir, path)
destination_path = os.path.join(config.outdir, path)
if os.path.isdir(source_path):
shutil.copytree(source_path, destination_path)
else:
shutil.copy2(source_path, destination_path)


def map_reads(tmpdir, config):
fastq_dir = os.path.join(tmpdir, "fastqs")

if config.input_format == "ab1":
# fastqs live in the tmpdir
sequence_dir = os.path.join(tmpdir, "fastqs")
file_ending = "fastq"
else:
sequence_dir = _extract_if_zip(tmpdir, config)
file_ending = config.input_format


bam_dir = os.path.join(tmpdir, "bams")
os.makedirs(bam_dir)

# ditch the .fasta file ending
name, _ = os.path.splitext(config.reference)
ref = f"{name}.index"

sam_view_cmd = ["samtools", "view", "-Sb", "-"]
sam_sort_cmd = ["samtools", "sort", "-"]

stderr = subprocess.DEVNULL if config.quiet else None

for fastq_file in glob.glob(os.path.join(fastq_dir, "*.fastq")):
fqs = glob.glob(os.path.join(sequence_dir, f"*.{file_ending}"))
fqs_length = len(fqs)

for idx, fastq_file in enumerate(fqs):
BioLib.set_execution_progress(15 + (25 * (idx / fqs_length)))
base_name = os.path.basename(fastq_file)
bam_file = os.path.join(bam_dir, f"{base_name}.bam")
bai_file = os.path.join(bam_dir, f"{base_name}.bam.bai")
sam_view_result_file = '/sam_view_result.bam'

# Create output files on disk
print("", file=open(sam_view_result_file, 'w'))
print("", file=open(bam_file, 'w'))
print("", file=open(bai_file, 'w'))

sam_view_cmd = ["samtools", "view", "-Sb", "-o", "sam_view_result.bam", "-"]
sam_sort_cmd = ["samtools", "sort", "-o", bam_file, "sam_view_result.bam"]
bowtie_cmd = ["bowtie2", "-x", ref, "--very-sensitive-local", "-U", fastq_file, "--qc-filter"]
if config.input_format == "fasta":
bowtie_cmd.append("-f")
sam_idx_cmd = ["samtools", "index", bam_file]

with open(bam_file, "w") as handle:
bowtie = subprocess.Popen(bowtie_cmd, stdout=subprocess.PIPE, stderr=stderr)
sam_view = subprocess.Popen(sam_view_cmd, stdin=bowtie.stdout, stdout=subprocess.PIPE, stderr=stderr)
sam_sort = subprocess.Popen(sam_sort_cmd, stdin=sam_view.stdout, stdout=handle, stderr=stderr)
sam_sort.wait()

subprocess.check_call(sam_idx_cmd, stderr=stderr)
bowtie = BioLib.call_task(bowtie_cmd[0], "", bowtie_cmd[1:],
["/ref", fastq_file, '/wasm/bowtie2-align-s.wasm'], True)
sam_view = BioLib.call_task(sam_view_cmd[0], bytes(bowtie.stdout), sam_view_cmd[1:],
[sam_view_result_file, '/wasm/samtools.wasm'], True)
sam_sort = BioLib.call_task(sam_sort_cmd[0], "", sam_sort_cmd[1:],
[sam_view_result_file, bam_file, "/wasm/samtools.wasm"], True)
if bowtie.exitcode != 0 or sam_view.exitcode != 0 or sam_sort.exitcode != 0:
config._failed.add(bam_file)
continue
result = BioLib.call_task(sam_idx_cmd[0], "", sam_idx_cmd[1:],
[bam_file, "/wasm/samtools.wasm", bai_file], True)


def _extract_if_zip(tmpdir: str, config: CSCConfig) -> str:
"""Extract the reads from a zipfile if input is indeed a zip file."""
if os.path.isdir(config.reads):
return config.reads
else:
extracted_dir = os.path.join(tmpdir, f"{config.input_format}s")
os.makedirs(extracted_dir)
with zipfile.ZipFile(config.reads) as zip_file:
files = [finfo for finfo in zip_file.infolist() if finfo.filename.endswith(f".{config.input_format}")]
for extract_file in files:
zip_file.extract(extract_file, extracted_dir)
return extracted_dir


def check_variants(tmpdir, config):
Expand All @@ -122,12 +172,22 @@ def check_variants(tmpdir, config):
columns.append("comment")

print(*columns, sep=",", file=outfile)
for bam_file in sorted(glob.glob(os.path.join(bam_dir, "*.bam"))):
bams = sorted(glob.glob(os.path.join(bam_dir, "*.bam")))
bams_length = len(bams)
for idx, bam_file in enumerate(bams):
BioLib.set_execution_progress(40 + (60 * (idx/bams_length)))
base_name = os.path.basename(bam_file)
sample_id = base_name.split(".")[0]
parts = [sample_id]
found_mutations = set()

if bam_file in config._failed:
for variant in variants:
parts.append("NA")
parts.append("read failed to align")
print(*parts, sep=",", file=outfile)
continue

for variant in variants:
region = REGIONS[variant]
try:
Expand Down Expand Up @@ -164,10 +224,10 @@ def check_variants(tmpdir, config):
else:
comment_parts.append(f"possibly found {variant}")

comment += ";".join(comment_parts)
comment += "; ".join(comment_parts)
if "N501Y" in found_mutations and "E484K" in found_mutations:
comment += "; important mutations found"
comment = comment.strip()
if not comment:
comment = "NA"

parts.append(comment)

Expand All @@ -192,9 +252,9 @@ def name_variants(found_mutations):

def call_variant(reference, bam_file, region):
cmd = ["samtools", "mpileup", "-f", reference, "-r", region, bam_file]
mpileup = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
result = mpileup.communicate()[0].decode("utf-8")
before, after, quality = parse_pileup(result)
result = BioLib.call_task(cmd[0], b"", cmd[1:],
[bam_file, reference, bam_file+'.bai', "/wasm/samtools.wasm"], True)
before, after, quality = parse_pileup(bytes(result.stdout).decode())

if "*" in after:
raise BaseDeletedError()
Expand Down Expand Up @@ -222,6 +282,3 @@ def parse_pileup(pileup):
quality.append(ord(parts[5])-33)

return before, after, quality



Loading