From ce1ce5b10637e9f0edd36c73d0c8542aeb6f4c59 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 12 Jan 2024 15:50:38 -0500 Subject: [PATCH] feat: use wags-tails for data management --- pyproject.toml | 2 +- rust/src/lib.rs | 46 ++++++--------------------------------- src/chainlifter/lifter.py | 42 +++++++++++++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 42 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1c3739a..089cedb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", ] -dependencies = [] +dependencies = ["wags-tails"] [project.optional-dependencies] test = [ diff --git a/rust/src/lib.rs b/rust/src/lib.rs index d03f30a..bd6e522 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,46 +1,12 @@ //! Provide Rust-based chainfile wrapping classes. use chain::core::{Coordinate, Interval, Strand}; use chainfile as chain; -use directories::BaseDirs; -use pyo3::exceptions::PyValueError; +use pyo3::exceptions::{PyValueError, PyFileNotFoundError}; use pyo3::prelude::*; -use std::env; -use std::fs; use std::fs::File; use std::io::BufReader; use std::path::Path; -fn get_chainfile_dir() -> String { - let env_var_name = "CHAINLIFTER_DATA_DIR"; - if let Ok(value) = env::var(env_var_name) { - return value; - } else if let Some(base_dirs) = BaseDirs::new() { - let data_dir = base_dirs.home_dir(); - let base_chainfile_dir = format!("{}/.local/share/chainlifter", data_dir.display()); - return base_chainfile_dir; - } else { - panic!("Unable to get ChainLifter data directory.") - } -} - -/// Acquire chainfile. -/// TODO: fetch from remote if not available locally, probably via config -/// TODO: throw exceptions if unable to acquire -/// TODO: specify base dir -fn get_chainfile(from_db: &str, to_db: &str) -> String { - let base_chainfile_dir = get_chainfile_dir(); - fs::create_dir_all(base_chainfile_dir.clone()).unwrap(); - let path = format!( - "{}/hg{}ToHg{}.over.chain", - base_chainfile_dir, from_db, to_db - ); - if Path::new(&path).exists() { - path - } else { - "this isn't going to work".to_string() - } -} - /// Define core ChainLifter class to be used by Python interface. /// Effectively just a wrapper on top of the chainfile crate's Machine struct. #[pyclass] @@ -51,14 +17,16 @@ pub struct ChainLifter { #[pymethods] impl ChainLifter { #[new] - pub fn new(from_db: &str, to_db: &str) -> ChainLifter { - let chainfile_name: String = get_chainfile(from_db, to_db); - let data = BufReader::new(File::open(&chainfile_name).unwrap()); + pub fn new(chainfile_path: &str) -> PyResult { + if !Path::new(&chainfile_path).exists() { + return Err(PyFileNotFoundError::new_err("Chainfile doesn't exist")); + } + let data = BufReader::new(File::open(&chainfile_path).unwrap()); let reader = chain::Reader::new(data); let machine = chain::liftover::machine::Builder::default() .try_build_from(reader) .unwrap(); - ChainLifter { machine } + Ok(ChainLifter { machine }) } /// Perform liftover diff --git a/src/chainlifter/lifter.py b/src/chainlifter/lifter.py index ec78e88..6ec2dc5 100644 --- a/src/chainlifter/lifter.py +++ b/src/chainlifter/lifter.py @@ -1,5 +1,12 @@ """Perform chainfile-driven liftover.""" +from pathlib import Path from enum import Enum +from typing import Callable + +import requests +from wags_tails import CustomData, DataSource +from wags_tails.utils.downloads import handle_gzip, download_http +from wags_tails.utils.storage import get_data_dir import chainlifter._core as _core @@ -22,7 +29,38 @@ def __init__(self, from_db: str, to_db: str) -> None: :param from_db: database name, e.g. ``"19"`` :param to_db: database name, e.g. ``"38"`` """ - self._chainlifter = _core.ChainLifter(from_db, to_db) + data_handler = CustomData( + f"chainfile_{from_db}_to_{to_db}", + "chain", + lambda: "", + self._download_function_builder(from_db, to_db), + data_dir=get_data_dir() / "ucsc-chainfile", + ) + file, _ = data_handler.get_latest() + self._chainlifter = _core.ChainLifter(str(file.absolute())) + + @staticmethod + def _download_function_builder(from_db: str, to_db: str) -> Callable: + """Build downloader function for chainfile corresponding to source/destination + params. + + Wags-Tails' custom data handler takes a downloader callback function. We + construct it here, curried with from/to values in the download URL. + + :param from_db: genome lifting from + :param to_db: genome lifting to + :return: Function that downloads appropriate chainfile from UCSC + """ + def _download_data(version: str, file: Path) -> None: + """Download and gunzip chainfile from UCSC. + + :param version: not used + :param file: path to save file to + """ + url = f"https://hgdownload.soe.ucsc.edu/goldenPath/{from_db}/liftOver/{from_db}To{to_db.title()}.over.chain.gz" + download_http(url, file, handler=handle_gzip) + + return _download_data def convert_coordinate( self, chrom: str, pos: int, strand: Strand = Strand.POSITIVE @@ -35,7 +73,7 @@ def convert_coordinate( from chainlifter.lifter import ChainLifter, Strand - lifter = ChainLifter("19", "38") + lifter = ChainLifter("hg19", "hg38") lifter.convert_coordinate("chr7", 140453136, Strand.POSITIVE) # returns [['chr7', '140753336', '+']]