From eced99eee12c4a247ffcb107d530396f5622daa4 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 30 May 2024 15:38:56 -0400 Subject: [PATCH 1/6] fix: don't configure logging on library import --- src/disease/__init__.py | 10 --------- src/disease/cli.py | 33 +++++++++++++++++++++++------- src/disease/database/dynamodb.py | 1 - src/disease/database/postgresql.py | 23 +++++++++++---------- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/src/disease/__init__.py b/src/disease/__init__.py index 35b1dbf..92f5785 100644 --- a/src/disease/__init__.py +++ b/src/disease/__init__.py @@ -1,20 +1,10 @@ """The VICC library for normalizing diseases.""" -import logging from pathlib import Path from .version import __version__ # noqa: F401 APP_ROOT = Path(__file__).resolve().parents[0] -# set up logging -logging.basicConfig( - filename="disease.log", - format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s", -) -logger = logging.getLogger("disease") -logger.setLevel(logging.DEBUG) -logger.handlers = [] - from disease.schemas import ( # noqa: E402 NamespacePrefix, diff --git a/src/disease/cli.py b/src/disease/cli.py index c9b4f57..02f2861 100644 --- a/src/disease/cli.py +++ b/src/disease/cli.py @@ -1,4 +1,5 @@ """Provides a CLI util to make updates to normalizer database.""" +import logging import os from pathlib import Path from timeit import default_timer as timer @@ -6,7 +7,7 @@ import click -from disease import SOURCES_FOR_MERGE, SOURCES_LOWER_LOOKUP, logger +from disease import SOURCES_FOR_MERGE, SOURCES_LOWER_LOOKUP from disease.database.database import ( AbstractDatabase, DatabaseException, @@ -16,6 +17,17 @@ ) from disease.schemas import SourceName +_logger = logging.getLogger(__name__) + + +def _configure_logging() -> None: + """Configure logging.""" + logging.basicConfig( + filename=f"{__package__}.log", + format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s", + ) + logging.getLogger(__package__).setLevel(logging.DEBUG) + @click.command() @click.option("--db_url", help="URL endpoint for the application database.") @@ -28,6 +40,7 @@ def check_db(db_url: str, verbose: bool = False) -> None: :param db_url: URL to normalizer database :param verbose: if true, print result to console """ # noqa: D301 + _configure_logging() db = create_db(db_url, False) if not db.check_schema_initialized(): if verbose: @@ -39,8 +52,10 @@ def check_db(db_url: str, verbose: bool = False) -> None: click.echo("Health check failed: DB is incompletely populated.") click.get_current_context().exit(1) + msg = "DB health check successful: tables appear complete." if verbose: - click.echo("DB health check successful: tables appear complete.") + click.echo(msg) + _logger.info(msg) @click.command() @@ -55,6 +70,7 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: :param data_url: user-specified location to pull DB dump from :param db_url: URL to normalizer database """ # noqa: D301 + _configure_logging() if not click.confirm("Are you sure you want to overwrite existing data?"): click.get_current_context().exit() if not data_url: @@ -88,6 +104,7 @@ def dump_database(output_directory: Path, db_url: str) -> None: :param output_directory: path to existing directory :param db_url: URL to normalizer database """ # noqa: D301 + _configure_logging() if not output_directory: output_directory = Path() @@ -102,6 +119,7 @@ def dump_database(output_directory: Path, db_url: str) -> None: except DatabaseException as e: click.echo(f"Encountered exception during update: {e!s}") click.get_current_context().exit(1) + _logger.info("Database dump successful.") def _update_sources( @@ -136,14 +154,14 @@ def _delete_source(n: SourceName, db: AbstractDatabase) -> float: """ msg = f"Deleting {n.value}..." click.echo(f"\n{msg}") - logger.info(msg) + _logger.info(msg) start_delete = timer() db.delete_source(n) end_delete = timer() delete_time = end_delete - start_delete msg = f"Deleted {n.value} in {delete_time:.5f} seconds." click.echo(f"{msg}\n") - logger.info(msg) + _logger.info(msg) return delete_time @@ -164,7 +182,7 @@ def _load_source( """ msg = f"Loading {n.value}..." click.echo(msg) - logger.info(msg) + _logger.info(msg) start_load = timer() # used to get source class name from string @@ -183,10 +201,10 @@ def _load_source( load_time = end_load - start_load msg = f"Loaded {n.value} in {load_time:.5f} seconds." click.echo(msg) - logger.info(msg) + _logger.info(msg) msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds." click.echo(msg) - logger.info(msg) + _logger.info(msg) def _delete_normalized_data(database: AbstractDatabase) -> None: @@ -270,6 +288,7 @@ def update_db( :param update_merged: if true, update normalized records :param from_local: if true, use locally available data only """ # noqa: D301 + _configure_logging() db = create_db(db_url, aws_instance) if update_all: diff --git a/src/disease/database/dynamodb.py b/src/disease/database/dynamodb.py index ce99c95..0d353f4 100644 --- a/src/disease/database/dynamodb.py +++ b/src/disease/database/dynamodb.py @@ -32,7 +32,6 @@ ) _logger = logging.getLogger() -_logger.setLevel(logging.DEBUG) class DynamoDbDatabase(AbstractDatabase): diff --git a/src/disease/database/postgresql.py b/src/disease/database/postgresql.py index 8e0065f..a3285fb 100644 --- a/src/disease/database/postgresql.py +++ b/src/disease/database/postgresql.py @@ -26,7 +26,7 @@ SourceName, ) -logger = logging.getLogger() +_logger = logging.getLogger() SCRIPTS_DIR = Path(__file__).parent / "postgresql" @@ -118,7 +118,7 @@ def drop_db(self) -> None: with self.conn.cursor() as cur: cur.execute(self._drop_db_query) self.conn.commit() - logger.info("Dropped all existing disease normalizer tables.") + _logger.info("Dropped all existing disease normalizer tables.") def check_schema_initialized(self) -> bool: """Check if database schema is properly initialized. @@ -131,7 +131,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Disease table existence check failed.") + _logger.info("Disease table existence check failed.") self.conn.rollback() return False try: @@ -140,7 +140,7 @@ def check_schema_initialized(self) -> bool: except DuplicateObject: self.conn.rollback() else: - logger.info("Disease foreign key existence check failed.") + _logger.info("Disease foreign key existence check failed.") self.conn.rollback() return False try: @@ -151,7 +151,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Disease normalized view lookup failed.") + _logger.info("Disease normalized view lookup failed.") self.conn.rollback() return False try: @@ -160,7 +160,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Disease indexes check failed.") + _logger.info("Disease indexes check failed.") self.conn.rollback() return False @@ -182,21 +182,21 @@ def check_tables_populated(self) -> bool: cur.execute(self._check_sources_query) results = cur.fetchall() if len(results) < len(SourceName): - logger.info("Disease sources table is missing expected sources.") + _logger.info("Disease sources table is missing expected sources.") return False with self.conn.cursor() as cur: cur.execute(self._check_concepts_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Disease records table is empty.") + _logger.info("Disease records table is empty.") return False with self.conn.cursor() as cur: cur.execute(self._check_merged_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Normalized disease records table is empty.") + _logger.info("Normalized disease records table is empty.") return False return True @@ -258,7 +258,7 @@ def _drop_indexes(self) -> None: def _create_tables(self) -> None: """Create all tables, indexes, and views.""" - logger.debug("Creating new disease normalizer tables.") + _logger.debug("Creating new disease normalizer tables.") tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes() with self.conn.cursor() as cur: @@ -569,7 +569,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: cur.execute(self._insert_assoc_query, [a, concept_id]) self.conn.commit() except UniqueViolation: - logger.error("Record with ID %s already exists", concept_id) + _logger.error("Record with ID %s already exists", concept_id) self.conn.rollback() _add_merged_record_query = b""" @@ -784,6 +784,7 @@ def export_db(self, output_directory: Path) -> None: pw_param = f"-W {self.conn.info.password}" if self.conn.info.password else "-w" system_call = f"pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}" + _logger.debug("Executing DB dump system call: %s", system_call) result = os.system(system_call) # noqa: S605 if result != 0: msg = f"System call '{system_call}' returned failing exit code." From c4110c22ff11aa7daac9e769c3016456309c6c53 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 30 May 2024 15:44:05 -0400 Subject: [PATCH 2/6] more logging statements --- src/disease/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/disease/cli.py b/src/disease/cli.py index 02f2861..de00fe5 100644 --- a/src/disease/cli.py +++ b/src/disease/cli.py @@ -87,6 +87,7 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: except DatabaseException as e: click.echo(f"Encountered exception during update: {e!s}") click.get_current_context().exit(1) + _logger.info("Successfully loaded data from remote snapshot.") @click.command() @@ -318,6 +319,7 @@ def update_db( sources_to_update = {SourceName(SOURCES_LOWER_LOOKUP[s]) for s in sources_split} _update_sources(sources_to_update, db, update_merged, from_local) + _logger.info("Database update successful.") if __name__ == "__main__": From 4e7ed4f9ee182e7463d61876025f22599c49e989 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 30 May 2024 15:45:40 -0400 Subject: [PATCH 3/6] get another one --- src/disease/etl/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/disease/etl/base.py b/src/disease/etl/base.py index 0e2407a..b7e166b 100644 --- a/src/disease/etl/base.py +++ b/src/disease/etl/base.py @@ -2,6 +2,7 @@ Generally, users shouldn't ever have to work directly with the classes contained within. """ +import logging from abc import ABC, abstractmethod from pathlib import Path from typing import Dict, List, Optional, Set, Union @@ -10,7 +11,7 @@ from owlready2.rdflib_store import TripleLiteRDFlibGraph as RDFGraph from wags_tails import CustomData, DataSource, DoData, MondoData, NcitData, OncoTreeData -from disease import ITEM_TYPES, SOURCES_FOR_MERGE, logger +from disease import ITEM_TYPES, SOURCES_FOR_MERGE from disease.database import AbstractDatabase from disease.schemas import Disease, SourceName @@ -21,6 +22,8 @@ SourceName.DO: DoData, } +_logger = logging.getLogger(__name__) + class Base(ABC): """The ETL base class.""" @@ -104,7 +107,7 @@ def _load_disease(self, disease: Dict) -> None: disease[attr_type] = list(set(value)) items = {item.lower() for item in value} if (attr_type == "aliases") and (len(items) > 20): - logger.debug(f"{concept_id} has > 20 aliases.") + _logger.debug("%s has > 20 aliases.", concept_id) del disease[attr_type] continue From b36b4aaed78351a7a73f93d02c14d9cfcb7870da Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 30 May 2024 15:49:45 -0400 Subject: [PATCH 4/6] get another one --- src/disease/etl/merge.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/disease/etl/merge.py b/src/disease/etl/merge.py index c6c9487..90a9da2 100644 --- a/src/disease/etl/merge.py +++ b/src/disease/etl/merge.py @@ -1,13 +1,15 @@ """Create concept groups and merged records.""" +import logging from timeit import default_timer as timer from typing import Collection, Dict, List, Set, Tuple from tqdm import tqdm -from disease import logger from disease.database.database import AbstractDatabase from disease.schemas import SourcePriority +_logger = logging.getLogger(__name__) + class Merge: """Manage construction of record mergers for normalization.""" @@ -31,37 +33,38 @@ def create_merged_concepts(self, record_ids: Collection[str]) -> None: :param record_ids: concept identifiers from which groups should be generated. """ # build groups - logger.info(f"Generating record ID sets from {len(record_ids)} records") + _logger.info("Generating record ID sets from %s records", len(record_ids)) start = timer() for concept_id in tqdm(record_ids, ncols=80, disable=self._silent): try: record = self._database.get_record_by_id(concept_id) except AttributeError: - logger.error( - f"`create_merged_concepts` received invalid " - f"concept ID: {concept_id}" + _logger.error( + "`create_merged_concepts` received invalid concept ID: %s", + concept_id, ) continue if not record: - logger.error(f"generate_merged_concepts couldn't find " f"{concept_id}") + _logger.error("generate_merged_concepts couldn't find %s", concept_id) continue xrefs = record.get("xrefs", None) group = {*xrefs, concept_id} if xrefs else {concept_id} self._groups.append((concept_id, group)) end = timer() self._database.complete_write_transaction() - logger.debug(f"Built record ID sets in {end - start} seconds") + _logger.debug("Built record ID sets in %s seconds", end - start) # build merged concepts - logger.info("Creating merged records and updating database...") + _logger.info("Creating merged records and updating database...") start = timer() for record_id, group in tqdm(self._groups, ncols=80, disable=self._silent): try: merged_record, merged_ids = self._generate_merged_record(group) except AttributeError: - logger.error( - "`create_merged_concepts` received invalid group:" - f"{group} for concept {record_id}" + _logger.error( + "`create_merged_concepts` received invalid group: %s for concept %s", + group, + record_id, ) continue self._database.add_merged_record(merged_record) @@ -71,8 +74,8 @@ def create_merged_concepts(self, record_ids: Collection[str]) -> None: self._database.update_merge_ref(concept_id, merge_ref) self._database.complete_write_transaction() end = timer() - logger.info("merged concept generation successful.") - logger.debug(f"Generated and added concepts in {end - start} seconds)") + _logger.info("merged concept generation successful.") + _logger.debug("Generated and added concepts in %s seconds", end - start) def _generate_merged_record(self, record_id_set: Set[str]) -> Tuple[Dict, List]: """Generate merged record from provided concept ID group. @@ -94,9 +97,10 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Tuple[Dict, List]: records.append(record) final_ids.append(record["concept_id"]) else: - logger.error( - f"generate_merged_record could not retrieve " - f"record for {record_id} in {record_id_set}" + _logger.error( + "generate_merged_record could not retrieve record for %s in %s", + record_id, + record_id_set, ) def record_order(record: Dict) -> Tuple: From 054ee6b48c5d59824f7a02acf27d71bde77fa64d Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 30 May 2024 15:51:12 -0400 Subject: [PATCH 5/6] get another one --- src/disease/etl/ncit.py | 6 ++++-- src/disease/etl/oncotree.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/disease/etl/ncit.py b/src/disease/etl/ncit.py index ab6d74e..e8937ab 100644 --- a/src/disease/etl/ncit.py +++ b/src/disease/etl/ncit.py @@ -1,14 +1,16 @@ """Get NCIt disease data.""" +import logging import re from typing import Set import owlready2 as owl from tqdm import tqdm -from disease import logger from disease.etl.base import OWLBase from disease.schemas import NamespacePrefix, SourceMeta, SourceName +_logger = logging.getLogger(__name__) + icdo_re = re.compile("[0-9]+/[0-9]+") @@ -56,7 +58,7 @@ def _transform_data(self) -> None: if disease_class.P108: label = disease_class.P108.first() else: - logger.warning(f"No label for concept {concept_id}") + _logger.warning("No label for concept %s", concept_id) continue aliases = [a for a in disease_class.P90 if a != label] diff --git a/src/disease/etl/oncotree.py b/src/disease/etl/oncotree.py index 5fc94fe..c6f0f71 100644 --- a/src/disease/etl/oncotree.py +++ b/src/disease/etl/oncotree.py @@ -1,13 +1,15 @@ """Get OncoTree data.""" import json +import logging from typing import Dict from tqdm import tqdm -from disease import logger from disease.etl.base import Base from disease.schemas import NamespacePrefix, SourceMeta +_logger = logging.getLogger(__name__) + class OncoTree(Base): """Gather and load data from OncoTree.""" @@ -69,7 +71,7 @@ def _add_disease(self, disease_node: Dict) -> None: normed_id = f"{normed_prefix}:{code}" disease["xrefs"].append(normed_id) else: - logger.warning(f"Unrecognized prefix: {prefix}") + _logger.warning("Unrecognized prefix: %s", prefix) continue self._load_disease(disease) From dc4ce6e8e30b4a1f28cc740c5bfbf2b90a76d732 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 30 May 2024 16:17:47 -0400 Subject: [PATCH 6/6] get another one --- src/disease/query.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/disease/query.py b/src/disease/query.py index 034c2e1..baee6dd 100644 --- a/src/disease/query.py +++ b/src/disease/query.py @@ -1,12 +1,13 @@ """Provides query handler class, which receives and responses to user search queries.""" import datetime +import logging import re from typing import Dict, Optional, Set, Tuple from botocore.exceptions import ClientError from ga4gh.core import core_models -from disease import NAMESPACE_LOOKUP, PREFIX_LOOKUP, SOURCES_LOWER_LOOKUP, logger +from disease import NAMESPACE_LOOKUP, PREFIX_LOOKUP, SOURCES_LOWER_LOOKUP from disease.database.database import AbstractDatabase from disease.schemas import ( Disease, @@ -20,6 +21,8 @@ from .version import __version__ +_logger = logging.getLogger(__name__) + class InvalidParameterException(Exception): # noqa: N818 """Exception for invalid parameter args provided by the user.""" @@ -56,8 +59,8 @@ def _emit_warnings(self, query_str: str) -> Optional[Dict]: nbsp = re.search("\xa0| ", query_str) if nbsp: warnings = {"nbsp": "Query contains non breaking space characters."} - logger.warning( - f"Query ({query_str}) contains non breaking space characters." + _logger.warning( + "Query (%s) contains non breaking space characters.", query_str ) return warnings @@ -111,12 +114,12 @@ def _fetch_records( concept_id.lower(), case_sensitive=False ) if match is None: - logger.error(f"Reference to {concept_id} failed.") + _logger.error("Reference to %s failed.", concept_id) else: (response, src) = self._add_record(response, match, match_type) matched_sources.add(src) except ClientError as e: - logger.error(e.response["Error"]["Message"]) + _logger.error(e.response["Error"]["Message"]) return response, matched_sources def _fill_no_matches(self, resp: Dict[str, Dict]) -> Dict: @@ -368,7 +371,7 @@ def _record_order(self, record: Dict) -> Tuple[int, str]: elif src == SourceName.DO.value: source_rank = 5 else: - logger.warning(f"query.record_order: Invalid source name for " f"{record}") + _logger.warning("query.record_order: Invalid source name for %s", record) source_rank = 4 return source_rank, record["concept_id"] @@ -382,9 +385,11 @@ def _handle_failed_merge_ref( :param str query: original query value :return: Normalized response with no match """ - logger.error( - f"Merge ref lookup failed for ref {record['merge_ref']} " - f"in record {record['concept_id']} from query {query}" + _logger.error( + "Merge ref lookup failed for ref %s in record %s from query %s", + record["merge_ref"], + record["concept_id"], + query, ) response["match_type"] = MatchType.NO_MATCH return NormalizationService(**response)