Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: don't configure logging on library import #189

Merged
merged 6 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions src/disease/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
"""The VICC library for normalizing diseases."""
import logging
from pathlib import Path

from .version import __version__ # noqa: F401

APP_ROOT = Path(__file__).resolve().parents[0]

# set up logging
logging.basicConfig(
filename="disease.log",
format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s",
)
logger = logging.getLogger("disease")
logger.setLevel(logging.DEBUG)
logger.handlers = []


from disease.schemas import ( # noqa: E402
NamespacePrefix,
Expand Down
35 changes: 28 additions & 7 deletions src/disease/cli.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""Provides a CLI util to make updates to normalizer database."""
import logging
import os
from pathlib import Path
from timeit import default_timer as timer
from typing import Collection, List, Optional, Set

import click

from disease import SOURCES_FOR_MERGE, SOURCES_LOWER_LOOKUP, logger
from disease import SOURCES_FOR_MERGE, SOURCES_LOWER_LOOKUP
from disease.database.database import (
AbstractDatabase,
DatabaseException,
Expand All @@ -16,6 +17,17 @@
)
from disease.schemas import SourceName

_logger = logging.getLogger(__name__)


def _configure_logging() -> None:
"""Configure logging."""
logging.basicConfig(
filename=f"{__package__}.log",
format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s",
)
logging.getLogger(__package__).setLevel(logging.DEBUG)


@click.command()
@click.option("--db_url", help="URL endpoint for the application database.")
Expand All @@ -28,6 +40,7 @@ def check_db(db_url: str, verbose: bool = False) -> None:
:param db_url: URL to normalizer database
:param verbose: if true, print result to console
""" # noqa: D301
_configure_logging()
db = create_db(db_url, False)
if not db.check_schema_initialized():
if verbose:
Expand All @@ -39,8 +52,10 @@ def check_db(db_url: str, verbose: bool = False) -> None:
click.echo("Health check failed: DB is incompletely populated.")
click.get_current_context().exit(1)

msg = "DB health check successful: tables appear complete."
if verbose:
click.echo("DB health check successful: tables appear complete.")
click.echo(msg)
_logger.info(msg)


@click.command()
Expand All @@ -55,6 +70,7 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None:
:param data_url: user-specified location to pull DB dump from
:param db_url: URL to normalizer database
""" # noqa: D301
_configure_logging()
if not click.confirm("Are you sure you want to overwrite existing data?"):
click.get_current_context().exit()
if not data_url:
Expand All @@ -71,6 +87,7 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None:
except DatabaseException as e:
click.echo(f"Encountered exception during update: {e!s}")
click.get_current_context().exit(1)
_logger.info("Successfully loaded data from remote snapshot.")


@click.command()
Expand All @@ -88,6 +105,7 @@ def dump_database(output_directory: Path, db_url: str) -> None:
:param output_directory: path to existing directory
:param db_url: URL to normalizer database
""" # noqa: D301
_configure_logging()
if not output_directory:
output_directory = Path()

Expand All @@ -102,6 +120,7 @@ def dump_database(output_directory: Path, db_url: str) -> None:
except DatabaseException as e:
click.echo(f"Encountered exception during update: {e!s}")
click.get_current_context().exit(1)
_logger.info("Database dump successful.")


def _update_sources(
Expand Down Expand Up @@ -136,14 +155,14 @@ def _delete_source(n: SourceName, db: AbstractDatabase) -> float:
"""
msg = f"Deleting {n.value}..."
click.echo(f"\n{msg}")
logger.info(msg)
_logger.info(msg)
start_delete = timer()
db.delete_source(n)
end_delete = timer()
delete_time = end_delete - start_delete
msg = f"Deleted {n.value} in {delete_time:.5f} seconds."
click.echo(f"{msg}\n")
logger.info(msg)
_logger.info(msg)
return delete_time


Expand All @@ -164,7 +183,7 @@ def _load_source(
"""
msg = f"Loading {n.value}..."
click.echo(msg)
logger.info(msg)
_logger.info(msg)
start_load = timer()

# used to get source class name from string
Expand All @@ -183,10 +202,10 @@ def _load_source(
load_time = end_load - start_load
msg = f"Loaded {n.value} in {load_time:.5f} seconds."
click.echo(msg)
logger.info(msg)
_logger.info(msg)
msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds."
click.echo(msg)
logger.info(msg)
_logger.info(msg)


def _delete_normalized_data(database: AbstractDatabase) -> None:
Expand Down Expand Up @@ -270,6 +289,7 @@ def update_db(
:param update_merged: if true, update normalized records
:param from_local: if true, use locally available data only
""" # noqa: D301
_configure_logging()
db = create_db(db_url, aws_instance)

if update_all:
Expand Down Expand Up @@ -299,6 +319,7 @@ def update_db(

sources_to_update = {SourceName(SOURCES_LOWER_LOOKUP[s]) for s in sources_split}
_update_sources(sources_to_update, db, update_merged, from_local)
_logger.info("Database update successful.")


if __name__ == "__main__":
Expand Down
1 change: 0 additions & 1 deletion src/disease/database/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
)

_logger = logging.getLogger()
_logger.setLevel(logging.DEBUG)


class DynamoDbDatabase(AbstractDatabase):
Expand Down
23 changes: 12 additions & 11 deletions src/disease/database/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
SourceName,
)

logger = logging.getLogger()
_logger = logging.getLogger()


SCRIPTS_DIR = Path(__file__).parent / "postgresql"
Expand Down Expand Up @@ -118,7 +118,7 @@ def drop_db(self) -> None:
with self.conn.cursor() as cur:
cur.execute(self._drop_db_query)
self.conn.commit()
logger.info("Dropped all existing disease normalizer tables.")
_logger.info("Dropped all existing disease normalizer tables.")

def check_schema_initialized(self) -> bool:
"""Check if database schema is properly initialized.
Expand All @@ -131,7 +131,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Disease table existence check failed.")
_logger.info("Disease table existence check failed.")
self.conn.rollback()
return False
try:
Expand All @@ -140,7 +140,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateObject:
self.conn.rollback()
else:
logger.info("Disease foreign key existence check failed.")
_logger.info("Disease foreign key existence check failed.")
self.conn.rollback()
return False
try:
Expand All @@ -151,7 +151,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Disease normalized view lookup failed.")
_logger.info("Disease normalized view lookup failed.")
self.conn.rollback()
return False
try:
Expand All @@ -160,7 +160,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Disease indexes check failed.")
_logger.info("Disease indexes check failed.")
self.conn.rollback()
return False

Expand All @@ -182,21 +182,21 @@ def check_tables_populated(self) -> bool:
cur.execute(self._check_sources_query)
results = cur.fetchall()
if len(results) < len(SourceName):
logger.info("Disease sources table is missing expected sources.")
_logger.info("Disease sources table is missing expected sources.")
return False

with self.conn.cursor() as cur:
cur.execute(self._check_concepts_query)
result = cur.fetchone()
if not result or result[0] < 1:
logger.info("Disease records table is empty.")
_logger.info("Disease records table is empty.")
return False

with self.conn.cursor() as cur:
cur.execute(self._check_merged_query)
result = cur.fetchone()
if not result or result[0] < 1:
logger.info("Normalized disease records table is empty.")
_logger.info("Normalized disease records table is empty.")
return False

return True
Expand Down Expand Up @@ -258,7 +258,7 @@ def _drop_indexes(self) -> None:

def _create_tables(self) -> None:
"""Create all tables, indexes, and views."""
logger.debug("Creating new disease normalizer tables.")
_logger.debug("Creating new disease normalizer tables.")
tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes()

with self.conn.cursor() as cur:
Expand Down Expand Up @@ -569,7 +569,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
cur.execute(self._insert_assoc_query, [a, concept_id])
self.conn.commit()
except UniqueViolation:
logger.error("Record with ID %s already exists", concept_id)
_logger.error("Record with ID %s already exists", concept_id)
self.conn.rollback()

_add_merged_record_query = b"""
Expand Down Expand Up @@ -784,6 +784,7 @@ def export_db(self, output_directory: Path) -> None:
pw_param = f"-W {self.conn.info.password}" if self.conn.info.password else "-w"

system_call = f"pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}"
_logger.debug("Executing DB dump system call: %s", system_call)
result = os.system(system_call) # noqa: S605
if result != 0:
msg = f"System call '{system_call}' returned failing exit code."
Expand Down
7 changes: 5 additions & 2 deletions src/disease/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

Generally, users shouldn't ever have to work directly with the classes contained within.
"""
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Set, Union
Expand All @@ -10,7 +11,7 @@
from owlready2.rdflib_store import TripleLiteRDFlibGraph as RDFGraph
from wags_tails import CustomData, DataSource, DoData, MondoData, NcitData, OncoTreeData

from disease import ITEM_TYPES, SOURCES_FOR_MERGE, logger
from disease import ITEM_TYPES, SOURCES_FOR_MERGE
from disease.database import AbstractDatabase
from disease.schemas import Disease, SourceName

Expand All @@ -21,6 +22,8 @@
SourceName.DO: DoData,
}

_logger = logging.getLogger(__name__)


class Base(ABC):
"""The ETL base class."""
Expand Down Expand Up @@ -104,7 +107,7 @@ def _load_disease(self, disease: Dict) -> None:
disease[attr_type] = list(set(value))
items = {item.lower() for item in value}
if (attr_type == "aliases") and (len(items) > 20):
logger.debug(f"{concept_id} has > 20 aliases.")
_logger.debug("%s has > 20 aliases.", concept_id)
del disease[attr_type]
continue

Expand Down
36 changes: 20 additions & 16 deletions src/disease/etl/merge.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Create concept groups and merged records."""
import logging
from timeit import default_timer as timer
from typing import Collection, Dict, List, Set, Tuple

from tqdm import tqdm

from disease import logger
from disease.database.database import AbstractDatabase
from disease.schemas import SourcePriority

_logger = logging.getLogger(__name__)


class Merge:
"""Manage construction of record mergers for normalization."""
Expand All @@ -31,37 +33,38 @@ def create_merged_concepts(self, record_ids: Collection[str]) -> None:
:param record_ids: concept identifiers from which groups should be generated.
"""
# build groups
logger.info(f"Generating record ID sets from {len(record_ids)} records")
_logger.info("Generating record ID sets from %s records", len(record_ids))
start = timer()
for concept_id in tqdm(record_ids, ncols=80, disable=self._silent):
try:
record = self._database.get_record_by_id(concept_id)
except AttributeError:
logger.error(
f"`create_merged_concepts` received invalid "
f"concept ID: {concept_id}"
_logger.error(
"`create_merged_concepts` received invalid concept ID: %s",
concept_id,
)
continue
if not record:
logger.error(f"generate_merged_concepts couldn't find " f"{concept_id}")
_logger.error("generate_merged_concepts couldn't find %s", concept_id)
continue
xrefs = record.get("xrefs", None)
group = {*xrefs, concept_id} if xrefs else {concept_id}
self._groups.append((concept_id, group))
end = timer()
self._database.complete_write_transaction()
logger.debug(f"Built record ID sets in {end - start} seconds")
_logger.debug("Built record ID sets in %s seconds", end - start)

# build merged concepts
logger.info("Creating merged records and updating database...")
_logger.info("Creating merged records and updating database...")
start = timer()
for record_id, group in tqdm(self._groups, ncols=80, disable=self._silent):
try:
merged_record, merged_ids = self._generate_merged_record(group)
except AttributeError:
logger.error(
"`create_merged_concepts` received invalid group:"
f"{group} for concept {record_id}"
_logger.error(
"`create_merged_concepts` received invalid group: %s for concept %s",
group,
record_id,
)
continue
self._database.add_merged_record(merged_record)
Expand All @@ -71,8 +74,8 @@ def create_merged_concepts(self, record_ids: Collection[str]) -> None:
self._database.update_merge_ref(concept_id, merge_ref)
self._database.complete_write_transaction()
end = timer()
logger.info("merged concept generation successful.")
logger.debug(f"Generated and added concepts in {end - start} seconds)")
_logger.info("merged concept generation successful.")
_logger.debug("Generated and added concepts in %s seconds", end - start)

def _generate_merged_record(self, record_id_set: Set[str]) -> Tuple[Dict, List]:
"""Generate merged record from provided concept ID group.
Expand All @@ -94,9 +97,10 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Tuple[Dict, List]:
records.append(record)
final_ids.append(record["concept_id"])
else:
logger.error(
f"generate_merged_record could not retrieve "
f"record for {record_id} in {record_id_set}"
_logger.error(
"generate_merged_record could not retrieve record for %s in %s",
record_id,
record_id_set,
)

def record_order(record: Dict) -> Tuple:
Expand Down
Loading
Loading