Skip to content

Commit

Permalink
wip: initial work for updating vrs version
Browse files Browse the repository at this point in the history
  • Loading branch information
korikuzma committed Jul 11, 2024
1 parent 8a45881 commit 34a8917
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 123 deletions.
14 changes: 4 additions & 10 deletions docs/source/normalizing_data/normalization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,26 +73,23 @@ Normalized records are structured as `Genes <https://github.com/ga4gh/vrs/tree/2
"label": "BRAF",
"extensions": [
{
"type": "Extension",
"name": "symbol_status",
"value": "approved"
},
{
"type": "Extension",
"name": "approved_name",
"value": "B-Raf proto-oncogene, serine/threonine kinase"
},
{
"type": "Extension",
"name": "strand",
"value": "-"
},
{
"type": "Extension",
"name": "ensembl_locations",
"value": [
{
"id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf",
"id": "ga4gh:SL.fUv91vYrVHBMg-B_QW7UpOQj50g_49hb",
"digest": "fUv91vYrVHBMg-B_QW7UpOQj50g_49hb",
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
Expand All @@ -104,11 +101,11 @@ Normalized records are structured as `Genes <https://github.com/ga4gh/vrs/tree/2
]
},
{
"type": "Extension",
"name": "ncbi_locations",
"value": [
{
"id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE",
"id": "ga4gh:SL.0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B",
"digest": "0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B",
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
Expand All @@ -120,17 +117,14 @@ Normalized records are structured as `Genes <https://github.com/ga4gh/vrs/tree/2
]
},
{
"type": "Extension",
"name": "hgnc_locus_type",
"value": "gene with protein product"
},
{
"type": "Extension",
"name": "ncbi_gene_type",
"value": "protein-coding"
},
{
"type": "Extension",
"name": "ensembl_biotype",
"value": "protein_coding"
}
Expand Down
6 changes: 4 additions & 2 deletions docs/source/normalizing_data/sources.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ Ensembl
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf",
"id": "ga4gh:SL.fUv91vYrVHBMg-B_QW7UpOQj50g_49hb",
"digest": "fUv91vYrVHBMg-B_QW7UpOQj50g_49hb",
"label": null,
"description": null,
"extensions": null,
Expand Down Expand Up @@ -122,7 +123,8 @@ The `NCBI Gene Database <https://www.ncbi.nlm.nih.gov/gene/>`_ is a service prov
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE",
"id": "ga4gh:SL.0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B",
"digest": "0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B",
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies = [
"uvicorn",
"click",
"boto3",
"ga4gh.vrs~=2.0.0a1",
# "ga4gh.vrs~=2.0.0a1",
]
dynamic = ["version"]

Expand Down
22 changes: 11 additions & 11 deletions src/gene/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar

from ga4gh.core import core_models, ga4gh_identify
from ga4gh.core import domain_models, entity_models, ga4gh_identify
from ga4gh.vrs import models

from gene import ITEM_TYPES, NAMESPACE_LOOKUP, PREFIX_LOOKUP
Expand Down Expand Up @@ -420,7 +420,7 @@ def _add_gene(
:param possible_concepts: List of other normalized concepts found
:return: Response with core Gene
"""
gene_obj = core_models.Gene(
gene_obj = domain_models.Gene(
id=f"normalize.gene.{record['concept_id']}",
label=record["symbol"],
)
Expand All @@ -431,11 +431,11 @@ def _add_gene(
for source_id in source_ids:
system, code = source_id.split(":")
mappings.append(
core_models.Mapping(
coding=core_models.Coding(
code=core_models.Code(code), system=system.lower()
entity_models.ConceptMapping(
coding=entity_models.Coding(
code=entity_models.Code(code), system=system.lower()
),
relation=core_models.Relation.RELATED_MATCH,
relation=entity_models.Relation.RELATED_MATCH,
)
)
if mappings:
Expand All @@ -450,7 +450,7 @@ def _add_gene(
val = [val]
aliases.update(val)
if aliases:
gene_obj.aliases = list(aliases)
gene_obj.alternativeLabels = list(aliases)

# extensions
extensions = []
Expand All @@ -464,7 +464,7 @@ def _add_gene(
for ext_label, record_label in extension_and_record_labels:
if record_label in record and record[record_label]:
extensions.append(
core_models.Extension(name=ext_label, value=record[record_label])
entity_models.Extension(name=ext_label, value=record[record_label])
)

record_locations = {}
Expand All @@ -485,15 +485,15 @@ def _add_gene(

if transformed_locs:
extensions.append(
core_models.Extension(name=loc_name, value=transformed_locs)
entity_models.Extension(name=loc_name, value=transformed_locs)
)

# handle gene types separately because they're wonky
if record["item_type"] == RecordType.IDENTITY:
gene_type = record.get("gene_type")
if gene_type:
extensions.append(
core_models.Extension(
entity_models.Extension(
name=GeneTypeFieldName[record["src_name"].upper()].value,
value=gene_type,
)
Expand All @@ -504,7 +504,7 @@ def _add_gene(
values = record.get(field_name, [])
for value in values:
extensions.append(
core_models.Extension(name=field_name, value=value)
entity_models.Extension(name=field_name, value=value)
)
if extensions:
gene_obj.extensions = extensions
Expand Down
13 changes: 6 additions & 7 deletions src/gene/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from enum import Enum, IntEnum
from typing import Dict, List, Literal, Optional, Union

from ga4gh.core import core_models
from ga4gh.core import domain_models
from ga4gh.vrs import models
from pydantic import (
BaseModel,
Expand Down Expand Up @@ -321,7 +321,7 @@ class NormalizeService(BaseNormalizationService):
"""Define model for returning normalized concept."""

normalized_id: Optional[str] = None
gene: Optional[core_models.Gene] = None
gene: Optional[domain_models.Gene] = None
source_meta_: Dict[SourceName, SourceMeta] = {}

model_config = ConfigDict(
Expand Down Expand Up @@ -402,12 +402,10 @@ class NormalizeService(BaseNormalizationService):
{
"name": "approved_name",
"value": "B-Raf proto-oncogene, serine/threonine kinase",
"type": "Extension",
},
{
"name": "symbol_status",
"value": "approved",
"type": "Extension",
},
# {
# "name": "chromosome_location",
Expand All @@ -419,7 +417,6 @@ class NormalizeService(BaseNormalizationService):
# "end": "q34",
# "start": "q34",
# },
# "type": "Extension"
# }
],
},
Expand Down Expand Up @@ -578,7 +575,8 @@ class UnmergedNormalizationService(BaseNormalizationService):
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", # noqa: E501
"id": "ga4gh:SL.4taOKYezIxUvFozs6c6OC0bJAQ2zwjxu", # noqa: E501
"digest": "4taOKYezIxUvFozs6c6OC0bJAQ2zwjxu",
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
Expand Down Expand Up @@ -630,7 +628,8 @@ class UnmergedNormalizationService(BaseNormalizationService):
# "end": "q22.1"
},
{
"id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", # noqa: E501
"id": "ga4gh:SL.OWr9DoyBhr2zpf4uLLcZSvsTSIDElU6R", # noqa: E501
"digest": "OWr9DoyBhr2zpf4uLLcZSvsTSIDElU6R",
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def _compare_records(normalized_gene, test_gene, match_type):
assert normalized_gene.symbol == test_gene.symbol
assert len(normalized_gene.locations) == len(test_gene.locations)
for loc in normalized_gene.locations:
assert loc.id.split("ga4gh:SL.")[-1] == loc.digest
loc.id = None
loc.digest = None
assert loc in test_gene.locations
assert set(normalized_gene.location_annotations) == set(
test_gene.location_annotations
Expand Down
5 changes: 0 additions & 5 deletions tests/unit/test_ensembl_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def ddx11l1():
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L",
"end": 14409,
"start": 11868,
"sequenceReference": {
Expand Down Expand Up @@ -68,7 +67,6 @@ def tp53():
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9",
"end": 7687538,
"start": 7661778,
"sequenceReference": {
Expand Down Expand Up @@ -100,7 +98,6 @@ def ATP6AP1_DT(): # noqa: N802
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3",
"end": 154428526,
"start": 154424377,
"sequenceReference": {
Expand Down Expand Up @@ -132,7 +129,6 @@ def hsa_mir_1253():
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR",
"end": 2748182,
"start": 2748077,
"sequenceReference": {
Expand Down Expand Up @@ -164,7 +160,6 @@ def spry3():
"location_annotations": [],
"locations": [
{
"id": "ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0",
"end": 155782459,
"start": 155612571,
"sequenceReference": {
Expand Down
10 changes: 0 additions & 10 deletions tests/unit/test_ncbi_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def dpf1():
# "type": "ChromosomeLocation"
# },
{
"id": "ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20",
"end": 38229695,
"start": 38211005,
"sequenceReference": {
Expand Down Expand Up @@ -106,7 +105,6 @@ def pdp1_symbol():
# "type": "ChromosomeLocation"
# },
{
"id": "ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG",
"end": 93926068,
"start": 93916922,
"sequenceReference": {
Expand Down Expand Up @@ -146,7 +144,6 @@ def pdp1_alias():
# "type": "ChromosomeLocation"
# },
{
"id": "ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY",
"end": 4665258,
"start": 4662293,
"sequenceReference": {
Expand Down Expand Up @@ -195,7 +192,6 @@ def spry3():
# "type": "ChromosomeLocation"
# },
{
"id": "ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV",
"end": 155782459,
"start": 155612585,
"sequenceReference": {
Expand All @@ -205,7 +201,6 @@ def spry3():
"type": "SequenceLocation",
},
{
"id": "ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL",
"end": 56968979,
"start": 56954315,
"sequenceReference": {
Expand Down Expand Up @@ -290,7 +285,6 @@ def znf84():
# "type": "ChromosomeLocation"
# },
{
"id": "ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud",
"end": 133063299,
"start": 133037508,
"sequenceReference": {
Expand Down Expand Up @@ -339,7 +333,6 @@ def slc25a6():
# "end": "p11.2"
# },
{
"id": "ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6",
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
Expand All @@ -349,7 +342,6 @@ def slc25a6():
"end": 1392113,
},
{
"id": "ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg",
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
Expand Down Expand Up @@ -484,7 +476,6 @@ def prkrap1():
# "type": "ChromosomeLocation"
# },
{
"id": "ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb",
"end": 3941874,
"start": 3940269,
"sequenceReference": {
Expand All @@ -494,7 +485,6 @@ def prkrap1():
"type": "SequenceLocation",
},
{
"id": "ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c",
"end": 3932085,
"start": 3930480,
"sequenceReference": {
Expand Down
Loading

0 comments on commit 34a8917

Please sign in to comment.