Skip to content

Commit

Permalink
fix: return all locations in response (#126, #152)
Browse files Browse the repository at this point in the history
  • Loading branch information
korikuzma committed Oct 27, 2022
1 parent 4e915eb commit 06537d2
Show file tree
Hide file tree
Showing 12 changed files with 510 additions and 191 deletions.
6 changes: 5 additions & 1 deletion gene/etl/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def record_order(record):
# merge from constituent records
set_fields = ["aliases", "associated_with", "previous_symbols"]
scalar_fields = ["symbol", "symbol_status", "label", "strand",
"location_annotations", "locations"]
"location_annotations"]
for record in records:
for field in set_fields:
merged_attrs[field] |= set(record.get(field, set()))
Expand All @@ -144,6 +144,10 @@ def record_order(record):
if field not in merged_attrs and field in record:
merged_attrs[field] = record[field]

locations = record.get("locations")
if locations:
merged_attrs[f"{record['src_name'].lower()}_locations"] = locations

gene_type = record.get("gene_type")
if gene_type:
merged_field = GeneTypeFieldName[record["src_name"].upper()]
Expand Down
65 changes: 45 additions & 20 deletions gene/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,20 @@ def fetch_meta(self, src_name: str) -> SourceMeta:
except ClientError as e:
logger.error(e.response['Error']['Message'])

@staticmethod
def _transform_sequence_location(loc: Dict) -> models.SequenceLocation:
"""Transform a sequence location to VRS sequence location
:param Dict loc: Sequence location
:return: VRS sequence location
"""
return models.SequenceLocation(
type="SequenceLocation",
sequence_id=loc["sequence_id"],
interval=models.SequenceInterval(
type="SequenceInterval",
start=models.Number(value=int(loc["start"]), type="Number"),
end=models.Number(value=int(loc["end"]), type="Number")))

@staticmethod
def _transform_chromosome_location(loc: Dict) -> models.ChromosomeLocation:
"""Transform a chromosome location to VRS chromosome location
Expand All @@ -98,6 +112,18 @@ def _transform_chromosome_location(loc: Dict) -> models.ChromosomeLocation:
end=loc["end"]))
return transformed_loc

def _transform_location(self, loc: Dict) -> Dict:
"""Transform a sequence/chromosome location to VRS sequence/chromosome location
:param Dict loc: Sequence or Chromosome location
:return: VRS sequence or chromosome location represented as a dictionary
"""
if loc["type"] == VRSTypes.SEQUENCE_LOCATION:
loc = self._transform_sequence_location(loc)
else:
loc = self._transform_chromosome_location(loc)
loc._id = ga4gh_identify(loc)
return loc.as_dict()

def _transform_locations(self, record: Dict) -> Dict:
"""Transform gene locations to VRS Chromosome/Sequence Locations
Expand All @@ -107,21 +133,7 @@ def _transform_locations(self, record: Dict) -> Dict:
record_locations = list()
if "locations" in record:
for loc in record["locations"]:
if loc["type"] == VRSTypes.SEQUENCE_LOCATION:
transformed_loc = models.SequenceLocation(
type="SequenceLocation",
sequence_id=loc["sequence_id"],
interval=models.SequenceInterval(
type="SequenceInterval",
start=models.Number(value=int(loc["start"]), type="Number"),
end=models.Number(value=int(loc["end"]), type="Number")))
else:
transformed_loc = self._transform_chromosome_location(loc)

transformed_loc._id = ga4gh_identify(transformed_loc)
transformed_loc = transformed_loc.as_dict()
record_locations.append(transformed_loc)

record_locations.append(self._transform_location(loc))
record["locations"] = record_locations
return record

Expand Down Expand Up @@ -437,20 +449,33 @@ def add_gene_descriptor(
extension_and_record_labels = [
("symbol_status", "symbol_status"),
("approved_name", "label"),
("chromosome_location", "locations"),
("associated_with", "associated_with"),
("previous_symbols", "previous_symbols"),
("location_annotations", "location_annotations")
]
for ext_label, record_label in extension_and_record_labels:
if record_label in record and record[record_label]:
if ext_label == 'chromosome_location':
loc = self._transform_chromosome_location(record[record_label][0])
loc._id = ga4gh_identify(loc)
record[record_label] = loc.as_dict()
extensions.append(Extension(
name=ext_label,
value=record[record_label]
))

record_locations = dict()
if record["item_type"] == "identity":
locs = record.get("locations")
if locs:
record_locations[f"{record['src_name'].lower()}_locations"] = locs
elif record["item_type"] == "merger":
for k, v in record.items():
if k.endswith("locations") and v:
record_locations[k] = v

for loc_name, locations in record_locations.items():
transformed_locs = list()
for loc in locations:
transformed_locs.append(self._transform_location(loc))
extensions.append(Extension(name=loc_name, value=transformed_locs))

# handle gene types separately because they're wonky
if record["item_type"] == "identity":
gene_type = record.get("gene_type")
Expand Down
2 changes: 1 addition & 1 deletion gene/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Gene normalizer version"""
__version__ = "0.1.29"
__version__ = "0.1.30"
6 changes: 3 additions & 3 deletions tests/unit/data/etl_data/ensembl_108.gff3
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@
#!genebuild-last-updated 2021-03
1 GRCh38 chromosome 1 248956422 . . . ID=chromosome:1;Alias=CM000663.2,chr1,NC_000001.11
###
1 havana pseudogene 11869 14409 . + . ID=gene:ENSG00000223972;Name=DDX11L1;biotype=transcribed_unprocessed_pseudogene;description=DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol%3BAcc:HGNC:37102];gene_id=ENSG00000223972;logic_name=havana_homo_sapiens;version=5
1 havana pseudogene 12010 13670 . + . ID=gene:ENSG00000223972;Name=DDX11L1;biotype=transcribed_unprocessed_pseudogene;description=DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol%3BAcc:HGNC:37102];gene_id=ENSG00000223972;logic_name=havana_homo_sapiens;version=6
###
1 ensembl_havana gene 220148293 220272453 . - . ID=gene:ENSG00000118873;Name=RAB3GAP2;biotype=protein_coding;description=RAB3 GTPase activating non-catalytic protein subunit 2 [Source:HGNC Symbol%3BAcc:HGNC:17168];gene_id=ENSG00000118873;logic_name=ensembl_havana_gene_homo_sapiens;version=16
###
Expand All @@ -220,7 +220,7 @@
###
9 GRCh38 chromosome 1 138394717 . . . ID=chromosome:9;Alias=CM000671.2,chr9,NC_000009.12
###
9 ensembl_havana gene 130713016 130887675 . + . ID=gene:ENSG00000097007;Name=ABL1;biotype=protein_coding;description=ABL proto-oncogene 1%2C non-receptor tyrosine kinase [Source:HGNC Symbol%3BAcc:HGNC:76];gene_id=ENSG00000097007;logic_name=ensembl_havana_gene_homo_sapiens;version=19
9 ensembl_havana gene 130713043 130887675 . + . ID=gene:ENSG00000097007;Name=ABL1;biotype=protein_coding;description=ABL proto-oncogene 1%2C non-receptor tyrosine kinase [Source:HGNC Symbol%3BAcc:HGNC:76];gene_id=ENSG00000097007;logic_name=ensembl_havana_gene_homo_sapiens;version=20
###
11 GRCh38 chromosome 1 135086622 . . . ID=chromosome:11;Alias=CM000673.2,chr11,NC_000011.10
###
Expand All @@ -242,7 +242,7 @@
###
X GRCh38 chromosome 1 156040895 . . . ID=chromosome:X;Alias=CM000685.2,chrX,NC_000023.11
###
X havana ncRNA_gene 154424378 154428512 . - . ID=gene:ENSG00000197180;Name=CH17-340M24.3;biotype=lncRNA;description=uncharacterized protein BC009467 [Source:NCBI gene (formerly Entrezgene)%3BAcc:158960];gene_id=ENSG00000197180;logic_name=havana_homo_sapiens;version=3
X havana ncRNA_gene 154424378 154428526 . - . ID=gene:ENSG00000197180;Name=ATP6AP1-DT;biotype=lncRNA;description=ATP6AP1 divergent transcript [Source:HGNC Symbol%3BAcc:HGNC:25138];gene_id=ENSG00000197180;logic_name=havana_homo_sapiens;version=4
###
X ensembl_havana gene 155612572 155782459 . + . ID=gene:ENSG00000168939;Name=SPRY3;biotype=protein_coding;description=sprouty RTK signaling antagonist 3 [Source:HGNC Symbol%3BAcc:HGNC:11271];gene_id=ENSG00000168939;logic_name=ensembl_havana_gene_homo_sapiens;version=12
###
174 changes: 101 additions & 73 deletions tests/unit/data/etl_data/hgnc_20210810.json
Original file line number Diff line number Diff line change
Expand Up @@ -251,17 +251,10 @@
"location_sortable": "22pter-q11"
},
{
"date_approved_reserved": "1999-09-29",
"alias_name": [
"chromatin assembly factor I (150 kDa)"
],
"vega_id": "OTTHUMG00000181922",
"locus_group": "protein-coding gene",
"mane_select": [
"ENST00000301280.10",
"NM_005483.3"
],
"status": "Approved",
"ucsc_id": "uc002mal.4",
"name": "chromatin assembly factor 1 subunit A",
"entrez_id": "10036",
"alias_symbol": [
"CAF1P150",
"CAF1B",
Expand All @@ -270,47 +263,54 @@
"P150",
"MGC71229"
],
"_version_": 1707696198253543425,
"uuid": "cbaac19b-6e86-4b58-9053-e34c3aa5d99e",
"prev_name": [
"chromatin assembly factor 1, subunit A (p150)"
],
"refseq_accession": [
"NM_005483"
"mane_select": [
"ENST00000301280.10",
"NM_005483.3"
],
"locus_type": "gene with protein product",
"agr": "HGNC:1910",
"locus_group": "protein-coding gene",
"hgnc_id": "HGNC:1910",
"rgd_id": [
"RGD:1590865"
"pubmed_id": [
7600578
],
"symbol": "CHAF1A",
"locus_type": "gene with protein product",
"status": "Approved",
"mgd_id": [
"MGI:1351331"
],
"ccds_id": [
"CCDS32875"
],
"ensembl_gene_id": "ENSG00000167670",
"entrez_id": "10036",
"uuid": "61416b73-dffa-4eb9-9af7-a82d97a84e77",
"date_name_changed": "2015-11-23",
"omim_id": [
"601246"
],
"symbol": "CHAF1A",
"date_name_changed": "2015-11-23",
"location": "19p13.3",
"name": "chromatin assembly factor 1 subunit A",
"date_modified": "2019-08-21",
"mgd_id": [
"MGI:1351331"
"rgd_id": [
"RGD:1590865"
],
"ucsc_id": "uc002mal.4",
"agr": "HGNC:1910",
"location_sortable": "19p13.3",
"date_modified": "2019-08-21",
"uniprot_ids": [
"Q13111"
],
"ccds_id": [
"CCDS32875"
"date_approved_reserved": "1999-09-29",
"refseq_accession": [
"NM_005483"
],
"alias_name": [
"chromatin assembly factor I (150 kDa)"
],
"ena": [
"U20979"
],
"pubmed_id": [
7600578
],
"location_sortable": "19p13.3"
"_version_": 1747674142400839680,
"location": "19p13.3"
},
{
"date_approved_reserved": "2003-11-13",
Expand Down Expand Up @@ -459,68 +459,71 @@
"location_sortable": "Xp22.32 and Yp11.3"
},
{
"date_approved_reserved": "2005-05-06",
"alias_name": [
"iGb3 synthase",
"isoglobotriaosylceramide synthase"
"status": "Approved",
"mgd_id": [
"MGI:2685279"
],
"vega_id": "OTTHUMG00000004125",
"locus_group": "protein-coding gene",
"hgnc_id": "HGNC:30005",
"symbol": "A3GALT2",
"entrez_id": "127550",
"mane_select": [
"ENST00000442999.3",
"NM_001080438.1"
],
"status": "Approved",
"alias_symbol": [
"IGBS3S",
"IGB3S"
],
"_version_": 1707696195380445184,
"uuid": "ec929101-693b-4afc-ae1b-bbe1d38f9c62",
"prev_name": [
"alpha 1,3-galactosyltransferase 2, pseudogene"
],
"refseq_accession": [
"NM_001080438"
],
"locus_type": "gene with protein product",
"agr": "HGNC:30005",
"hgnc_id": "HGNC:30005",
"rgd_id": [
"RGD:727913"
"prev_symbol": [
"A3GALT2P"
],
"ensembl_gene_id": "ENSG00000184389",
"entrez_id": "127550",
"gene_group": [
"Glycosyltransferase family 6"
],
"symbol": "A3GALT2",
"date_name_changed": "2013-03-11",
"location": "1p35.1",
"name": "alpha 1,3-galactosyltransferase 2",
"date_modified": "2018-02-08",
"mgd_id": [
"MGI:2685279"
],
"ucsc_id": "uc031plq.1",
"prev_symbol": [
"A3GALT2P"
"alias_name": [
"iGb3 synthase",
"isoglobotriaosylceramide synthase"
],
"_version_": 1747674139854897153,
"uniprot_ids": [
"U3KPV4"
],
"rgd_id": [
"RGD:727913"
],
"agr": "HGNC:30005",
"date_modified": "2018-02-08",
"ensembl_gene_id": "ENSG00000184389",
"date_name_changed": "2013-03-11",
"locus_type": "gene with protein product",
"ccds_id": [
"CCDS60080"
],
"gene_group_id": [
429
],
"date_symbol_changed": "2013-03-11",
"pubmed_id": [
10854427,
18630988
],
"location_sortable": "01p35.1"
"alias_symbol": [
"IGBS3S",
"IGB3S"
],
"date_symbol_changed": "2013-03-11",
"locus_group": "protein-coding gene",
"vega_id": "OTTHUMG00000004125",
"ucsc_id": "uc031plq.1",
"gene_group_id": [
429
],
"location": "1p35.1",
"date_approved_reserved": "2005-05-06",
"refseq_accession": [
"NM_001080438"
],
"omim_id": [
"619850"
],
"location_sortable": "01p35.1",
"uuid": "2192efcd-2c34-43b1-aca0-2fc9f72ced47"
},
{
"date_approved_reserved": "2009-02-18",
Expand Down Expand Up @@ -1169,8 +1172,33 @@
],
"entrez_id": "109280162",
"location_sortable": "10q23.3 or 10q24.2"
}
},
{
"name": "interferon production regulator",
"locus_group": "other",
"entrez_id": "3466",
"symbol": "IFNR",
"pubmed_id": [
1906174,
1193239
],
"hgnc_id": "HGNC:5447",
"status": "Approved",
"locus_type": "unknown",
"uuid": "58487ff1-1a71-435a-89b2-02f7e275b3af",
"location_sortable": "16",
"date_modified": "2019-06-26",
"omim_id": [
"147573"
],
"curator_notes": [
"This gene has the locus type unknown because it has never been mapped to the human genome."
],
"date_approved_reserved": "1986-01-01",
"location": "16",
"_version_": 1747674151184760833
}
],
"start": 0
}
}
}
Loading

0 comments on commit 06537d2

Please sign in to comment.