Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix KeyError due to absence of secondary structure sections #308

Merged
merged 1 commit into from
Jul 23, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 39 additions & 19 deletions evcouplings/compare/pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,15 +500,23 @@ def __init__(self, filehandle, keep_full_data=False):
name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
})

# decode information into dataframe with BioPython helper method
self.helix_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items()
})

# decode information into dataframe with BioPython helper method
self.sheet_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
})
# decode information into dataframe with BioPython helper method; note this section may not be
# present if no helices exist in the structure
try:
self.helix_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items()
})
except KeyError:
self.helix_table = None

# decode information into dataframe with BioPython helper method; note this section may not be
# present if no sheets exist in the structure
try:
self.sheet_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
})
except KeyError:
self.sheet_table = None

# create secondary structure table for merging to chain tables
# (will only contain helix/H and strand/E, coil/C will need to be filled in)
Expand All @@ -517,6 +525,10 @@ def __init__(self, filehandle, keep_full_data=False):
("H", self.helix_table),
("E", self.sheet_table)
]:
# skip if secondary structure element not present in PDB file at all
if sse_table is None:
continue

for _, row in sse_table.iterrows():
assert row.beg_label_asym_id == row.end_label_asym_id
for seq_id in range(row.beg_label_seq_id, row.end_label_seq_id + 1):
Expand All @@ -527,11 +539,14 @@ def __init__(self, filehandle, keep_full_data=False):
})

# drop duplicates, there are overlapping helix segment annotations e.g. for PDB 6cup:A:Asp92
self.secondary_structure = pd.DataFrame(
sse_raw
).drop_duplicates(
subset=["label_asym_id", "label_seq_id"]
)
if len(sse_raw) > 0:
self.secondary_structure = pd.DataFrame(
sse_raw
).drop_duplicates(
subset=["label_asym_id", "label_seq_id"]
)
else:
self.secondary_structure = None

# store information about models/chains for quick retrieval and verification;
# subtract 0 to start numbering consistently to how this was handled with MMTF
Expand Down Expand Up @@ -692,11 +707,16 @@ def get_chain(self, chain, model=0, is_author_id=True):
res.index.name = "residue_index"

# merge secondary structure information (left outer join as coil is missing from table)
res_sse = res.merge(
self.secondary_structure,
on=("label_seq_id", "label_asym_id"),
how="left"
)
if self.secondary_structure is not None:
res_sse = res.merge(
self.secondary_structure,
on=("label_seq_id", "label_asym_id"),
how="left"
)
else:
res_sse = res.assign(
sec_struct_3state=np.nan
)

res_sse.loc[
res_sse.sec_struct_3state.isnull() & (res_sse.label_seq_id > 0),
Expand Down
Loading