-
Notifications
You must be signed in to change notification settings - Fork 0
/
scoutToIMPXFDR.py
166 lines (143 loc) · 7.73 KB
/
scoutToIMPXFDR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
# Scout Result file to MS Annika Result file converter
# 2022 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
import argparse
import pandas as pd
import traceback as tb
__version = "1.0.0"
__date = "2023-10-17"
"""
DESCRIPTION:
A script to convert Scout *.csv result files to MS Annika format as
Microsoft Excel worksheets for usage with IMP-X-FDR (v1.1.0).
USAGE:
scoutToIMPXFDR.py f [f ...]
[-o OUTPUT]
[-xl CROSSLINKER]
[-xlmod CROSSLINKER_MODIFICATION]
[-h]
[--version]
positional arguments:
f Scout result file to process, if second filename
is given it will be used as the output name!
optional arguments:
-h, --help show this help message and exit.
-o OUTPUT, --output OUTPUT
Name of the output file.
-xl CROSSLINKER, --crosslinker CROSSLINKER
Name of the Crosslinker e.g. DSSO.
-xlmod CROSSLINKER_MODIFICATION, --crosslinker-modification CROSSLINKER_MODIFICATION
Residue that the Crosslinker binds to e.g. K for DSSO.
--version show program's version number and exit.
"""
#### MS Annika Result columns mapping ####
# Checked: (bool) TRUE | FALSE -> create with FALSE
# Crosslinker: (string) e.g. DSSO -> create with crosslinker name
# Crosslink Type: (string selection) Intra | Inter -> mapped "Link-Type"
# # CSMs: (int) -> mapped "CSM count"
# # Proteins: (int) -> create with zeros
# Sequence A: (string) e.g. [K]SSAAR -> mapped "Alpha peptide"
# Accession A: (string) e.g. P0A7X3 -> mapped "Alpha protein mapping(s)"
# Position A: int -> mapped "Alpha peptide position"
# Sequence B: (string) -> ^
# Accession B: (string) -> ^
# Position B: (int) -> ^
# Protein Descriptions A: (string) -> mapped "Alpha protein mapping(s)"
# Protein Descriptions B: (string) -> ^
# Best CSM Score: (double) -> mapped "Score"
# In protein A: (int) -> create with zeros
# In protein B: (int) -> create with zeros
# Decoy: (bool) TRUE | FALSE -> create with FALSE
# Modifications A: (string) e.g. K1(DSSO);M1(Oxidation) -> create with xl name and modification
# Modifications B: (string) -> create with xl name and modification
# Confidence: (string selection) High | Medium | Low -> create with High
# function that returns pandas dataframe in annika format
def create_annika_result(scout_filename: str, crosslinker: str = "DSSO", crosslinker_aa: str = "K") -> pd.DataFrame:
if len(crosslinker_aa) != 1:
raise Exception("Crosslinker modifications that affect more than one amino acid are not supported! Exiting...")
# load file
scout_df = pd.read_csv(scout_filename)
nrows = scout_df.shape[0]
# columns
Checked = ["FALSE" for i in range(nrows)]
Crosslinker = [crosslinker for i in range(nrows)]
Crosslink_Type = scout_df["Link-Type"].apply(lambda x: "Intra" if "intra" in x.lower() else "Inter").tolist()
CSMs = scout_df["CSM count"].tolist()
Proteins = [0 for i in range(nrows)]
Sequence_A = scout_df["Alpha peptide"].apply(lambda x: x.replace(" ", "")).tolist()
Accession_A = scout_df["Alpha protein mapping(s)"].tolist()
Position_A = scout_df["Alpha peptide position"].tolist()
Sequence_B = scout_df["Beta peptide"].apply(lambda x: x.replace(" ", "")).tolist()
Accession_B = scout_df["Beta protein mapping(s)"].tolist()
Position_B = scout_df["Beta peptide position"].tolist()
Protein_Descriptions_A = scout_df["Alpha protein mapping(s)"].tolist()
Protein_Descriptions_B = scout_df["Beta protein mapping(s)"].tolist()
Best_CSM_Score = scout_df["Score"].tolist()
In_protein_A = [0 for i in range(nrows)]
In_protein_B = [0 for i in range(nrows)]
Decoy = ["FALSE" for i in range(nrows)]
Modifications_A = scout_df["Alpha peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist()
Modifications_B = scout_df["Beta peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist()
Confidence = ["High" for i in range(nrows)]
# create annika dataframe
annika_df = pd.DataFrame({"Checked": Checked,
"Crosslinker": Crosslinker,
"Crosslink Type": Crosslink_Type,
"# CSMs": CSMs,
"# Proteins": Proteins,
"Sequence A": Sequence_A,
"Accession A": Accession_A,
"Position A": Position_A,
"Sequence B": Sequence_B,
"Accession B": Accession_B,
"Position B": Position_B,
"Protein Descriptions A": Protein_Descriptions_A,
"Protein Descriptions B": Protein_Descriptions_B,
"Best CSM Score": Best_CSM_Score,
"In protein A": In_protein_A,
"In protein B": In_protein_B,
"Decoy": Decoy,
"Modifications A": Modifications_A,
"Modifications B": Modifications_B,
"Confidence": Confidence})
return annika_df
# read Scout result and write MS Annika result (in xlsx format)
def main() -> pd.DataFrame:
parser = argparse.ArgumentParser()
parser.add_argument(metavar = "f",
dest = "files",
help = "Scout result file to process, if second filename is given it will be used as the output name!",
type = str,
nargs = "+")
parser.add_argument("-o", "--output",
dest = "output",
default = None,
help = "Name of the output file.",
type = str)
parser.add_argument("-xl", "--crosslinker",
dest = "crosslinker",
default = "DSSO",
help = "Name of the Crosslinker e.g. DSSO.",
type = str)
parser.add_argument("-xlmod", "--crosslinker-modification",
dest = "crosslinker_modification",
default = "K",
help = "Residue that the Crosslinker binds to e.g. K for DSSO.",
type = str)
parser.add_argument("--version",
action = "version",
version = __version)
args = parser.parse_args()
input_file = args.files[0]
output_file = args.files[0].split(".csv")[0] + ".xlsx"
if len(args.files) > 1:
output_file = args.files[1].split(".xlsx")[0] + ".xlsx"
if args.output is not None:
output_file = args.output.split(".xlsx")[0] + ".xlsx"
scout_resultdf = create_annika_result(input_file, args.crosslinker, args.crosslinker_modification)
scout_resultdf.to_excel(output_file, sheet_name = "Crosslinks", index = False, engine = "xlsxwriter")
return scout_resultdf
if __name__ == "__main__":
scout_df = main()