-
Notifications
You must be signed in to change notification settings - Fork 8
/
snpedia.py
202 lines (163 loc) · 7.78 KB
/
snpedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import sys,pickle,snp
from wikitools import wiki,category,page
from glob import glob
def snpedia_items(): # get all snps from snpedia
try:
snpedia_infile = open("snpedia.data","rb") # try getting cached snps
except IOError:
snpedia = snpedia_getter() # if none avail get from snpedia
else:
snpedia = pickle.load(snpedia_infile) # else load from cache
snpedia_infile.close()
return snpedia
def snpedia_getter():
site = wiki.Wiki("http://snpedia.com/api.php") # open snpedia
snps = category.Category(site, "Is_a_snp")
snpedia = {}
for article in snps.getAllMembersGen(namespaces=[0]): # get all snp-names
snpedia[article.title.lower()] = "in snpedia"
print article.title
snpedia_outfile = open("snpedia.data","wb") # save all snps to cache
pickle.dump(snpedia,snpedia_outfile)
snpedia_outfile.close()
return snpedia
def snp_compare(snpedia,my_snps): # compare my snps to snpedia
filtered_snps = []
for single_snp in my_snps:
if len(single_snp.genotype) >= 2: # only use homozygotous snps
if single_snp.genotype[0] == single_snp.genotype[1]:
if snpedia.has_key(single_snp.name):
filtered_snps.append(single_snp)
return filtered_snps
def snpedia_genotypes(my_filtered_snps): # get genotypes for snps from snpedia
try:
genotype_infile = open("genotypes.data","rb") # again, try to cache data to save load
except IOError:
genotypes = genotype_getter(my_filtered_snps)
else:
genotypes = pickle.load(genotype_infile)
genotype_infile.close()
return genotypes
def genotype_getter(my_filtered_snps):
site = wiki.Wiki("http://snpedia.com/api.php")
genotypes = {}
for single_snp in my_filtered_snps:
type_counter = 1
wikipage = page.Page(site,single_snp.name)
snp_page = wikipage.getWikiText()
while snp_page.find("geno"+str(type_counter)) != -1:
if genotypes.has_key(single_snp.name):
current_genotypes = genotypes[single_snp.name]
type_start = snp_page.find("geno"+str(type_counter))
type_start = snp_page.find("(",type_start)
type_stop = snp_page.find(")",type_start)
current_genotypes.append(str(snp_page[type_start:type_stop+1]))
genotypes[single_snp.name] = current_genotypes
else:
type_start = snp_page.find("geno"+str(type_counter))
type_start = snp_page.find("(",type_start)
type_stop = snp_page.find(")",type_start)
genotypes[single_snp.name] = [str(snp_page[type_start:type_stop+1])]
type_counter +=1
print "Got genotypes for " + str(single_snp.name)
genotype_outfile = open("genotypes.data","wb")
pickle.dump(genotypes,genotype_outfile)
genotype_outfile.close()
return genotypes
def genotype_descriptions(genotypes): # get descriptions for relevant genotypes from snpedia
try:
description_infile = open("description.data","rb") # again, try to cache data to save load...
except IOError:
descriptions = description_getter(genotypes)
else:
descriptions = pickle.load(description_infile)
description_infile.close()
return descriptions
def description_getter(genotypes):
site = wiki.Wiki("http://snpedia.com/api.php")
genotype_descriptions = {}
for single_type in genotypes:
for variant in genotypes[single_type]:
genotype_name = str(single_type)+str(variant)
print genotype_name
wikipage = page.Page(site,genotype_name)
if wikipage.exists == True:
genotype_page = wikipage.getWikiText()
if genotype_page.find("summary=") != -1:
summary_start = genotype_page.find("summary=") + 8
summary_stop = genotype_page.find("\n",summary_start)
print genotype_page[summary_start:summary_stop]
genotype_descriptions[genotype_name] = genotype_page[summary_start:summary_stop]
description_outfile = open("description.data","wb")
pickle.dump(genotype_descriptions,description_outfile)
description_outfile.close()
return genotype_descriptions
def genotype_comparer(my_filtered_snps,genotypes,descriptions):
counter = 1
bases = {'A': "T", "T" : "A", "G" : "C", "C" :"G", "I" : "I", "D" : "D"}
candidates = {}
for single_snp in my_filtered_snps:
# iterate over all SNPs from user
my_genotype = ""
if genotypes.has_key(single_snp.name) and single_snp.genotype != "--": # is this SNP called and are there genotypes for this SNP?
genotype = genotypes[single_snp.name]
for single_genotype in genotype: # iterate over all known genotypes of a SNP
if "("+single_snp.genotype[0]+";"+single_snp.genotype[1]+")" == single_genotype:
my_genotype = "("+single_snp.genotype[0]+";"+single_snp.genotype[1]+")" # check for genotype of user (or its complementary bases)
elif "("+bases[single_snp.genotype[0]]+";"+bases[single_snp.genotype[1]]+")" == single_genotype:
my_genotype = "("+bases[single_snp.genotype[0]]+";"+bases[single_snp.genotype[1]]+")"
if my_genotype != "": # if genotype of user is known: go on!
output = "-----------\nMy genotype at " + single_snp.name+": "+single_snp.genotype+" or "+my_genotype+"\n"
for single_genotype in genotype:
genotype_name = single_snp.name + single_genotype
if descriptions.has_key(genotype_name):
output = output + genotype_name +" is associated with: "+ descriptions[genotype_name]+"\n"
if output.find("associated") != -1:
print counter
print output
candidates[single_snp.name] = single_snp
counter += 1
return candidates
def freq_counter(candidates):
other_genomes = glob("other_genomes/*.txt")
variance = {}
for single_genome in other_genomes:
second_genome = {}
second_genome = snp.reader_dict(single_genome)
for my_snp in candidates:
if second_genome.has_key(my_snp):
if candidates[my_snp].other_genotypes.has_key(second_genome[my_snp].genotype):
candidates[my_snp].other_genotypes[second_genome[my_snp].genotype] += 1
else:
candidates[my_snp].other_genotypes[second_genome[my_snp].genotype] = 1
print candidates[my_snp].other_genotypes
for single_snp in candidates:
if variance.has_key(len(candidates[single_snp].other_genotypes)):
variance[len(candidates[single_snp].other_genotypes)] += 1
else:
variance[len(candidates[single_snp].other_genotypes)] = 1
if len(candidates[single_snp].other_genotypes) > 3:
print single_snp + ": " + str(candidates[single_snp].other_genotypes)
print variance
def main():
print "Start getting SNPs from snpedia.com"
snpedia_snps = snpedia_items() # get snps as hash: the snp-names are saved as keys
print "Got " +str(len(snpedia_snps)) + " SNPs from snpedia.com\n"
print "Start getting SNPs from user"
my_snps = snp.reader("genome_Bastian_Greshake_Full_20110503120911.txt") # get snps of user, saved as a list
print "Got " + str(len(my_snps)) + " SNPs from user\n"
print "Start filtering SNPs"
my_filtered_snps = snp_compare(snpedia_snps,my_snps) # get homozygotous snps of user which are in snpedia, saved as list
print "Got " +str(len(my_filtered_snps)) + " SNPs that are available on SNPedia & are homozygotous\n"
print "Start getting genotypes"
genotypes = snpedia_genotypes(my_filtered_snps) # get different genotypes for each snp, saved as hash. key = snp, value = list of genotypes
print "Got " +str(len(genotypes)) + " SNPs from SNPedia which have known genotypes\n"
print "Start getting genotype descriptions"
descriptions = genotype_descriptions(genotypes) # get different descriptions for each genotype, saved as hash. key = genotype name, value = description
print "Got "+ str(len(descriptions)) +" genotypes with a sufficient description\n"
print "Start getting relevant genotypes\n"
candidates = genotype_comparer(my_filtered_snps,genotypes,descriptions)
print "Got relevant genotypes\n"
print "Start counting frequency of candidates\n"
freq_counter(candidates)
main()