Skip to content

Commit

Permalink
v1.0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
alek0991 committed Apr 30, 2018
1 parent 1228d6d commit 96ac20a
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 63 deletions.
2 changes: 2 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
* Version 1.0.2
- Track execution progress and fixing minor bugs.
* Version 1.0.1
- Keeps not SNPs (indels, MNPs, and etc) as long as they have only one ALT (binary alleles).
2 changes: 1 addition & 1 deletion help.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ usage: isafe.py [-h] [--format FORMAT] --input INPUT --output OUTPUT
iSAFE: (i)ntegrated (S)election of (A)llele (F)avored by (E)volution
====================================================================
Source code & further instructions can be found at: https://github.com/alek0991/iSAFE
iSAFE v1.0.1
iSAFE v1.0.2
--------------------------------------------------------------------

optional arguments:
Expand Down
2 changes: 1 addition & 1 deletion src/bcftools.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def get_snp_matrix(chrom, region_start, region_end, case_vcf, AA_file, cont_vcf=

total_window_size=region_end-region_start
if status:
print 'Loading %0.3fMbp, %s:%i-%i'%(total_window_size/1e6, chrom, region_start, region_end)
print 'Loading %0.3fMbp, %s:%i-%i, please wait ...'%(total_window_size/1e6, chrom, region_start, region_end)
df = get_combined_vcf(chrom, region_start, region_end, case_vcf, cont_vcf=cont_vcf, case_IDs=case_IDs, cont_IDs=cont_IDs)
dfI = get_AA_df(AA_file, df)
I = df.index.get_level_values("POS").isin(dfI.loc[dfI['FLIP']==True, "POS"])
Expand Down
17 changes: 13 additions & 4 deletions src/isafe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
from safeclass import SafeClass
from utils import *
from bcftools import get_snp_matrix
import time
import sys
import os
def run():
# command line parser
parser = argparse.ArgumentParser(description='===================================================================='
'\niSAFE: (i)ntegrated (S)election of (A)llele (F)avored by (E)volution'
'\n===================================================================='
'\nSource code & further instructions can be found at: https://github.com/alek0991/iSAFE'
'\niSAFE v1.0.1'
'\niSAFE v1.0.2'
'\n--------------------------------------------------------------------', formatter_class=argparse.RawTextHelpFormatter)

# optional arguments
Expand Down Expand Up @@ -143,6 +145,10 @@ def run():
if (args.RandomSampleRate>=1)|(args.RandomSampleRate<0):
raise ValueError("--RandomSampleRate must be non-negative and less than 1.")
status = not args.StatusOff
if status:
sys.stdout.write('-~'*30+'\n')
sys.stdout.write(time.ctime()+'\n')

if args.region is not None:
try:
chrom, region_start, region_end = parse_region(args.region.replace(',', ''))
Expand Down Expand Up @@ -174,7 +180,7 @@ def run():
total_window_size / 1e6, args.MaxRegionSize / 1e6))
if num_gaps>0:
if not args.IgnoreGaps:
raise ValueError("There is %i gaps with size greater than %ikbp."%(num_gaps, args.MaxGapSize/1e3))
raise ValueError("There is %i gaps with size greater than %ikbp. Set --IgnoreGaps flag to ignore gaps."%(num_gaps, args.MaxGapSize/1e3))
else:
if not args.WarningOff:
warnings.warn("Warning: There is %i gaps with size greater than %ikbp." % (num_gaps, args.MaxGapSize/ 1e3))
Expand All @@ -192,7 +198,7 @@ def run():
df_final['POS'] = snp_matrix.index
df_final[["POS", "SAFE", "DAF", "phi", "kappa"]].to_csv("%s.SAFE.out"%args.output, index=None,sep='\t')
if status:
print "SAFE Done!"
print "SAFE completed successfully."
else:
if (NumSNPs < args.MinRegionSize_ps) | (total_window_size < args.MinRegionSize_bp):
raise ValueError((
Expand All @@ -208,5 +214,8 @@ def run():
psi_k1 = obj_isafe.creat_psi_k1_dataframe()
psi_k1.index.rename("#POS", inplace=True)
psi_k1.to_csv("%s.Psi.out"%args.output, sep='\t')
if status:
sys.stdout.write(time.ctime()+'\n')
sys.stdout.write('-~' * 30+'\n')
if __name__ == '__main__':
run()
run()
128 changes: 71 additions & 57 deletions src/isafeclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,19 @@ def create_rolling_indices(total_variant_count, w_size, w_step):
return rolling_indices


def creat_windows_summary_stats(snp_matrix, w_size, w_step):
def creat_windows_summary_stats(snp_matrix, w_size, w_step, step_counter='XX', status=True):
total_variant_count = snp_matrix.shape[0]
rolling_indices = create_rolling_indices(total_variant_count, w_size, w_step)
windows_stats = {}
progress0 = 0
for i, I in enumerate(rolling_indices):
progress = np.round(100.0 * i / len(rolling_indices))
if (progress>progress0):
if status:
sys.stdout.write("\r\tStep %s: %.2i%%" % (step_counter, progress))
sys.stdout.flush()
progress0 = progress

window_i_stats = {}
df = snp_matrix.iloc[I]
snp_matrix_i = df.values.T
Expand Down Expand Up @@ -65,9 +73,16 @@ def isafe_kernel(haf, snp):
p = (phi - kappa) / sigma
return p

def creat_matrix_Psi_k(M, Dw, Ifp):
def creat_matrix_Psi_k(M, Dw, Ifp, step_counter='XX', status=True):
P = np.zeros((len(Ifp), len(Dw)))
progress0 = 0
for i in range(len(Ifp)):
progress = np.round(100.0 * i / len(Ifp))
if (progress>progress0):
if status:
sys.stdout.write("\r\tStep %s: %.2i%%" % (step_counter, progress))
sys.stdout.flush()
progress0 = progress
for j in Dw.keys():
output = isafe_kernel(Dw[j]["haf"], M[:, Ifp[i]])
P[i, j] = output
Expand All @@ -78,48 +93,48 @@ def step_function(P0):
P[P < 0] = 0
return P

def apply_isafe(snp_matrix, w_size, w_step, top_k1, top_k2, status = False):
if status:
toolbar_width = 4
sys.stdout.write("[%s]" % (" " * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width + 1))
sys.stdout.write("-")
sys.stdout.flush()
WS = creat_windows_summary_stats(snp_matrix, w_size, w_step)
if status:
sys.stdout.write("-")
sys.stdout.flush()
df_snps = creat_snps_information_df(WS)
df_top_k1 = get_top_k_snps_in_each_window(df_snps, k=top_k1)
cand_snp_k1 = np.sort(df_top_k1["ordinal_pos"].unique())
Psi_k1 = creat_matrix_Psi_k(snp_matrix.values.T, WS, cand_snp_k1)
if status:
sys.stdout.write("-")
sys.stdout.flush()
df_top_k2 = get_top_k_snps_in_each_window(df_snps, k=top_k2)
temp = np.sort(df_top_k2["ordinal_pos"].unique())
cand_snp_k2 = np.sort(np.setdiff1d(temp, cand_snp_k1))
Psi_k2 = creat_matrix_Psi_k(snp_matrix.values.T, WS, cand_snp_k2)
if status:
sys.stdout.write("-")
sys.stdout.flush()


alpha = Psi_k1.sum(0) / Psi_k1.sum()

iSAFE1 = pd.DataFrame(data={"ordinal_pos": cand_snp_k1, "isafe": np.dot(Psi_k1, alpha)})
iSAFE2 = pd.DataFrame(data={"ordinal_pos": cand_snp_k2, "isafe": np.dot(Psi_k2, alpha)})

iSAFE1["tier"] = 1
iSAFE2["tier"] = 2
iSAFE = pd.concat([iSAFE1, iSAFE2]).reset_index(drop=True)
iSAFE["id"] = snp_matrix.iloc[iSAFE["ordinal_pos"]].index
freq = snp_matrix.mean(1).values.squeeze()
iSAFE["freq"] = freq[iSAFE["ordinal_pos"].values.squeeze()]
if status:
sys.stdout.write("\niSAFE Done!\n")
return iSAFE[["ordinal_pos", "id", "isafe", "freq", "tier"]], Psi_k1
# def apply_isafe(snp_matrix, w_size, w_step, top_k1, top_k2, status = False):
# if status:
# toolbar_width = 4
# sys.stdout.write("[%s]" % (" " * toolbar_width))
# sys.stdout.flush()
# sys.stdout.write("\b" * (toolbar_width + 1))
# sys.stdout.write("-")
# sys.stdout.flush()
# WS = creat_windows_summary_stats(snp_matrix, w_size, w_stepcreat_windows_summary_stats)
# if status:
# sys.stdout.write("-")
# sys.stdout.flush()
# df_snps = creat_snps_information_df(WS)
# df_top_k1 = get_top_k_snps_in_each_window(df_snps, k=top_k1)
# cand_snp_k1 = np.sort(df_top_k1["ordinal_pos"].unique())
# Psi_k1 = creat_matrix_Psi_k(snp_matrix.values.T, WS, cand_snp_k1)
# if status:
# sys.stdout.write("-")
# sys.stdout.flush()
# df_top_k2 = get_top_k_snps_in_each_window(df_snps, k=top_k2)
# temp = np.sort(df_top_k2["ordinal_pos"].unique())
# cand_snp_k2 = np.sort(np.setdiff1d(temp, cand_snp_k1))
# Psi_k2 = creat_matrix_Psi_k(snp_matrix.values.T, WS, cand_snp_k2)
# if status:
# sys.stdout.write("-")
# sys.stdout.flush()
#
#
# alpha = Psi_k1.sum(0) / Psi_k1.sum()
#
# iSAFE1 = pd.DataFrame(data={"ordinal_pos": cand_snp_k1, "isafe": np.dot(Psi_k1, alpha)})
# iSAFE2 = pd.DataFrame(data={"ordinal_pos": cand_snp_k2, "isafe": np.dot(Psi_k2, alpha)})
#
# iSAFE1["tier"] = 1
# iSAFE2["tier"] = 2
# iSAFE = pd.concat([iSAFE1, iSAFE2]).reset_index(drop=True)
# iSAFE["id"] = snp_matrix.iloc[iSAFE["ordinal_pos"]].index
# freq = snp_matrix.mean(1).values.squeeze()
# iSAFE["freq"] = freq[iSAFE["ordinal_pos"].values.squeeze()]
# if status:
# sys.stdout.write("\niSAFE Done!\n")
# return iSAFE[["ordinal_pos", "id", "isafe", "freq", "tier"]], Psi_k1

class iSafeClass:
def __init__(self, snp_matrix, w_size, w_step, top_k1, top_k2):
Expand All @@ -138,31 +153,30 @@ def __init__(self, snp_matrix, w_size, w_step, top_k1, top_k2):
self.isafe = None

def fire(self, status=True):
step_counter = '1/3'
if status:
toolbar_width = 4
sys.stdout.write("[%s]" % (" " * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width + 1))
sys.stdout.write("-")
sys.stdout.flush()
self.windows_summary = creat_windows_summary_stats(self.snp_matrix, self.w_size, self.w_step)
sys.stdout.write("Data loaded successfully.\nRunning iSAFE:\n")
self.windows_summary = creat_windows_summary_stats(self.snp_matrix, self.w_size, self.w_step, step_counter=step_counter, status=status)
if status:
sys.stdout.write("-")
sys.stdout.write("\r\tStep %s: 100%%\n"%(step_counter))
sys.stdout.flush()
self.snps_summary = creat_snps_information_df(self.windows_summary)
df_top_k1 = get_top_k_snps_in_each_window(self.snps_summary, k=self.top_k1)
self.ordinal_pos_snps_k1 = np.sort(df_top_k1["ordinal_pos"].unique())
self.psi_k1 = creat_matrix_Psi_k(self.snp_matrix.values.T, self.windows_summary, self.ordinal_pos_snps_k1)
step_counter = '2/3'
self.psi_k1 = creat_matrix_Psi_k(self.snp_matrix.values.T, self.windows_summary, self.ordinal_pos_snps_k1, step_counter=step_counter, status=status)
if status:
sys.stdout.write("-")
sys.stdout.write("\r\tStep %s: 100%%\n"%step_counter)
sys.stdout.flush()
df_top_k2 = get_top_k_snps_in_each_window(self.snps_summary, k=self.top_k2)
temp = np.sort(df_top_k2["ordinal_pos"].unique())
self.ordinal_pos_snps_k2 = np.sort(np.setdiff1d(temp, self.ordinal_pos_snps_k1))
self.psi_k2 = creat_matrix_Psi_k(self.snp_matrix.values.T, self.windows_summary, self.ordinal_pos_snps_k2)
step_counter = '3/3'
self.psi_k2 = creat_matrix_Psi_k(self.snp_matrix.values.T, self.windows_summary, self.ordinal_pos_snps_k2, step_counter=step_counter, status=status)
if status:
sys.stdout.write("-")
sys.stdout.write("\r\tStep %s: 100%%\n" % step_counter)
sys.stdout.flush()

self.alpha = self.psi_k1.sum(0) / self.psi_k1.sum()

iSAFE1 = pd.DataFrame(data={"ordinal_pos": self.ordinal_pos_snps_k1, "isafe": np.dot(self.psi_k1, self.alpha)})
Expand All @@ -175,7 +189,7 @@ def fire(self, status=True):
freq = self.snp_matrix.mean(1).values.squeeze()
iSAFE["freq"] = freq[iSAFE["ordinal_pos"].values.squeeze()]
if status:
sys.stdout.write("\niSAFE Done!\n")
sys.stdout.write("iSAFE completed successfully.\n")
self.isafe = iSAFE[["ordinal_pos", "id", "isafe", "freq", "tier"]]

def creat_psi_k1_dataframe(self):
Expand Down

0 comments on commit 96ac20a

Please sign in to comment.