From cc0e95b4364f0968d89f70cd7321fb967532a383 Mon Sep 17 00:00:00 2001 From: Karl Kumbier Date: Mon, 29 Apr 2024 14:46:47 -0700 Subject: [PATCH] updates for paper revision --- README.md | 46 +- paper_figures/Dockerfile | 33 + paper_figures/Figure_ASO/.DS_Store | Bin 10244 -> 8196 bytes paper_figures/Figure_ASO/aso_models.R | 509 ++++++++------- paper_figures/Figure_ASO/raw_fus_phenotype.R | 312 +++++----- paper_figures/Figure_Scores/.DS_Store | Bin 10244 -> 10244 bytes paper_figures/Figure_Scores/README.md | 33 + .../Figure_Scores/fig_coordinate_space.R | 104 ++++ .../Figure_Scores/fig_model_comparison.R | 114 ++++ paper_figures/Figure_Scores/fig_qc_pca.R | 123 ++++ paper_figures/Figure_Scores/fig_qc_plates.R | 178 ++++++ paper_figures/Figure_Scores/fig_scores.R | 588 ++++++++++++++++++ paper_figures/Figure_Scores/run_models.sh | 16 + .../Figure_Scores/train_fus_models.R | 73 +++ paper_figures/Figure_Search/.DS_Store | Bin 10244 -> 10244 bytes paper_figures/Figure_Search/README.md | 38 ++ .../Figure_Search/fig_marker_tuning.R | 43 +- .../Figure_Search/fig_pred_by_marker.R | 60 ++ paper_figures/Figure_Search/run.sh | 4 + .../Figure_Search/train_fus_screen.R | 119 ++-- .../Figure_Search/train_marker_screens.R | 129 ++-- .../Figure_Transcriptomics/.DS_Store | Bin 10244 -> 10244 bytes .../Figure_Transcriptomics/README.md | 48 ++ .../Figure_Transcriptomics/fig_age_cor.R | 90 +++ .../Figure_Transcriptomics/fig_fibro.R | 332 ++++++++++ .../Figure_Transcriptomics/fig_spine.R | 119 ++++ .../Figure_Transcriptomics/fit_fibro.R | 101 +++ .../Figure_Transcriptomics/fit_spine.R | 133 ++++ .../Figure_Transcriptomics/load_fibroblast.R | 99 ++- .../Figure_Transcriptomics/load_spine.R | 93 ++- .../Figure_Transcriptomics/pca_analysis.R | 36 +- .../Figure_Transcriptomics/utilities.R | 253 ++++---- paper_figures/color_palette.R | 61 +- paper_figures/setup.R | 41 ++ paper_figures/theme.R | 4 + paper_figures/utilities.R | 296 +++++---- preprocessing/make_eigenfeatures.R | 125 ++++ preprocessing/make_eigenfeatures_well.R | 150 +++++ preprocessing/utilities.R | 60 ++ 39 files changed, 3593 insertions(+), 970 deletions(-) create mode 100644 paper_figures/Dockerfile create mode 100644 paper_figures/Figure_Scores/README.md create mode 100644 paper_figures/Figure_Scores/fig_coordinate_space.R create mode 100644 paper_figures/Figure_Scores/fig_model_comparison.R create mode 100644 paper_figures/Figure_Scores/fig_qc_pca.R create mode 100644 paper_figures/Figure_Scores/fig_qc_plates.R create mode 100644 paper_figures/Figure_Scores/fig_scores.R create mode 100644 paper_figures/Figure_Scores/run_models.sh create mode 100644 paper_figures/Figure_Scores/train_fus_models.R create mode 100644 paper_figures/Figure_Search/README.md create mode 100644 paper_figures/Figure_Search/fig_pred_by_marker.R create mode 100644 paper_figures/Figure_Search/run.sh create mode 100644 paper_figures/Figure_Transcriptomics/README.md create mode 100644 paper_figures/Figure_Transcriptomics/fig_age_cor.R create mode 100644 paper_figures/Figure_Transcriptomics/fig_fibro.R create mode 100644 paper_figures/Figure_Transcriptomics/fig_spine.R create mode 100644 paper_figures/Figure_Transcriptomics/fit_fibro.R create mode 100644 paper_figures/Figure_Transcriptomics/fit_spine.R create mode 100644 paper_figures/setup.R create mode 100644 paper_figures/theme.R create mode 100644 preprocessing/make_eigenfeatures.R create mode 100644 preprocessing/make_eigenfeatures_well.R create mode 100644 preprocessing/utilities.R diff --git a/README.md b/README.md index 8e8ae63..369e969 100755 --- a/README.md +++ b/README.md @@ -1,15 +1,41 @@ +# FUS-ALS Molecular ALS Phenotype Scores This directory contains scripts used to process and analyze screens from the ALS -patient-derived fibroblast study. Analyses for each screen are organized within -individual directories corresponding to Figures 2-4 of the manuscript -Figure_Scores=Fig. 2, Figure_ASO=Fig. 3, Figure_Transcriptomics=Fig. 4, -Figure_Search=Extended data Figs. E1-E3. +patient-derived fibroblast study. We provide a Dockerfile to install all +dependencies for running analyses reported in Kumbier et al. 2024. -Data required to run the scripts can be downloaded from Zenodo: -https://zenodo.org/record/7247995. To run scripts, Initialize `.Renviron` -variables: +``` + docker build -t als/als . + docker run -it --rm -v :/ALS als/als +``` -- `ALS_PAPER`=`` +The core analysis scripts used for each screen / analysis include: -- `ALS_DATA`=`` +## General scripts +- `preprocessing`: directory containing scripts for processing raw image-derived +features into the selected set of ALS-relevant eigenfeatures. Proecsses image +features are available in the `data_profiles` directory of the zenodo +repository. Raw image features and images are available upon request. -The `data` directory on Zenodo must be placed in this directory. +- `utilities.R`: script containg functions for fitting models along with helper +functions for data processing and visualization. + +- `color_palette.R`: sets color palette used in figures. + +## Analysis specific scripts +Scripts used to generate figures for the paper have been copied to +`paper_figures` and are for the most part organized by figure. + +- `Figure_Search`: analyses of imaging marker set search—reported in + supplemental section. + +- `Figure_Scores`: analyses for image-based scores (i-MAP scores). + +- `Figure_Transcriptomics`: analyses for transcriptomic-based scores (t-MAP +scores). + +- `Figure_ASO`: analyses from ASO screen. *Note:* some of the sporadic figures +are generated in `Figure_Transcriptomics` + +## Data +Data for image-based cell profiles are contained in `data_profiles`, organized +by screen. All other data can be found in `data`. diff --git a/paper_figures/Dockerfile b/paper_figures/Dockerfile new file mode 100644 index 0000000..0368929 --- /dev/null +++ b/paper_figures/Dockerfile @@ -0,0 +1,33 @@ +## Base docker image +FROM ubuntu:latest +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update && \ + apt-get install -y r-base \ + libssl-dev \ + libcurl4-gnutls-dev \ + libxml2-dev \ + libfontconfig1-dev \ + libharfbuzz-dev \ + libfribidi-dev \ + libfreetype6-dev \ + libpng-dev \ + libtiff5-dev \ + libjpeg-dev \ + libgit2-dev \ + git \ + curl \ + python3-venv + +RUN apt install -y cmake +## TODO: install python for keras + +## Initialize Renviron varaibles +RUN touch ~/.Renviron +RUN echo "ALS_PAPER=/ALS/" >> ~/.Renviron +RUN echo "ALS_DATA=/ALS/data_profiles/" >> ~/.Renviron + +## run the script +COPY ./setup.R /setup.R +RUN Rscript /setup.R + diff --git a/paper_figures/Figure_ASO/.DS_Store b/paper_figures/Figure_ASO/.DS_Store index c522d7d9a6851abcb578234850ce8249ced3c78a..f7a9fbe724b041077adc29609628115322f4b282 100644 GIT binary patch literal 8196 zcmeHMYitx%6h3EK+a69oiLqQcV@RB zsi`s1hy;B6FeZi{XhMt$MvM{v5;4X{xOOPbCY|| zz2|Z7+;hJ(XYVWkU~67)1gHi8MpaN=M$I%u$VHt}lz=1bkn915!2ul%u)v+79Wp`> zgdPYz5PBf=KcPI-I4z!ZZk21;|1$Ami3ctFP_mF9rb957fJgBuF^tCL;K zR|kws8n&SaLJv&!fXMyjkfQs#1IbDE_j=gVX*rp+<)leRgw;D#>#zHTdH^Lb045Ah z*23uYTImrt8_abfowZEYYiRfam6FmK_snE7Ss6Q+7;;7uUdqe6t*P98G3s_4D_!iT z(tAx~B+2KsIkuNFZ6oI!n7TnK2YO9gcSc&Bysf*wZ9)o|%G4yUK6b3Jxh}STUE^q7 z>{#QvwM})gjSbDCqbi$UyJqvAeI0t{X;P$9=R+83x7?AaP@Ok|wltS|fJ z)!h>$ZlO)?Q@K{$=U2sjs#;kj(cPmZ`J4i6rYq~2j_vL>axT#%uXJhCof*g8oil{I zH{)3OVcSje*_o7;$y+JUXtAt>`K00YdBzb>^!i;V=e7BYeJ0roUXt^?YwR(|z$_T; zWae^I&y~+#u()=`>b1>VS~_+fX@4l$$YZ-LwaL z3#MzPEu-5`4IBKds*dwnl@*byNDbq)tS_H7pU9iuQJpkM=VnSW& zFN`ciTU!=cuJRP?cIlDj6sxp4mAh*9VLDD}x<+ecTvvOnRNAm4YrWRQ z_%5}#ApItp8@0`BHrA!%zJYJyJNO=c zf?we`_#OVh5-i6GoP`Up7MJ4vxE$+n12*Ay z92Z+<3H6Hc;TD2uSB&YRAlk8Q`yGmcugfLw=gynINCd-%O%L6`j6_)xSQ7#3V|3CX z`WW@&;#P1)s;X-i+^a=<2*k#b+l3ULDd5V+h4$i@7UeSt=5n}n84d3th|A&1RnaI{ zD6-_Reoa*4B@{QRZ>z3~YC*TDS<`qa#g95(Q&k<&wrE>b3R(`nV!VZ>5y9{eV0i}K zhIipZxJ0o0k^p%XuEC#h9cN%AR$&b;#3(Kz7_P(z2#TwHC~m^dxCJ{1j-5UnyKyh} z;XVRo26Z&hCP?N2P(FoY_%uF)C-5Y`h%e#G_zJ#`Z{hh{px0Rhouv5QP3R3f?d?IL zVPmcH4xi$eS|&>MzmaB#CcM_KiJVsibRm(mEyo@tj)4DM$WhW9p_DlQc}kv13f&YGp_R0Q9@juXA{havT2fh-=-F-b{6bCfLTDGa4!Nhc3(_F#Kg!wI4yoxF&D$qII*x^ClQS6Yst<|&4`3GBaC{~ z)LQYa{mXvvi;#jD`Z8zGJCmW8LxzY-hNt%qOJ;5r9yMdm94>@onrd2gb>9J(lRIyI z9-qhO^G^pxjM0Dc$a@2A)ICc|{6Yelk6ZU(TYn;rc+th{o)?8?Q#wYRqIiTWF4uTIdg{g?fa1skg6R z2#O1nxSH;mr5d`~r^HQ^L2-eJtL}^#`i{6l>-!^ymWb+RP%Mgsv`9h=SxT#>1=Qyi zbHGyeTl8ztG~!mf-EcrfU0N0t#e}KsR8T=pDt=TmMev+1ShQsMit4&`8#cFZ-M(X@ z@R0>#kx(oQB38Pk4yj6fSICSgx~%HMy`!pZjr1o~Qw?iMj~kBpOe(vayWGB19J{ov?Tl2_VIYue)qrm~~3 zQd%wan79sJ46%kaLaorV4>#E(nyjQ=Xq0-jP*~ACqRhr=5}Kv{Bm>hUcCo^ZLa{V} zTNTA6njTj6gw!fMLx5VTv{k~N?pA84L@j( zgyKr~D6((DDevZad*iU2_SPY*mN8iwpp~MkCsiaiZZYFGU*%iHNHqAOmfXoFB@}UgM zL4rzn6zZV?8sTwhg*Na*2RsD<=z%`yhrKWa!=S>mV8R$2gcsmNcp2V+H{mUK7f!+{ zI1OjuJY0fL;4)l=FX0+|4cFlp_!Vx!U5?{&xLmH3E8~`NwcL6(aV}*!yI4Mza?SXa zvK%w%90XiwCi|%HcWn8e?BmA*+aeYgmn>RZwQ5ZR+Q}qy7}ZkjO6ROi)M*=O6Vh?m$yRQ{EU_~c6|HU9$rX{me`3BzPLi|a0{Ik) z`8~N#Zjw9XF48d{ilEdcWF;(xm9Pp{Lp7{{CRh*6umP#r0b5`zbV3&rvlpp30D}+# z85GdLfEdKVf=f;yhTre_AO6Vx&%&F3 zx=j6`L&~TjXQ_>iq#yQoXy=YzG7#4_Lm$QxO8r9S5z#A0&}1$2#MDP?=p`c=UXr?+ zbRV2WGtG0JPZ`Vu^PFYqIqT3_w$wLeah!*7HTcZxIO%G$tZ}2zK9dbHvbG36wS^8WX2Fl5Ho`fcd`F;Hq4=ibj+aS+#lG( z9JJE0yLI#JQ=6C-%t>>~LjIX%I+jp!5hpsAnPy;`H;L&C_;J+!((%h^on}EFJ#Y&u z9UE(2iw3m5C5r*QLQ%vP{zmz{&vvu0J88fip?4d98U`c z>{+;pi-3#3!-+r+o0gZ}|98y(|Np}|d3RPW0xkmo3IdSp5BS@#t5n3D*(<$k_u{)3 zU;5B?<4oErn9wG)+`f*-484xG?% - dplyr::rename(CellLine=`Cell line`) %>% - mutate(Sex=toupper(Sex)) %>% - dplyr::rename(Genetics=`Disease Gene`) %>% - mutate(Genetics=ifelse(Genetics == 'WT', 'Healthy', Genetics)) %>% - mutate(Genetics=ifelse(Genetics == 'FUS', 'FUS-ALS', Genetics)) %>% - mutate(Genetics=str_replace_all(Genetics, 'ALS with no known mutation', 'sporadic')) +params <- str_c(region, "_", marker, ":", marker.select) +output.fig.dir <- file.path(figure.dir, "fig") +dir.create(output.fig.dir, recursive = TRUE, showWarnings = FALSE) + +load(file.path(data.dir, str_c("profiles_", marker, ".Rdata"))) # Initialize models to be used for supervised learning -model <- function(x, y) irf(x, y, n.core=n.core) +model <- function(x, y) irf(x, y, n.core = n.core) model_predict <- predict_irf -importance_norm <- function(x) as.matrix(x / max(x)) -#' ## Loading data -#' Our dataset consists of phenotypic profiles derived from `r marker`. Each -#' observation in the dataset corresponds to a single `r type`. We subset to -#' features as indicated by `cytosolic` and `fus.only` variables. +# " ## Loading data +# " Our dataset consists of phenotypic profiles. Each +# " observation in the dataset corresponds to a single cell. #+load_data # Load in dataset for select marker set -load(file.path(data.dir, str_c('profiles_', marker, '.Rdata'))) - -x <- mutate(x, PlateID=str_remove_all(PlateID, '.*plate_')) %>% - mutate(ID=str_c(ID, 1:n())) %>% - dplyr::rename(ASO=ASOtreatment) %>% - mutate(ASO=str_remove_all(ASO, 'EP_')) %>% - mutate(ASO=str_replace_all(ASO, 'ASO', ' ASO')) %>% - mutate(ASO=str_replace_all(ASO, 'NTC', 'NTC 10uM')) %>% - mutate(ASO=str_replace_all(ASO, 'low ASO', 'ASO 1uM')) %>% - mutate(ASO=str_replace_all(ASO, 'high ASO', 'ASO 10uM')) %>% - mutate(ASO=factor(ASO, treatment.levels)) %>% - dplyr::select(-Genetics) %>% - left_join(xcell, by='CellLine') +x <- mutate(x, PlateID = str_remove_all(PlateID, ".*plate_")) %>% + mutate(ID = str_c(ID, seq_len(n()))) %>% + dplyr::rename(ASO = ASOtreatment) %>% + mutate(ASO = str_remove_all(ASO, "EP_")) %>% + mutate(ASO = str_replace_all(ASO, "ASO", " ASO")) %>% + mutate(ASO = str_replace_all(ASO, "NTC", "NTC 10uM")) %>% + mutate(ASO = str_replace_all(ASO, "low ASO", "ASO 1uM")) %>% + mutate(ASO = str_replace_all(ASO, "high ASO", "ASO 10uM")) %>% + mutate(ASO = factor(ASO, treatments)) # Initialize gene/cell line key genetic.key <- dplyr::select(x, CellLine, Genetics) %>% distinct() -mutation.key <- setNames(xcell$`Figure Names`, xcell$CellLine) # Initialize channel / region IDs -channels <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3) +channels <- sapply(str_split(colnames(x), "\\.\\."), "[", 3) +channels[is.na(channels)] <- 0 -regions <- sapply(str_split(colnames(x), '\\.\\.'), '[', 2) -regions <- lapply(regions, str_split, pattern='_') +regions <- sapply(str_split(colnames(x), "\\.\\."), "[", 2) +regions <- lapply(regions, str_split, pattern = "_") regions <- sapply(regions, function(z) tail(z[[1]], 1)) +regions[is.na(regions)] <- "X" + +# Filter to selected region / marker +id <- rep(TRUE, ncol(x)) +if (region == "cytosolic") id <- id & regions %in% c("X", "N", "NN") +if (marker.select == "FUS") id <- id & channels %in% c("0", "2") +if (marker.select == "CD63") id <- id & channels %in% c("0", "2") +if (marker.select == "EEA1") id <- id & channels %in% c("0", "3") +x <- dplyr::select_if(x, id) -# Initialize and (optionally) drop FUS nuclear features -channels <- str_split(channels, '') -regions <- str_split(regions, '') +# " ## ASO models +# " We evaluate the degree to which ASO treatment "pushes" cell lines (disease +# " to health, health to disease) in the context of supervised models trained +# " to discriminate between ALS/FUS. Models are trained on DMSO treated cell +# " lines and evaluated on lines treated with (i) H2O (ii) NTC (iii) low ASO +# " and (iv) high ASO. +#+ leave_cell_out, fig.height = 12, fig.width = 16 +aso_fit <- function( + x, + model = irf, + model_predict = predict_irf) { + # Check for valid input + if (nrow(x) == 0) { + return(NULL) + } -nuc.fus <- mapply(function(ch, rg) { - any(ch == 2 & rg == 'D') | any(ch == 2 & rg == 'C') -}, channels, regions) + # Rank-normalize features within plate + ecdf <- function(x) rank(x) / length(x) + xfeat <- ungroup(x) %>% + dplyr::select(matches("(^X|PlateID)")) %>% + group_by(PlateID) %>% + mutate_if(is.numeric, ecdf) %>% + ungroup() %>% + dplyr::select(-PlateID) -nuc.fus[is.na(nuc.fus)] <- FALSE -if (cytosolic) x <- dplyr::select_if(x, !nuc.fus) + x <- dplyr::select(x, -matches("^X")) + x <- cbind(x, xfeat) -# Initialize and (optionally) drop EEA1 features -channels <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3) -channels[is.na(channels)] <- 0 -if (fus.only) x <- dplyr::select_if(x, !str_detect(channels, '3')) - - -#' ## ASO models -#' We evaluate the degree to which ASO treatment "pushes" cell lines (disease -#' to health, health to disease) in the context of supervised models trained -#' to discriminate between ALS/FUS. Models are trained on DMSO treated cell -#' lines and evaluated on lines treated with (i) H2O (ii) NTC (iii) low ASO -#' and (iv) high ASO. -#+ leave_cell_out -aso_fit <- function(x, - model=irf, - model_predict=predict_irf) { - - # Check for valid input - if (nrow(x) == 0) return(NULL) - # Set feature matrix and reponse vector - y <- as.numeric(x$Genetics != 'Healthy') - + y <- as.numeric(x$Genetics != "Healthy") + # Downsample for class balance xds <- downSample(x, as.factor(y)) - xx <- dplyr::select(xds, matches('^X')) + xx <- dplyr::select(xds, matches("^X")) y <- as.numeric(xds$Class) - 1 - + # Set training/test indices id.train <- which(xds$ASO == train.condition) id.test <- which(xds$ASO != train.condition) fit <- fit_model(xx, y, id.train, model, model_predict) - + predicted <- data.frame( - Ypred=fit$ypred, - Ytest=y[id.test], - Genetics=xds$Genetics[id.test], - CellLine=xds$CellLine[id.test], - Treatment=xds$ASO[id.test], - WellID=xds$WellID[id.test], - ID=xds$ID[id.test], - Marker=marker, - Plate=xds$PlateID[id.test] + Ypred = fit$ypred, + Ytest = y[id.test], + Genetics = xds$Genetics[id.test], + CellLine = xds$CellLine[id.test], + Treatment = xds$ASO[id.test], + WellID = xds$WellID[id.test], + ID = xds$ID[id.test], + Marker = marker, + Plate = xds$PlateID[id.test] ) - - return(list(fit=fit, predicted=predicted)) + + return(list(fit = fit, predicted = predicted)) } @@ -173,111 +155,198 @@ aso_fit <- function(x, set.seed(47) cell.models <- aso_fit(x, model, model_predict) -#' ### Fig 3c -#' Model prediction scores by genetics, treatment. -#+ raw_predictions, fig.height=8, fig.width=12 -if (cytosolic & fus.only) { - main <- 'FUS only,\nexcluding nuclear FUS features' -} else if (cytosolic) { - main <- 'FUS & EEA1,\nexcluding nuclear FUS features' -} else if (fus.only) { - main <- 'FUS only,\nall features' -} else { - main <- 'FUS & EEA1,\nall features' -} +# " ### Predictions by genetics +# " Model prediction scores by genetics, treatment. +#+ raw_predictions, fig.height = 8, fig.width = 12 +main <- str_c(marker.select, ", ", region) -# Comparisons of raw predictions -xplot <- filter(cell.models$predicted, !Treatment %in% train.condition) %>% +xplot <- filter(cell.models$predicted) %>% group_by(Genetics, CellLine, Treatment) %>% - summarize(Ypred=mean(Ypred), .groups='drop') %>% - mutate(ChannelS='FUS, EEA1') %>% - filter(Treatment != 'untreated') %>% - mutate(Treatment=factor(Treatment, levels=levels(Treatment)[3:5])) - -comparison <- list(c("Healthy", "FUS-ALS")) - -p <- ggboxplot(xplot, x="Genetics", fill="Genetics", y="Ypred", facet.by='Treatment', - outlier.shape=NA, coef=NULL) + - stat_boxplot(coef=NULL, aes(fill=Genetics)) + - geom_jitter(size=3, width=0.1) + - stat_compare_means(comparisons=comparison, label="p.signif", size=7, p.adjust.method='bonferroni') + - ylab(NULL) + - xlab(NULL) + - scale_fill_manual(values=col.pal) + + summarize(Ypred = mean(Ypred), .groups = "drop") + +fout <- str_c("pred_", params, ".pdf") +fout <- file.path(output.fig.dir, fout) + +if (save.fig) pdf(file = fout, height = 8, width = 8) +ggplot(xplot, aes(x = Treatment, y = Ypred)) + + geom_boxplot(aes(fill = Genetics)) + + scale_fill_manual(values = col.pal) + ylim(0:1) + - theme_ipsum( - plot_title_size=28, - axis_title_size=24, - strip_text_size=20, - axis_text_size=20, - base_size=20, - base_family='sans', - axis=TRUE, - axis_col='#000000', - grid=FALSE - ) + - theme(legend.position='none') + - theme(axis.text.x=element_blank()) + ggtitle(main) +if (save.fig) dev.off() -pdf(file.path(output.fig.dir, 'fig5c.pdf'), height=8, width=8) -plot(p) -dev.off() - -#' ### Fig 3d -#' Model prediction scores by cell line, treatment. -#+ raw_predictions_cell_line, fig.height=8, fig.width=12 -# Raw predictions -p <- filter(cell.models$predicted, !Treatment %in% train.condition) %>% - mutate(Treatment=factor(Treatment, levels=treatment.levels)) %>% +##################### +# Genetic predictions +##################### +# Average predictions by well and normalize relative to NTC +xplot <- filter(cell.models$predicted, !Treatment %in% train.condition) %>% group_by(Genetics, CellLine, Treatment, WellID, Plate) %>% - summarize(Ypred=mean(Ypred), Count=n(), .groups='drop') %>% - filter(Treatment != 'untreated') %>% - mutate(CellLineL=mutation.key[CellLine]) %>% - ggplot(aes(x=Treatment, y=Ypred, fill=Genetics)) + - geom_boxplot(outlier.shape=NA, coef=NULL) + - geom_jitter(width=0.25) + - facet_wrap(~CellLineL, nrow=2) + + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + filter(Treatment != "untreated") %>% + group_by(CellLine) %>% + mutate(Ypred = Ypred / mean(Ypred[Treatment == "NTC 10uM"])) %>% + ungroup() + +# Initialize plot range +xrange <- group_by(xplot, Genetics, Treatment) %>% + summarize(Mean = mean(Ypred), SD = sd(Ypred)) %>% + mutate(MaxRange = Mean + SD) %>% + mutate(MinRange = Mean - SD) + +xrange <- c(min(xrange$MinRange), max(xrange$MaxRange)) +eps <- diff(xrange) / 10 + +# Compute significance levels for treatment effect +lme.test <- lapply(unique(xplot$Genetics), function(g) { + xg <- filter(xplot, Genetics == g) + mixed <- lmerTest::lmer(Ypred ~ Treatment + (1 | CellLine), data = xg) + out <- summary(mixed)$coefficients[3, 5] + out <- data.frame(Treatment = "ASO 10uM", p = out) %>% + mutate(Treatment = str_remove_all(Treatment, "Treatment")) %>% + mutate(Genetics = g) + return(out) +}) + +test.plot <- rbindlist(lme.test) %>% + mutate(padj = p.adjust(p, method = correction)) %>% + mutate(padj = signif(padj, 3)) %>% + mutate(x = ifelse(Treatment == "ASO 1uM", 1, 2)) %>% + mutate(x = 2) %>% + mutate(y = max(xrange) - eps) %>% + mutate(y = ifelse(Treatment == "ASO 1uM", y + eps / 2, y)) %>% + mutate(xend = x + 1) %>% + mutate(yend = y) %>% + mutate(xtext = 2) %>% + mutate(ytext = y + eps / 4) %>% + mutate(x = 1) %>% + mutate(plab = label_pval(padj)) %>% + filter(Genetics %in% genetics.keep) + +p <- filter(xplot, Genetics %in% genetics.keep) %>% + ggbarplot( + x = "Treatment", + fill = "Genetics", + y = "Ypred", + facet.by = "Genetics", + merge = TRUE, + add = "mean_se", + error.plot = "upper_errorbar" + ) + + geom_text( + data = filter(test.plot, str_detect(Treatment, "1uM")), + aes(x = xtext, y = ytext, label = plab), size = 8 + ) + + geom_text( + data = filter(test.plot, str_detect(Treatment, "10uM")), + aes(x = xtext, y = ytext, label = plab), size = 8 + ) + + geom_segment( + data = test.plot, + aes(x = x, y = y, xend = xend, yend = yend) + ) + ylab(NULL) + xlab(NULL) + - theme(legend.position='none') + - theme(axis.text.x=element_text(angle=90)) + - scale_fill_manual(values=col.pal) + - scale_color_manual(values=col.pal) + + theme(legend.position = "none") + + theme(axis.text.x = element_text(angle = 90)) + + scale_fill_manual(values = col.pal) + + coord_cartesian(ylim = xrange) + ggtitle(main) + + geom_hline(yintercept = 1, col = "grey", lty = 2) + + ylab("i-MAP score, relative to NTC") + + theme(text = element_text(size = 22)) -pdf(file.path(output.fig.dir, 'fig5d.pdf'), height=9, width=20) +w <- ifelse(drop.sporadic, 8, 5) +fout <- str_c("adj_cell_line_", params, "_", drop.sporadic, ".pdf") +fout <- file.path(output.fig.dir, fout) +if (save.fig) pdf(file = fout, height = 8, width = w) plot(p) -dev.off() - -save(file=file.path(output.fig.dir, 'model.Rdata'), cell.models) - -# Save table of plates used -if (save.plates) { - write.table( - file=plate.file, - unique(x$PlateID), - append=TRUE, - row.names=FALSE, - col.names=FALSE, - quote=FALSE - ) - - # Save plate maps - plate.layout <- cell.models$predicted %>% - mutate(PlateID=str_remove_all(ID, '_[0-9]*-[0-9]*$')) %>% - dplyr::select(PlateID, CellLine, WellID) %>% - distinct() %>% - mutate(Row=as.numeric(str_remove_all(WellID, '-.*$'))) %>% - mutate(Col=as.numeric(str_remove_all(WellID, '^.*-'))) - - for (p in unique(plate.layout$PlateID)) { - platemap <- matrix('', nrow=16, ncol=24) - pm <- filter(plate.layout, PlateID == p) - for (i in 1:nrow(pm)) platemap[pm$Row[i], pm$Col[i]] <- pm$CellLine[i] - colnames(platemap) <- 1:24 - - print(table(platemap)) - fout <- file.path(platemap.dir, str_c(p, '.csv')) - write.csv(file=fout, platemap) - } +if (save.fig) dev.off() + + +####################### +# Cell line predictions +####################### +# Initialize plot range +xrange <- group_by(xplot, CellLine, Treatment) %>% + summarize(Mean = mean(Ypred), SD = sd(Ypred)) %>% + mutate(MaxRange = Mean + SD) %>% + mutate(MinRange = Mean - SD) + +xrange <- c(min(xrange$MinRange), max(xrange$MaxRange)) +eps <- diff(xrange) / 8 + +# Compute significance levels for treatment effect +rs.test <- lapply(unique(xplot$CellLine), function(cl) { + xc <- filter(xplot, CellLine == cl) + xt2 <- filter(xc, Treatment == "ASO 10uM") + xt1 <- filter(xc, Treatment == "ASO 1uM") + xt0 <- filter(xc, Treatment == "NTC 10uM") + + rs.20 <- wilcox.test(xt2$Ypred, xt0$Ypred, alternative = "less") + rs.10 <- wilcox.test(xt1$Ypred, xt0$Ypred, alternative = "less") + + out <- data.frame(Treatment = c("ASO 10uM")) %>% + mutate(p = c(rs.20$p.value)) %>% + mutate(CellLine = cl) + + return(out) +}) + +test.plot <- rbindlist(rs.test) %>% + mutate(padj = p.adjust(p, method = correction)) %>% + mutate(padj = signif(padj, 3)) %>% + mutate(x = ifelse(Treatment == "ASO 1uM", 1, 2)) %>% + mutate(y = max(xrange) - eps) %>% + mutate(y = ifelse(Treatment == "ASO 1uM", y + eps / 2, y)) %>% + mutate(xend = x + 1) %>% + mutate(yend = y) %>% + mutate(xtext = 2) %>% # x + 0.5) %>% + mutate(ytext = y + eps / 2) %>% + mutate(x = 1) %>% + mutate(plab = label_pval(padj)) + +if (drop.sporadic) { + test.plot <- filter(test.plot, !str_detect(CellLine, "S")) +} else { + test.plot <- filter(test.plot, str_detect(CellLine, "S")) } + +# Plot FUS, healthy +p <- filter(xplot, Genetics %in% genetics.keep) %>% + ggbarplot( + x = "Treatment", + fill = "Genetics", + y = "Ypred", + facet.by = "CellLine", + merge = TRUE, + add = "mean_se", + error.plot = "upper_errorbar", + nrow = 1 + ) + + geom_text( + data = filter(test.plot, str_detect(Treatment, "1uM")), + aes(x = xtext, y = ytext, label = plab), size = 7 + ) + + geom_text( + data = filter(test.plot, str_detect(Treatment, "10uM")), + aes(x = xtext, y = ytext, label = plab), size = 7 + ) + + geom_segment( + data = test.plot, + aes(x = x, y = y, xend = xend, yend = yend) + ) + + ylab(NULL) + + xlab(NULL) + + theme(legend.position = "none") + + theme(axis.text.x = element_text(angle = 90)) + + scale_fill_manual(values = col.pal) + + coord_cartesian(ylim = xrange) + + ggtitle(main) + + ylab("i-MAP score, relative to NTC") + + theme(text = element_text(size = 22)) + +fout <- str_c("well_f-h_", params, "_", drop.sporadic, ".pdf") +fout <- file.path(output.fig.dir, fout) +if (save.fig) pdf(file = fout, height = 8, width = 16) +plot(p) +if (save.fig) dev.off() diff --git a/paper_figures/Figure_ASO/raw_fus_phenotype.R b/paper_figures/Figure_ASO/raw_fus_phenotype.R index 3ab142a..9b0e88d 100644 --- a/paper_figures/Figure_ASO/raw_fus_phenotype.R +++ b/paper_figures/Figure_ASO/raw_fus_phenotype.R @@ -1,89 +1,70 @@ -#' # Figure 5 #' The following script generates plots for qPCR levels following ASO knockdown -#' of FUS (Fig. S5a) and FUS intensity levels in microscopy screens (Fig. 5a). -#' +#' of FUS and FUS intensity levels in microscopy screens. +#' #' Requires input datasets #' `data/2022_ASOscreen_v3_qPCR results.xls` #' `data/080122_ASOScreenV2_Cell` -#' -#' Karl Kumbier, 3/6/2023 +#' +#' Karl Kumbier, 12/19/2023 library(data.table) -library(readxl) library(tidyverse) library(tidytext) library(ggsci) library(ggpubr) -library(hrbrthemes) - -theme_set( - theme_ipsum( - plot_title_size=28, - axis_title_size=24, - strip_text_size=24, - axis_text_size=22, - base_size=22, - base_family='sans', - axis=TRUE, - axis_col='#000000', - grid=FALSE - ) -) +library(lmerTest) + +theme_set(theme_bw(base_size = 22)) # Set parameters for analysis save.fig <- TRUE -treatment.levels <- c('untreated', 'H2O', 'NTC 10uM', 'ASO 1uM', 'ASO 10uM') -heat.pal <- c('#FFFFFF', pal_material('blue-grey')(10)) +treatment.levels <- c("untreated", "NTC 10uM", "ASO 1uM", "ASO 10uM") +heat.pal <- c("#FFFFFF", pal_material("blue-grey")(10)) +correction <- "BH" # Set paths to data directories -fig.base.dir <- Sys.getenv('ALS_PAPER') -data.base.dir <- Sys.getenv('ALS_DATA') -figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_ASO') +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_ASO") -source(file.path(fig.base.dir, 'paper_figures', 'utilities.R')) -source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R')) +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) +source(file.path(fig.base.dir, "paper_figures", "color_palette.R")) # Set paths to data directories -data.dir <- file.path(data.base.dir, '080122_ASOScreenV2', 'Cell_Filtered') -library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv') -qpcr.file <- file.path(fig.base.dir, 'data', '2022_ASOscreen_qPCR.xls') - -output.fig.dir <- file.path(figure.dir, 'fig/') -dir.create(output.fig.dir, recursive=TRUE, showWarnings=FALSE) +data.dir <- file.path(data.base.dir, "120423_ASO14", "Cell_Filtered") +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") +qpcr.file <- file.path(fig.base.dir, "data", "20231007_ASO14_qPCRresults.csv") +output.fig.dir <- file.path(figure.dir, "fig/") +dir.create(output.fig.dir, recursive = TRUE, showWarnings = FALSE) # Load mutation table xcell <- fread(library.file) %>% - dplyr::rename(Figure=`Figure Names`) %>% - dplyr::rename(CellLine=`Cell line`) + dplyr::rename(Figure = `Figure Names`) %>% + dplyr::rename(CellLine = `Cell line`) mutation.key <- setNames(xcell$Figure, xcell$CellLine) ################################################################################ # FUS qPCR by cell line. ################################################################################ -qpcr <- read_excel(qpcr.file, sheet=5) - -colnames(qpcr)[1] <- 'CellLine' -colnames(qpcr) <- str_remove_all(colnames(qpcr), ' ') - -p <- reshape2::melt(qpcr) %>% - filter(variable != 'Untreated') %>% - mutate(variable=str_replace_all(variable, 'ASO', ' ASO')) %>% - mutate(variable=str_replace_all(variable, 'NTC', 'NTC 10uM')) %>% - mutate(variable=str_replace_all(variable, 'low ASO', 'ASO 1uM')) %>% - mutate(variable=str_replace_all(variable, 'high ASO', 'ASO 10uM')) %>% - mutate(CellLineL=mutation.key[CellLine]) %>% - mutate(ASO=factor(variable, levels=treatment.levels)) %>% - mutate(CellLineL=str_remove_all(CellLineL, '-.*$')) %>% - ggplot(aes(x=CellLine, y=value, fill=ASO)) + - geom_bar(stat='identity', position=position_dodge(), color='black') + - scale_fill_manual(values=heat.pal[c(3, 6, 9)]) + +qpcr <- fread(qpcr.file) + +p <- qpcr %>% + filter(ASO != "Un") %>% + mutate(ASO = str_replace_all(ASO, "1uM", "ASO 1uM")) %>% + mutate(ASO = str_replace_all(ASO, "10uM", "ASO 10uM")) %>% + mutate(ASO = str_replace_all(ASO, "NTC", "NTC 10uM")) %>% + mutate(CellLineL = mutation.key[CellLine]) %>% + mutate(ASO = factor(ASO, levels = treatment.levels)) %>% + mutate(CellLineL = str_remove_all(CellLineL, "-.*$")) %>% + ggplot(aes(x = CellLine, y = `FUS (normalized to NTC)`, fill = ASO)) + + geom_bar(stat = "identity", position = position_dodge(), color = "black") + + scale_fill_manual(values = heat.pal[c(3, 6, 9)]) + xlab(NULL) + - ylab('qPCR-FUS') + - facet_wrap(~CellLineL, nrow=2, scales='free_x') + - theme(axis.text.x=element_blank()) + facet_wrap(~CellLineL, nrow = 2, scales = "free_x") + + theme(axis.text.x = element_blank()) -pdf(file.path(output.fig.dir, 'fig_supp_qpcr.pdf'), height=8, width=16) +pdf(file.path(output.fig.dir, "fig_supp_qpcr.pdf"), height = 8, width = 16) plot(p) dev.off() @@ -91,106 +72,121 @@ dev.off() ################################################################################ # Raw nuclear and cytoplasmic FUS intensities ################################################################################ -load(file.path(data.dir, 'profiles_raw.Rdata')) -x <- mutate(x, ASO=str_remove_all(ASOtreatment, 'EP_')) %>% - mutate(ASO=str_replace_all(ASO, 'ASO', ' ASO')) %>% - mutate(ASO=str_replace_all(ASO, 'NTC', 'NTC 10uM')) %>% - mutate(ASO=str_replace_all(ASO, 'low ASO', 'ASO 1uM')) %>% - mutate(ASO=str_replace_all(ASO, 'high ASO', 'ASO 10uM')) %>% - mutate(ASO=factor(ASO, treatment.levels)) %>% - mutate(Genetics=str_replace_all(Genetics, 'FUS', 'FUS-ALS')) - -# Plot change in nuclear/cytoplasmic FUS -fselect <- c('X..Iav_in_dna_region..2', 'X..Iav_in_non_dna_region..2') -mselect <- c('Genetics', 'ASO') +load(file.path(data.dir, "profiles_raw.Rdata")) +x <- mutate(x, ASO = str_remove_all(ASOtreatment, "EP_")) %>% + mutate(ASO = str_replace_all(ASO, "ASO", " ASO")) %>% + mutate(ASO = str_replace_all(ASO, "NTC", "NTC 10uM")) %>% + mutate(ASO = str_replace_all(ASO, "low ASO", "ASO 1uM")) %>% + mutate(ASO = str_replace_all(ASO, "high ASO", "ASO 10uM")) %>% + mutate(ASO = factor(ASO, treatment.levels)) %>% + filter(Genetics != "sporadic") + +params.plot <- list( + list( + fselect = as.symbol("X..Iav_in_dna_region..2"), + main = "Nuclear FUS, average pixel intensity", + fout = "nuclear_fus.pdf" + ), + list( + fselect = as.symbol("X..Iav_in_non_dna_region..2"), + main = "Cytosolic FUS, average pixel intensity", + fout = "cytosolic_fus.pdf" + ), + list( + fselect = as.symbol("X..Imax_in_non_dna_region..3"), + main = "Cytosolic EEA1, max pixel intensity", + fout = "cytosolic_eea1.pdf" + ) +) + +mselect <- c("Genetics", "ASO") # Compute well-level median intensities -xgroup <- group_by(x, ID, ASO, Genetics) %>% - filter(ASO != 'H2O') %>% - filter(ASO != 'untreated') %>% - summarize_if(is.numeric, median) %>% - ungroup() %>% - dplyr::select(-ID) - -comparisons <- list(c("NTC 10uM", "ASO 1uM"), c("ASO 1uM", "ASO 10uM")) - -p1 <- dplyr::rename(xgroup, NucFUS=`X..Iav_in_dna_region..2`) %>% - dplyr::select(one_of(c('NucFUS', mselect))) %>% - ggbarplot(x="ASO", y="NucFUS", fill="ASO", facet.by="Genetics", add="mean_sd") + - ylab('Intensity') + - scale_fill_manual(values=heat.pal[c(3, 6, 9)], labels=NULL) + - xlab(NULL) + - ylab('Nuclear FUS pixel intensity') + - theme_ipsum( - plot_title_size=28, - axis_title_size=24, - strip_text_size=20, - axis_text_size=20, - base_size=20, - base_family='sans', - axis=TRUE, - axis_col='#000000', - grid=FALSE - ) + - theme(legend.position='none') + - stat_compare_means(comparisons=comparisons, label="p.signif", size=7, p.adjust.method='bonferroni') + - theme(axis.text.x=element_blank()) - -p2 <- dplyr::rename(xgroup, CytFUS=`X..Iav_in_non_dna_region..2`) %>% - dplyr::select(one_of(c('CytFUS', mselect))) %>% - ggbarplot(x="ASO", y="CytFUS", fill="ASO", facet.by="Genetics", add="mean_sd")+ - ylab('Intensity') + - scale_fill_manual(values=heat.pal[c(3, 6, 9)], labels=NULL) + - xlab(NULL) + - ylab('Cytosolic FUS pixel intensity') + - theme_ipsum( - plot_title_size=28, - axis_title_size=24, - strip_text_size=20, - axis_text_size=20, - base_size=20, - base_family='sans', - axis=TRUE, - axis_col='#000000', - grid=FALSE - ) + - theme(legend.position='none') + - stat_compare_means(comparisons=comparisons, label="p.signif", size=7, p.adjust.method='bonferroni') + - theme(axis.text.x=element_blank()) - -p3 <- mutate(xgroup, ratio=(`X..Iav_in_dna_region..2` / `X..Iav_in_non_dna_region..2`)) %>% - dplyr::select(one_of(c('ratio', mselect))) %>% - ggbarplot(x="ASO", y="ratio", fill="ASO", facet.by="Genetics",add="mean_sd")+ - ylab('Intensity') + - scale_fill_manual(values=heat.pal[c(3, 6, 9)]) + - labs(fill=NULL) + - xlab(NULL) + - ylab('Nuclea') + - theme_ipsum( - plot_title_size=28, - axis_title_size=24, - strip_text_size=20, - axis_text_size=20, - base_size=20, - base_family='sans', - axis=TRUE, - axis_col='#000000', - grid=FALSE - ) + - stat_compare_means(comparisons=comparisons, label="p.signif", size=7, p.adjust.method='bonferroni') + - ylab('Nuclear / cytosolic FUS pixel intensity ratio') + - theme(axis.text.x=element_blank()) - - - -if (save.fig) pdf(file=str_c(output.fig.dir, 'fig5a_nuc.pdf'), height=8, width=10) -plot(p1) -if (save.fig) dev.off() - -if (save.fig) pdf(file=str_c(output.fig.dir, 'fig5a_cyt.pdf'), height=8, width=10) -plot(p2) -if (save.fig) dev.off() - -if (save.fig) pdf(file=str_c(output.fig.dir, 'fig5a_rat.pdf'), height=8, width=10) -plot(p3) -if (save.fig) dev.off() +for (prm in params.plot) { + fselect <- prm$fselect + main <- prm$main + fout <- file.path(output.fig.dir, prm$fout) + + xplot <- group_by(x, ID, ASO, Genetics, CellLine, WellID, PlateID) %>% + summarize_if(is.numeric, mean) %>% + ungroup() %>% + dplyr::select(-ID) %>% + filter(ASO != "untreated") %>% + dplyr::rename(Treatment = ASO) + + # Initialize plot range + xrange <- group_by(xplot, Genetics, Treatment) %>% + summarize( + Mean = mean(!!fselect), + SD = sd(!!fselect) + ) %>% + mutate(MaxRange = Mean + SD) %>% + mutate(MinRange = Mean - SD) + + + xrange <- c(min(xrange$MinRange), max(xrange$MaxRange)) + + eps <- diff(xrange) / 10 + + # Compute significance levels for treatment effect + lme.test <- lapply(unique(xplot$Genetics), function(g) { + xg <- filter(xplot, Genetics == g) + f <- str_c(as.character(fselect), "~ Treatment + (1 | CellLine)") + mixed <- lmer(as.formula(f), data = xg) + out <- summary(mixed)$coefficients[3, 5] + out <- data.frame(Treatment = "ASO 10uM", p = out) %>% + mutate(Treatment = str_remove_all(Treatment, "Treatment")) %>% + mutate(Genetics = g) + return(out) + }) + + test.plot <- rbindlist(lme.test) %>% + mutate(padj = p.adjust(p, method = correction)) %>% + mutate(padj = signif(padj, 3)) %>% + mutate(x = ifelse(Treatment == "ASO 1uM", 1, 2)) %>% + mutate(x = 2) %>% + mutate(y = max(xrange) - eps) %>% + mutate(y = ifelse(Treatment == "ASO 1uM", y + eps / 2, y)) %>% + mutate(xend = x + 1) %>% + mutate(yend = y) %>% + mutate(xtext = 2) %>% + mutate(ytext = y + eps / 4) %>% + mutate(x = 1) %>% + mutate(plab = label_pval(padj)) + + p <- xplot %>% + ggbarplot( + x = "Treatment", + fill = "Genetics", + y = as.character(fselect), + facet.by = "Genetics", + merge = TRUE, + add = "mean_se", + error.plot = "upper_errorbar" + ) + + geom_text( + data = filter(test.plot, str_detect(Treatment, "1uM")), + aes(x = xtext, y = ytext, label = plab), size = 8, + ) + + geom_text( + data = filter(test.plot, str_detect(Treatment, "10uM")), + aes(x = xtext, y = ytext, label = plab), size = 8, + ) + + geom_segment( + data = test.plot, + aes(x = x, y = y, xend = xend, yend = yend) + ) + + ylab(main) + + xlab(NULL) + + theme(legend.position = "none") + + theme(axis.text.x = element_text(angle = 90)) + + scale_fill_manual(values = col.pal) + + coord_cartesian(ylim = xrange) + + geom_hline(yintercept = 1, col = "grey", lty = 2) + + theme(text = element_text(size = 22)) + + + if (save.fig) pdf(file = fout, height = 8, width = 10) + plot(p) + if (save.fig) dev.off() +} diff --git a/paper_figures/Figure_Scores/.DS_Store b/paper_figures/Figure_Scores/.DS_Store index d2e5b0a81e916e733769dfe75dd04e96ac97a89e..a01f7271dc3889f2e9fb98bf20ce79252f7d94f1 100644 GIT binary patch literal 10244 zcmeGhZEPG@apvsAyZD-{9XoL@PHs~d63CI>`71Uw^7-tfNt58zJ|}jZPHajr;rLlDn#&u(ng?GD$rm20fDG!g@jb*?b|)? zo#O&RYSZSP^mg8RGxPT5>%E;h0KimPNdU+IAW&yeY6S?109^!wjp12e)tYDpsYj{P zSOS>W@|99fH*>3K8w)p{214|cnEjfmH-QFL(0=%0uB25Cax82=B&8Oyew`!a2?#V)kQg zI|u#~l>!YJvI*wNw1wQF?FZ)bu3>%325e}cp@ExH1K!yo)ZO(y@H9LN&%w{(*YG<^g8u}6hqvHucnAK4>u>`~*oN)64ZCp%Mlps1IEedj z43FY5oWSEajb|}~S5x<0A#;@Q*_y~RzAI0zD6Zj-v!5`vJ@G8E7 zui`a)1AmWy#5eKJ_z!#=-wBDK_RywKM`%}QcPJ71OlTyOWIvih!@GfCme2iX=rE1E zGKy~dWw6lKt{*VbgLfW06u#@m4Vc$>nWv+3^A?%K&AvO5RR`n#=(}LhwOwzni2>J} zUDMZlA_da6?(N~|G`YKaPkjLTfho1H@Dr{v>YcZSqtZGG2tNJ9E}E{VVBpg|v1n8Z zQ4I2FJP{2`O%!3G%i9`@hP~m?a5yYAQ>1as{_QjcZ!(&qO<3Fr|ECB45?+PZ;7#}o zdGNc~j3Rd7PTYl`CVw5lQ5?quc#wSdZk(c$4)fb{B2C>R}U`q>u%i_v1+urm2%9Lg3rs3D01kk~R0@a{waOjy| zRf(b!<8S@Tq(x{I&P>s5!B3G?)|z9pv}x)&f1J&opduXA2**sr$!c^?xeRn*6Uph5 z${IgYlR=I2gZtSrj^ZZvX6L=Dc1s z%&#(W=o`|H^jrA6`n;b;tK_PayGndx4?5}RwLq2*;5;mlc`Bd=4D^D3c)f21+Pbuq z=Nd-orx70@T*D&;x|56RXZX%~Ti7g8@cBTwWRdaQb6ao;o?#s{ejCrSzl; zeyX~TP_1(Z&$PMS&%AW+!`_+ZR>6zr%9*wr8$8o$cdgUuMqX2g40(_VMJTxe#7;Pq zJj(uvc+`K8vQ8)Kt2(&=`NO)I_ih%-$B{+r-gn*R>a~V8psf$&xEbj0i^+ZQI6G|; zeZ7M*c{skW?n3x)j++{xY8-U$>0gl8HTa;LK0ew1;zQl8o7T~+IO|$8H7>jNWv1Pl zJ?f0)UD0*sVgDTk&Ervh(1sa`55(l5M4xMz-aeWP#(V!)!_b#RgTAl#^{zgtg_I+EyWWpE5*At pA6oN^J&Y$w>kKY)NF!?R`yT-{_W`9Fjr+fG|JQw0+zk2uzX7uz;EDhM delta 1831 zcmd^9UrbY19R7W`4YuJrZh=yYD_mD3Zfi<`0(FY98;VYi1e`M)3&DE9&bGJE0z*O? zqGpp#qn>TANYs}-Y;JYq!QlR6*@I@;mY5|-T=w8Hvj<~z$-**qzZRlbsZI90b92wV z=bYbn&hPi-eBYt=L+!^MkWciNo$xw^uiES+ZkY~53ejE)Q;d2jNm|-wDKn>4^HEcJ znZ;~LWrB(+L^?kM#z5+(1jQK?B8|M%u_dC1_C)kO9M3yakYDfpY~w$o<*+W%zc|lGf!&K zSM!wTb8VBZkS-yRYsJV5rn_2l#R#ABtaW!dq{lkL2Q)ERGE&*C6c2^9;T`d0SdS%k zX#)xVx;JEtAWGj>CXiCV^|+{p0!nxpe0FMo*Qjz+$~p?yOj` z<^@?+%u2$#FkJ8AAuSGGd`Yzl-{3N?U{+kSyHiW9r%^n^P3^%yu4ToOBkrMM^2Hf-g6Ug)LybBqSP+WX_;e?x z#eHXgjbn7QuEF2rQGG^jjEz|>c6aUCj_&=DXiPsMRywLu6TDZ!1I_hQ$S4P*a`WTs z#@e}s_@Q~KplhhLVulxT+_TGO19!|fQ diff --git a/paper_figures/Figure_Scores/README.md b/paper_figures/Figure_Scores/README.md new file mode 100644 index 0000000..0d35ce1 --- /dev/null +++ b/paper_figures/Figure_Scores/README.md @@ -0,0 +1,33 @@ +# FUS-ALS imaging scores +Scripts in this directory train and evaluate imaging-based FUS-ALS vs. Healthy +classifiers and generate figures summarizing model performance. Data are derived +from: + +- `data_profiles/032422_WTFUSSporadics/Cell_Filtered` +- `data_profiles/032422_WTFUSSporadics/Well` + +## Modeling +Models can be fit using the `train_fus_models.R` script. + +### Parameters in modeling scripts +- `cytosolic`: TRUE/FALSE, indicating whether models should be restricted to + cytosolic FUS features. + +- `model`: modeling function that takes arguments `x` (feature matrix) and `y` + (response vector) and returns a fitted model. + +- `model_predict: model prediction function that takes arguments `model` (a + fitted mode) and `x` (feature matrix) and returns predictions on `x`. + +## Figures +Scripts to generate figures for the marker search screens require fitted models, +generated using the script described above. `fig_scores.R` generates summary +plots based on the FUS-ALS vs. WT classifier that appear in the main text of the +manuscript. To generate summaries for different models (e.g., irf, logistic +regression, neural network), change the `model.str` argument in this script. +`fig_coordinate_space.R` generates plots of the cell lines in 2D coordinate +space based on raw features. `fig_qc_pca.R` plots points in PCA space. +`fig_qc_plates.R` runs models and evaluates performance on hold-out plates. +`fig_model_comparisons.R` generates plots comparing predictions across logistic +regression, irf, and neural networks. `run_models.sh` provides a shell wrapper +to iterate over models and cytosolic / full cell analyses. diff --git a/paper_figures/Figure_Scores/fig_coordinate_space.R b/paper_figures/Figure_Scores/fig_coordinate_space.R new file mode 100644 index 0000000..49e2247 --- /dev/null +++ b/paper_figures/Figure_Scores/fig_coordinate_space.R @@ -0,0 +1,104 @@ +################################################################################ +#+ setup, message=FALSElibrary(tidyverse) +################################################################################ +library(tidyverse) +library(data.table) +library(gridExtra) +library(ggsci) +library(patchwork) +library(Cairo) + +theme_set(theme_bw(base_size = 22)) + + +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores") + +source(file.path(fig.base.dir, "paper_figures", "color_palette.R")) + +# Set paths to data directories +data.dir <- file.path(data.base.dir, "032422_WTFUSSporadics", "Well") +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") + +# Fit model to select data +output.fig.dir <- file.path(figure.dir, "fig/") +load(file.path(data.dir, "profiles_raw.Rdata")) + +# Analysis parameters +marker <- "FUS_EEA1" +f1 <- "X..Iav_in_non_dna_region..2" # x-axis feature +f2 <- "X..Iav_in_cell_region..3" # y-axis feature +save.fig <- TRUE + +################################################################################ +# Load in raw feature and merge with cell line metadata +################################################################################ +# Read in cell line genetics +xgen <- fread(library.file) %>% + dplyr::rename(CellLine = `Cell line`, Genetics = `Disease Gene`) %>% + mutate(Genetics = ifelse(Genetics == "FUS", "FUS-ALS", Genetics)) %>% + mutate(Genetics = ifelse(Genetics == "WT", "Healthy", Genetics)) + +x <- left_join(x, xgen, by = "CellLine") + +# Filter to FUS/WT cell lines +x <- filter(x, Genetics %in% c("FUS-ALS", "Healthy")) + +################################################################################ +# Plot raw feature distributions +################################################################################ +pxlim <- range(x[[f1]]) +pylim <- range(x[[f2]]) + +# Generate raw feature plot +g <- "FUS-ALS" + +# Compute centroids & standard errors +xmean <- dplyr::select(x, one_of(f1, f2, "CellLine", "Genetics", "Mutation")) %>% + group_by(CellLine, Genetics, Mutation) %>% + summarize_if(is.numeric, mean) %>% + mutate(Mutation = str_remove_all(Mutation, "FUS-")) %>% + mutate(Mutation = str_remove_all(Mutation, "FUS ")) + +pxlim <- c(min(x[[f1]]), max(x[[f1]])) +pylim <- c(min(x[[f2]]), max(x[[f2]])) + +# Generate plots +s1 <- ggplot(xmean, aes_string(x = f1, y = f2)) + + geom_point(size = 4, aes(col = Genetics)) + + geom_point(shape = 1, size = 4) + + geom_point(data = x, alpha = 0.5, aes(col = Genetics)) + + xlim(pxlim) + + ylim(pylim) + + xlab("Cytosolic FUS intensity (KS)") + + ylab("Cellular EEA1 intensity (KS)") + + scale_color_manual(values = col.pal) + + theme(legend.position = "none") + + theme(plot.margin = ggplot2::margin(0, 0, 0, 0)) + +d1 <- filter(xmean, Genetics %in% c("Healthy", g)) %>% + ggplot(aes(fill = Genetics)) + + geom_boxplot(aes_string(x = f1), coef = NULL) + + scale_fill_manual(values = col.pal) + + theme_void() + + theme(legend.position = "none") + + theme(plot.title = element_text(face = "bold", size = 18)) + + xlim(pxlim) + +d2 <- filter(xmean, Genetics %in% c("Healthy", g)) %>% + ggplot(aes(fill = Genetics)) + + geom_boxplot(aes_string(x = f2), coef = NULL) + + scale_fill_manual(values = col.pal) + + theme_void() + + theme(legend.position = "none") + + coord_flip() + + xlim(pylim) + +pg <- d1 + plot_spacer() + s1 + d2 + + plot_layout(ncol = 2, nrow = 2, widths = c(4, 0.5), heights = c(0.5, 4)) + +fout <- file.path(output.fig.dir, "fig_projection.pdf") +if (save.fig) pdf(file = fout, height = 8, width = 8) +plot(pg) +if (save.fig) dev.off() diff --git a/paper_figures/Figure_Scores/fig_model_comparison.R b/paper_figures/Figure_Scores/fig_model_comparison.R new file mode 100644 index 0000000..210fef1 --- /dev/null +++ b/paper_figures/Figure_Scores/fig_model_comparison.R @@ -0,0 +1,114 @@ +################################################################################ +#+ setup, message=FALSE +################################################################################ +library(tidyverse) +library(data.table) +library(ggsci) +library(PRROC) + +col.pal <- pal_jama()(7)[c(2, 6, 7)] + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores") +output.fig.dir <- file.path(figure.dir, "fig_comparisons") +dir.create(output.fig.dir, recursive = FALSE) + +source(file.path(fig.base.dir, "paper_figures", "theme.R")) +theme_set(theme_als()) + +save.fig <- TRUE +models <- c("fcnn", "logistic", "irf") +cytosolic <- FALSE + +# Load predictions from each model +xpred <- lapply(models, function(m) { + subdir <- ifelse(cytosolic, "fig_cytosolic", "fig_full") + load(file.path(figure.dir, m, subdir, "model.Rdata")) + return(mutate(cell.models$ypred, Model = m)) +}) + + +################################################################################ +#' # ROC curves +#+ roc, message=FALSE +################################################################################ +roc <- function(ypred, ytrue) { + roc.curve(scores.class0 = ypred, weights.class0 = ytrue, curve = TRUE) +} + +roc.cell <- rbindlist(xpred) %>% + filter(Genetics %in% c("FUS-ALS", "Healthy")) %>% + group_by(Model) %>% + summarize(ROC = list(roc(Ypred, Ytest))) %>% + mutate(Type = "Single cell") + +# Aggregate predictions by single well +roc.well <- rbindlist(xpred) %>% + filter(Genetics %in% c("FUS-ALS", "Healthy")) %>% + group_by(CellLine, WellID, Ytest, PlateID, Model) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + group_by(Model) %>% + summarize(ROC = list(roc(Ypred, Ytest))) %>% + mutate(Type = "Well replicate") + +# Aggregate predictions by cell line +roc.line <- rbindlist(xpred) %>% + filter(Genetics %in% c("FUS-ALS", "Healthy")) %>% + group_by(CellLine, Ytest, Model) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + group_by(Model) %>% + summarize(ROC = list(roc(Ypred, Ytest))) %>% + mutate(Type = "Cell line") + +rocs <- rbind(roc.cell, roc.well, roc.line) + +# Initialize type labels +aucs <- mutate(rocs, AUC = sapply(ROC, function(z) z$auc)) %>% + mutate(AUC = round(AUC, 2)) %>% + mutate(AUC = str_c("AUROC = ", AUC)) %>% + mutate(X = 0.75) %>% + mutate(Y = ifelse(Model == "fcnn", 0.2, 0.15)) %>% + mutate(Y = ifelse(Model == "logistic", 0.1, Y)) %>% + mutate(AUC=Model) + +rocs <- mutate(rocs, FPR = sapply(ROC, function(z) z$curve[, 1])) %>% + mutate(TPR = sapply(ROC, function(z) z$curve[, 2])) + +xplot <- lapply(1:nrow(rocs), function(i) { + out <- data.frame(FPR = rocs$FPR[[i]], TPR = rocs$TPR[[i]]) %>% + mutate(Model = rocs$Model[i]) %>% + mutate(Type = rocs$Type[i]) + return(out) +}) + +type.levels <- c("Single cell", "Well replicate", "Cell line") +aucs <- mutate(aucs, Type = factor(Type, levels = type.levels)) +xplot <- rbindlist(xplot) %>% mutate(Type = factor(Type, levels = type.levels)) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_roc_comparison.pdf"), h = 8, w = 16) +ggplot(xplot, aes(col = Model)) + + geom_line(aes(x = FPR, y = TPR), size = 2) + + geom_abline(col = "grey", lty = 2) + + scale_color_manual(values = col.pal) + + xlab("False positive rate") + + ylab("True positive rate") + + theme(legend.position = "none") + + labs(color = NULL) + + facet_wrap(~Type) + + geom_text(data = aucs, aes(x = X, y = Y, label = AUC)) +if (save.fig) dev.off() + + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_roc_comparison_well.pdf"), h = 8, w = 8) +filter(xplot, Type == "Well replicate") %>% + ggplot(aes(col = Model)) + + geom_line(aes(x = FPR, y = TPR), size = 2) + + scale_color_manual(values = col.pal) + + xlab("False positive rate") + + ylab("True positive rate") + + theme(legend.position = c(0.85, 0.15)) + + labs(col = NULL) + + theme(legend.text = element_text(size = 24)) +if (save.fig) dev.off() diff --git a/paper_figures/Figure_Scores/fig_qc_pca.R b/paper_figures/Figure_Scores/fig_qc_pca.R new file mode 100644 index 0000000..bcd097f --- /dev/null +++ b/paper_figures/Figure_Scores/fig_qc_pca.R @@ -0,0 +1,123 @@ +#+ setup, message=FALSE +library(tidyverse) +library(data.table) +library(ggsci) +library(gridExtra) +library(viridis) + +theme_set(theme_bw(base_size = 22)) +size <- 16 + +# Paths to data directories +als.dir <- Sys.getenv("ALS_PAPER") +figure.dir <- file.path(als.dir, "paper_figures", "Figure_Scores", "fig_qc") +dir.create(figure.dir, showWarnings = FALSE) + +# Source utility functions +source(file.path(als.dir, "paper_figures", "utilities.R")) +source(file.path(als.dir, "paper_figures", "color_palette.R")) +source(file.path(fig.base.dir, "paper_figures", "theme.R")) +theme_set(theme_als()) + +library.file <- file.path(als.dir, "data", "Fibroblasts_Library.csv") + +# Parameters for analysis + +# Initialize cell line metadata table +xcell <- fread(library.file) %>% + dplyr::rename(CellLine = `Cell line`) %>% + mutate(Sex = toupper(Sex)) %>% + dplyr::rename(Genetics = `Disease Gene`) %>% + dplyr::rename(Figure = `Figure Names`) %>% + mutate(Genetics = str_replace_all(Genetics, "ALS with no known mutation", "sporadic")) + +data.dir <- file.path(als.dir, "data_profiles", "032422_WTFUSSporadics", "Cell_Filtered") +load(file.path(data.dir, "profiles_raw.Rdata")) + +# Fix typo in F7 cell line passage number +x <- left_join(x, xcell, by = "CellLine") %>% + mutate(Plate = str_remove_all(PlateID, "^.*plate_")) %>% + mutate(DiseaseStatus = ifelse(Genetics.x != "Healthy", "ALS", "Healthy")) %>% + mutate(ALS = Genetics.x != "Healthy") %>% + dplyr::rename(Age = `Age of biopsy`) %>% + dplyr::rename(Passage = CellLinePassageNumber) %>% + dplyr::rename(Genetics = Genetics.x) %>% + dplyr::rename(Site = `Origin Institution`) %>% + mutate(Passage = ifelse(CellLine == "F7", 20, Passage)) + + +################################################################################ +# PCA plots +################################################################################ +xfeat <- dplyr::select(x, matches("^X")) +xpca <- prcomp(apply(xfeat, MAR = 2, rank)) +pct.var <- xpca$sdev^2 / sum(xpca$sdev^2) + +plot_pc <- function(x, var, pal, pct.var, facet = FALSE) { + # Wrapper function to plot PCs 1-4 + + cols <- c("PlateID", "WellID", var) + if (facet) cols <- c(cols, "ALS") + pct.var <- round(pct.var, 3) * 100 + + p1 <- ggplot(x, aes(x = PC1, y = PC2)) + + geom_point(aes_string(col = var)) + + scale_y_continuous(labels = scales::scientific, breaks = c(-2e5, 0, 2e5)) + + scale_x_continuous(labels = scales::scientific, breaks = c(-4e5, 0, 4e5)) + + xlab(str_c("PC1, (", pct.var[1], "%)")) + + ylab(str_c("PC2, (", pct.var[2], "%)")) + + p2 <- ggplot(x, aes(x = PC3, y = PC4)) + + geom_point(aes_string(col = var)) + + scale_y_continuous(labels = scales::scientific, breaks = c(-2e5, 0, 2e5)) + + scale_x_continuous(labels = scales::scientific, breaks = c(-5e5, 0, 5e5)) + + xlab(str_c("PC3, (", pct.var[3], "%)")) + + ylab(str_c("PC4, (", pct.var[4], "%)")) + + if (is.numeric(x[[var]])) { + p1 <- p1 + scale_color_gradientn(colors = pal, limits = force) + p2 <- p2 + scale_color_gradientn(colors = pal, limits = force) + } else { + p1 <- p1 + scale_color_manual(values = pal, limits = force) + p2 <- p2 + scale_color_manual(values = pal, limits = force) + } + + if (!facet) { + return(grid.arrange(p1, p2, nrow = 1)) + } +} + +# Compute well-level averages of PCA-projections +cols <- c("PlateID", "WellID", "ALS", "Genetics", "Sex", "Age", "Passage", "Plate", "Site") + +xplot <- cbind(xpca$x, x) %>% + group_by_at(vars(one_of(cols))) %>% + summarize_if(is.numeric, mean) + +pdf(file.path(figure.dir, "pca_als.pdf"), height = size / 2, width = size) +plot_pc(xplot, "ALS", pal_jama()(7), pct.var) +dev.off() + +pdf(file.path(figure.dir, "pca_gene.pdf"), height = size / 2, width = size) +plot_pc(xplot, "Genetics", col.pal, pct.var) +dev.off() + +pdf(file.path(figure.dir, "pca_site.pdf"), height = size / 2, width = size) +plot_pc(xplot, "Site", pal_jama()(7), pct.var) +dev.off() + +pdf(file.path(figure.dir, "pca_sex.pdf"), height = size / 2, width = size) +plot_pc(xplot, "Sex", pal_jama()(7), pct.var) +dev.off() + +pdf(file.path(figure.dir, "pca_age.pdf"), height = size / 2, width = size) +plot_pc(xplot, "Age", viridis(10), pct.var) +dev.off() + +pdf(file.path(figure.dir, "pca_passage.pdf"), height = size / 2, width = size) +plot_pc(xplot, "Passage", viridis(10), pct.var) +dev.off() + +pdf(file.path(figure.dir, "pca_plate.pdf"), height = size / 2, width = size) +plot_pc(xplot, "Plate", pal_jama()(7), pct.var) +dev.off() diff --git a/paper_figures/Figure_Scores/fig_qc_plates.R b/paper_figures/Figure_Scores/fig_qc_plates.R new file mode 100644 index 0000000..c8b64e8 --- /dev/null +++ b/paper_figures/Figure_Scores/fig_qc_plates.R @@ -0,0 +1,178 @@ +#+ setup, message=FALSE +library(tidyverse) +library(data.table) +library(ggsci) +library(gridExtra) +library(caret) + +theme_set(theme_bw(base_size = 22)) + +# Paths to data directories +als.dir <- Sys.getenv("ALS_PAPER") +figure.dir <- file.path(als.dir, "paper_figures", "Figure_Scores", "fig_qc") +dir.create(figure.dir, showWarnings = FALSE) + +# Source utility functions +source(file.path(als.dir, "paper_figures", "utilities.R")) +source(file.path(als.dir, "paper_figures", "color_palette.R")) +library.file <- file.path(als.dir, "data", "Fibroblasts_Library.csv") + +# Parameters for analysis +save.fig <- TRUE # save figures or generate in notebook +heat.pal <- pal_material("blue-grey")(10) + +model <- irf +model_predict <- predict_irf + +# Initialize cell line metadata table +xcell <- fread(library.file) %>% + dplyr::rename(CellLine = `Cell line`) %>% + mutate(Sex = toupper(Sex)) %>% + dplyr::rename(Genetics = `Disease Gene`) %>% + dplyr::rename(Figure = `Figure Names`) %>% + mutate(Genetics = str_replace_all(Genetics, "WT", "Healthy")) %>% + mutate(Genetics = str_replace_all(Genetics, "FUS", "FUS-ALS")) + + +#' # Plate generalization +#+ cell_counts +data.dir <- file.path(als.dir, "data_profiles", "032422_WTFUSSporadics", "Cell_Filtered") +load(file.path(data.dir, "profiles_FUS_EEA1.Rdata")) + +plate_fit <- function(x, + model = irf, + model_predict = predict_irf) { + # Wrapper function to call plate_fit_single predict over all plates + + # Initialize cell lines to evaluate + plates <- unique(x$PlateID) + + # Rank-normalize features within plate + ecdf <- function(x) rank(x) / length(x) + xfeat <- ungroup(x) %>% + dplyr::select(matches("(^X|PlateID)")) %>% + group_by(PlateID) %>% + mutate_if(is.numeric, ecdf) %>% + ungroup() %>% + dplyr::select(-PlateID) + + x <- dplyr::select(x, -matches("^X")) + x <- cbind(x, xfeat) + + # Drop sporadic lines from model trai + fits <- lapply(plates, plate_fit_single, + x = x, + model = model, + model_predict = model_predict + ) + + # Initialize importance + return(rbindlist(fits)) +} + +plate_fit_single <- function(x, + plate.test, + model = irf, + model_predict = predict_irf) { + # Check for valid input + print(plate.test) + if (nrow(x) == 0) { + return(NULL) + } + + # Set training/test indices based on hold-out place + id.test <- which(x$Plate %in% plate.test) + id.train <- setdiff(1:nrow(x), id.test) + + if (length(id.test) == 0) { + return(NULL) + } + + # Set feature matrix and response vector + y <- as.numeric(x$Genetics != "Healthy") + xx <- dplyr::select(x, matches("^X")) + + # Take balanced class sample + ytab <- data.frame(Y = as.factor(y)) %>% + mutate(Idx = 1:n()) %>% + filter(Idx %in% id.train) + + id.downsample <- downSample(ytab, as.factor(ytab$Y)) + + # Fit model + id.train <- intersect(id.train, id.downsample$Idx) + fit <- fit_model(xx, y, id.train, model, model_predict, test.id = id.test) + + # Aggregate predictions across each model + ypred <- dplyr::select(x[id.test, ], WellID, PlateID, CellLine, Genetics) %>% + mutate(Ypred = fit$ypred) %>% + mutate(Ytest = y[id.test]) + + return(ypred) +} + +x <- filter(x, Genetics != "sporadic") +plate.models <- plate_fit(x, model, model_predict) + +################################################################################ +# Plate generalization by cell line +################################################################################ +# Aggregate FUS v. WT predictions by cell line +predictions <- plate.models %>% + group_by(CellLine, Ytest, PlateID) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") + +# Compute test statistic for difference in prediction scores +p <- predictions %>% + filter(Genetics %in% c("FUS-ALS", "Healthy")) %>% + mutate(PlateID = str_remove_all(PlateID, "^.*_plate_")) %>% + ggplot(aes(x = reorder(Genetics, Ypred, median), y = Ypred)) + + geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, size = 3, alpha = 0.6) + + ylab("i-MAP score") + + xlab(NULL) + + facet_wrap(~PlateID, ncol = 1) + + theme(legend.position = "none") + + theme(axis.text.x = element_text(angle = 90)) + + scale_fill_manual(values = col.pal) + + scale_color_manual(values = col.pal) + + ylim(0:1) + + +if (save.fig) pdf(file = file.path(figure.dir, "plate_generalization_a.pdf"), h = 16, w = 4) +plot(p) +if (save.fig) dev.off() + +################################################################################ +# Plate generalization by well replicate +################################################################################ +# Aggregate predictions by well replicate +predictions <- plate.models %>% + group_by(CellLine, WellID, Ytest, PlateID) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") + +predictions.agg <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>% + group_by(CellLine, Genetics, PlateID) %>% + summarize(Ypred = median(Ypred), .groups = "drop") %>% + arrange(Ypred) + +# Plot predictions, FUS v. WT +p <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>% + mutate(PlateID = str_remove_all(PlateID, "^.*_plate_")) %>% + ggplot(aes(x = reorder(Figure, Ypred, median), y = Ypred)) + + geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, alpha = 0.6) + + theme(axis.text.x = element_text(angle = 90)) + + theme(legend.position = "none") + + xlab(NULL) + + ylab("i-MAP score") + + scale_color_manual(values = col.pal) + + scale_fill_manual(values = col.pal) + + facet_wrap(~PlateID, ncol = 1) + + ylim(c(0, 1)) + +if (save.fig) pdf(file = file.path(figure.dir, "plate_generalization_b.pdf"), h = 16, w = 16) +plot(p) +if (save.fig) dev.off() diff --git a/paper_figures/Figure_Scores/fig_scores.R b/paper_figures/Figure_Scores/fig_scores.R new file mode 100644 index 0000000..21f30b1 --- /dev/null +++ b/paper_figures/Figure_Scores/fig_scores.R @@ -0,0 +1,588 @@ +################################################################################ +#+ setup, message=FALSE +################################################################################ +library(tidyverse) +library(data.table) +library(tidytext) +library(scales) +library(ggsci) +library(ggrepel) +library(PRROC) + +# Set user args for model choice +args <- R.utils::commandArgs(asValues = TRUE) +cytosolic <- as.logical(args$CYTOSOLIC) # FALSE +model.str <- args$MODEL #' irf' + +if (!length(cytosolic)) cytosolic <- FALSE +if (!length(model.str)) model.str <- "irf" + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores") +platemap.dir <- file.path(fig.base.dir, "data", "platemaps") +dir.create(platemap.dir) + +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) +source(file.path(fig.base.dir, "paper_figures", "color_palette.R")) +source(file.path(fig.base.dir, "paper_figures", "theme.R")) +theme_set(theme_als()) + +# Set paths to data directories +data.dir <- file.path(data.base.dir, "032422_WTFUSSporadics", "Cell_Filtered") +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") +plate.file <- file.path(fig.base.dir, "data", "plate_list.txt") + +marker <- "FUS_EEA1" +save.fig <- TRUE +save.plate <- FALSE + +importance_norm <- function(x) x / max(x) +ext <- ifelse(cytosolic, "_cytosolic", "_full") +output.fig.dir <- file.path(figure.dir, model.str, str_c("fig", ext)) +dir.create(output.fig.dir, showWarnings = FALSE) + +################################################################################ +#' # Load data +#+ parameters, message=FALSE +################################################################################ +# Load in datasets for selected marker +input.file <- str_c("profiles_", marker, ".Rdata") +load(file.path(data.dir, input.file)) + +# Initialize cell line metadata table +xcell <- fread(library.file) %>% dplyr::rename(CellLine = `Cell line`) + +xcell <- dplyr::select(x, CellLine, CellLinePassageNumber, Genetics) %>% + distinct() %>% + left_join(xcell, by = "CellLine") %>% + mutate(Sex = toupper(Sex)) %>% + dplyr::rename(Passage = CellLinePassageNumber) %>% + dplyr::rename(Site = `Origin Institution`) + +if (!file.exists(file.path(output.fig.dir, "model.Rdata"))) { + stop("Model not fit, run `train_fus_models.R`") +} else { + load(file.path(output.fig.dir, "model.Rdata")) +} + +################################################################################ +#' # Predictions by cell line +#+ predictions_cell_line, message=FALSE +################################################################################ +# Aggregate FUS v. Healthy predictions by cell line +predictions <- cell.models$ypred %>% + group_by(CellLine, Ytest) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") + +################################################################################ +# FUS-ALS vs. WT +################################################################################ +# Compute test statistic for difference in prediction scores +x.wt.fus <- filter(predictions, Genetics != "sporadic") +fit <- lm(Ypred ~ Genetics + Sex + Site + Passage, data = x.wt.fus) +pval.lm <- summary(fit)$coefficients["GeneticsHealthy", "Pr(>|t|)"] +print(pval.lm) + +yfus <- predictions$Ypred[predictions$Genetics == "FUS-ALS"] +ywt <- predictions$Ypred[predictions$Genetics == "Healthy"] +pval.sr <- wilcox.test(yfus, ywt, alternative = "greater")$p.value +print(pval.sr) + +pval.lab <- label_pval(pval.sr) +ymax <- max(predictions$Ypred) + 1e-2 + +p <- predictions %>% + filter(Genetics != "sporadic") %>% + ggplot(aes(x = reorder(Genetics, Ypred, median), y = Ypred)) + + geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, size = 3, alpha = 0.6) + + geom_text(x = 1.5, y = ymax + 5e-2, label = pval.lab, size = 8) + + geom_segment(x = 1, xend = 2, y = ymax, yend = ymax, col = "#0B4668", linewidth = 1) + + ylab("i-MAP score") + + xlab(NULL) + + theme(legend.position = "none") + + theme(axis.text.x = element_text(angle = 90)) + + scale_fill_manual(values = col.pal) + + scale_color_manual(values = col.pal) + + ylim(c(0, 1)) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_a.pdf"), h = 8, w = 4) +plot(p) +if (save.fig) dev.off() + +################################################################################ +# sporadic vs. WT +################################################################################ +# Compute test statistic for difference in prediction scores +x.wt.sp <- filter(predictions, Genetics != "FUS-ALS") +fit <- lm(Ypred ~ Genetics + Sex + Site + Passage, data = x.wt.sp) +pval.lm <- summary(fit)$coefficients["Geneticssporadic", "Pr(>|t|)"] + +ysporadic <- predictions$Ypred[predictions$Genetics == "sporadic"] +pval.sr <- wilcox.test(ysporadic, ywt, alternative = "greater")$p.value + +pval.lab.s <- label_pval(pval.lm) +ymax.s <- max(predictions$Ypred[predictions$Genetics == "sporadic"]) + 5e-2 + +p <- predictions %>% + ggplot(aes(x = reorder(Genetics, Ypred, median), y = Ypred)) + + geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, size = 3, alpha = 0.6) + + geom_text(x = 1.5, y = ymax + 5e-2, label = pval.lab, size = 8) + + geom_segment(x = 1, xend = 3, y = ymax, yend = ymax, col = "#0B4668", size = 1) + + geom_text(x = 1.5, y = ymax.s + 5e-2, label = pval.lab.s, size = 8) + + geom_segment(x = 1, xend = 2, y = ymax.s, yend = ymax.s, col = "#0B4668", size = 1) + + ylab("i-MAP score") + + xlab(NULL) + + theme(legend.position = "none") + + theme(axis.text.x = element_text(angle = 90)) + + scale_fill_manual(values = col.pal) + + scale_color_manual(values = col.pal) + + ylim(0:1) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_sa.pdf"), h = 8, w = 4) +plot(p) +if (save.fig) dev.off() + +################################################################################ +# Predictions vs. metadata features +################################################################################ +p1 <- predictions %>% + filter(Genetics == "Healthy") %>% + ggplot(aes(x = Sex, y = Ypred, fill = Sex)) + + geom_boxplot(outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, size = 3, alpha = 0.6) + + xlab("Sex") + + theme(legend.position = "none") + + ylab("i-MAP score") + + scale_fill_hue(l = 55) + + labs(fill = NULL) + + ylim(0:1) + +p2 <- predictions %>% + filter(Genetics == "Healthy") %>% + ggplot(aes(x = Site, y = Ypred, fill = Site)) + + geom_boxplot(outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, size = 3, alpha = 0.6) + + xlab("Origin institution") + + theme(legend.position = "none") + + ylab(NULL) + + scale_fill_jama() + + labs(fill = NULL) + + ylim(0:1) + +p3 <- predictions %>% + filter(Genetics == "Healthy") %>% + ggplot(aes(x = Passage, y = Ypred)) + + geom_smooth(method = "lm", se = FALSE) + + geom_point(size = 3) + + theme(legend.position = "none") + + ylab(NULL) + + xlab("Cell Line Passage Number") + + ylim(0:1) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_metadata.pdf"), h = 8, w = 24) +gridExtra::grid.arrange(p1, p2, p3, nrow = 1) +if (save.fig) dev.off() + +# Aggregate by cell line / plate +predictions <- cell.models$ypred %>% + group_by(CellLine, Ytest, PlateID) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") %>% + dplyr::rename(Figure = `Figure Names`) %>% + mutate(Plate = as.numeric(as.factor(PlateID))) %>% + mutate(Plate = str_c("Plate ", Plate)) + +p4 <- predictions %>% + filter(Genetics == "Healthy") %>% + ggplot(aes(x = Plate, y = Ypred, fill = PlateID)) + + geom_boxplot(outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, size = 3, alpha = 0.6) + + theme(legend.position = "none") + + ylab("i-MAP score") + + xlab(NULL) + + scale_fill_hue(l = 55) + + labs(fill = NULL) + + ylim(0:1) + +# Aggregate by cell line / plate +predictions <- cell.models$ypred %>% + group_by(CellLine, Ytest, PlateID, WellID) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") %>% + dplyr::rename(Figure = `Figure Names`) %>% + mutate(Row = str_remove_all(WellID, "-.*$")) %>% + mutate(Row = as.numeric(Row)) %>% + mutate(Col = str_remove_all(WellID, "^.*-")) %>% + mutate(Col = as.numeric(Col)) %>% + mutate(Plate = as.numeric(as.factor(PlateID))) %>% + mutate(Plate = str_c("Plate ", Plate)) + +p5 <- predictions %>% + mutate(Ypred = ifelse(Genetics == "Healthy", Ypred, NA)) %>% + ggplot(aes(x = Col, y = Row, fill = Ypred)) + + geom_tile() + + ylab("Plate Column") + + xlab("Plate Row") + + scale_fill_viridis_c(limits = 0:1) + + facet_wrap(~Plate) + + labs(fill = "i-MAP score") + + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_plate.pdf"), h = 8, w = 8) +plot(p4) +if (save.fig) dev.off() + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_well.pdf"), h = 12, w = 24) +plot(p5) +if (save.fig) dev.off() + + +################################################################################ +#' # Predictions by well replicate +#+ predictions_by_well, message=FALSE +################################################################################ +# Aggregate predictions by well replicate +predictions <- cell.models$ypred %>% + group_by(CellLine, WellID, Ytest, PlateID) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") %>% + dplyr::rename(Figure = `Figure Names`) + +################################################################################ +# FUS-ALS vs. WT +################################################################################ +predictions.agg <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>% + group_by(CellLine, Genetics) %>% + summarize(Ypred = median(Ypred), .groups = "drop") %>% + arrange(Ypred) %>% + mutate(Face = ifelse(CellLine %in% c("F12", "F1"), "bold", "plain")) + +# Plot predictions, FUS v. Healthy +p <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>% + ggplot(aes(x = reorder(Figure, Ypred, median), y = Ypred)) + + geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, alpha = 0.6) + + theme(axis.text.x = element_text(angle = 90, face = predictions.agg$Face)) + + theme(legend.position = "none") + + xlab(NULL) + + ylab("i-MAP score") + + scale_color_manual(values = col.pal) + + scale_fill_manual(values = col.pal) + + ylim(c(0, 1)) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_b.pdf"), h = 8, w = 16) +plot(p) +if (save.fig) dev.off() + +################################################################################ +# sporadic vs. WT +################################################################################ +n.sporadic.top <- 5 + +# plot predictions, sporadic v. Healthy +predictions.agg <- filter(predictions, Genetics %in% c("sporadic", "Healthy")) %>% + group_by(CellLine, Genetics) %>% + summarize(Ypred = median(Ypred), .groups = "drop") %>% + arrange(Ypred) + +p <- filter(predictions, Genetics %in% c("sporadic", "Healthy")) %>% + ggplot(aes(x = reorder(Figure, Ypred, median), y = Ypred)) + + geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) + + geom_jitter(width = 0.2, alpha = 0.6) + + theme(axis.text.x = element_text(angle = 90, face = predictions.agg$Face)) + + theme(legend.position = "none") + + xlab(NULL) + + ylab("i-MAP score") + + scale_color_manual(values = col.pal) + + scale_fill_manual(values = col.pal) + + ylim(c(0, 1)) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_sb.pdf"), h = 8, w = 16) +plot(p) +if (save.fig) dev.off() + +################################################################################ +#' # ROC curves +#+ roc, message=FALSE +################################################################################ +predictions.cell <- filter(cell.models$ypred, Genetics %in% c("FUS-ALS", "Healthy")) + +# Aggregate predictions by single well +predictions.well <- filter(cell.models$ypred, Genetics %in% c("FUS-ALS", "Healthy")) %>% + group_by(CellLine, WellID, Ytest, PlateID) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") + +# Aggregate predictions by cell line +predictions.line <- filter(cell.models$ypred, Genetics %in% c("FUS-ALS", "Healthy")) %>% + group_by(CellLine, Ytest) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") + +# Compute ROC curves +roc.cell <- roc.curve( + scores.class0 = predictions.cell$Ypred, + weights.class0 = predictions.cell$Ytest, + curve = TRUE +) + +roc.well <- roc.curve( + scores.class0 = predictions.well$Ypred, + weights.class0 = predictions.well$Ytest, + curve = TRUE +) + +roc.line <- roc.curve( + scores.class0 = predictions.line$Ypred, + weights.class0 = predictions.line$Ytest, + curve = TRUE +) + +# Initialize type labels +type.levels <- c("Single cell", "Well replicate", "Cell line") +aucs <- c(roc.cell$auc, roc.well$auc, roc.line$auc) %>% round(2) +type.levels <- str_c(type.levels, ", AUROC = ", aucs) + +# Plot ROC curves +xplot <- rbind( + data.frame(roc.cell$curve[, 1:2], Type = type.levels[1]), + data.frame(roc.well$curve[, 1:2], Type = type.levels[2]), + data.frame(roc.line$curve[, 1:2], Type = type.levels[3]) +) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_c.pdf"), h = 8, w = 8) +mutate(xplot, Type = factor(Type, levels = type.levels)) %>% + ggplot(aes(x = X1, y = X2, col = Type)) + + geom_line(size = 2) + + geom_abline(col = "grey", lty = 2) + + scale_color_jama() + + xlab("False positive rate") + + ylab("True positive rate") + + theme(legend.position = c(0.75, 0.15)) + + theme(legend.text = element_text(size = 16)) + + labs(color = NULL) +if (save.fig) dev.off() + + +################################################################################ +#' # Feature importance +#+ importance, message=FALSE +################################################################################ +clean_marker <- function(x, markers = c("2" = "FUS", "3" = "EEA1")) { + # Map channel indices to marker names + channels <- str_split(x, "\\.\\.") %>% + sapply("[", 2) %>% + str_split("") + return(sapply(channels, function(ch) str_c(markers[ch], collapse = "/"))) +} + +clean_region <- function(x) { + # Map region indicators to full names + region.key <- c(D = "nucleus", N = "cytoplasm", C = "cell") + x <- str_remove_all(x, "^.*_") + regions <- str_split(x, "\\.\\.") %>% + sapply("[", 1) %>% + str_split("") + return(sapply(regions, function(r) str_c(region.key[r], collapse = "/"))) +} + +clean_feature <- function(x) { + # Clean feature name, remove region / marker indicators + x <- str_remove_all(x, "_(D|C|N)*\\.\\.[0-9]*$") + return(str_replace_all(x, "_", " ") %>% str_c("\n")) +} + +# Max-normalize feature importance for each hold-out cell line +importance <- apply(abs(cell.models$importance), MAR = 2, importance_norm) +features <- rownames(cell.models$importance) %>% str_remove_all("^X\\.\\.") + +# Compute maximum importance within feature category, average across models +importance.avg <- reshape2::melt(importance) %>% + mutate(Feature = features[Var1]) %>% + dplyr::rename(Model = Var2) %>% + mutate(Feature = str_remove_all(Feature, "\\.\\.[0-9]*$")) %>% + group_by(Model, Feature) %>% + summarize(Importance = max(value), .groups = "drop") %>% + group_by(Feature) %>% + summarize(SD = sd(Importance), Importance = mean(Importance)) %>% + mutate(Marker = clean_marker(Feature)) %>% + mutate(Region = clean_region(Feature)) %>% + mutate(Feature = clean_feature(Feature)) %>% + arrange(desc(Importance)) + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_d.pdf"), h = 7, w = 8) +top_n(importance.avg, 5, Importance) %>% + mutate(Feature = str_c(Feature, Region, ", ", Marker)) %>% + ggplot(aes(x = reorder(Feature, Importance), y = Importance)) + + geom_bar(stat = "identity", fill = "#0B4668", color = "black", width = 0.5) + + geom_errorbar(aes(ymin = Importance, ymax = Importance + SD), width = 0.25) + + theme(axis.text.y = element_text(size = 16)) + + theme(panel.grid.major.y = element_blank()) + + theme(panel.grid.minor.y = element_blank()) + + coord_flip() + + xlab(NULL) + + ylab(NULL) + + ggtitle(NULL) + + theme(text = element_text(size = 22)) +if (save.fig) dev.off() + +fout <- file.path(output.fig.dir, "importance.csv") +write.csv(file = fout, importance.avg) + +################################################################################ +#' # Age correlation +#+ age, message=FALSE +################################################################################ +# Aggregate model predictions by cell line +predictions <- cell.models$ypred %>% + dplyr::rename(Plate = PlateID) %>% + group_by(CellLine, Ytest, Genetics, WellID, Plate) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") + +# Plot age v. predictions by genetics +xcell.s <- dplyr::select(xcell, -Genetics) %>% + dplyr::rename(Age = `Age of biopsy`) %>% + dplyr::rename(AgeOfOnset = `Age of onset`) %>% + dplyr::rename(Figure = `Figure Names`) + +xplot <- predictions %>% left_join(xcell.s, by = "CellLine") + +xplot.avg <- group_by(predictions, CellLine, Genetics) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell.s, by = "CellLine") %>% + mutate(AgeOfOnset = ifelse(AgeOfOnset == "Control", Age, AgeOfOnset)) %>% + mutate(AgeOfOnset = ifelse(AgeOfOnset == "control", Age, AgeOfOnset)) %>% + mutate(AgeOfOnset = ifelse(AgeOfOnset == "", Age, AgeOfOnset)) %>% + mutate(AgeOfOnset = as.numeric(AgeOfOnset)) %>% + mutate(Figure = str_remove_all(Figure, "-.*$")) %>% + dplyr::rename(Site = Site) + +# Compute age/prediction correlations within genetic background +xfus <- filter(xplot.avg, Genetics == "FUS-ALS") +cor.fus <- cor.test(xfus$Ypred, xfus$Age, alternative = "less") + +xwt <- filter(xplot.avg, Genetics == "Healthy") +cor.wt <- cor.test(xwt$Ypred, xwt$Age, alternative = "less") + +xsporadic <- filter(xplot.avg, Genetics == "sporadic") +cor.spor <- cor.test(xsporadic$Ypred, xsporadic$Age, alternative = "less") + +rhos <- round(c(cor.fus$estimate, cor.wt$estimate, cor.spor$estimate), 2) +pvals <- c(cor.fus$p.value, cor.wt$p.value, cor.spor$p.value) +pvals <- p.adjust(pvals, method='bonferroni') +pval.lab <- str_c("(", sapply(pvals, label_pval), ")") + +xplot.text <- data.frame(Genetics = c("FUS-ALS", "Healthy", "sporadic")) %>% + mutate(text = str_c("Cor = ", rhos, pval.lab)) %>% + mutate(Ypred = c(0.15, 0.1, 0.05)) %>% + mutate(Age = 55) + +# Plot predictions v. age, FUS v. Healthy +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_aob.pdf"), h = 10, w = 10) +filter(xplot.avg, Genetics %in% c("Healthy", "FUS-ALS")) %>% + ggplot(aes(x = Age, y = Ypred, col = Genetics)) + + geom_point(size = 3) + + geom_line(stat = "smooth", method = "lm", formula = y ~ x, alpha = 0.7, size = 1.5) + + geom_text_repel(aes(label = CellLine), size = 7, min.segment.length = 0.01) + + geom_text(data = filter(xplot.text, Genetics != "sporadic"), aes(label = text), size = 10) + + scale_color_manual(values = col.pal) + + ylim(0:1) + + ylab("i-MAP score") + + theme(legend.position = "none") +if (save.fig) dev.off() + +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_sporadic_aob.pdf"), h = 10, w = 10) +filter(xplot.avg, Genetics %in% c("Healthy", "sporadic")) %>% + ggplot(aes(x = Age, y = Ypred, col = Genetics)) + + geom_point(size = 3) + + geom_line(stat = "smooth", method = "lm", formula = y ~ x, alpha = 0.7, size = 1.5) + + geom_text_repel(aes(label = CellLine), size = 7, min.segment.length = 0.01) + + geom_text(data = filter(xplot.text, Genetics != "FUS-ALS"), aes(label = text), size = 10) + + scale_color_manual(values = col.pal) + + ylim(0:1) + + ylab("i-MAP score") + + theme(legend.position = "none") +if (save.fig) dev.off() + + +################################################################################ +# Age of onset +################################################################################ +# Compute age/prediction correlations within genetic background +cor.fus.aoo <- cor.test(xfus$Ypred, xfus$AgeOfOnset, alternative = "less") + +rhos <- round(c(cor.fus.aoo$estimate, cor.wt$estimate, cor.spor$estimate), 2) +pvals <- c(cor.fus.aoo$p.value, cor.wt$p.value, cor.spor$p.value) +pvals <- p.adjust(pvals, method='bonferroni') +pval.lab <- str_c("(", sapply(pvals, label_pval), ")") + +xplot.text <- data.frame(Genetics = c("FUS-ALS", "Healthy", "sporadic")) %>% + mutate(text = str_c("Cor = ", rhos, pval.lab)) %>% + mutate(Ypred = c(0.15, 0.1, 0.05)) %>% + mutate(AgeOfOnset = 55) + +# Plot predictions v. age of onset, FUS v. Healthy +if (save.fig) pdf(file = file.path(output.fig.dir, "fig_g.pdf"), h = 12, w = 12) +filter(xplot.avg, Genetics %in% c("Healthy", "FUS-ALS")) %>% + ggplot(aes(x = AgeOfOnset, y = Ypred, col = Genetics)) + + geom_point(size = 3) + + geom_line(stat = "smooth", method = "lm", formula = y ~ x, size = 2) + + geom_text_repel(aes(label = CellLine), size = 7, min.segment.length = 0.01) + + geom_text(data = filter(xplot.text, Genetics != "sporadic"), aes(label = text), size = 10) + + scale_color_manual(values = col.pal) + + ylim(0:1) + + ylab("i-MAP score") + + xlab("Age of onset") + + theme(legend.position = "none") +if (save.fig) dev.off() + + +################################################################################ +# Save table of predictions by cell line +################################################################################ +# Predictions by genetics +predictions <- cell.models$ypred %>% + group_by(CellLine, Ytest) %>% + summarize(Ypred = mean(Ypred), .groups = "drop") %>% + left_join(xcell, by = "CellLine") + +save(file = file.path(output.fig.dir, "predictions.Rdata"), predictions) + +################################################################################ +# Save table of plates used +################################################################################ +if (save.plate) { + plate.list <- unique(predictions$Plate) + + write.table( + file = plate.file, + plate.list, + append = TRUE, + row.names = FALSE, + col.names = FALSE, + quote = FALSE + ) + + # Save plate maps + plate.layout <- cell.models$ypred %>% + dplyr::select(PlateID, CellLine, WellID) %>% + distinct() %>% + mutate(Row = as.numeric(str_remove_all(WellID, "-.*$"))) %>% + mutate(Col = as.numeric(str_remove_all(WellID, "^.*-"))) + + for (p in unique(plate.layout$PlateID)) { + platemap <- matrix("", nrow = 16, ncol = 24) + pm <- filter(plate.layout, PlateID == p) + for (i in 1:nrow(pm)) platemap[pm$Row[i], pm$Col[i]] <- pm$CellLine[i] + colnames(platemap) <- 1:24 + + print(table(platemap)) + fout <- file.path(platemap.dir, str_c(p, ".csv")) + write.csv(file = fout, platemap) + } +} diff --git a/paper_figures/Figure_Scores/run_models.sh b/paper_figures/Figure_Scores/run_models.sh new file mode 100644 index 0000000..8af9176 --- /dev/null +++ b/paper_figures/Figure_Scores/run_models.sh @@ -0,0 +1,16 @@ +declare -a MODEL=("irf" "fcnn" "logistic") +declare -a CYTOSOLIC=("TRUE" "FALSE") + +for m in "${MODEL[@]}" +do + + for c in "${CYTOSOLIC[@]}" + do + echo "RUNNING: $m, $c" + Rscript train_fus_models.R --MODEL "$m" --CYTOSOLIC "$c" + Rscript fig_scores.R --MODEL "$m" --CYTOSOLIC "$c" + echo "####################" + done + +done + diff --git a/paper_figures/Figure_Scores/train_fus_models.R b/paper_figures/Figure_Scores/train_fus_models.R new file mode 100644 index 0000000..c282a8a --- /dev/null +++ b/paper_figures/Figure_Scores/train_fus_models.R @@ -0,0 +1,73 @@ +################################################################################ +#+ setup, message=FALSE +################################################################################ +library(tidyverse) +library(data.table) + +# Set user args for model choice +args <- R.utils::commandArgs(asValues = TRUE) +cytosolic <- as.logical(args$CYTOSOLIC) # TRUE/FALSE +model.str <- args$MODEL # 'irf', 'logistic', or 'fcnn' + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores") +platemap.dir <- file.path(fig.base.dir, "data", "platemaps") +dir.create(platemap.dir) + +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) + +# Set paths to data directories +data.dir <- file.path(data.base.dir, "032422_WTFUSSporadics", "Cell_Filtered") +plate.file <- file.path(fig.base.dir, "data", "plate_list.txt") + +n.core <- 10 +marker <- "FUS_EEA1" +save.fig <- FALSE +save.plate <- FALSE + +################################################################################ +#+ parameters, message=FALSE +################################################################################ + +# Initialize models to be used for supervised learning +if (model.str == "fcnn") { + model <- fcnn + model_predict <- predict_fcnn +} else if (model.str == "logistic") { + model <- logistic + model_predict <- predict_logistic +} else { + model <- function(x, y) irf(x, y, n.core = n.core) + model_predict <- predict_irf +} + +ext <- ifelse(cytosolic, "_cytosolic", "_full") +output.fig.dir <- file.path(figure.dir, model.str, str_c("fig", ext)) +dir.create(output.fig.dir, showWarnings = FALSE, recursive = TRUE) + +################################################################################ +#' # Load data +#+ parameters, message=FALSE +################################################################################ +# Load in datasets for selected marker +input.file <- str_c("profiles_", marker, ".Rdata") +load(file.path(data.dir, input.file)) + +# Filter to cytosolic features +if (cytosolic) x <- select_cytosolic(x) + +################################################################################ +#' # Predictive analysis +#+ modeling, message=FALSE +################################################################################ +# Fit model to select data +set.seed(47) + +if (!file.exists(file.path(output.fig.dir, "model.Rdata"))) { + cell.models <- cell_fit(x, model, model_predict) + save(file = file.path(output.fig.dir, "model.Rdata"), cell.models) +} else { + warning("Model file already exists, model not fit") +} diff --git a/paper_figures/Figure_Search/.DS_Store b/paper_figures/Figure_Search/.DS_Store index fe8eb2dcb92d09c8fc15bc4ff30a7703f54fd01a..95ea6914022c19af35b9e5801ad168be2f1b620c 100644 GIT binary patch delta 220 zcmZn(XbIThF2J~JvWGySrc`ycp@EUHj)IAqVXclrwV|oGnT~>`u}N($Cx@uAzI9N1 zc1~_ye$V6w0`iQ#lLWKhQZ1CxdlL32F4JFP2#Mao7ok9v26Y?a)^;~)0h|mKX^ro delta 264 zcmZn(XbIThF2J~BvWGxnqC|DInTeT>f|;RZt&T#qrICS-f{C$NZ7nBO1edeg&;Psn=KH%x u91-er^HW@sa`KaaA{>~q1sTYuBy469XXV_?s_=(pGqdOcMk*M`!UzE5zDVl; diff --git a/paper_figures/Figure_Search/README.md b/paper_figures/Figure_Search/README.md new file mode 100644 index 0000000..a580224 --- /dev/null +++ b/paper_figures/Figure_Search/README.md @@ -0,0 +1,38 @@ +# Marker search screens +Scripts in this directory are used to train and evalaute models used in marker +set search screens and generate figures summarizing model performance. Data are +derived from three screens: + +- `data_profiles/110420_BiomarkerScreen/Cell_Filtered` +- `data_profiles/121820_SporadicScreen/Cell_Filtered` +- `data_profiles/030321_FUSWT/Cell_Filtered` + +## Model fitting +Models for the first two screens can be fit by running the +`train_marker_screens.R` script. Select the screen to analyze by setting the +`screen` variable as one of '110420_BiomarkerScreen' or '121820_SporadicScreen'. +Models for the third screen can be trained using the `train_fus.R` script. Each +of these scripts saves the output model in `output.fig.dir/model_.Rdata` + +### Parameters in modeling scripts + +- `genetics.train`: cell line genetics to be used for model training. Default is + to use all genetics available for a given marker set. (only defined for + `train_marker_screens.R`) + +- `model`: modeling function that takes arguments `x` (feature matrix) and `y` + (response vector) and returns a fitted model. + +- `model_predict: model prediction function that takes arguments `model` (a + fitted mode) and `x` (feature matrix) and returns predictions on `x`. + +- `markers`: marker sets to be used for model. Single marker models can be + defined from a marker pair as _:. (only defined for + `train_fus_screen.R`) + +## Figures +Scripts to generate figures for the marker search screens require fitted models, +generated using the scripts described above. `fig_heatmap.R` produces a heatmap +for a selected `screen` reporting classification accuracy by genetics / marker. +`fig_marker_tuning.R` produces figures summarizing marker set vs. individual +marker performance for the FUS screen. diff --git a/paper_figures/Figure_Search/fig_marker_tuning.R b/paper_figures/Figure_Search/fig_marker_tuning.R index bd8955d..4333986 100644 --- a/paper_figures/Figure_Search/fig_marker_tuning.R +++ b/paper_figures/Figure_Search/fig_marker_tuning.R @@ -1,39 +1,14 @@ -#' # Figure S1 -#' The following script fits models by marker set for the FUS, CD63, EEA1 -#' comparison screen (Fig. S1). We consider model performance with respect to -#' both marker pairs and individual markers. -#' -#' Requires input datasets -#' `data_profiles/030321_FUSWT_Cell` -#' -#' Requires models trained using scripts: -#' `train_fus_screen.R` -#' -#' Karl Kumbier 2/23/2022` +################################################################################ #+ setup, message=FALSE +################################################################################ library(tidyverse) library(data.table) library(tidytext) library(PRROC) library(scales) library(ggsci) -library(hrbrthemes) library(gridExtra) -theme_set( - theme_ipsum( - plot_title_size=24, - axis_title_size=24, - strip_text_size=24, - axis_text_size=22, - base_size=24, - base_family='sans', - axis=TRUE, - axis_col='#000000', - grid=FALSE - ) -) - # Set paths to data directories fig.base.dir <- Sys.getenv('ALS_PAPER') data.base.dir <- Sys.getenv('ALS_DATA') @@ -41,6 +16,8 @@ figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search') source(file.path(fig.base.dir, 'paper_figures', 'utilities.R')) source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R')) +source(file.path(fig.base.dir, "paper_figures", "theme.R")) +theme_set(theme_als()) # Set paths to data directories data.dir <- file.path(data.base.dir, '030321_FUSWT', 'Cell_Filtered') @@ -53,14 +30,10 @@ save.fig <- TRUE output.fig.dir <- file.path(figure.dir, 'fig/') load(file.path(output.fig.dir, 'model_030321_FUSWT.Rdata')) - +################################################################################ #' ## Accuracy by marker set -#' To evaluate performance of marker sets, we compute AUROC on held-out cell -#' lines at three data resolutions: (i) cell line (ii) well replicate -#' (iii) single cell. Since NS003 cannot be accurately classified using any -#' marker set, but is an outlier wrt some, we drop this line when computing -#' AUROC. #+ accuracy +################################################################################ accuracy <- lapply(fits, function(z) { # Drop cell line not classified by any marker set @@ -115,13 +88,13 @@ xplot <- lapply(unique(predictions$Markers), function(m) { } p <- predictions.m %>% - ggplot(aes(x=reorder(Figure, Ypred, median), y=Ypred, col=Genetics)) + + ggplot(aes(x=reorder(CellLine, Ypred, median), y=Ypred, col=Genetics)) + geom_boxplot(aes(fill=Genetics), outlier.shape=NA, coef=NULL, alpha=0.75) + geom_jitter(width=0.15, alpha=0.9) + theme(legend.position='none') + theme(axis.text.x=element_text(angle=90)) + xlab(NULL) + - ylab('FUS-ALS phenotype score') + + ylab('i-MAP score') + scale_color_manual(values=col.pal) + scale_fill_manual(values=col.pal) + ylim(0:1) + diff --git a/paper_figures/Figure_Search/fig_pred_by_marker.R b/paper_figures/Figure_Search/fig_pred_by_marker.R new file mode 100644 index 0000000..1274d73 --- /dev/null +++ b/paper_figures/Figure_Search/fig_pred_by_marker.R @@ -0,0 +1,60 @@ +################################################################################ +#+ setup, message=FALSE +################################################################################ +library(tidyverse) +library(data.table) +library(RColorBrewer) +library(scales) +library(superheat) +library(ggsci) + +# Paths to data directories +fig.base.dir <- Sys.getenv('ALS_PAPER') +data.base.dir <- Sys.getenv('ALS_DATA') +figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search') + +#screen <- '110420_BiomarkerScreen' +#screen <- '121820_SporadicScreen' +screen <- '030321_FUSWT' + +# Source utility functions +source(file.path(fig.base.dir, 'paper_figures', 'utilities.R')) +source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R')) +source(file.path(fig.base.dir, "paper_figures", "theme.R")) +theme_set(theme_als()) + +# Parameters for visualization +save.fig <- TRUE + +# Load in fitted model for current screen +output.fig.dir <- file.path(figure.dir, 'fig') +load(file.path(output.fig.dir, str_c('model_', screen, '.Rdata'))) + + +################################################################################ +#+ cell_prediction_plots, fig.width=18, fig.height=12, warning=FALSE +################################################################################ +# Predictions by genetics +predictions <- rbindlist(fits) %>% + mutate(Genetics=factor(Genetics, levels=genetic.order)) %>% + filter(!str_detect(Markers, ':')) %>% + group_by(PlateID, WellID, CellLine, Genetics, Markers) %>% + summarize(Ypred = mean(Ypred)) + +nrow <- ifelse(screen == "110420_BiomarkerScreen", 2, 1) +p <- ggplot(predictions, aes(x = reorder(CellLine, Ypred), y = Ypred)) + + geom_boxplot(aes(fill = Genetics, col = Genetics), alpha = 0.5) + + geom_jitter(width = 0.2, alpha = 0.8, aes(col = Genetics)) + + facet_wrap(~Markers, nrow = nrow) + + scale_fill_manual(values = col.pal) + + scale_color_manual(values = col.pal) + + theme(legend.position = "none") + + ylab("i-MAP score") + + xlab(NULL) + + ylim(c(0, 1)) + + theme(text = element_text(size = 13)) + +fout <- file.path(output.fig.dir, str_c("fig_", screen, ".pdf")) +if (save.fig) pdf(file = fout, height = nrow * 5, width=16) +plot(p) +if (save.fig) dev.off() diff --git a/paper_figures/Figure_Search/run.sh b/paper_figures/Figure_Search/run.sh new file mode 100644 index 0000000..47cdcde --- /dev/null +++ b/paper_figures/Figure_Search/run.sh @@ -0,0 +1,4 @@ +Rscript train_marker_screens.R --SCREEN 110420_BiomarkerScreen +Rscript train_marker_screens.R --SCREEN 121820_SporadicScreen +Rscript train_fus_screen.R + diff --git a/paper_figures/Figure_Search/train_fus_screen.R b/paper_figures/Figure_Search/train_fus_screen.R index fea9e56..af2ee7e 100644 --- a/paper_figures/Figure_Search/train_fus_screen.R +++ b/paper_figures/Figure_Search/train_fus_screen.R @@ -1,87 +1,76 @@ -#' The following script fits models by marker set for the FUS, CD63, EEA1 -#' comparison screen. We consider model performance with respect to both marker -#' pairs and individual markers. -#' -#' Requires input datasets -#' `data_profiles/030321_FUSWT_Cell` -#' -#' Karl Kumbier 2/23/2022 +############################################################################### #+ setup, message=FALSE +############################################################################### library(tidyverse) library(data.table) -library(scales) # Set paths to data directories -fig.base.dir <- Sys.getenv('ALS_PAPER') -data.base.dir <- Sys.getenv('ALS_DATA') -figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search') +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Search") -data.dir <- file.path(data.base.dir, '030321_FUSWT', 'Cell_Filtered') -library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv') -plate.file <- file.path(fig.base.dir, 'data', 'plate_list.txt') -platemap.dir <- file.path(fig.base.dir, 'data', 'platemaps') +data.dir <- file.path(data.base.dir, "030321_FUSWT", "Cell_Filtered") +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") +plate.file <- file.path(fig.base.dir, "data", "plate_list.txt") +platemap.dir <- file.path(fig.base.dir, "data", "platemaps") -output.fig.dir <- file.path(figure.dir, 'fig') -dir.create(output.fig.dir, recursive=FALSE) +output.fig.dir <- file.path(figure.dir, "fig") +dir.create(output.fig.dir, recursive = FALSE) dir.create(platemap.dir) # Source utility funcitons -source(file.path(fig.base.dir, 'paper_figures', 'utilities.R')) +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) + +# save plate IDs to experiment summary file +save.plates <- FALSE # Parameters for analysis -type <- 'Cell' n.core <- 16 +############################################################################### +#+ parameters +############################################################################### markers <- c( - 'FUS_EEA1', 'CD63_EEA1', 'CD63_FUS', - 'FUS_EEA1:FUS', 'CD63_EEA1:CD63', 'CD63_FUS:CD63', - 'FUS_EEA1:EEA1', 'CD63_EEA1:EEA1', 'CD63_FUS:FUS' + "FUS_EEA1", "CD63_EEA1", "CD63_FUS", + "FUS_EEA1:FUS", "CD63_EEA1:CD63", "CD63_FUS:CD63", + "FUS_EEA1:EEA1", "CD63_EEA1:EEA1", "CD63_FUS:FUS" ) -save.plates <- FALSE - # Initialize models to be used for supervised learning -model <- function(x, y) irf(x, y, n.core=n.core) +model <- function(x, y) irf(x, y, n.core = n.core) model_predict <- predict_irf -#' # Loading data -#' Our dataset consists of phenotypic profiles derived from `r marker`. Each -#' observation in the dataset corresponds to a single `r type`. +############################################################################### +#' # Load data #+load_data +############################################################################### # Load in datasets for selected marker load_marker <- function(marker) { # wrapper function for loading data from select marker set - load(file.path(data.dir, str_c('profiles_', marker, '.Rdata'))) + load(file.path(data.dir, str_c("profiles_", marker, ".Rdata"))) return(x) } -#' ## Predictive analysis -#' Here we address the question of which marker is best for separating WT from -#' FUS cell lines. Toward this end, we train classifiers to discriminate between -#' disease and healthy `r type`s. Classifiers are trained using features -#' derived from select markers / marker sets. -#' -#' To assess the generalizability of our predictive models to new cell lines, we -#' consider a leave one cell line out strategy. Specifically, we remove a single -#' cell line, train models on the remaining cell lines, and evaluate predictions -#' on the held-out line. -#+ leave_cell_out +################################################################################ +#' # Predictive analysis +#+ modeling +################################################################################ marker_fit <- function(marker) { # Wrapper function to call cell_fit over each marker set - - marker.split <- str_split(marker, ':')[[1]] - + + marker.split <- str_split(marker, ":")[[1]] + # Fit model to predict each cell lines x <- load_marker(marker.split[1]) # Filter to singletons if (length(marker.split) == 2) { - marker.set <- str_split(marker.split[1], '_')[[1]] + marker.set <- str_split(marker.split[1], "_")[[1]] single.marker <- marker.split[2] channel <- which(marker.set == single.marker) + 1 - - xchannel <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3) - id.meta <- !str_detect(colnames(x), '^X') + + xchannel <- sapply(str_split(colnames(x), "\\.\\."), "[", 3) + id.meta <- !str_detect(colnames(x), "^X") id.marker <- xchannel %in% c(channel, str_c(channel, channel)) x <- select_if(x, id.marker | id.meta) } @@ -93,11 +82,11 @@ marker_fit <- function(marker) { # Fit model to select data set.seed(47) -fout <- file.path(output.fig.dir, 'model_030321_FUSWT.Rdata') +fout <- file.path(output.fig.dir, "model_030321_FUSWT.Rdata") fits <- lapply(markers, marker_fit) fits <- lapply(fits, function(f) f$ypred) -save(file=fout, fits) +save(file = fout, fits) ################################################################################ @@ -106,30 +95,30 @@ save(file=fout, fits) if (save.plates) { predictions <- rbindlist(fits) plate.list <- unique(predictions$PlateID) - + write.table( - file=plate.file, - plate.list, - append=TRUE, - row.names=FALSE, - col.names=FALSE, - quote=FALSE + file = plate.file, + plate.list, + append = TRUE, + row.names = FALSE, + col.names = FALSE, + quote = FALSE ) - + # Save plate maps plate.layout <- dplyr::select(predictions, PlateID, CellLine, WellID) %>% distinct() %>% - mutate(Row=as.numeric(str_remove_all(WellID, '-.*$'))) %>% - mutate(Col=as.numeric(str_remove_all(WellID, '^.*-'))) - + mutate(Row = as.numeric(str_remove_all(WellID, "-.*$"))) %>% + mutate(Col = as.numeric(str_remove_all(WellID, "^.*-"))) + for (p in unique(plate.layout$PlateID)) { - platemap <- matrix('', nrow=16, ncol=24) + platemap <- matrix("", nrow = 16, ncol = 24) pm <- filter(plate.layout, PlateID == p) for (i in 1:nrow(pm)) platemap[pm$Row[i], pm$Col[i]] <- pm$CellLine[i] colnames(platemap) <- 1:24 - + print(table(platemap)) - fout <- file.path(platemap.dir, str_c(p, '.csv')) - write.csv(file=fout, platemap) + fout <- file.path(platemap.dir, str_c(p, ".csv")) + write.csv(file = fout, platemap) } } diff --git a/paper_figures/Figure_Search/train_marker_screens.R b/paper_figures/Figure_Search/train_marker_screens.R index 4e61f13..4eeec5d 100644 --- a/paper_figures/Figure_Search/train_marker_screens.R +++ b/paper_figures/Figure_Search/train_marker_screens.R @@ -1,117 +1,106 @@ -#' The following script generates heatmaps of prediction accuracy by genetics -#' and marker set as reported in paper Fig. 1b. Accuracy is measured within each -#' screen individually. -#' -#' Requires input datasets -#' `data_profiles/110420_BiomarkerScreen/Cell_Filtered` -#' `data_profiles/121820_SporadicScreen/Cell_Filtered` -#' -#' Karl Kumbier 2/23/2022` -#+ setup, message=FALSE +############################################################################### +#+ setup, message=FALSE, echo=FALSE +############################################################################### library(tidyverse) library(data.table) -library(scales) # Paths to data directories -fig.base.dir <- Sys.getenv('ALS_PAPER') -data.base.dir <- Sys.getenv('ALS_DATA') -figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search') - -#screen <- '110420_BiomarkerScreen' -screen <- '121820_SporadicScreen' -data.dir <- file.path(data.base.dir, screen, 'Cell_Filtered') -plate.file <- file.path(fig.base.dir, 'data', 'plate_list.txt') -platemap.dir <- file.path(fig.base.dir, 'data', 'platemaps') - -output.fig.dir <- file.path(figure.dir, 'fig') -dir.create(output.fig.dir, recursive=FALSE) +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Search") + +#screen <- "121820_SporadicScreen" +args <- R.utils::commandArgs(asValues=TRUE) +screen <- args$SCREEN +if (is.null(screen)) screen <- "110420_BiomarkerScreen" + +data.dir <- file.path(data.base.dir, screen, "Cell_Filtered") +plate.file <- file.path(fig.base.dir, "data", "plate_list.txt") +platemap.dir <- file.path(fig.base.dir, "data", "platemaps") + +output.fig.dir <- file.path(figure.dir, "fig") +dir.create(output.fig.dir, recursive = FALSE) dir.create(platemap.dir) # Source utility functions -source(file.path(fig.base.dir, 'paper_figures', 'utilities.R')) -source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R')) +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) -# Parameters for analysis -n.core <- 16 # number of cores for fitting models -type <- 'Cell' # sample type for modeling -genetics.train <- c('FUS-ALS', 'C9orf72', 'sporadic', 'TBK1') # genetics for training models -save.plates <- TRUE # save plate IDs to experimental summary file +# save plate IDs to experiment summary file +save.plates <- TRUE -# Initialize plot ordering for genetics -genetic.order <- c('Healthy', 'TBK1', 'sporadic', 'C9orf72', 'FUS-ALS') +# number of cores to use +n.core <- 16 -# Initialize models to be used for supervised learning -model <- function(x, y) irf(x, y, n.core=n.core) +############################################################################### +#+ parameters +############################################################################### +# genetics used for model training +genetics.train <- c("FUS-ALS") + +# models used for supervised learning +model <- function(x, y) irf(x, y, n.core = n.core) model_predict <- predict_irf -#' # Loading data -#' Our dataset consists of phenotypic profiles derived from multiple marker sets. -#' Each observation in the dataset corresponds to a single `r type`. For each -#' marker set, we evaluate the predictive strength of a single marker by -#' ALS genetic subtype. +############################################################################### +#' # Load data #+load_data - +############################################################################### # Load in datasets for each marker set -input.files <- list.files(data.dir) %>% str_subset('Rdata') +input.files <- list.files(data.dir) %>% str_subset("Rdata") xlist <- lapply(input.files, function(f) { load(file.path(data.dir, f)) - x <- filter(x, str_detect(Compounds, '(DMSO|None|Control)')) + x <- filter(x, str_detect(Compounds, "(DMSO|None|Control)")) + x <- filter(x, Genetics %in% c("FUS-ALS", "Healthy")) return(as.data.frame(x)) }) -#' ## Predictive analysis - cell line generalization -#' To assess the generalizability of our predictive models to new cell lines, we -#' consider a leave one cell line out strategy. Specifically, we remove a single -#' cell line from our dataset, train models on the remaining cell lines from the -#' following genetic backgrounds `r genetics.train`, and evaluate predictions on -#' the held-out line. The wrapper functions defined below run this analysis. -#+ leave_cell_out - -#' Below, we run the analysis described above over each cell line. -#+ cell_model, cache=TRUE +############################################################################### +#' # Predictive analysis +#+ modeling, cache=TRUE +############################################################################### set.seed(47) -fout <- file.path(output.fig.dir, str_c('model_', screen, '.Rdata')) +fout <- file.path(output.fig.dir, str_c("model_", screen, ".Rdata")) # Fit model to select data fits <- lapply(xlist, function(z) { - print('-----------------------------------------------') + print("-----------------------------------------------") print(z$Markers[1]) cell_fit(z, model, model_predict, genetics.train)$ypred }) fits <- fits[!sapply(fits, is.null)] -save(file=fout, fits) +save(file = fout, fits) # Save table of plates used if (save.plates) { - plate.list <- lapply(xlist, function(z) unique(z$PlateID)) %>% unlist - + plate.list <- lapply(xlist, function(z) unique(z$PlateID)) %>% unlist() + write.table( - file=plate.file, - unique(plate.list), - append=TRUE, - row.names=FALSE, - col.names=FALSE, - quote=FALSE + file = plate.file, + unique(plate.list), + append = TRUE, + row.names = FALSE, + col.names = FALSE, + quote = FALSE ) - + # Save plate maps plate.layout <- lapply(xlist, function(z) { dplyr::select(z, PlateID, CellLine, RowID, ColID) %>% distinct() }) - plate.layout <- rbindlist(plate.layout) - + plate.layout <- rbindlist(plate.layout) + for (p in unique(plate.layout$PlateID)) { - platemap <- matrix('', nrow=16, ncol=24) + platemap <- matrix("", nrow = 16, ncol = 24) pm <- filter(plate.layout, PlateID == p) for (i in 1:nrow(pm)) platemap[pm$RowID[i], pm$ColID[i]] <- pm$CellLine[i] colnames(platemap) <- 1:24 - + print(table(platemap)) - fout <- file.path(platemap.dir, str_c(p, '.csv')) - write.csv(file=fout, platemap) + fout <- file.path(platemap.dir, str_c(p, ".csv")) + write.csv(file = fout, platemap) } } diff --git a/paper_figures/Figure_Transcriptomics/.DS_Store b/paper_figures/Figure_Transcriptomics/.DS_Store index 743769fd11f613c105a0310f076ebbcda1ffde02..39bdc4a1b05b1235a92fd2381d70f822a4a6eca6 100644 GIT binary patch literal 10244 zcmeHMU2GIp6h3EKV3rPawt@_>tqWBtuz{A}R1~&45;9*{p~a9u3+6fdGL3fh!2G??aR-qj8^(dz61VsNpF9(Q;z* zLeKoiKhY>-5RLnE+@r*yf}Sc-PZj+X1O40qJ#~`D{qf@-_0$2mWtzvgjDCiKe(Hfy zJ{&Oa(Vz_w2oRWx0L$IW-~bpf)NYpM@AxZ8R$bNS~wnq}K>gJthHMfWLdTiTF6`$4AZbKhQ%JW)m%T*0a&v6Aq(`m@w zKEu-Nk!Cw@X%5#-cmgpbhLW-}I@+)?7G1ZtVJsFMZCJZzLoC`@zj16VB+jp0y{S8S z(9Br&{oXdw|5O0mV`(`?9e+3GP(vbZXR=(prsiyk#3{6R`z)&~?(;%%pP|_m(j29C zcW<9Gkd&(mw4u(dYuJ{vThBQ}lX9g)8*fkBR##4E^1iff<};R)l*4J&Oy^D2)tgK+ zVI0+oSLvj4wrO_Rj=`oG=y&zQE|X0W^$$9B&TZk7C1pA9=-oQaWfb&Qn$hVu$Rtyg zG4%4>1(78;tysOTadXSIj;?c+a=7Zcd6G0pkXo*Bz|eEOBZlS<52#LBw=}~V>MIzI zkuvojOU>wEd^i+3S0#rlm4(V;QLYvH^C{y{-f)lbxvFavX|G0BciI}8O_NWGAud^3 zqbPfrq$Q_QtBE3K-@HtbdK|Vm)gQPrCP_U9>7C0CnPLm9Z;+H;Q%&intc5(a^&2Fm zuiybI@9>$oY?73IdUYAviWA0|&)yPOlzq%{hw8YUS>2*ly+`KxE^3smqBy{$O`4`_ z+}`$fMHzI4)tufrLcS<}>aTSt@%dXM37e};Gh9383wLp$Y*+<@9Gza6=kKB@))pPD zHeTmFPCzJB8?X0Ah)@k{pcOh`KjdH(j>8Ff5nhE;@EW`epTGsU2;ad^@H6}dzr!Cm z181U$A*{m1ScfZcCB|?)HeoZiUK4NAM`#i^uRdK8O$D z349u#!DsO~dXgCF3B_z`}A7x8<%gg@Yq_=_-Gs1PJ!zECGD7gh)>g>}Mh z!tKHiZ{1L5n_F>gn9QC%kg*-aMWr`Q}}F}xec@g8#K2gsct!AJ2id>o(P?)*HyfG^=mJcXz6b$kQg z#IyJ@eu-aAa@mfFE{os%<+2&Owbeh^#Bc9g6uv8Zwy9LA{ev_+TzZV0VMhEwC?s;0 zX7611_wB?uo~+z%)ybVX0&W2eNc|XM2I&}_>Q~d^D=oE1lLRF|AV45MAV45M zAV6TcAyDQmD9Y~t+y4Il|LL~uU_Ai>0Rq<$0W5D#v^LX(#l(K{-0ZGBN_9U~R@k_4 zk8%}i_;ox{ejQI8b{)?zK!{ho@nCv!;xjt#QF@~CpZ^)~Z*I?m{6EP5$v6ExK4bsS H^8Y^pnfLjP literal 10244 zcmeHMU2GIp6h3EKVAc+FO3U9$+l9)H+LV?*iwG|5q7+Js-Ts5FyE{WWFgsIrW_N*x zq-cykfDcA}Q#9(MXcROUB}OsP2TdddBT>Qw2BL}a!I)s8c<#Nku-g_)Oh}N-P41kz z=iYPfJ>Q%&bM74gz?O{G1P}rMnJS~ILYD~&>t}UbkvvW%Q6zbQT@ZuqU_b)W<3{@o z9|0c$9|0c$9|0eMTLA(3X0sxf3F=>c1bhU11g;{$-VY(FjD|frDky(-(1mXSh?WwY z7rLi?0GA1QH0;q)LAfaD6uSrXMA6$~pb)425F1W3?9ov{g*c!P2lQq}Z-;_>c9IM6 z;ecU5{i~0FkHB~Y*xr3AXrO~bwQ&D#Lj!cSnRYT^+6fxP0_z>B4Oe)fJAj(&NX9(A z0fS)CnScWsIumdp4k;K0lV%#Pd)*k_#jA2i&LCYANaBGplEtgKCix#$h1Fwpm+Zrrf+G#4;5orY}dH7w1}wb&U;bGU3&bjSfY5L4%jj5IYaZLAM9 z-`BjaJ~Xn5-7jCZa^JpyJfm*O>dx3sGiBKfJSynDSIHTQ%Ry>-Mw^38(WIRk<^$$D z@moQUlWi4-mMqR2dNyxp;Epn7YOuSfyI1M&Gm^FyHTLQ;wcH_V+o-uEt!K%~-lT12 zQkD}_E0S?DnK9$8zRon+VB(cK$+T^nTWrU04cqGPbM@VBUQtn3-+*JM-BzwFrm7i7 z@6<_`k<}xlkyK4*vvmwJHBOJ7c8J!68->+@BhzNhowwkgCCgW>ZGE_X%jt5pV!BeL z% zD-8kF33TnG=PyK?lx4E21-i|6LN^6#rP3^`n*zOAVWD8IR@TTBSRd%4$C#pZJAxe! zw#e$2@?&i7_PFDA4C@w6&#Z{l1cUvIzfRM1%`+m{7ThGOr{n==D4y0ka^#mnQCqM> zmieP4KGU>bGhF*>XOwlShLttQbJs3<8fmt9d$P4pV8?pGAGlmbumt$mQ1z+zssTO*{2g^~QP4t+y+{ua){dAJ0>!)2U| zWvF5;F2)92PR{!fZop00j+@DW`)~laawi@n7v6z8(Zwve@g8#IC-EtK8lS;u@dbPl zU*gXEI(O!`@Cd$(@8eNCh9~e-{0u+GllTRGgWutWF)rI)?6UaNHJ44(C5>VkS%%M+}V(0(sumAu5wzc0k%tydSfO759NHo$y zNaXjbV?kkO?IEgJL=E>2XO W=HK+60sj8~=A54S`~UyV{{J7YA9?)% diff --git a/paper_figures/Figure_Transcriptomics/README.md b/paper_figures/Figure_Transcriptomics/README.md new file mode 100644 index 0000000..9c2239d --- /dev/null +++ b/paper_figures/Figure_Transcriptomics/README.md @@ -0,0 +1,48 @@ +# FUS-ALS transcriptomic scores +Scripts in this directory train and evaluate transcriptomic-based FUS-ALS vs. +Healthy classifiers and generate figures summarizing model performance. Data for +both spinal cord and fibroblast classifiers are derived from: + +- **fibroblast raw read counts:** `data/050322_RNAseq/read_counts.Rdata` +- **spinal cord raw read counts:** `data/050322_RNAseq/nygc/read_counts.Rdata` + +## Preprocessing +Spinal cord and fibroblast RNA-seq datasets are preprocessed from raw read +counts using the `load_spine.R` and `load_fibro.R` scripts. Running these +scripts generates the `processed_*.Rdata` files contained in +`data/050322_RNAseq`. The resulting files contain sparse matrices with +both (i) raw read counts and (ii) TPM for genes that are both annotated in +the reactome pathway database and expressed in fibroblasts / spinal cord +samples. + +## Modeling +Models can be fit using the `fit_*.R` scripts. The spinal cord models train ALS +vs. healthy control classifiers on patient-derived spinal cord samples, using +subsets of genes corresponding to reactome pathways. The fibroblast models train +classifiers using the subset of reactome pathways with better than random +prediction in the spinal cord data. + +### Parameters in modeling scripts +#### Spinal cord model +- `min.healthy`: minimum number of healthy controls for a site to be included in + analysis. I.e., data from sites with fewer than `min.healthy` HCs are dropped. + +- `n.null`: number of bootstrap samples to use in generating null distribution + on metadata features. + +- `min.size`, `max.size`: minimum and maximum pathway sizes included in + analysis. + +#### Fibroblast model +- `n.rep`: number of training model replicates to fit for each hold-out cell + line. Upsampling for class balance, and optional bootstrap samples, result in + variation across model replicates. +- `bootstrap`: T/F, should models be trained on bootstrap samples of the + training data. +- `gene_transform`: transformation applied to each feature. Default is rank + transform. + + +## Figures +`fig_*.R` scripts generate figures reported in the manuscript. Models must be +trained first, using the `fit_*.R` scripts described above. diff --git a/paper_figures/Figure_Transcriptomics/fig_age_cor.R b/paper_figures/Figure_Transcriptomics/fig_age_cor.R new file mode 100644 index 0000000..567baeb --- /dev/null +++ b/paper_figures/Figure_Transcriptomics/fig_age_cor.R @@ -0,0 +1,90 @@ +################################################################################ +#+ setup, message=FALSE +################################################################################ +library(tidyverse) +library(data.table) +library(tidytext) +library(ggrepel) +theme_set(theme_bw(base_size = 22)) + +save.fig <- TRUE +age.of.onset <- TRUE + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.base.dir <- Sys.getenv("ALS_DATA") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics") +output.fig.dir <- file.path(figure.dir, "fig") + +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) +source(file.path(fig.base.dir, "paper_figures", "color_palette.R")) +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") + +load(file.path(output.fig.dir, "fibro_pw_models_auroc.Rdata")) +load(file.path(output.fig.dir, "spine_auc_enrich_group.Rdata")) + +# Initialize top-leve pathway grouping +groups <- setNames(pw.enriched$RootName, pw.enriched$Name) + +# Initialize cell line metadata table +xcell <- fread(library.file) %>% + dplyr::rename(CellLine = `Cell line`) %>% + dplyr::rename(Age = `Age of biopsy`) %>% + dplyr::rename(AgeOfOnset = `Age of onset`) %>% + dplyr::rename(Figure = `Figure Names`) %>% + dplyr::select(CellLine, Age, AgeOfOnset, Figure) + +# Select top-performaing pathway classifiers from spinal cord data +top.spine.pw <- group_by(top.pw, RootName) %>% + mutate(Count = n()) %>% + filter(Count >= 3) %>% + top_n(1, AUC) %>% + filter(!is.na(RootName)) + +# Initialize enzemblized fibroblast predictions +ypred.ens <- filter(ypred, Name %in% top.spine.pw$Name) %>% + filter(Genetics != "sporadic") %>% + group_by(CellLine, Genetics, Sex, Site) %>% + summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred)) %>% + ungroup() + +# Merge predictions with cell metadata +predictions <- left_join(ypred.ens, xcell, by = "CellLine") %>% + dplyr::select(-Ypred) %>% + dplyr::rename(Ypred = YpredSeq) %>% + mutate(AgeOfOnset = ifelse(AgeOfOnset == "Control", Age, AgeOfOnset)) %>% + mutate(AgeOfOnset = ifelse(AgeOfOnset == "control", Age, AgeOfOnset)) %>% + mutate(AgeOfOnset = ifelse(AgeOfOnset == "", Age, AgeOfOnset)) %>% + mutate(AgeOfOnset = as.numeric(AgeOfOnset)) %>% + mutate(Figure = str_remove_all(Figure, "-.*$")) + +if (age.of.onset) predictions$Age <- predictions$AgeOfOnset + +# Compute age / prediction correlation by group +xplot.text <- group_by(predictions, Genetics) %>% + summarize( + Cor = cor.test(Ypred, Age, alternative = "less")$estimate, + CorP = cor.test(Ypred, Age, alternative = "less")$p.value + ) %>% + mutate(CorP = p.adjust(CorP, method = "bonferroni")) %>% + mutate(Cor = round(Cor, 3)) %>% + mutate(CorP = round(CorP, 3)) %>% + mutate(PLab = sapply(CorP, label_pval)) %>% + mutate(Ypred = ifelse(Genetics == "Healthy", 0.95, 0.85)) %>% + mutate(text = str_c("Cor = ", Cor, PLab)) + +# Plot predictions v. age, FUS v. Healthy +fout <- file.path(output.fig.dir, "fig_supp_rna_aob.pdf") +if (save.fig) pdf(file = fout, h = 8, w = 8) +xplot.text$Age <- 60 +filter(predictions, Genetics %in% c("Healthy", "FUS-ALS")) %>% + ggplot(aes(x = Age, y = Ypred, col = Genetics)) + + geom_point(size = 3) + + geom_line(stat = "smooth", method = "lm", formula = y ~ x, size = 2) + + geom_text(data = filter(xplot.text, Genetics != "sporadic"), aes(label = text), size = 8) + + geom_text_repel(aes(label = CellLine)) + + scale_color_manual(values = col.pal) + + ylim(0:1) + + ylab("t-MAP score") + + theme(legend.position = "none") +if (save.fig) dev.off() diff --git a/paper_figures/Figure_Transcriptomics/fig_fibro.R b/paper_figures/Figure_Transcriptomics/fig_fibro.R new file mode 100644 index 0000000..b47a9be --- /dev/null +++ b/paper_figures/Figure_Transcriptomics/fig_fibro.R @@ -0,0 +1,332 @@ +######################################################## i####################### +#+ setup, echo=FALSE, warning=FALSE, message=FALSE +################################################################################ +library(data.table) +library(tidyverse) +library(ggsci) +library(Matrix) +library(PRROC) +library(ggrepel) +library(org.Hs.eg.db) +library(RColorBrewer) + +options(stringsAsFactors = FALSE) + +auroc <- function(yp, yt) roc.curve(scores.class0 = yp, weights.class0 = yt)$auc + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics") +pathway.file <- file.path(data.dir, "reactome_pathway.Rdata") +gene.file <- file.path(fig.base.dir, "data", "transcriptomic_genes.csv") + +output.fig.dir <- file.path(figure.dir, "fig") +dir.create(output.fig.dir, showWarnings = FALSE) + +source(file.path(figure.dir, "utilities.R")) +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) +source(file.path(fig.base.dir, "paper_figures", "color_palette.R")) +source(file.path(fig.base.dir, "paper_figures", "theme.R")) +theme_set(theme_als()) + +# Parameters for analysis +heat.pal <- pal_material("blue-grey")(10) +save.fig <- TRUE + +################################################################################ +#+ load_data +################################################################################ +load(file.path(output.fig.dir, "fibro_models.Rdata")) +load(file.path(output.fig.dir, "spine_models.Rdata")) +load(pathway.file) + +xgene <- fread(gene.file) %>% + mutate(Disease = ifelse(!Disease %in% c("ALS", ""), "ND", Disease)) + +# Initialize pathway groups +groups <- setNames(pw.enriched$RootName, pw.enriched$Name) + +# Initialize keys for converting between ENSEMBL / Symbol gene IDs +genes <- rbindlist(coefs)$Var1 %>% + as.character() %>% + str_subset("^ENS") + +gene.key.se <- mapIds( + x = org.Hs.eg.db, + keys = genes, + column = "SYMBOL", + keytype = "ENSEMBL" +) + +# Select top-performing pathway classifiers from spinal cord data +top.spine.pw <- filter(pw.enriched, RootName != "Other") %>% + group_by(RootName) %>% + top_n(1, AUC) + +# Compute fibroblast AUROC for pathways / groups +auc.fibro <- filter(ypred, Genetics != "sporadic") %>% + group_by(Name) %>% + summarize(AUC = auroc(YpredSeq, ALS)) %>% + arrange(desc(AUC)) + +auc.fibro.group <- mutate(auc.fibro, Group = groups[Name]) %>% + group_by(Group) %>% + summarize(AUC = max(AUC), Name = Name[which.max(AUC)]) %>% + arrange(AUC) + +# Generate pathway enrichment table +xtab <- left_join(pw.enriched, auc.fibro, by='Name') %>% + dplyr::rename(AUROC_Spine=AUC.x) %>% + dplyr::rename(AUROC_Fibroblast=AUC.y) %>% + group_by(RootName) %>% + arrange(desc(AUROC_Fibroblast), .by_group = TRUE) %>% + ungroup() %>% + dplyr::select(Name, RootName, Size, AUROC_Fibroblast, AUROC_Spine) + +fout <- file.path(output.fig.dir, 'pw_enrichment_table.csv') +write.csv(file = fout, xtab, row.names = FALSE) + +################################################################################ +#' ## Distribution of predictions by pathway +#+ pw_dist, fig.height=12, fig.width=12 +################################################################################ +ypred.ens <- filter(ypred, Name %in% top.spine.pw$Name) %>% + filter(Genetics != "sporadic") %>% + group_by(CellLine, Genetics, Sex, Site) %>% + summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred)) + +fit <- lm(YpredSeq ~ Genetics + Sex + Site, data = ypred.ens) +pval.lm <- summary(fit)$coefficients["GeneticsHealthy", "Pr(>|t|)"] +print(pval.lm) + +yfus <- ypred.ens$YpredSeq[ypred.ens$Genetics == "FUS-ALS"] +ywt <- ypred.ens$YpredSeq[ypred.ens$Genetics == "Healthy"] +pval.sr <- wilcox.test(yfus, ywt, alternative = "greater")$p.value +print(pval.sr) + +pval.lab <- label_pval(pval.sr) +ymax <- max(ypred.ens$Ypred) + 1e-2 + +fout <- file.path(output.fig.dir, "predictions_by_genetics.pdf") +if (save.fig) pdf(fout, height = 8, width = 4) +ggplot(ypred.ens, aes(x = Genetics, y = YpredSeq, fill = Genetics)) + + geom_boxplot(outlier.shape=NA) + + geom_jitter(size = 3, width = 0.2, alpha = 0.6) + + geom_text(x = 1.5, y = ymax + 5e-2, label = pval.lab, size = 8) + + geom_segment(x = 1, xend = 2, y = ymax, yend = ymax, col = "#0B4668", linewidth = 1) + + scale_fill_manual(values = col.pal) + + ylab("t-MAP score") + + xlab(NULL) + + theme(legend.position = "none") + + ylim(0:1) +if (save.fig) dev.off() + +################################################################################ +#' ## Imaging vs. sequencing predictions for select pathway +#+ pw_predictions, fig.height=12, fig.width=12 +################################################################################ +xthr <- filter(ypred.ens, Genetics == "Healthy") %>% arrange(desc(Ypred)) +xthr <- xthr$Ypred[1] + +ythr <- filter(ypred.ens, Genetics == "Healthy") %>% arrange(desc(YpredSeq)) +ythr <- ythr$YpredSeq[1] + +fout <- file.path(output.fig.dir, "pathway_predictions.pdf") +if (save.fig) pdf(fout, height = 10, width = 10) +ypred.ens %>% + ggplot(aes(x = Ypred, y = YpredSeq, col = Genetics)) + + geom_vline(xintercept = xthr, col = "grey", lty = 2, linewidth = 1.5) + + geom_hline(yintercept = ythr, col = "grey", lty = 2, linewidth = 1.5) + + geom_point(size = 4) + + geom_text_repel(aes(label = CellLine), size = 6) + + scale_color_manual(values = col.pal) + + theme(legend.position = "none") + + xlab("i-MAP score") + + ylab("t-MAP score") + + xlim(0:1) + + ylim(0:1) +if (save.fig) dev.off() + + +# Plot ensemblized predictions with sporadics +ypred.ens.full <- filter(ypred, Name %in% top.spine.pw$Name) %>% + group_by(CellLine, Genetics, Sex, Site) %>% + summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred)) + +fout <- file.path(output.fig.dir, "pathway_predictions_full.pdf") +if (save.fig) pdf(fout, height = 10, width = 10) +ypred.ens.full %>% + ggplot(aes(x = Ypred, y = YpredSeq, col = Genetics)) + + geom_vline(xintercept = xthr, col = "grey", lty = 2, linewidth = 1.5) + + geom_hline(yintercept = ythr, col = "grey", lty = 2, linewidth = 1.5) + + geom_point(size = 4) + + geom_text_repel(aes(label = CellLine), size = 6) + + scale_color_manual(values = col.pal) + + theme(legend.position = "none") + + xlab("i-MAP score") + + ylab("t-MAP score") + + xlim(0:1) + + ylim(0:1) +if (save.fig) dev.off() + +################################################################################ +#+ ## imaging and pathway predictions by cell line +#+ predictions_by_celline +################################################################################ +xplot <- group_by(ypred, Name) %>% + mutate(SeqThreshold = max(YpredSeq[ALS == 0])) %>% + mutate(Threshold = max(Ypred[ALS == 0])) %>% + ungroup() %>% + mutate(SepSeq = YpredSeq > SeqThreshold) %>% + mutate(Sep = Ypred > Threshold) %>% + mutate(Group = groups[Name]) %>% + group_by(Group, CellLine, Genetics) %>% + summarize(Sep = any(Sep), SepSeq = any(SepSeq)) + +xplot.seq <- filter(xplot, SepSeq) %>% + dplyr::select(CellLine, Group, Genetics) + +xplot.img <- filter(xplot, Sep) %>% + dplyr::select(CellLine, Group, Genetics) %>% + mutate(Group = "Imaging") + +cell.order <- dplyr::select(ypred, CellLine, Ypred, ALS) %>% + distinct() %>% + arrange(desc(Ypred)) + +fout <- file.path(output.fig.dir, "cell_line_sep.pdf") +if (save.fig) pdf(fout, height = 8, width = 10) +rbind(xplot.seq, xplot.img) %>% + filter(Genetics != "sporadic") %>% + mutate(Group = factor(Group, levels = c(auc.fibro.group$Group, "Imaging"))) %>% + mutate(CellLine = factor(CellLine, levels = cell.order$CellLine)) %>% + ggplot(aes(x = CellLine, y = Group)) + + geom_point(size = 8) + + theme(axis.text.x = element_text(color = col.pal["FUS-ALS"])) + + geom_hline(yintercept = 8.5, lty = 2, col = "#363636") + + xlab(NULL) + + ylab(NULL) + + theme(legend.position = "none") +if (save.fig) dev.off() + +################################################################################ +#' ## Number of cell lines classified by modality +#+ num_cell_line, fig.height=12, fig.width=12 +################################################################################ +xplot <- auc.fibro.group %>% + mutate(Group = factor(Group, levels = Group)) %>% + filter(Group != "Other") + +fout <- file.path(output.fig.dir, "num_classified_auc.pdf") +if (save.fig) pdf(fout, height = 8, width = 2) +ggplot(xplot, aes(x = Group, y = AUC)) + + geom_point(size = 6, color = "#0B4668") + + geom_segment(aes(xend = Group, yend = 0.7), lwd = 3, color = "#0B4668") + + coord_flip() + + theme(legend.position = "none") + + xlab(NULL) + + theme(axis.text.y = element_blank()) + + ylab("AUROC") + + scale_y_continuous(breaks = c(0.7, 1), limits = c(0.7, 1)) +if (save.fig) dev.off() + +################################################################################ +#' ## Minimal gene set +#' We sought to identify minimal gene sets cable of predicting ALS vs. health +#' within each PW group. This strategy was analogous to imaging: (i) define a +#' refined, pathway focused assay (ii) train classifier (iii) predict hold-out. +#' +#' As with imaging, we focused on sparse (minimal) set of readouts for +#' prediction. Focusing on spinal cord enriched PWs to prioritize ALS relevant +#' pathways. +#+ coeffs, fig.height=8, fig.width=12 +################################################################################ +#' ## Minimal gene set +coef.select.full <- rbindlist(coefs) %>% + filter(str_detect(Var1, "^ENSG")) %>% + mutate(Group = groups[Name]) %>% + filter(Group != "Other") %>% + group_by(Var1, Group, Name) %>% + summarize(Stability = mean(AbsBeta != 0), AbsBeta = mean(AbsBeta)) %>% + mutate(Var1 = as.character(Var1)) %>% + group_by(Group, Var1) %>% + summarize(Stability = max(Stability), AbsBeta = max(AbsBeta)) %>% + mutate(SYMBOL = gene.key.se[Var1]) %>% + filter(Stability >= 0.5) + +# Initialize stability score / coefficient matrices +coef.select <- group_by(coef.select.full, Group) %>% + top_n(10, Stability) %>% + mutate(Group = factor(Group, levels = auc.fibro.group$Group)) + +xgene <- dplyr::rename(xgene, SYMBOL = Gene)[-14, ] +xgene <- mutate(xgene, Disease = factor(Disease, levels = c("ALS", "ND", ""))) + +# Plot gene fingerprints by pathway +gene.order <- left_join(coef.select, xgene, by = "SYMBOL") %>% + group_by(SYMBOL) %>% + summarize(Stability = max(Stability)) %>% + arrange(desc(Stability)) + +fout <- file.path(output.fig.dir, "gene_fingerprint.pdf") +if (save.fig) pdf(fout, height = 8, width = 24) +mutate(coef.select, SYMBOL = factor(SYMBOL, levels = gene.order$SYMBOL)) %>% + ggplot(aes(x = SYMBOL, y = Group)) + + geom_point(aes(col = Stability), size=8) + + geom_point(shape = 1, size=8) + + scale_color_material("blue-grey", breaks=c(0.5, 1), limits=c(0.5, 1)) + + theme(axis.text.x = element_text(angle = 90)) + + xlab(NULL) + + ylab(NULL) + + theme(legend.position = "bottom") + + theme(legend.title = element_blank()) + + theme(legend.key.width = unit(2, "cm")) +if (save.fig) dev.off() + + +fout <- file.path(output.fig.dir, "gene_refs.pdf") +if (save.fig) pdf(fout, height = 3, width = 24) +mutate(xgene, SYMBOL = factor(SYMBOL, levels = gene.order$SYMBOL)) %>% + ggplot(aes(x = SYMBOL, y = 1, shape = Disease)) + + geom_point(size = 8) + + theme(axis.text.x = element_text(angle = 90)) + + theme(axis.text.y = element_blank()) + + theme(panel.grid.minor.y = element_blank()) + + theme(panel.grid.major.y = element_blank()) + + scale_shape_manual(guide = "none", values = c(8, 10, 32)) + + xlab(NULL) + + ylab(NULL) + + ylim(0.95, 1.05) +if (save.fig) dev.off() + +# Plot gene fingerprints aggregated +fout <- file.path(output.fig.dir, "gene_fingerprint_agg.pdf") +if (save.fig) pdf(fout, height = 7, width = 20) +left_join(gene.order, xgene, by = "SYMBOL") %>% + mutate(Disease = as.character(Disease)) %>% + mutate(Disease = ifelse(Disease == "", "No reported ND", Disease)) %>% + mutate(Disease = ifelse(is.na(Disease), "No reported ND", Disease)) %>% + mutate(SYMBOL = factor(SYMBOL, levels = rev(gene.order$SYMBOL))) %>% + ggplot(aes(y = SYMBOL, x = Stability, col = Disease)) + + geom_point(size=5) + + geom_segment(aes(yend=SYMBOL), xend = 0.5, linewidth = 3) + + scale_color_jama() + + theme(axis.text.x = element_text(angle = 90)) + + xlab('Stability score') + + ylab(NULL) + + labs(col=NULL) + + theme(legend.position = c(0.1, 0.85)) + + theme(legend.text = element_text(size = 18)) + + coord_flip() +if (save.fig) dev.off() + +# Write gene fingerprint table +xtab <- left_join(coef.select.full, xgene, by = "SYMBOL") %>% + dplyr::rename(RootName = Group) %>% + group_by(RootName) %>% + arrange(desc(Stability), .by_group = TRUE) %>% + dplyr::select(RootName, SYMBOL, Stability, `Protein name`, Function, Disease, Reference, `Author, Year`) + +fout <- file.path(output.fig.dir, 'gene_enrichment_table.csv') +write.csv(file = fout, xtab, row.names = FALSE) \ No newline at end of file diff --git a/paper_figures/Figure_Transcriptomics/fig_spine.R b/paper_figures/Figure_Transcriptomics/fig_spine.R new file mode 100644 index 0000000..0198f36 --- /dev/null +++ b/paper_figures/Figure_Transcriptomics/fig_spine.R @@ -0,0 +1,119 @@ +################################################################################ +#+ setup, echo=FALSE, warning=FALSE, message=FALSE +################################################################################ +library(data.table) +library(tidyverse) +library(Matrix) +library(ggsci) +library(PRROC) +library(ggh4x) +library(parallel) +library(RColorBrewer) +options(stringsAsFactors = FALSE) +theme_set(theme_bw(base_size = 22)) + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics") +pathway.file <- file.path(data.dir, "reactome_pathway.Rdata") +hierarchy.file <- file.path(fig.base.dir, "data", "reactome_hierarchy.Rdata") +output.fig.dir <- file.path(figure.dir, "fig") + +disc.col.pal <- pal_npg()(10)[-8] +save.fig <- FALSE +n.null <- 100000 +p.thresh <- 0 + +load(file.path(output.fig.dir, "spine_models.Rdata")) +load(hierarchy.file) + +source(file.path(fig.base.dir, "paper_figures", "color_palette.R")) + +clean_name <- function(x, n = 10) { + # Prettify pathway names for figure axes + x <- str_split(x, " ")[[1]] + #if (length(x) > 12) n <- 6 + nbreak <- floor(length(x) / n) + + # adjust n for roughly equal words per line + n <- ceiling(length(x) / (nbreak + 1)) + + if (nbreak > 0) { + for (i in 1:nbreak) { + if ((i * n) > length(x)) break + x[i * n] <- str_c(x[i * n], "\n") + } + } + + return(str_c(x, collapse = " ")) +} + +################################################################################ +#+ pathways, fig.height=12, fig.width=16 +# Initialize top-level pathways in hierarchy +################################################################################ +root.nodes <- filter(nodes, Top)$id +root.children <- lapply(root.nodes, get_children, nodes = nodes, edges = edges) +root.children <- mapply(function(r, ch) c(r, ch), root.nodes, root.children, SIMPLIFY = FALSE) + +root.children <- reshape2::melt(root.children) %>% + dplyr::rename(Root = L1) %>% + dplyr::rename(Depth = L2) %>% + dplyr::rename(id = value) %>% + left_join(nodes, by = "id") %>% + dplyr::rename(Name = label) + +root.key <- filter(root.children, Top) %>% + dplyr::select(Root, Name) %>% + dplyr::rename(RootName = Name) + +pw.enriched <- filter(pw.aucs, p <= p.thresh) %>% + left_join(root.children, by = "Name") %>% + left_join(root.key, by = "Root") %>% + group_by(RootName) %>% + mutate(Count = n()) %>% + mutate(RootName = ifelse(Count < 3, "Other", RootName)) %>% + ungroup() %>% + filter(!(Name == "Retinoid metabolism and transport" & RootName == "Other")) + +# Plot pathways with enrichment +limits <- c(0.25, 1) +breaks <- seq(0.25, 1, by=0.25) + +p1 <- filter(pw.enriched, p == 0) %>% + mutate(Name=str_sub(Name, 1, 50)) %>% + ggplot(aes(x=AUC, y=RootName)) + + #geom_boxplot() + + geom_jitter(height=0.25, width=0) + + ylab(NULL) + + scale_x_continuous(limits=limits, breaks=breaks) + + xlab(NULL) + + theme(axis.text.x = element_blank()) + + labs(fill=NULL) + +p2 <- ggplot(null.accuracy) + + geom_histogram(aes(x=AUC)) + + scale_x_continuous(limits=limits, breaks=breaks) + + ylab(NULL) + + xlab('AUROC') + +# build the plots +p1.common <- ggplot_gtable(ggplot_build(p1)) +p2.common <- ggplot_gtable(ggplot_build(p2)) +p2.common$widths <- p1.common$widths + +fout <- file.path(output.fig.dir, 'spine_auroc.pdf') +pdf(file=fout, height=12, width=8) +grid.arrange(p1.common, p2.common, nrow=2, heights=c(5, 1)) +dev.off() + + +# Save Rdata object +fout <- file.path(output.fig.dir, "spine_models.Rdata") +save(file = fout, pw.aucs, pw.models, null.pred, null.accuracy, pw.enriched) + +# Save table of pathway enrichment +fout <- file.path(output.fig.dir, "spine_enrichment_table.csv") +xtab <- dplyr::select(pw.enriched, Name, RootName, Size, AUC, p) +write.csv(file = fout, xtab, row.names=FALSE) diff --git a/paper_figures/Figure_Transcriptomics/fit_fibro.R b/paper_figures/Figure_Transcriptomics/fit_fibro.R new file mode 100644 index 0000000..731c5c4 --- /dev/null +++ b/paper_figures/Figure_Transcriptomics/fit_fibro.R @@ -0,0 +1,101 @@ +################################################################################ +#+ setup, echo=FALSE, warning=FALSE, message=FALSE +################################################################################ +library(data.table) +library(tidyverse) +library(Matrix) +library(mltools) +library(ggrepel) +library(parallel) + +options(stringsAsFactors = FALSE) +theme_set(theme_bw(base_size = 22)) + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics") +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") +pathway.file <- file.path(data.dir, "reactome_pathway.Rdata") + +output.fig.dir <- file.path(figure.dir, "fig") +dir.create(output.fig.dir, showWarnings = FALSE) +source(file.path(figure.dir, "utilities.R")) + +# Parameters for analysis +n.core <- 12 +nrep <- 10 +bootstrap <- TRUE + +gene_transform <- function(x) rank(x) +model <- fit_lm_classification +model_predict <- predict_lm_classification + +################################################################################ +#' ## Data preprocessing +#+ load_data, echo=FALSE, warning=FALSE, message=FALSE +################################################################################ +load(file.path(data.dir, "fibro_processed.Rdata")) +load(file.path(output.fig.dir, "spine_models.Rdata")) +load(file.path(output.fig.dir, "genes.Rdata")) +load(pathway.file) + +colnames(xmeta.fibro) <- str_replace_all(colnames(xmeta.fibro), " ", "_") + +x <- x.fibro[, shared.genes] +xmeta <- mutate(xmeta.fibro, Y = as.numeric(ALS)) +rm(x.fibro, xmeta.fibro) + +# Initialize one-hot encoding of metadata features +xonehot <- dplyr::select(xmeta, Site, Sex) %>% + mutate_all(as.factor) %>% + as.data.table() %>% + mltools::one_hot() + +# Normalize features for modeling +x <- apply(x, MAR = 2, gene_transform) + +# Load in cell line library metadata +xcell <- fread(library.file) %>% + dplyr::rename(CellLine = `Cell line`) %>% + dplyr::rename(Fig = `Figure Names`) + +################################################################################ +#' ## Modeling +#' Transcription model of ALS. We fit supervised models trained to predict +#' disease status. We use a leave-one-out sample split to evaluate predictions +#' on held-out WT & FUS cell lines. Genes used as features in classification +#' models are taken from pathways enriched in patient-derived spinal cord +#' samples. +#+ models, fig.height=8, fig.width=12, cache=TRUE +################################################################################ +pw.genes <- pw.enriched$ENSEMBL +pathways <- pw.enriched$Name + +fit <- mclapply(pw.genes, function(g) { + fit_genes(x, xmeta, xonehot, g, model, model_predict, nrep, bootstrap = bootstrap) +}, mc.cores = n.core) + +ypred <- lapply(1:length(fit), function(i) { + return(mutate(fit[[i]]$Ypred, Name = pathways[i])) +}) + +ypred <- rbindlist(ypred) %>% + group_by(CellLine, Genetics, Name, Site, Sex) %>% + summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred), .groups = "drop") %>% + mutate(ALS = as.numeric(Genetics != "Healthy")) + +coefs <- lapply(1:length(fit), function(i) { + coef.i <- reshape2::melt(fit[[i]]$Coef) %>% + mutate(Name = pathways[i]) %>% + dplyr::rename(CellIdx = L1) %>% + filter(Var1 != "Intercept") %>% + mutate(AbsBeta = abs(value)) %>% + mutate(SignBeta = sign(value)) %>% + dplyr::select(Name, Var1, CellIdx, AbsBeta, SignBeta) + + return(coef.i) +}) + +fout <- file.path(output.fig.dir, "fibro_models.Rdata") +save(file = fout, fit, ypred, coefs) diff --git a/paper_figures/Figure_Transcriptomics/fit_spine.R b/paper_figures/Figure_Transcriptomics/fit_spine.R new file mode 100644 index 0000000..964ddb3 --- /dev/null +++ b/paper_figures/Figure_Transcriptomics/fit_spine.R @@ -0,0 +1,133 @@ +############################################################################### +#+ setup, echo=FALSE, warning=FALSE, message=FALSE +################################################################################ +library(data.table) +library(tidyverse) +library(Matrix) +library(caret) +library(iRF) +library(PRROC) +library(parallel) + +options(stringsAsFactors = FALSE) +theme_set(theme_bw(base_size = 22)) + +# Paths to data directories +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/") +figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics") +output.fig.dir <- file.path(figure.dir, "fig") + +n.core <- 12 +train.prop <- 0.5 # training proportion +min.healthy <- 3 # minimum number of healthy samples to include site in analysis +n.null <- 100000 # number of bootstrap subsamples for null distribution +min.size <- 5 # minimum pathway size included in analysis +max.size <- 500 # maximmum pathway size included in analysis + +source(file.path(figure.dir, "utilities.R")) +load(file.path(data.dir, "spine_processed.Rdata")) +load(file.path(output.fig.dir, "genes.Rdata")) +load(file.path(data.dir, "reactome_pathway.Rdata")) + +auc <- function(x) { + # Compute AUROC from data frame with columns `Ypred`, `Ytrue` + roc.curve(scores.class0 = x$Ypred, weights.class0 = x$Ytrue)$auc +} + +pw_predict <- function(x, y, id.train, genes) { + # Fit ALS vs. health classification models on subset of genes + fselect <- str_c(genes, collapse = "|") + fselect <- str_c("(", fselect, "|Sex|^Site$|^Age$|pct_)") + x <- dplyr::select(x, matches(fselect)) + return(fit_sample_split(x, y, id.train)) +} + +################################################################################ +#' ## Load data +#' Below Initialize spinal cord data sets used in classification models. We +#' include sample metadata features: `Sex`, `Site` (of sample collection), +#' `Age`, and `pct_x` (x corresponding to ancestry computed from WGS data). +#+ modeling +################################################################################ +xmeta <- dplyr::select(xmeta.spine, matches("(^Sex$|^Site$|^Age$|pct_)")) +x.spine <- x.spine[, shared.genes] + +x <- cbind(x.spine, xmeta) +y <- as.factor(xmeta.spine$ALS) + +# Filter to sites with both ALS and healthy control samples +sites <- group_by(xmeta.spine, Site) %>% summarize(NHealthy = sum(!ALS)) +dplyr::select(xmeta.spine, Site, ALS) %>% table() + +genes <- colnames(x.spine) +id.drop <- xmeta$Site %in% sites$Site[sites$NHealthy < min.healthy] + +x <- x[!id.drop, ] +y <- y[!id.drop] +xmeta <- xmeta[!id.drop, ] +x.spine <- x.spine[!id.drop, ] + +# Take balanced sample within site +set.seed(47) + +xtrain <- data.frame(x, Y = y) %>% + mutate(Idx = 1:n()) %>% + group_by(Site, Y) %>% + sample_frac(train.prop) + +id.train <- xtrain$Idx +id.test <- setdiff(1:nrow(x), id.train) +ytrue <- as.numeric(y) - 1 + +################################################################################ +#' ## Null model +#' Performance of a null model is estimated +#+ null_model +################################################################################ +bs.train <- replicate(n.null, sample(id.train, replace = TRUE), simplify = FALSE) +bs.test <- replicate(n.null, sample(id.test, replace = TRUE), simplify = FALSE) + +fit.null <- mcmapply(function(trn, tst) { + fit <- fit_sample_split(xmeta, y, trn, tst) + ypred <- data.frame(Ypred = fit$ypred, Ytrue = ytrue[fit$id]) + acc <- data.frame(AUC = auc(ypred)) + return(list(ypred = ypred, acc = acc)) +}, bs.train, bs.test, mc.cores = n.core, SIMPLIFY = FALSE) + +null.accuracy <- rbindlist(lapply(fit.null, function(z) z$acc)) +null.pred <- lapply(fit.null, function(z) z$ypred) + +################################################################################ +#' ## Pathway models +#' For each reactome pathway, we fit ALS vs. healthy classifiers using the +#' subset of genes associated with that pathway. +#+ pw_models +################################################################################ +reactome.pathways <- reactome.pathways %>% + mutate(ENSEMBL = sapply(ENSEMBL, intersect, y = genes)) %>% + mutate(Size = sapply(ENSEMBL, length)) %>% + filter(Size > min.size & Size <= max.size) %>% + dplyr::select(-Genes, -ID) + +# Fit classification models for each reactome pathway +pw.models <- mclapply(reactome.pathways$Name, function(pw) { + genes <- filter(reactome.pathways, Name == pw)$ENSEMBL + genes <- na.omit(unlist(genes)) + fit.g <- pw_predict(x, y, id.train, genes) + ypred <- data.frame(Ypred = fit.g$ypred, Ytrue = ytrue[fit.g$id]) + + return(list( + acc = data.frame(AUC = auc(ypred), Name = pw), + ypred = ypred, + importance = fit.g$importance + )) +}, mc.cores = n.core) + +pw.aucs <- rbindlist(lapply(pw.models, function(z) z$acc)) %>% + arrange(desc(AUC)) %>% + left_join(reactome.pathways, by = "Name") %>% + mutate(p = sapply(AUC, function(z) mean(null.accuracy$AUC > z))) + +fout <- file.path(output.fig.dir, "spine_models.Rdata") +save(file = fout, pw.aucs, pw.models, null.pred, null.accuracy) diff --git a/paper_figures/Figure_Transcriptomics/load_fibroblast.R b/paper_figures/Figure_Transcriptomics/load_fibroblast.R index 10eb4b1..71b3170 100644 --- a/paper_figures/Figure_Transcriptomics/load_fibroblast.R +++ b/paper_figures/Figure_Transcriptomics/load_fibroblast.R @@ -1,74 +1,65 @@ -#' The following loads RNA-seq data for patient derived fibroblast samples. -#' -#' Requires input datasets -#' `data/050322_RNAseq` -#' `data/reactome_pathway.Rdata` -#' `scripts/Figure_scores/models/predictions.Rdata` -#' -#' **Note:** RNA-seq read count datasets can be processed from raw fastq files -#' using the `als_rnaseq_fibro` pipeline. -#' Karl Kumbier 10/19/2022library(data.table) -library(tidyverse) +################################################################################ +#+ setup +################################################################################ library(Matrix) -library(clusterProfiler) +library(tidyverse) +library(data.table) +library(DESeq2) -options(stringsAsFactors=FALSE) +options(stringsAsFactors = FALSE) -################################################################################ -# Load data and initialize analysis paramters -################################################################################ # Paths to data directories -fig.base.dir <- Sys.getenv('ALS_PAPER') -data.dir <- file.path(fig.base.dir, 'data/050322_RNAseq/') +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/") -pred.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Scores', 'fig_full') -pathway.file <- str_c(data.dir, 'reactome_pathway.Rdata') -library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv') -qc.file <- file.path(fig.base.dir, 'data', 'data Table 2.1 Sample Sequencing Statistics.csv') +pred.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores", "irf", "fig_full") +pathway.file <- str_c(data.dir, "reactome_pathway.Rdata") +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") +qc.file <- file.path(fig.base.dir, "data", "data Table 2.1 Sample Sequencing Statistics.csv") -load(file.path(data.dir, 'read_counts.Rdata')) -load(file.path(pred.dir, 'predictions.Rdata')) -xqc <- fread(qc.file) %>% dplyr::rename(CellLine=`Sample ID`) +load(file.path(data.dir, "read_counts.Rdata")) +load(file.path(pred.dir, "predictions.Rdata")) +xqc <- fread(qc.file) %>% dplyr::rename(CellLine = `Sample ID`) # Source utility functions -source(file.path(fig.base.dir, 'paper_figures', 'utilities.R')) +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) expression.threshold <- 2.5 # minimum log TPM + 1 level for gene filtering ################################################################################ # Initialize cell line metadata ################################################################################ # Read in cell line meta data -cell.lines <- str_remove_all(bam.files, '^.*/') %>% str_remove_all('Aligned.*$') +cell.lines <- str_remove_all(bam.files, "^.*/") %>% str_remove_all("Aligned.*$") xcell <- fread(library.file) %>% - dplyr::rename(CellLine=`Cell line`) %>% - dplyr::rename(Genetics=`Disease Gene`) - -xmeta <- data.frame(CellLine=cell.lines) %>% - left_join(xcell, by='CellLine') %>% - mutate(Genetics=str_replace_all(Genetics, 'ALS with no known mutation', 'sporadic')) %>% - mutate(Genetics=str_replace_all(Genetics, 'WT', 'Healthy')) %>% - mutate(Genetics=str_replace_all(Genetics, 'FUS', 'FUS-ALS')) %>% - mutate(Sex=ifelse(Sex == 'm', 'M', Sex)) %>% - mutate(Site=`Origin Institution`) %>% - mutate(Institution=Site) %>% - left_join(xqc, by='CellLine') + dplyr::rename(CellLine = `Cell line`) %>% + dplyr::rename(Genetics = `Disease Gene`) + +xmeta <- data.frame(CellLine = cell.lines) %>% + left_join(xcell, by = "CellLine") %>% + mutate(Genetics = str_replace_all(Genetics, "ALS with no known mutation", "sporadic")) %>% + mutate(Genetics = str_replace_all(Genetics, "WT", "Healthy")) %>% + mutate(Genetics = str_replace_all(Genetics, "FUS", "FUS-ALS")) %>% + mutate(Sex = ifelse(Sex == "m", "M", Sex)) %>% + mutate(Site = `Origin Institution`) %>% + mutate(Institution = Site) %>% + left_join(xqc, by = "CellLine") # Initialize cell line genetics key gene.tab <- dplyr::select(xmeta, CellLine, Genetics) gene.tab <- setNames(gene.tab$Genetics, gene.tab$CellLine) # Merge metadata with imaging predictions -predictions <- group_by(predictions, CellLine) %>% +predictions <- group_by(predictions, CellLine) %>% arrange(desc(Ypred)) %>% dplyr::select(CellLine, CellLinePassageNumber, Ypred) -xmeta <- left_join(xmeta, predictions, by='CellLine') %>% - mutate(ALS=Genetics != 'Healthy') +xmeta <- left_join(xmeta, predictions, by = "CellLine") %>% + mutate(ALS = Genetics != "Healthy") #' # Preprocessing #' Raw read counts are preprocessed as follows: -#' +#' #' 1. Normalize expression within sample as transcript per million (TPM) #' 2. Drop genes with low expression: average log(TPM + 1) < `r expression.threshold` #' 3. Filter reactome pathway genes @@ -79,24 +70,26 @@ genes <- filter(genes, ensembl_gene_id %in% colnames(x)) %>% arrange(match(ensembl_gene_id, colnames(x))) exon.length <- setNames(genes$ExonSize, genes$ensembl_gene_id) -genes.drop <- which(is.na(exon.length)) %>% names -x <- as.matrix(x[,!colnames(x) %in% genes.drop]) +genes.drop <- which(is.na(exon.length)) %>% names() +x <- as.matrix(x[, !colnames(x) %in% genes.drop]) xcount <- x -x <- sapply(colnames(x), function(g) x[,g] / exon.length[g]) -x <- t(apply(x, MAR=1, function(z) 1e6 * z / sum(z))) +x <- sapply(colnames(x), function(g) x[, g] / exon.length[g]) +x <- t(apply(x, MAR = 1, function(z) 1e6 * z / sum(z))) # Drop low expression genes -xcount <- xcount[,colMedians(log(x + 1)) >= expression.threshold] -x <- x[,colMedians(log(x + 1)) >= expression.threshold] +xcount <- xcount[, colMedians(log(x + 1)) >= expression.threshold] +x <- x[, colMedians(log(x + 1)) >= expression.threshold] # Filter to genes with annotated pathways load(pathway.file) -xcount <- xcount[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] -x <- x[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] +xcount <- xcount[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] +x <- x[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] # Rename for joint analysis xcount.fibro <- xcount x.fibro <- x -xmeta.fibro <- mutate(xmeta, Type='Fibroblast') -rm(x, xmeta) \ No newline at end of file +xmeta.fibro <- mutate(xmeta, Type = "Fibroblast") + +fout <- file.path(data.dir, "fibro_processed.Rdata") +save(file = fout, x.fibro, xcount.fibro, xmeta.fibro) diff --git a/paper_figures/Figure_Transcriptomics/load_spine.R b/paper_figures/Figure_Transcriptomics/load_spine.R index 9b71cef..7e7dc63 100644 --- a/paper_figures/Figure_Transcriptomics/load_spine.R +++ b/paper_figures/Figure_Transcriptomics/load_spine.R @@ -1,63 +1,52 @@ -#' The following loads RNA-seq data for patient derived spinal cord samples. -#' -#' Requires input datasets -#' `data/050322_RNAseq` -#' `data/reactome_pathway.Rdata` -#' -#' **Note:** RNA-seq read count datasets can be processed from raw fastq files -#' using the `als_rnaseq_spine` pipeline. -#' -#' Karl Kumbier 10/19/2022 +################################################################################ +#+ setup +################################################################################ library(Matrix) library(tidyverse) library(data.table) +library(DESeq2) -################################################################################ -# Load data and initialize analysis paramters -################################################################################ # Paths to data directories -fig.base.dir <- Sys.getenv('ALS_PAPER') -data.dir <- file.path(fig.base.dir, 'data/050322_RNAseq/') -nygc.dir <- file.path(data.dir, 'nygc/') +fig.base.dir <- Sys.getenv("ALS_PAPER") +data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/") +nygc.dir <- file.path(data.dir, "nygc/") -pred.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Scores', 'fig_full') -pathway.file <- str_c(data.dir, 'reactome_pathway.Rdata') -library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv') -pathway.file <- file.path(data.dir, 'reactome_pathway.Rdata') +pathway.file <- str_c(data.dir, "reactome_pathway.Rdata") +library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv") # Source utility functions -source(file.path(fig.base.dir, 'paper_figures', 'utilities.R')) +source(file.path(fig.base.dir, "paper_figures", "utilities.R")) expression.threshold <- 2.5 # minimum log read count threshold # Load exon length table -load(file.path(data.dir, 'read_counts.Rdata')) +load(file.path(data.dir, "read_counts.Rdata")) gene.length <- dplyr::select(genes, ensembl_gene_id, ExonSize) # Load spinal cord read count data -load(file.path(nygc.dir, 'read_counts_aw.Rdata')) -genes <- left_join(genes, gene.length, by='ensembl_gene_id') +load(file.path(nygc.dir, "read_counts_aw.Rdata")) +genes <- left_join(genes, gene.length, by = "ensembl_gene_id") # Initialize patient populations used for analysis -als.groups <- 'ALS Spectrum MND' -control.groups <- 'Non-Neurological Control' -site <- 'Spinal_Cord_Cervical' +als.groups <- "ALS Spectrum MND" +control.groups <- "Non-Neurological Control" +site <- "Spinal_Cord_Cervical" # Initialize metadata table -meta.file <- 'RNA_Metadata_200212/Delivered Quotes-Table 1.csv' +meta.file <- "RNA_Metadata_200212/Delivered Quotes-Table 1.csv" xmeta <- fread(file.path(nygc.dir, meta.file)) - + colnames(xmeta) <- colnames(xmeta) %>% - str_remove_all(' ') %>% - str_remove_all('\\(.*$') + str_remove_all(" ") %>% + str_remove_all("\\(.*$") -xmeta <- dplyr::rename(xmeta, Mutation=ReportedGenomicMutations) %>% - mutate(SubjectGroup=str_replace_all(SubjectGroup, 'DIs', 'Dis')) +xmeta <- dplyr::rename(xmeta, Mutation = ReportedGenomicMutations) %>% + mutate(SubjectGroup = str_replace_all(SubjectGroup, "DIs", "Dis")) # Filter to metadata with RNA-seq samples xmeta <- filter(xmeta, ExternalSampleId %in% xbam$ExternalSampleId) #' # Preprocessing -#' +#' #' We preprocess data as follows: #' 1. Filter to samples derived from patient spinal cords #' 2. Filter to ALS and non-neurological control samples @@ -70,8 +59,8 @@ xmeta <- filter(xmeta, ExternalSampleId %in% xbam$ExternalSampleId) id.spine <- xmeta$SampleSource %in% site id.groups <- xmeta$SubjectGroup %in% c(als.groups, control.groups) -x <- x[id.spine & id.groups,] -xmeta <- xmeta[id.spine & id.groups,] +x <- x[id.spine & id.groups, ] +xmeta <- xmeta[id.spine & id.groups, ] # Normalize read counts as transcripts per killobase million (TPM) genes <- filter(genes, ensembl_gene_id %in% colnames(x)) %>% @@ -79,31 +68,33 @@ genes <- filter(genes, ensembl_gene_id %in% colnames(x)) %>% exon.length <- setNames(genes$ExonSize, genes$ensembl_gene_id) -genes.drop <- which(is.na(exon.length)) %>% names -x <- as.matrix(x[,!colnames(x) %in% genes.drop]) +genes.drop <- which(is.na(exon.length)) %>% names() +x <- as.matrix(x[, !colnames(x) %in% genes.drop]) xcount <- x -x <- sapply(colnames(x), function(g) x[,g] / exon.length[g]) -x <- t(apply(x, MAR=1, function(z) 1e6 * z / sum(z))) +x <- sapply(colnames(x), function(g) x[, g] / exon.length[g]) +x <- t(apply(x, MAR = 1, function(z) 1e6 * z / sum(z))) # Drop low expression genes -xcount <- xcount[,colMedians(log(x + 1)) >= expression.threshold] -x <- x[,colMedians(log(x + 1)) >= expression.threshold] +xcount <- xcount[, colMedians(log(x + 1)) >= expression.threshold] +x <- x[, colMedians(log(x + 1)) >= expression.threshold] # Filter to genes with annotated pathways load(pathway.file) -xcount <- xcount[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] -x <- x[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] +xcount <- xcount[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] +x <- x[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)] # Standardize metadata feature names -xmeta <- mutate(xmeta, ALS=SubjectGroup %in% als.groups) %>% - dplyr::rename(Site=SiteSpecimenCollected) %>% - dplyr::rename(Age=AgeatDeath) %>% - mutate(Age=ifelse(Age == '90 or Older', 90, Age)) %>% - mutate(Age=as.numeric(Age)) +xmeta <- mutate(xmeta, ALS = SubjectGroup %in% als.groups) %>% + dplyr::rename(Site = SiteSpecimenCollected) %>% + dplyr::rename(Age = AgeatDeath) %>% + mutate(Age = ifelse(Age == "90 or Older", 90, Age)) %>% + mutate(Age = as.numeric(Age)) # Rename for joint analysis x.spine <- x xcount.spine <- xcount -xmeta.spine <- mutate(xmeta, Type='Spine') -rm(x, xmeta) +xmeta.spine <- mutate(xmeta, Type = "Spine") + +fout <- file.path(data.dir, "spine_processed.Rdata") +save(file = fout, x.spine, xcount.spine, xmeta.spine) diff --git a/paper_figures/Figure_Transcriptomics/pca_analysis.R b/paper_figures/Figure_Transcriptomics/pca_analysis.R index a6dd207..b6705da 100644 --- a/paper_figures/Figure_Transcriptomics/pca_analysis.R +++ b/paper_figures/Figure_Transcriptomics/pca_analysis.R @@ -1,4 +1,4 @@ -#' In this script, we run PCA analysis on expression data to investigate primary +#' In this script, we run PCA analysis on expression data and report primary #' drivers of variation in fibroblast / spinal cord expression data. #' #' Requires input datasets @@ -17,26 +17,12 @@ #+ setup, echo=FALSE, warning=FALSE, message=FALSE library(data.table) library(tidyverse) -library(hrbrthemes) library(Matrix) library(gridExtra) library(ggsci) options(stringsAsFactors=FALSE) - -theme_set( - theme_ipsum( - plot_title_size=24, - axis_title_size=18, - strip_text_size=18, - axis_text_size=18, - base_size=18, - axis=TRUE, - base_family='sans', - axis_col='#000000', - grid=FALSE - ) -) +theme_set(theme_bw(base_size=18)) grid.arrange4 <- function(...) grid.arrange(..., ncol=4) @@ -45,16 +31,16 @@ grid.arrange4 <- function(...) grid.arrange(..., ncol=4) ################################################################################ fig.base.dir <- Sys.getenv('ALS_PAPER') data.dir <- file.path(fig.base.dir, 'data/050322_RNAseq/') -figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure3') -output.fig.dir <- file.path(figure.dir, 'fig3') +figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Transcriptomics') +output.fig.dir <- file.path(figure.dir, 'fig') dir.create(output.fig.dir, showWarnings=FALSE) ################################################################################ # Load data and initialize analysis parameters ################################################################################ source(file.path(figure.dir, 'utilities.R')) -source(file.path(figure.dir, 'load_fibroblast.R')) -source(file.path(figure.dir, 'load_spine.R')) +load(file.path(data.dir, 'fibro_processed.Rdata')) +load(file.path(data.dir, 'spine_processed.Rdata')) # Initialize analysis parameters save.fig <- TRUE @@ -65,9 +51,6 @@ gene_transform <- function(x) log(x + 1) colnames(xmeta.fibro) <- str_replace_all(colnames(xmeta.fibro), ' ', '_') xmeta.fibro <- xmeta.fibro %>% mutate_if(is.character, as.factor) -# Fix typo in cell line passage number -xmeta.fibro$CellLinePassageNumber[xmeta.fibro$CellLine == 'ND40077'] <- 20 - # Clean names of fibro metadata xmeta.fibro <- xmeta.fibro %>% dplyr::rename(Age=Age_of_biopsy) %>% @@ -86,7 +69,7 @@ pca <- prcomp(gene_transform(x.fibro)) xpca <- cbind(pca$x, xmeta.fibro) pct.var <- round(pca$sdev ^ 2 / sum(pca$sdev ^ 2), 3) * 100 -meta.features <- c('Site', 'Sex', 'ALS', 'Age', 'Passage') +meta.features <- c('Site', 'Sex', 'Genetics', 'Age') p <- lapply(meta.features, function(f) { out1 <- ggplot(xpca, aes_string(col=f)) + @@ -105,6 +88,9 @@ p <- lapply(meta.features, function(f) { out1 <- out1 + scale_color_viridis_c() out2 <- out2 + scale_color_viridis_c() + } else if (f == "Genetics") { + out1 <- out1 + scale_color_manual(values = col.pal) + out2 <- out2 + scale_color_manual(values = col.pal) } else { out1 <- out1 + scale_color_jama() out2 <- out2 + scale_color_jama() @@ -144,7 +130,7 @@ p <- lapply(meta.features, function(f) { out1 <- out1 + scale_color_viridis_c() out2 <- out2 + scale_color_viridis_c() - } else { + } else { out1 <- out1 + scale_color_manual(values=disc.pal) out2 <- out2 + scale_color_manual(values=disc.pal) } diff --git a/paper_figures/Figure_Transcriptomics/utilities.R b/paper_figures/Figure_Transcriptomics/utilities.R index e342a56..b865bdd 100644 --- a/paper_figures/Figure_Transcriptomics/utilities.R +++ b/paper_figures/Figure_Transcriptomics/utilities.R @@ -1,145 +1,132 @@ -library(DESeq2) -library(fgsea) +library(glmnet) +library(iRF) ################################################################################ -# Differential expression and gene set enrichment analysis +# Supervised analysis ################################################################################ -deseq_design <- function(x, xmeta, design, ...) { - # Wrapper function for running deseq over targeted samples - deseq <- DESeqDataSetFromMatrix( - countData=t(round(x)), - colData=xmeta, - design=as.formula(design), - ) +fit_holdout <- function(x, xmeta, lines.predict, fit_, predict_, bootstrap=FALSE) { + # Wrapper function for supervised analysis with holdout cell lines + train.id <- !xmeta$CellLine %in% lines.predict + test.id <- xmeta$CellLine %in% lines.predict - deseq <- DESeq(deseq, ...) - return(deseq) -} - - -gsea <- function(enrichment, pathway.file, pthr=0.01, ...) { - # Compute gene set enrichment for selected pathway - - load(pathway.file) - pathways <- reactome.pathways$Genes - names(pathways) <- reactome.pathways$Name - - entrez <- clusterProfiler::bitr( - names(enrichment), - fromType="ENSEMBL", - toType="ENTREZID", - OrgDb='org.Hs.eg.db' - ) + x <- mutate(data.frame(x), Y=xmeta$Y) + xus <- caret::upSample(x[train.id,], as.factor(xmeta$ALS[train.id])) - # Join entrez ids with enrichment data - gene.set <- data.frame(Loading=unname(enrichment)) %>% - mutate(ENSEMBL=names(enrichment)) %>% - right_join(entrez, by='ENSEMBL') - - # Run gene set enrichment analysis for bag data - gene.list <- gene.set$Loading - names(gene.list) <- gene.set$ENTREZID - - out <- fgsea( - pathways=pathways, - stats=gene.list, - scoreType='pos', - eps=1e-30, - ... - ) + if (bootstrap) { + xus <- group_by(xus, Class) %>% + sample_frac(1, replace=TRUE) %>% + ungroup() + } - out <- filter(out, padj < pthr) %>% arrange(padj) - out.collapse <- collapsePathways(out, pathways, gene.list) - out.f <- filter(out, pathway %in% out.collapse$mainPathways) - return(list(pw=out.f, filtered=out.collapse, pw.full=out)) -} - - -jaccard <- function(x, y) { - # Compute jaccard distance between two sets - return(length(intersect(x, y)) / length(union(x, y))) + xi <- as.matrix(dplyr::select(xus, -Class, -Y)) + fit <- fit_(x=as.matrix(xi), y=xus$Y) + + # Get model coefficients + betas <- coef(fit, s='lambda.min') + if (sum(betas) == 0) { + betas <- coef(fit$glmnet.fit) + betas.sum <- Matrix::colSums(betas) + betas <- betas[,min(which(betas.sum != 0))] + } + + x <- dplyr::select(x, -Y) %>% as.matrix + ypred <- predict_(fit, x[test.id,]) + ypred <- mutate(xmeta[test.id,], YpredSeq=ypred[,1]) + return(list(Ypred=ypred, Coef=betas)) } -pathway_distance <- function(gene.set, pw.names=NULL) { - # Compute jaccard distance matrix over set of pathways - pairs <- combn(1:length(gene.set), 2, simplify=FALSE) - dvec <- sapply(pairs, function(p) jaccard(gene.set[[p[1]]], gene.set[[p[2]]])) - - # Initialize pathway-level distance matrix - dmat <- matrix(0, nrow=length(gene.set), ncol=length(gene.set)) - - dmat[lower.tri(dmat)] <- dvec - dmat <- dmat + t(dmat) - diag(dmat) <- 1 +fit_genes <- function(x, xmeta, xonehot, genes, fit_, predict_, + nrep=10, bootstrap=FALSE) { - if (!is.null(pw.names)) { - rownames(dmat) <- pw.names - colnames(dmat) <- pw.names - } + # Runs leave-one out prediction analysis, fitting models on selected subset of + # full gene set. + xg <- x[,intersect(genes, colnames(x))] + xg <- cbind(xg, xonehot) - return(dmat) -} + # Initialize subset of FUS/WT cell lines for binary classification + cell.lines <- filter(xmeta, Genetics != 'sporadic')$CellLine + s.cell.lines <- filter(xmeta, Genetics == 'sporadic')$CellLine -subsample_deseq <- function(x, xmeta, - designs='~Genetics + Site', - name='Genetics_Healthy_vs_FUS.ALS') { - # Wrapper function for computing deseq over subsamples samples - - id <- sample(nrow(x), 0.75 * nrow(x)) - - out <- sapply(designs, function(d) { - deseq <- deseq_design( - x[id,], - xmeta[id,], - design=d, - fitType='mean' - ) + out <- lapply(1:length(cell.lines), function(i) { + set.seed(i) + lines.predict <- c(s.cell.lines, cell.lines[i]) + + out <- replicate(nrep, { + fit_holdout(xg, xmeta, lines.predict, fit_, predict_, bootstrap) + }, simplify=FALSE) - res <- results(deseq, name=name) - return(setNames(-log10(res$pvalue), rownames(res))) + ypred <- lapply(out, function(z) z$Ypred) %>% rbindlist + coefs <- sapply(out, function(z) as.matrix(z$Coef)) + rownames(coefs) <- c('Intercept', colnames(xg)) + return(list(Ypred=ypred, Coef=coefs)) }) - return(out) + ypred <- lapply(out, function(z) z$Ypred) %>% rbindlist + coefs <- lapply(out, function(z) z$Coef) + return(list(Ypred=ypred, Coef=coefs)) } -################################################################################ -# Supervised analysis -################################################################################ -fit_holdout <- function(x, xmeta, lines.predict, fit_, predict_, importance_) { - # Wrapper function for supervised analysis with holdout cell lines - train.id <- !xmeta$CellLine %in% lines.predict - test.id <- xmeta$CellLine %in% lines.predict - y <- as.numeric(xmeta$Y) - labels <- as.numeric(xmeta$ALS) + +fit_sample_split <- function(x, y, id.train, id.test=NULL, n.iter=1, n.core=1) { + # Fits iRF and predicts RF models on sample split + if (is.null(id.test)) id.test <- setdiff(1:nrow(x), id.train) + + fit.trn <- iRF( + x=x[id.train,], + y=y[id.train], + type='ranger', + n.iter=n.iter, + n.core=n.core, + respect.unordered.factors=TRUE + ) + fit.tst <- iRF( + x=x[id.test,], + y=y[id.test], + type='ranger', + n.iter=n.iter, + n.core=n.core, + respect.unordered.factors=TRUE + ) - x <- mutate(data.frame(x), Y=xmeta$Y) - xus <- caret::upSample(x[train.id,], as.factor(labels[train.id])) - xi <- as.matrix(dplyr::select(xus, -Class, -Y)) - yi <- xus$Y - fit <- fit_(x=xi, y=yi) + ypred.train <- predict(fit.tst$rf.list, x[id.train], predict.all=TRUE) + ypred.train <- rowMeans(ypred.train$predictions) - x <- dplyr::select(x, -Y) %>% as.matrix - ypred <- predict_(fit, x[test.id,]) - ypred <- data.frame(CellLine=xmeta$CellLine[test.id]) %>% - mutate(YpredSeq=ypred[,1]) %>% - mutate(Genetics=xmeta$Genetics[test.id]) + ypred.test <- predict(fit.trn$rf.list, x[id.test], predict.all=TRUE) + ypred.test <- rowMeans(ypred.test$predictions) + ypred <- c(ypred.train, ypred.test) + id.split <- c(id.train, id.test) - return(ypred) -} - - -fit_model <- function(x, y, id.train, model, model_predict) { - # Wrapper function for fitting model and predicting on held-out samples - fit <- model(x[id.train,], y[id.train]) - ypred <- model_predict(fit, x[!id.train,]) - return(list(fit=fit, ypred=ypred, ytest=y[!id.train])) + importance.trn <- fit.trn$rf.list$variable.importance + importance.tst <- fit.tst$rf.list$variable.importance + importance <- (importance.trn + importance.tst) / 2 + + return(list(id=id.split, ypred=ypred, importance=importance)) } ################################################################################ # Wrapper functions for running supervised analysis ################################################################################ +fit_lm <- function(x, y, ...) { + # Standardized function for sparse logistic regression + return(cv.glmnet(x=x, y=y, nfolds=3, intercept=FALSE, ...)) +} + +predict_lm <- function(fit, x) { + # Standardized function for sparse logistic regression predictions + lambda.min <- fit$lambda.min + + if (which(fit$lambda == lambda.min) == 1) { + ypred <- predict(fit$glmnet.fit, x, type='response') + null.model <- colMeans(ypred == 0.5) == 1 + ypred <- ypred[,min(which(!null.model))] %>% as.matrix + } else { + ypred <- predict(fit, x, s='lambda.min', type='response') + } + return(ypred) +} + fit_lm_classification <- function(x, y, ...) { # Standardized function for sparse logistic regression return(cv.glmnet(x=x, y=y, nfolds=3, family='binomial', intercept=FALSE, ...)) @@ -160,3 +147,37 @@ predict_lm_classification <- function(fit, x) { return(ypred) } + +################################################################################ +# General utility functions +################################################################################ +reweight_importance <- function(importance, xcor) { + # Computes correlation-weighted feature importance. + importance <- importance[rownames(xcor)] + out <- abs(xcor) %*% as.matrix(importance) + return(out[,1]) +} + +get_children <- function(nodes, edges, ids, children=list()) { + # Traverse graph to select children of selected node + if (!any(ids %in% edges$from)) return(children) + + new.children <- edges$to[edges$from %in% ids] + children <- c(children, list(new.children)) + return(get_children(nodes, edges, new.children, children)) +} + +set_node_gradient <- function(x, col.pal=viridis(100), col.range=NULL) { + if (all(is.na(x))) return(rep('#707579', length(x))) + + if (is.null(col.range)) { + col.range <- c(min(x, na.rm=TRUE), max(x, na.rm=TRUE)) + } + + if (length(unique(col.range)) == 1) stop('Color range must contain unique levels') + cut.seq <- seq(col.range[1], col.range[2], length.out=length(col.pal)) + out <- rev(col.pal)[cut(x, cut.seq, include.lowest=TRUE)] + out[is.na(out)] <- '#707579' + return(out) +} + diff --git a/paper_figures/color_palette.R b/paper_figures/color_palette.R index 2f43f24..debdc5d 100644 --- a/paper_figures/color_palette.R +++ b/paper_figures/color_palette.R @@ -1,58 +1,5 @@ -library(tidyverse) -library(RColorBrewer) -library(data.table) +library(ggsci) -genetics.include <- c('C9orf72', 'FUS-ALS', 'sporadic', 'TBK1', 'Healthy') - -library.file <- 'Fibroblasts_Library.csv' -library.path <- file.path(Sys.getenv('ALS_PAPER'), 'data', library.file) - -# Load in cell line metadata -xcell <- fread(library.path) %>% - dplyr::rename(Genetics=`Disease Gene`, CellLine=`Cell line`) %>% - mutate(Genetics=ifelse(Genetics == 'WT', 'Healthy', Genetics)) %>% - mutate(Genetics=ifelse(Genetics == 'FUS', 'FUS-ALS', Genetics)) %>% - mutate(Genetics=str_replace_all(Genetics, 'ALS with no known mutation', 'sporadic')) - -# Initialize cell line/age key -age.key <- setNames(xcell$`Age of biopsy`, xcell$CellLine) - -# Initialize cell line/genetics key -xg <- dplyr::select(xcell, CellLine, Genetics) %>% - distinct() %>% - group_by(Genetics) %>% - summarize(CellLine=list(unique(CellLine))) %>% - filter(Genetics %in% genetics.include) - -# Initialize base color palette -col.pals <- list( - brewer.pal(8, 'Purples')[-(1:3)], - brewer.pal(8, 'Reds')[-(1:2)], - brewer.pal(8, 'Greys')[-(1:3)], - brewer.pal(8, 'YlOrRd')[4:5], - brewer.pal(8, 'Greens')[-(1:3)] -) - -# Initialize color palette for cell lines -col.pal.c <- lapply(1:nrow(xg), function(i) { - ncell <- length(xg$CellLine[[i]]) - cols <- colorRampPalette(col.pals[[i]])(ncell) - - cells.i <- xg$CellLine[[i]] - age.i <- age.key[cells.i] %>% sort %>% rev - - names(cols) <- names(age.i) - return(cols) -}) - -col.pal.c <- unlist(col.pal.c) - -# Initialize color palette for genetics -col.pal.g <- sapply(1:nrow(xg), function(i) { - ncell <- length(xg$CellLine[[i]]) + 1 - cols <- colorRampPalette(col.pals[[i]])(ncell) - return(cols[floor(ncell / 2)]) -}) - -names(col.pal.g) <- xg$Genetics -col.pal <- c(col.pal.c, col.pal.g) +col.pal <- pal_jama()(7) +genetics.include <- c("FUS-ALS", "sporadic", "Healthy", "ALS") +names(col.pal)[c(4, 5, 1, 3)] <- genetics.include diff --git a/paper_figures/setup.R b/paper_figures/setup.R new file mode 100644 index 0000000..e84974d --- /dev/null +++ b/paper_figures/setup.R @@ -0,0 +1,41 @@ +# Keras install +install.packages("keras") +reticulate::install_python(version = "3.10") +keras::install_keras() + +# Standard package installs +install.packages("data.table") +install.packages("tidyverse") +install.packages("Matrix") +install.packages("viridis") +install.packages("ggsci") +install.packages("caret") +install.packages("glmnet") +install.packages("PRROC") +install.packages("RColorBrewer") +install.packages("scales") +install.packages("superheat") +install.packages("gridExtra") +install.packages("hrbrthemes") +install.packages("ggpubr") +install.packages("patchwork") +install.packages("Cairo") +install.packages("kableExtra") +install.packages("clusterProfiler") +install.packages("mltools") +install.packages("ggrepel") +install.packages("parallel") +install.packages("ggh4x") +install.packages("tidytext") +install.packages("lme4") +install.packages("lmerTest") +install.packages("R.utils") + +# Bioconductor package installs +install.packages("BiocManager") +BiocManager::install("org.Hs.eg.db") +BiocManager::install("DESeq2") + +# Devtools installs +install.packages("devtools") +devtools::install_github("karlkumbier/iRF2.0") diff --git a/paper_figures/theme.R b/paper_figures/theme.R new file mode 100644 index 0000000..6dcc044 --- /dev/null +++ b/paper_figures/theme.R @@ -0,0 +1,4 @@ +theme_als <- function(){ + theme_bw() %+replace% + theme(panel.grid.minor = element_blank()) +} diff --git a/paper_figures/utilities.R b/paper_figures/utilities.R index 0a0c96a..7d453a7 100644 --- a/paper_figures/utilities.R +++ b/paper_figures/utilities.R @@ -1,126 +1,136 @@ -#' This script contains helper functions for running the predictive analyses -#' described in Kumbier et al. 2022. The core function `fit_model` can be -#' used with any supervised model by passing in functions with standardized -#' inputs/outputs for model fitting and prediction. Currently, models for -#' iterative random forests and logistic regression with L1 penalty are -#' supported. -#' -#' The functions `cell_fit` and `cell_fit_single` are used to run leave one -#' cell line out analyses described in the paper. -#' -#' Note: irf is set to use all available cores. -#' -#' Karl Kumbier 10/18/2022 +# " This script contains helper functions for running the predictive analyses. +# " The core function `fit_model` can be used with any supervised model by +# " passing in functions with standardized inputs/outputs for model fitting and +# " prediction. Currently, models for iterative random forests and logistic +# " regression with L1 penalty are supported. +# " +# " Note: irf is set to use all available cores. +# " +# " Karl Kumbier 10/4/2023 library(iRF) library(glmnet) +library(keras) library(caret) library(PRROC) -cell_fit <- function(x, model, model_predict, genetics.train='FUS-ALS', verbose=TRUE) { +cell_fit <- function( + x, model, model_predict, + genetics.train = "FUS-ALS", # nolint + verbose = TRUE, + ncell = 250) { # Wrapper function to call cell_fit_single predict over all cell lines. - - # Rank-normalize features within plate + # Normalize features within plate ecdf <- function(x) rank(x) / length(x) xfeat <- ungroup(x) %>% - dplyr::select(matches('(^X|PlateID)')) %>% + dplyr::select(matches("(^X|PlateID)")) %>% group_by(PlateID) %>% mutate_if(is.numeric, ecdf) %>% ungroup() %>% dplyr::select(-PlateID) - x <- dplyr::select(x, -matches('^X')) + x <- dplyr::select(x, -matches("^X")) x <- cbind(x, xfeat) # Initialize grid for hold-out cell lines, training genetics cell.lines <- unique(x$CellLine) - + # Drop sporadic lines from model training — evaluated in all models - sporadics <- filter(x, Genetics == 'sporadic')$CellLine - if (!'sporadic' %in% genetics.train) cell.lines <- setdiff(cell.lines, sporadics) - + sporadics <- filter(x, Genetics == "sporadic")$CellLine + if (!"sporadic" %in% genetics.train) { + cell.lines <- setdiff(cell.lines, sporadics) + } + + # Fit model fits <- lapply(cell.lines, function(cl) { if (verbose) print(cl) cell_fit_single( - x=x, - cell.test=cl, - genetics.train=genetics.train, - model=model, - model_predict=model_predict + x = x, + cell.test = cl, + genetics.train = genetics.train, + model = model, + model_predict = model_predict, + ncell = ncell ) }) - + id.drop <- sapply(fits, is.null) fits <- fits[!id.drop] - + # Initialize importance importance <- sapply(fits, function(f) f$fit$fit$importance) - ypred <- lapply(fits, function(f) f$ypred) %>% rbindlist - return(list(ypred=ypred, importance=importance)) + ypred <- lapply(fits, function(f) f$ypred) %>% rbindlist() + return(list(ypred = ypred, importance = importance)) } -cell_fit_single <- function(x, cell.test, model, model_predict, genetics.train='FUS-ALS', ncell=200) { +cell_fit_single <- function(x, cell.test, model, model_predict, + genetics.train = "FUS-ALS", ncell = 250) { # Runs leave one cell line out prediction analysis. # # x: data frame containing predictive features (named as ^X.*), response # (Genetics), CellLine, WellID, PlateID, and Markers. - # cell.test: held-out cell line to be used for model evaluation. + # cell.test: held-out cell line(s) to be evaluated. # model: standardized model fitting function. Takes arguments x, y only. # model_predict: standardized model prediction function. Takes arguments # fit (fitted model) and x only # genetics.train: ALS genetic backgrounds to be used for model training. - + # ncell: number of cells per well to use in training + # Check for valid input - print(str_c('Fitting ', cell.test, '...')) - if (nrow(x) == 0) return(NULL) - + model.line <- cell.test + print(str_c("Fitting ", cell.test, "...")) + if (nrow(x) == 0) { + return(NULL) + } + # Set training/test indices based on hold-out cell line - sporadics <- filter(x, Genetics == 'sporadic')$CellLine - cells.test <- cell.test - if (!'sporadic' %in% genetics.train) cells.test <- c(cells.test, unique(sporadics)) - cells.train <- setdiff(unique(x$CellLine), cells.test) - + sporadics <- filter(x, Genetics == "sporadic")$CellLine + if (!"sporadic" %in% genetics.train) cell.test <- c(cell.test, unique(sporadics)) + cells.train <- setdiff(unique(x$CellLine), cell.test) + # Set training/test indices based on hold-out cell line id.cell.train <- which(x$CellLine %in% cells.train) - id.genetics <- which(x$Genetics %in% c(genetics.train, 'Healthy')) - + id.genetics <- which(x$Genetics %in% c(genetics.train, "Healthy")) + id.train <- intersect(id.cell.train, id.genetics) - id.test <- which(x$CellLine %in% cells.test) - if (length(id.test) == 0) return(NULL) - + id.test <- which(x$CellLine %in% cell.test) + if (length(id.test) == 0) { + return(NULL) + } + # Set feature matrix and response vector - y <- as.numeric(x$Genetics != 'Healthy') - xx <- select(x, matches('^X')) - + y <- as.numeric(x$Genetics != "Healthy") + xx <- select(x, matches("^X")) + # Initialize cell index table for down sampling - xidx <- dplyr::select(x, ID) %>% - mutate(Y=y, Idx=1:n()) %>% + xidx <- dplyr::select(x, ID) %>% + mutate(Y = y, Idx = 1:n()) %>% filter(Idx %in% id.train) - + # Down sample for equal cell counts from each well - xcount <- group_by(xidx, ID) %>% count() + xcount <- group_by(xidx, ID) %>% dplyr::count() sample.size <- min(c(xcount$n, ncell)) xidx <- group_by(xidx, ID) %>% sample_n(sample.size) - + # Down sample for class balance xidx <- downSample(xidx, as.factor(xidx$Y)) - + # Fit model id.train <- intersect(id.train, xidx$Idx) - fit <- fit_model(xx, y, id.train, model, model_predict, test.id=id.test) - + fit <- fit_model(xx, y, id.train, model, model_predict, test.id = id.test) + # Initialize prediction table output - f.select <- c('CellLine', 'Genetics', 'WellID', 'PlateID', 'Markers') - out <- dplyr::select(x[id.test,], one_of(f.select)) %>% - mutate(Ypred=fit$ypred) %>% - mutate(Ytest=as.numeric(Genetics != 'Healthy')) %>% - mutate(ModelLine=cell.test) %>% - mutate(TestIdx=id.test) - - return(list(fit=fit, ypred=out)) + f.select <- c("CellLine", "Genetics", "WellID", "PlateID", "Markers") + out <- dplyr::select(x[id.test, ], one_of(f.select)) %>% + mutate(Ypred = fit$ypred) %>% + mutate(Ytest = as.numeric(Genetics != "Healthy")) %>% + mutate(ModelLine = model.line) %>% + mutate(TestIdx = id.test) + + return(list(fit = fit, ypred = out)) } -fit_model <- function(x, y, train.id, model, model_predict, test.id=NULL) { +fit_model <- function(x, y, train.id, model, model_predict, test.id = NULL) { # Core function for model fitting. Fits the specified model, generates # predictions, and evaluates AUROC. # @@ -130,80 +140,146 @@ fit_model <- function(x, y, train.id, model, model_predict, test.id=NULL) { # model: standardized model fitting function. Takes arguments x, y only. # model_predict: standardized model prediction function. Takes arguments # fit (fitted model) and x only - # test.id: rows of x to generate predictions for. If NULL, all non training + # test.id: rows of x to generate predictions for. If NULL, all non training # samples will be used. - + # Set test indices if not specified - if (is.null(test.id)) test.id <- setdiff(1:nrow(x), train.id) - - # Fit model - fit <- model(x=x[train.id,], y=y[train.id]) - + if (is.null(test.id)) test.id <- setdiff(seq_len(nrow(x)), train.id) + + # Fit model + fit <- model(x = x[train.id, ], y = y[train.id]) + # Fit model and generate predictions - ypred <- model_predict(fit, x[test.id,]) + ypred <- model_predict(fit, x[test.id, ]) ytest <- y[test.id] - roc <- roc.curve(scores.class0=ypred, weights.class0=ytest) - return(list(fit=fit, auc=roc$auc, y=y, ypred=ypred, id.test=test.id)) + roc <- roc.curve(scores.class0 = ypred, weights.class0 = ytest) + + return(list( + fit = fit, + auc = roc$auc, + y = y, + ypred = ypred, + id.test = test.id + )) } logistic <- function(x, y) { # Standardized function for L1 logistic regression. - fit <- cv.glmnet(x=as.matrix(x), y=y, family='binomial') - fit$importance <- coef(fit, s='lambda.min')[,1] + + print("Fitting LR") + fit <- cv.glmnet( + x = as.matrix(x), + y = y, + family = "binomial", + type.logistic = "modified.Newton", + thresh = 1e-5 + ) + print("LR fit") + + fit$importance <- coef(fit, s = "lambda.min")[-1, 1] return(fit) } + predict_logistic <- function(fit, x) { # Standardized function for logistic regression prediction. - ypred <- predict(fit, as.matrix(x), s='lambda.min', type='response') - return(ypred[,1]) + ypred <- predict(fit, as.matrix(x), s = "lambda.min", type = "response") + return(ypred[, 1]) } -irf <- function(x, y, n.core=1) { - # Standardized function for irf. +irf <- function(x, y, n.core = 1) { + # Standardized function for irf. if (all(y %in% c(0, 1))) y <- as.factor(y) - fit <- iRF(x=x, y=y, type='ranger', verbose=FALSE, n.iter=2, n.core=n.core) + + fit <- iRF( + x = x, + y = y, + type = "ranger", + verbose = FALSE, + n.iter = 2, + n.core = n.core + ) + fit$importance <- fit$rf.list$variable.importance return(fit) } predict_irf <- function(fit, x) { # Standardized function for irf prediction. - ypred <- predict(fit$rf.list, x, predict.all=TRUE) + ypred <- predict(fit$rf.list, x, predict.all = TRUE) ypred <- rowMeans(ypred$prediction) return(ypred) } +fcnn <- function(x, y) { + # Standardized function for FC network + + if (all(y %in% 0:1)) { + y <- to_categorical(y, 2) + x <- as.matrix(x) + } else { + stop("Multiclass and regression not supported") + } + + p <- c(ncol(x)) + + model <- keras_model_sequential() %>% + layer_dense(units = 16, activation = "relu", input_shape = p) %>% + layer_dropout(rate = 0.5) %>% + layer_dense(units = 16, activation = "relu") %>% + layer_dropout(rate = 0.5) %>% + layer_dense(units = 2, activation = "softmax") + + model %>% compile( + loss = "categorical_crossentropy", + optimizer = optimizer_rmsprop(), + metrics = c("accuracy") + ) + + history <- model %>% fit( + x, y, + epochs = 30, + batch_size = 128, + validation_split = 0.2 + ) + + + # Evaluate feature importance + ximportance <- diag(ncol(x)) + importance <- predict(model, ximportance)[, 2] + importance <- abs(importance - mean(importance)) + names(importance) <- colnames(x) + + fit <- list() + fit$model <- model + fit$importance <- importance + return(fit) +} + +predict_fcnn <- function(fit, x) { + return(predict(fit$model, as.matrix(x))[, 2]) +} + + +############################################################################### +# Additional utility functions used in analysis +############################################################################### label_pval <- function(x) { # Generate significance *s for p-values - pval.lab.s <- ' ns' - pval.lab.s <- ifelse(x < 0.05, '*', pval.lab.s) - pval.lab.s <- ifelse(x < 0.01, '**', pval.lab.s) - pval.lab.s <- ifelse(x < 0.001, '***', pval.lab.s) + pval.lab.s <- " ns" + pval.lab.s <- ifelse(x < 0.05, "*", pval.lab.s) + pval.lab.s <- ifelse(x < 0.01, "**", pval.lab.s) + pval.lab.s <- ifelse(x < 0.001, "***", pval.lab.s) return(pval.lab.s) } -select_cytosolic_fus <- function(x) { - # Subset feature matrix to cytosolic fus features - assumes FUS is channel 2 - - # Initialize channel / region IDs - channels <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3) - - regions <- sapply(str_split(colnames(x), '\\.\\.'), '[', 2) - regions <- lapply(regions, str_split, pattern='_') +select_cytosolic <- function(x) { + regions <- sapply(str_split(colnames(x), "\\.\\."), "[", 2) + regions <- lapply(regions, str_split, pattern = "_") regions <- sapply(regions, function(z) tail(z[[1]], 1)) - - # Initialize FUS nuclear features - channels <- str_split(channels, '') - regions <- str_split(regions, '') - - nuc.fus <- mapply(function(ch, rg) { - any(ch == 2 & rg == 'D') | any(ch == 2 & rg == 'C') - }, channels, regions) - - nuc.fus[is.na(nuc.fus)] <- FALSE - - # Drop nuclear features - return(dplyr::select_if(x, !nuc.fus)) + regions[is.na(regions)] <- "X" + + # Filter to selected region / marker + return(dplyr::select_if(x, regions %in% c("X", "N", "NN"))) } diff --git a/preprocessing/make_eigenfeatures.R b/preprocessing/make_eigenfeatures.R new file mode 100644 index 0000000..b9056cc --- /dev/null +++ b/preprocessing/make_eigenfeatures.R @@ -0,0 +1,125 @@ +#' This script is used to generate eigenfeatures from raw imaging data. +#' Eigenfeatures are computed for a single screen / marker set for either +#' single cells or KS normalized well replicates. +#' +#' Requires input data +#' +#' ``: raw imaging data from a screen with both +#' `cbfeature_profiles.csv` and `metadata.csv` files. These can be generated +#' using the `rprofiler` package in awlabtools (see `GENERATE_PROFILE` shell +#' script in each screen subdirectory). +#' `Fibroblasts_Library.csv`: cell line metadata +#' `features_select_expanded.csv`: table of features to be included and their +#' associated feature groups. +#' +#' Karl Kumbier 10/20/2022 +library(tidyverse) +library(data.table) + +als.dir <- Sys.getenv("ALS_PAPER") +source(file.path(als.dir, "preprocessing/utilities.R")) +args <- R.utils::commandArgs(asValues = TRUE) + +# Variable parameters for eigenfeature generation +pct.var <- 0.95 # % variance explained by eigenfeatures within category +type <- "cell" # type pf sample, being 'cell' or 'well' +screen <- args$SCREEN #' 032422_WTFUSporadics' # screen to be analyzed +raw <- as.logical(args$RAW) # FALSE + +# Directory containing cell line and feature metadata files +library.path <- file.path(als.dir, "data", "Fibroblasts_Library.csv") +feature.path <- file.path(als.dir, "data", "features_select_expanded.csv") + +# Directories with raw imaging data, metadata, and eigenfeature output +base.dir <- "/awlab/projects/2017_01_ProjectALS/" +data.dir <- file.path(base.dir, "plates") +metadata.dir <- file.path(base.dir, "data", screen, "Cell_Filtered/") +output.dir <- file.path(base.dir, "data", screen, "Cell_Filtered/") + +################################################################################ +# Load data +################################################################################ +# Load in metadata +xmeta <- fread(str_c(metadata.dir, "metadata.csv")) %>% + mutate(Markers = str_remove_all(Markers, " ")) %>% + mutate(Markers = str_remove_all(Markers, "Hoechst_")) %>% + mutate(Markers = str_remove_all(Markers, "_Mitotracker")) + +# Load in cell line data +xcell <- fread(library.path) %>% + dplyr::rename(CellLine = `Cell line`) %>% + dplyr::rename(Genetics = `Disease Gene`) %>% + dplyr::select(CellLine, Genetics) %>% + mutate(CellLine = str_remove_all(CellLine, " ")) %>% + mutate(Genetics = ifelse(str_detect(Genetics, "mutation"), "sporadic", Genetics)) %>% + mutate(Genetics = ifelse(Genetics == "WT", "Healthy", Genetics)) %>% + mutate(Genetics = ifelse(Genetics == "FUS", "FUS-ALS", Genetics)) + +xmeta <- left_join(xmeta, xcell, by = "CellLine") + +# Clean markerset names +markers <- unique(xmeta$Markers) + +# Initialize feature grouping key +xmarker <- fread(feature.path) %>% + rename(Feature = feature_name) %>% + rename(FeatureAbbr = feature_short) %>% + rename(Channel = channel) %>% + rename(Group = group) %>% + distinct() + +# Generate key for mapping feature full names to feature short names +feature.key <- select(xmarker, Feature, FeatureAbbr) %>% distinct() + +# Drop channels for select markersets +lapply(markers, function(m) { + print(m) + xmeta.m <- dplyr::filter(xmeta, Markers == m) + plate.ids <- unique(xmeta.m$PlateID) + + xm <- lapply(plate.ids, function(p) { + data.file <- str_c("cbfeature_", type, "_profiles.csv") + data.path <- file.path(data.dir, p, "ks_profiles", data.file) + fread(data.path) + }) + + # Merge data / metadata + xm <- rbindlist(xm) %>% + mutate(ID = str_c(PlateID, "_", ID)) %>% + dplyr::select(-PlateID) %>% + left_join(xmeta.m, by = "ID") + + if (raw) { + features <- str_replace_all(xmarker$Feature, "\\*", "\\.\\*") + features <- str_c("X\\.\\.", features) %>% str_c(collapse = "|") + xf <- dplyr::select(xm, matches(str_c("(", features, ")"))) + + channels <- str_remove_all(colnames(xf), "^.*\\.\\.") %>% str_split("") + id.channel <- sapply(channels, function(ch) all(ch %in% 2:3)) + xf <- dplyr::select_if(xf, id.channel) + xf.meta <- dplyr::select(xm, -matches("^X")) + x <- cbind(xf.meta, xf) %>% mutate_if(is.numeric, median_impute) + save(file = file.path(output.dir, "profiles_raw.Rdata"), x) + } else { + # Reduce dataset based on select features + xm <- subset_features(xm, m, xmarker$Channel, xmarker$Feature, pct.var) + + # Update column names with feature abbrevaiation + col.names <- str_split(colnames(xm), "\\.\\.") + col.names <- sapply(col.names, function(z) { + if (length(z) == 1) { + return(z) + } + z.feat <- str_remove_all(z[2], "\\.") + z.feat <- feature.key$FeatureAbbr[feature.key$Feature == z.feat] + out <- str_c(z[1], "..", z.feat, "..", z[3]) + return(str_replace_all(out, ":PC", "..")) + }) + + colnames(xm) <- col.names + x <- xm + + output.path <- str_c(output.dir, "profiles_", m, ".Rdata") + save(file = output.path, x) + } +}) diff --git a/preprocessing/make_eigenfeatures_well.R b/preprocessing/make_eigenfeatures_well.R new file mode 100644 index 0000000..0b97777 --- /dev/null +++ b/preprocessing/make_eigenfeatures_well.R @@ -0,0 +1,150 @@ +library(tidyverse) +library(data.table) +library(readxl) + +setwd("~/github/projectALS/") +source("121820_SporadicScreen/utilities.R") +output.dir <- "data/profiles/121820_SporadicScreen_MergedWell_Condensed/" +dir.create(output.dir, showWarnings = FALSE) + +################################################################################ +# Data loading +################################################################################ +# Set paths to datasets used for integrated analysis +data.path.1 <- "data/profiles/110420_BiomarkerScreen/cbfeature_profiles.csv" +data.path.2 <- "data/profiles/121820_SporadicScreen/cbfeature_profiles.csv" + +meta.path.1 <- "data/profiles/110420_BiomarkerScreen/metadata.csv" +meta.path.2 <- "data/profiles/121820_SporadicScreen/metadata.csv" + +desc.path.1 <- "data/profiles/110420_BiomarkerScreen/feature_descriptors.csv" +desc.path.2 <- "data/profiles/121820_SporadicScreen/feature_descriptors.csv" +feature.path <- "data/features_select_condensed.csv" + +# Load in data from 11/20 and 12/20 screens +x1 <- fread(data.path.1) %>% + dplyr::mutate(ID = str_c(PlateID, ID, sep = "_")) %>% + dplyr::select(-PlateID) %>% + dplyr::filter(!Bootstrap) + +x2 <- fread(data.path.2) %>% + dplyr::mutate(ID = str_c(PlateID, ID, sep = "_")) %>% + dplyr::select(-PlateID) %>% + dplyr::filter(!Bootstrap) + +# Load in metadata from 11/20 and 12/20 screens +xmeta1 <- fread(meta.path.1) +x1 <- left_join(x1, xmeta1, by = "ID") +x1 <- as.data.frame(x1) %>% mutate(Screen = "1120") + +# Match marker set names +x1$Markers <- str_replace_all(x1$Markers, "_EEA1 _CD63", "_CD63 _EEA1") +x1$Markers <- str_replace_all(x1$Markers, "p-p62", "phospho-p62") + +xmeta2 <- fread(meta.path.2) +x2 <- left_join(x2, xmeta2, by = "ID") +x2 <- as.data.frame(x2) %>% mutate(Screen = "1220") + +# Merge data and metadata +x <- rbindlist(list(x1, x2), fill = TRUE) + +# Update marker set names +x <- mutate(x, Markers = str_remove_all(Markers, " ")) +markers <- unique(x$Markers) + +# Update compound names +x <- mutate(x, Compounds = str_replace_all(Compounds, "None", "Control")) %>% + mutate(Compounds = str_replace_all(Compounds, "0.1%_DMSO", "Control")) %>% + mutate(CellLine = str_remove_all(CellLine, " ")) %>% + mutate(CellLine = str_replace_all(CellLine, "308", "ALS308")) %>% + mutate(CellLine = str_replace_all(CellLine, "309", "ALS309")) + +# Load in cell line data +xcl <- fread("data/2020_Fibroblasts_Library.csv") %>% + dplyr::rename(CellLine = `Cell line`) %>% + dplyr::rename(CellLineGenetics = `Disease Gene`) %>% + dplyr::select(CellLine, CellLineGenetics) %>% + mutate(CellLine = str_remove_all(CellLine, " ")) + +x <- left_join(x, xcl, by = "CellLine") + +# Load in feature desc file +xdesc1 <- fread(desc.path.1) %>% + mutate(Full = str_c("X..", Feature, "..", Channel)) %>% + mutate(Full = str_replace_all(Full, "-", ".")) + +xdesc2 <- fread(desc.path.2) %>% + mutate(Full = str_c("X..", Feature, "..", Channel)) %>% + mutate(Full = str_replace_all(Full, "-", ".")) + +xdesc <- rbindlist(list(xdesc1, xdesc2)) %>% distinct() + +# Load in select features file +xmarker <- fread(feature.path) %>% + rename(Feature = feature_name) %>% + rename(FeatureAbbr = feature_short) %>% + rename(Channel = channel) %>% + rename(Group = group) %>% + distinct() %>% + select(-V5, -V6) + +################################################################################ +# Data preprocessing +################################################################################ +# Markers for which to generate datasets +markers <- c( + "CD63_EEA1", + "EEA1", + "CD63", + "p62_phospho-p62S403", + "p62", + "phospho-p62S403", + "LAMP1_TFEB", + "LAMP1", + "TFEB", + "FUS", + "TIAR" +) + +# Filter to WT cell lines shared across screens +x.lines <- filter(x, CellLineGenetics == "WT") %>% + group_by(Screen) %>% + summarize(CellLine = list(unique(CellLine)), .groups = "drop") + +# Normalize data by median shift over shared WT cell lines +shared.lines <- reduce(x.lines$CellLine, intersect) + +# Generate key for mapping feature full names to feature short names +feature.key <- select(xmarker, Feature, FeatureAbbr) %>% distinct() + +# Process each markerset independently and return as list +lapply(markers, function(m) { + # Drop channels for single marker evaluation + if (m %in% c("FUS", "EEA1", "phospho-p62S403", "TFEB")) { + xmarker <- filter(xmarker, Channel == 3) + } else if (m %in% c("TIAR", "p62", "CD63", "LAMP1")) { + xmarker <- filter(xmarker, Channel == 2) + } + + print(str_c("Processing markerset: ", m, "...")) + xm <- subset_features(x, m, xmarker$Channel, xmarker$Feature, 0.95) + xm <- batch_normalize(as.data.frame(xm), shared.lines) + + # Update column names with feature abbrevaiation + col.names <- str_split(colnames(xm), "\\.\\.") + col.names <- sapply(col.names, function(z) { + if (length(z) == 1) { + return(z) + } + z.feat <- str_remove_all(z[2], "\\.") + z.feat <- feature.key$FeatureAbbr[feature.key$Feature == z.feat] + out <- str_c(z[1], "..", z.feat, "..", z[3]) + return(str_replace_all(out, ":PC", "..")) + }) + + colnames(xm) <- col.names + x <- xm + + output.path <- str_c(output.dir, "cbfeature_profiles_", m, ".Rdata") + save(file = output.path, x) +}) diff --git a/preprocessing/utilities.R b/preprocessing/utilities.R new file mode 100644 index 0000000..5ac4f97 --- /dev/null +++ b/preprocessing/utilities.R @@ -0,0 +1,60 @@ +library(tidyverse) +library(data.table) + +median_impute <- function(x) { + x[is.na(x)] <- median(x, na.rm = TRUE) + return(x) +} + +subset_features <- function(x, marker, channels, features, pct.var = 0) { + # Filter dataset to selected markers/channels/features + + # Filter to selected marker set + xm <- filter(x, str_detect(Markers, marker)) %>% + select_if(function(col) !all(is.na(col))) + + # Filter to select channels + channels.select <- unlist(str_split(channels, "")) + c.drop <- str_c(setdiff(1:4, channels.select), collapse = "|") + f.drop <- str_c("X\\.\\..*\\.\\..*(", c.drop, ").*$") + id.drop <- str_detect(colnames(xm), f.drop) + xm <- select_if(xm, !id.drop) + + # Filter to select features + features <- str_replace_all(features, "\\*", "\\.\\*") + features.select <- str_c("X..", features, "..", channels) + xf <- lapply(features.select, function(f) select(xm, matches(f))) + + # Reduce dimensions of filtered data + if (pct.var != 0) { + xf <- lapply(xf, reduce_dim, pct.var = pct.var) + p <- sapply(xf, ncol) + features.select <- rep(features.select, times = p) + xf <- do.call(cbind, xf) + colnames(xf) <- str_c(features.select, ":", colnames(xf)) + } else if (pct.var == "average") { + xf <- sapply(xf, rowMeans) + colnames(xf) <- features.select + } else { + xf <- do.call(cbind, xf) + xf <- mutate_all(xf, median_impute) + } + + # Aggregatedata and metadata + meta.cols <- str_subset(colnames(xm), "^X", negate = TRUE) + xm <- select(xm, one_of(meta.cols)) + out <- cbind(xm, xf) + return(out) +} + +reduce_dim <- function(x, pct.var = 0.9) { + # Reduce dataset dimensionality with PCA + x <- mutate_all(x, median_impute) + pca <- prcomp(x) + + # Evaluate # PCs required to obtain specified level of explained variability + var.exp <- cumsum(pca$sdev^2) / sum(pca$sdev^2) + n.pc <- min(which(var.exp > pct.var)) + xout <- pca$x[, 1:n.pc, drop = FALSE] + return(xout) +}