From cc0e95b4364f0968d89f70cd7321fb967532a383 Mon Sep 17 00:00:00 2001
From: Karl Kumbier <kkumbier@berkeley.edu>
Date: Mon, 29 Apr 2024 14:46:47 -0700
Subject: [PATCH] updates for paper revision

---
 README.md                                     |  46 +-
 paper_figures/Dockerfile                      |  33 +
 paper_figures/Figure_ASO/.DS_Store            | Bin 10244 -> 8196 bytes
 paper_figures/Figure_ASO/aso_models.R         | 509 ++++++++-------
 paper_figures/Figure_ASO/raw_fus_phenotype.R  | 312 +++++-----
 paper_figures/Figure_Scores/.DS_Store         | Bin 10244 -> 10244 bytes
 paper_figures/Figure_Scores/README.md         |  33 +
 .../Figure_Scores/fig_coordinate_space.R      | 104 ++++
 .../Figure_Scores/fig_model_comparison.R      | 114 ++++
 paper_figures/Figure_Scores/fig_qc_pca.R      | 123 ++++
 paper_figures/Figure_Scores/fig_qc_plates.R   | 178 ++++++
 paper_figures/Figure_Scores/fig_scores.R      | 588 ++++++++++++++++++
 paper_figures/Figure_Scores/run_models.sh     |  16 +
 .../Figure_Scores/train_fus_models.R          |  73 +++
 paper_figures/Figure_Search/.DS_Store         | Bin 10244 -> 10244 bytes
 paper_figures/Figure_Search/README.md         |  38 ++
 .../Figure_Search/fig_marker_tuning.R         |  43 +-
 .../Figure_Search/fig_pred_by_marker.R        |  60 ++
 paper_figures/Figure_Search/run.sh            |   4 +
 .../Figure_Search/train_fus_screen.R          | 119 ++--
 .../Figure_Search/train_marker_screens.R      | 129 ++--
 .../Figure_Transcriptomics/.DS_Store          | Bin 10244 -> 10244 bytes
 .../Figure_Transcriptomics/README.md          |  48 ++
 .../Figure_Transcriptomics/fig_age_cor.R      |  90 +++
 .../Figure_Transcriptomics/fig_fibro.R        | 332 ++++++++++
 .../Figure_Transcriptomics/fig_spine.R        | 119 ++++
 .../Figure_Transcriptomics/fit_fibro.R        | 101 +++
 .../Figure_Transcriptomics/fit_spine.R        | 133 ++++
 .../Figure_Transcriptomics/load_fibroblast.R  |  99 ++-
 .../Figure_Transcriptomics/load_spine.R       |  93 ++-
 .../Figure_Transcriptomics/pca_analysis.R     |  36 +-
 .../Figure_Transcriptomics/utilities.R        | 253 ++++----
 paper_figures/color_palette.R                 |  61 +-
 paper_figures/setup.R                         |  41 ++
 paper_figures/theme.R                         |   4 +
 paper_figures/utilities.R                     | 296 +++++----
 preprocessing/make_eigenfeatures.R            | 125 ++++
 preprocessing/make_eigenfeatures_well.R       | 150 +++++
 preprocessing/utilities.R                     |  60 ++
 39 files changed, 3593 insertions(+), 970 deletions(-)
 create mode 100644 paper_figures/Dockerfile
 create mode 100644 paper_figures/Figure_Scores/README.md
 create mode 100644 paper_figures/Figure_Scores/fig_coordinate_space.R
 create mode 100644 paper_figures/Figure_Scores/fig_model_comparison.R
 create mode 100644 paper_figures/Figure_Scores/fig_qc_pca.R
 create mode 100644 paper_figures/Figure_Scores/fig_qc_plates.R
 create mode 100644 paper_figures/Figure_Scores/fig_scores.R
 create mode 100644 paper_figures/Figure_Scores/run_models.sh
 create mode 100644 paper_figures/Figure_Scores/train_fus_models.R
 create mode 100644 paper_figures/Figure_Search/README.md
 create mode 100644 paper_figures/Figure_Search/fig_pred_by_marker.R
 create mode 100644 paper_figures/Figure_Search/run.sh
 create mode 100644 paper_figures/Figure_Transcriptomics/README.md
 create mode 100644 paper_figures/Figure_Transcriptomics/fig_age_cor.R
 create mode 100644 paper_figures/Figure_Transcriptomics/fig_fibro.R
 create mode 100644 paper_figures/Figure_Transcriptomics/fig_spine.R
 create mode 100644 paper_figures/Figure_Transcriptomics/fit_fibro.R
 create mode 100644 paper_figures/Figure_Transcriptomics/fit_spine.R
 create mode 100644 paper_figures/setup.R
 create mode 100644 paper_figures/theme.R
 create mode 100644 preprocessing/make_eigenfeatures.R
 create mode 100644 preprocessing/make_eigenfeatures_well.R
 create mode 100644 preprocessing/utilities.R

diff --git a/README.md b/README.md
index 8e8ae63..369e969 100755
--- a/README.md
+++ b/README.md
@@ -1,15 +1,41 @@
+# FUS-ALS Molecular ALS Phenotype Scores
 This directory contains scripts used to process and analyze screens from the ALS
-patient-derived fibroblast study. Analyses for each screen are organized within
-individual directories corresponding to Figures 2-4 of the manuscript
-Figure_Scores=Fig. 2, Figure_ASO=Fig. 3, Figure_Transcriptomics=Fig. 4,
-Figure_Search=Extended data Figs. E1-E3. 
+patient-derived fibroblast study. We provide a Dockerfile to install all
+dependencies for running analyses reported in Kumbier et al. 2024. 
 
-Data required to run the scripts can be downloaded from Zenodo:
-https://zenodo.org/record/7247995. To run scripts, Initialize `.Renviron`
-variables:
+```
+  docker build -t als/als .
+  docker run -it --rm -v <path to project_als directory>:/ALS als/als
+```
 
-- `ALS_PAPER`=`<path_to_this_directory>`
+The core analysis scripts used for each screen / analysis include:
 
-- `ALS_DATA`=`<path_to_data_profiles_directory>` 
+## General scripts
+- `preprocessing`: directory containing scripts for processing raw image-derived
+features into the selected set of ALS-relevant eigenfeatures. Proecsses image
+features are available in the `data_profiles` directory of the zenodo
+repository. Raw image features and images are available upon request.
 
-The `data` directory on Zenodo must be placed in this directory.
+- `utilities.R`: script containg functions for fitting models along with helper
+functions for data processing and visualization.
+
+- `color_palette.R`: sets color palette used in figures.
+
+## Analysis specific scripts
+Scripts used to generate figures for the paper have been copied to
+`paper_figures` and are for the most part organized by figure.
+
+- `Figure_Search`: analyses of imaging marker set search—reported in
+  supplemental section. 
+
+- `Figure_Scores`: analyses for image-based scores (i-MAP scores).
+
+- `Figure_Transcriptomics`: analyses for transcriptomic-based scores (t-MAP
+scores).
+
+- `Figure_ASO`: analyses from ASO screen. *Note:* some of the sporadic figures
+are generated in `Figure_Transcriptomics`
+
+## Data
+Data for image-based cell profiles are contained in `data_profiles`, organized
+by screen. All other data can be found in `data`.
diff --git a/paper_figures/Dockerfile b/paper_figures/Dockerfile
new file mode 100644
index 0000000..0368929
--- /dev/null
+++ b/paper_figures/Dockerfile
@@ -0,0 +1,33 @@
+## Base docker image 
+FROM ubuntu:latest 
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update && \
+    apt-get install -y r-base \
+      libssl-dev \
+      libcurl4-gnutls-dev \
+      libxml2-dev \
+      libfontconfig1-dev \
+      libharfbuzz-dev \
+      libfribidi-dev \
+      libfreetype6-dev \
+      libpng-dev \
+      libtiff5-dev \
+      libjpeg-dev \
+      libgit2-dev \
+      git \
+      curl \
+      python3-venv
+
+RUN apt install -y cmake
+## TODO: install python for keras
+
+## Initialize Renviron varaibles
+RUN touch ~/.Renviron
+RUN echo "ALS_PAPER=/ALS/" >> ~/.Renviron
+RUN echo "ALS_DATA=/ALS/data_profiles/" >> ~/.Renviron
+
+## run the script
+COPY ./setup.R /setup.R
+RUN Rscript /setup.R
+
diff --git a/paper_figures/Figure_ASO/.DS_Store b/paper_figures/Figure_ASO/.DS_Store
index c522d7d9a6851abcb578234850ce8249ced3c78a..f7a9fbe724b041077adc29609628115322f4b282 100644
GIT binary patch
literal 8196
zcmeHMYitx%6h3EK<j%CEm)EvM7FG)Ku#}cZE05sz1r*dG+tN}V>+a69oiLqQcV@RB
zsi`s1hy;B6FeZi{XhMt$MvM{v5;4X<L~MK{8byDZm`F65_zTaSI}0r>{xOOPbCY||
zz2|Z7+;hJ(XYVWkU~67)1gHi8MpaN=M$I%u$VHt}lz=1bkn915!2ul%u)v+79Wp`>
zgdPYz5PBf=K<I)0g9j*Qi^6V`_oXy!Ll1-=xGOy%-VZUVg2n?nCaM4GpvJ!f5G^IP
zU#L&_0N*AS(0D+{B=t>cPI-I4z!ZZk21;|1$Ami3ctFP_mF9rb957fJgBuF^tCL;K
zR|kws8n&SaLJv&!fXMyjkfQs#1IbDE_j=gVX*rp+<)leRgw;D#>#zHTdH^Lb045Ah
z*23uYTImrt8_abfowZEYYiRfam6FmK_snE7Ss6Q+7;;7uUdqe6t*P98G3s_4D_!iT
z(tAx~B+2KsIkuNFZ6oI!n7TnK2YO9gcSc&Bysf*wZ9)o|%G4yUK6b3Jxh}STUE^q7
z>{#QvwM})gjSbDCqbi$UyJqvA<Y8;rc8>eI0t{X;P$9=R+83x7?AaP@Ok|wltS|fJ
z)!h>$ZlO)?Q@K{$=U2sjs#;kj(cPmZ`J4i6rYq~2j_vL>axT#%uXJhCof*g8oil{I
zH{)3OVcSje*_o7;$y+JUXtAt>`K00YdBzb>^!i;V=e7BYeJ0roUXt^?YwR(|z$_T;
zWae^I&y~+#u()=`>b1>VS~_+<I$klWlFy!_Y5n9X+cO7DBiA!x>fX@4l$$YZ-LwaL
z3#MzPEu-5`4IBKds*dwnl@*byNDbq)tS_H7pU9iuQJ<?Vj4V?5`|N;D>pkM=VnSW&
zFN`c<Y`@^Q<qT>iTU!=cuJRP?cIlDj6sxp4mAh*9VLDD}x<+ecTvvOnRNAm4YrWRQ
z_%5}#ApItp8@0`BHrA<qbS%TP9aplqM&gWr!VZbGA5OVmSJtp^K2<wo`-G%L*NG|o
zv2A-K((ewXaz@t(L5yEv%;)bSzJEX@5%YBDrsw1WVV5uDrd==zzjTxm#*DGr_zDT1
zNlvMaH%P8ZS+54_DaY-A1N7Z23rFDuoP<+w8qUIbxBwU7Q@9MD!4>!%zJYJyJNO=c
zf?we`_#OVh5-i6GoP`Up7MJ4vxE$+n12*A<xDmJFcHDtGaTh*@{kR_w;vpPB2eas+
zhokr`K8MfaaeNKW;2ZcRp2H9DB7TII@MFA;SMdk@5r4+(3Mi;3N|{ov+$SGeDaJ>y
z92Z+<3H6Hc;TD2uSB&YRAlk8Q`yGmcugfLw=gynINCd-%O%L6`j6_)xSQ7#3V|3CX
z`WW@&;#P1)s;X-i+^a=<2*k#b+l3ULDd5V+h4$i@7UeSt=5n}n84d3th|A&1RnaI{
zD6-_Reoa*4B@{QRZ>z3~YC*TDS<`qa#g95(Q&k<&wrE>b3R(`nV!VZ>5y9{eV0i}K
zhIipZxJ0o0k^p%XuEC#h9cN%AR$&b;#3(Kz7_P(z2#TwHC~m^dxCJ{1j-5UnyKyh}
z;XVRo26Z&hCP?N2P(FoY_%uF)C-5Y`h%e#G_zJ#`Z{hh{px0Rhouv5QP3R3f?d?IL
zVPmcH4xi$eS|&>MzmaB#CcM_KiJVsibRm(mEyo@tj)4DM$WhW9p_DlQc}kv13f&<J
zV4KpYM?svW{?r62p5yd{1WLm@SxV&M)#5im$L+uW-^oV|*A6`pdf;F70G75V+FNOn
zVq(8_J%t!Cs*h6@Md*!5>YGp_R0Q9@juXA{havT2fh-=-F-b{6<v;%r5a$0d|NB?k
LX?`8tMfv|Px_*yi

literal 10244
zcmeHMTWl0n7(V~B(AjA#Q(EX{3R!66l9t*FlokZH?G|W?max51&}DaL+K$Z5tUI$?
zu%Pu#0r8gjU<?nw@BjfZ7$Nb2AR3}UQ5!E29*9w%B)s^559)vB%(mO5-BhB;!%Q;g
zpL6-o<vZVh{{7DqLZB@n*AwC;gz)%MDHdQd!$a%od(Xo;C6y=v^$Cd*gUCc78VTOh
z+AVMqa1n43a1n43a1nSI5I~<zA3mQ+H+2zk5pWThM1Y<TUVN$aIdYsy`{=;L4gpZ+
zVzK?;H^u?Bj@OYsM~*XTt74xldH_cioDc(;I@Wt>bCf<ujx)*B2{3g6&dA_IC}59{
zdS1IZ0UwiY>LTDGa4!Nhc3(_F#Kg!wI4yoxF&D$qII*x^ClQS6Yst<|&4`3GBaC{~
z)LQYa{mXvvi;#jD`Z8zGJCmW8LxzY-hNt%qOJ;5r9yMdm94>@onrd2gb>9J(lRIyI
z9-qhO^G^pxjM0D<vJz%nD88HKdJIDgr@lksK2;eFiY4uaZiQ4`iQ5XQtYDKp{i-e-
zqise)mrYwYz$$nT?+J?KV`KGe*EQF8n;RPr)p*BR8tQ7iYa5#m9rEytsvg_8Gq_KS
z>c$a@2A)ICc|{6Yelk6ZU(TYn;rc+th{o)?8?Q#wYRqIiTWF4uTIdg{g?fa1skg6R
z2#O1nxSH;mr5d`~r^HQ^L2-eJtL}^#`i{6l>-!^ymWb+RP%Mgsv`9h=SxT#>1=Qyi
zbHGyeTl8ztG~!mf-EcrfU0N0t#e}KsR8T=pDt=TmMev+1ShQsMit4&`8#cFZ-M(X@
z@R0>#kx(oQB38Pk4yj6fSICSgx~%HMy`!pZjr1o~Qw?iMj~<FDo+7C6OcX9IEi12(
zB-UU`A=Pvs5mui|sMZ0ois$VymP%e>kBpOe(vayWGB19J{ov?Tl2_VIYue)qrm~~3
zQd%wan79sJ46%kaLaorV4>#E(nyjQ=Xq0-jP*~ACqRhr=5}Kv{Bm>hUcCo^ZLa{V}
zTNTA6njTj6gw!fMLx<fSGOg~IqSGl|!8%-;-z!PGXkDu;+Y>5VTv{k~N?pA84L@j(
zgyKr~D6((DDevZad*iU2_SPY*mN8iwpp~MkCsiaiZZYFGU*%iHNHq<WDqkHd^`K)d
zM+5YcZDbFcaEu%vN69hr8aYl*kPpbm<P!Oud_}I2Z^;C?L4GE`k>AOmfXoFB@}UgM
zL4rzn6zZV?8sTwhg*Na*2RsD<=z%`yhrKWa!=S>mV8R$2gcsmNcp2V+H{mUK7f!+{
zI1OjuJY0fL;4)l=FX0+|4cFlp_!Vx!U5?{&xLmH3E8~`NwcL6(aV}*!yI4Mza?SXa
zvK%w%90XiwCi|%HcWn8e?BmA*+aeYgmn>RZwQ5ZR+Q}qy7}ZkjO6ROi)M*=O6V<ku
z43bUQrXy4?N!iMj^)PGdMTRT;d1`24p3{2Savm8j&U3OWR${hGoX@fq)ipd?lgQaw
zp`xzdgEk}PIN53sGM)}92+b`Xv>h?m$yRQ{EU_~c6|HU9$rX{me`3BzPLi|a0{Ik)
z`8~N#Zjw9XF48d{ilEdcWF;(xm9Pp{Lp7{{CRh*6umP#r0b5`zbV3&rvlpp30D}+#
z85GdLfEdKVf<rbvUxLGM1dhTncpZuQ4xE7ZGibLhMK~tm>=f;yhTre_AO6Vx&%&F3
zx=j6`L&~TjXQ_>iq#yQoXy=YzG7#4_Lm$QxO8r9S5z#A0&}1$2#MDP?=p`c=UXr?+
zbRV2WGtG0JPZ`Vu^PFYqIqT3_w$wLeah!*7HTcZxIO%G$tZ<q$y?th0_SWe;+4#lP
zgXbpfoy^jI`tHt6W7>}2zK9dbHvbG36wS^8WX2Fl5Ho`fcd`F;Hq4=ibj+aS+#lG(
z9JJE0yLI#JQ=6C-%t>>~LjIX%I+jp!5hpsAnPy;`H;L&C_;J+!((%h^on}EFJ#Y&u
z9UE(2iw3m5C5r*QLQ%<LK-9o87L@@|G5+Wek<k>vP{zmz{&vvu0J88fip?4d98U`c
z>{+;pi-3#3!-+r+o0gZ}|98y(|Np}|d3RPW0xkmo3IdSp5BS@#t5n3D*(<$k_u{)3
zU;5B?<4oErn9wG)+`f*-484xG?<KIzxrwmreNMY^CT(@pv(-_u+x-9h&wvMTKjy~&
W_kA(t#{bkTvc#Yn5#N2q|9=BWs`vo_

diff --git a/paper_figures/Figure_ASO/aso_models.R b/paper_figures/Figure_ASO/aso_models.R
index 7b0eecb..6b313f9 100644
--- a/paper_figures/Figure_ASO/aso_models.R
+++ b/paper_figures/Figure_ASO/aso_models.R
@@ -1,171 +1,153 @@
-#' # Figure 5c-d
-#' The following script fits and evaluates FUS vs. WT classifiers on ASO-treated
-#' cell lines. Models can be fit for the full feature set or cytosolic features
-#' only by setting the variable `cytosolic = FALSE` or `cytosolic = TRUE`
-#' respectively. Models can be fit using EEA1/FUS features or only FUS features 
-#' by setting the variable `fus.only = FALSE` or `fus.only = TRUE` respectively.
-#' 
-#' Models are trained using data from EP cells and evaluated on EP cells treated
-#' with NTC, low ASO and high ASO.
+# " # ASO analysis
+# " The following script fits and evaluates FUS vs. WT classifiers on ASO-treated
+# " cell lines. Models can be fit for the full feature set or cytosolic features
+# " only by setting the variable `cytosolic  =  FALSE` or `cytosolic  =  TRUE`
+# " respectively. Models can be fit using EEA1/FUS features or only FUS features
+# " by setting the variable `fus.only  =  FALSE` or `fus.only  =  TRUE` respectively.
+# "
+# " Models are trained using data from EP cells and evaluated on EP cells treated
+# " with NTC, low ASO and high ASO.
 library(data.table)
-library(readxl)
 library(tidyverse)
 library(tidytext)
 library(ggsci)
 library(ggpubr)
-library(hrbrthemes)
 library(caret)
 library(gridExtra)
-library(see)
-
-theme_set(
-  theme_ipsum(
-    plot_title_size=28,
-    axis_title_size=24,
-    strip_text_size=24, 
-    axis_text_size=22,
-    base_size=22,
-    base_family='sans',
-    axis=TRUE,
-    axis_col='#000000',
-    grid=FALSE
-  )
-)
+library(lme4)
+library(lmerTest)
 
 # Paths to data directories
-fig.base.dir <- Sys.getenv('ALS_PAPER')
-data.base.dir <- Sys.getenv('ALS_DATA')
-figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_ASO')
-platemap.dir <- file.path(fig.base.dir, 'data', 'platemaps')
-dir.create(platemap.dir)
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_ASO")
+meta.dir <- file.path(fig.base.dir, "data")
 
-source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
-source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R'))
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
+source(file.path(fig.base.dir, "paper_figures", "color_palette.R"))
+source(file.path(fig.base.dir, "paper_figures", "theme.R"))
+theme_set(theme_als())
 
 # Set paths to data directories
-data.dir <- file.path(data.base.dir, '080122_ASOScreenV2', 'Cell_Filtered')
-library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv')
-plate.file <- file.path(fig.base.dir, 'data', 'plate_list.txt')
+data.dir <- file.path(data.base.dir, "120423_ASO14", "Cell_Filtered")
 
 # Set parameters for analysis
 n.core <- 16
-compound <- 'DMSO'
-marker <- 'FUS_EEA1'
-cytosolic <- FALSE
-fus.only <- FALSE
+marker <- "FUS_EEA1" # marker set used in screen
+marker.select <- "FUS_EEA1" # marker(s) used in model
+region <- "cytosolic"
+drop.sporadic <- FALSE # if TRUE, plots FUS/healthy, else plots sporadic
 save.fig <- TRUE
-save.plates <- TRUE
-train.condition <- 'H2O'
-treatment.levels <- c('untreated', 'H2O', 'NTC 10uM', 'ASO 1uM', 'ASO 10uM')
+correction <- "bonferroni"
+train.condition <- "untreated"
+treatments <- c("untreated", "NTC 10uM", "ASO 1uM", "ASO 10uM")
+
+if (drop.sporadic) {
+  genetics.keep <- c("Healthy", "FUS-ALS")
+} else {
+  genetics.keep <- "sporadic"
+}
 
 # Set paths to data directories
-output.fig.dir <- file.path(figure.dir, 'fig')
-if (cytosolic) output.fig.dir <- str_c(output.fig.dir, '_cytosolic')
-if (fus.only) output.fig.dir <- str_c(output.fig.dir, '_fus')
-dir.create(output.fig.dir, recursive=TRUE, showWarnings=FALSE)
-
-# Load mutation table
-xcell <- fread(library.file) %>% 
-    dplyr::rename(CellLine=`Cell line`) %>%
-    mutate(Sex=toupper(Sex)) %>%
-    dplyr::rename(Genetics=`Disease Gene`) %>%
-    mutate(Genetics=ifelse(Genetics == 'WT', 'Healthy', Genetics)) %>%
-    mutate(Genetics=ifelse(Genetics == 'FUS', 'FUS-ALS', Genetics)) %>%
-    mutate(Genetics=str_replace_all(Genetics, 'ALS with no known mutation', 'sporadic'))
+params <- str_c(region, "_", marker, ":", marker.select)
+output.fig.dir <- file.path(figure.dir, "fig")
+dir.create(output.fig.dir, recursive = TRUE, showWarnings = FALSE)
+
+load(file.path(data.dir, str_c("profiles_", marker, ".Rdata")))
 
 # Initialize models to be used for supervised learning
-model <- function(x, y) irf(x, y, n.core=n.core)
+model <- function(x, y) irf(x, y, n.core = n.core)
 model_predict <- predict_irf
-importance_norm <- function(x) as.matrix(x / max(x))
 
-#' ## Loading data
-#' Our dataset consists of phenotypic profiles derived from `r marker`. Each 
-#' observation in the dataset corresponds to a single `r type`. We subset to
-#' features as indicated by `cytosolic` and `fus.only` variables.
+# " ## Loading data
+# " Our dataset consists of phenotypic profiles. Each
+# " observation in the dataset corresponds to a single cell.
 #+load_data
 # Load in dataset for select marker set
-load(file.path(data.dir, str_c('profiles_', marker, '.Rdata')))
-
-x <- mutate(x, PlateID=str_remove_all(PlateID, '.*plate_')) %>%
-  mutate(ID=str_c(ID, 1:n())) %>%
-  dplyr::rename(ASO=ASOtreatment) %>%
-  mutate(ASO=str_remove_all(ASO, 'EP_')) %>%
-  mutate(ASO=str_replace_all(ASO, 'ASO', ' ASO')) %>%
-  mutate(ASO=str_replace_all(ASO, 'NTC', 'NTC 10uM')) %>%
-  mutate(ASO=str_replace_all(ASO, 'low ASO', 'ASO 1uM')) %>%
-  mutate(ASO=str_replace_all(ASO, 'high ASO', 'ASO 10uM')) %>%
-  mutate(ASO=factor(ASO, treatment.levels)) %>%
-  dplyr::select(-Genetics) %>%
-  left_join(xcell, by='CellLine')
+x <- mutate(x, PlateID = str_remove_all(PlateID, ".*plate_")) %>%
+  mutate(ID = str_c(ID, seq_len(n()))) %>%
+  dplyr::rename(ASO = ASOtreatment) %>%
+  mutate(ASO = str_remove_all(ASO, "EP_")) %>%
+  mutate(ASO = str_replace_all(ASO, "ASO", " ASO")) %>%
+  mutate(ASO = str_replace_all(ASO, "NTC", "NTC 10uM")) %>%
+  mutate(ASO = str_replace_all(ASO, "low ASO", "ASO 1uM")) %>%
+  mutate(ASO = str_replace_all(ASO, "high ASO", "ASO 10uM")) %>%
+  mutate(ASO = factor(ASO, treatments))
 
 # Initialize gene/cell line key
 genetic.key <- dplyr::select(x, CellLine, Genetics) %>% distinct()
-mutation.key <- setNames(xcell$`Figure Names`, xcell$CellLine)
 
 # Initialize channel / region IDs
-channels <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3)
+channels <- sapply(str_split(colnames(x), "\\.\\."), "[", 3)
+channels[is.na(channels)] <- 0
 
-regions <- sapply(str_split(colnames(x), '\\.\\.'), '[', 2)
-regions <- lapply(regions, str_split, pattern='_') 
+regions <- sapply(str_split(colnames(x), "\\.\\."), "[", 2)
+regions <- lapply(regions, str_split, pattern = "_")
 regions <- sapply(regions, function(z) tail(z[[1]], 1))
+regions[is.na(regions)] <- "X"
+
+# Filter to selected region / marker
+id <- rep(TRUE, ncol(x))
+if (region == "cytosolic") id <- id & regions %in% c("X", "N", "NN")
+if (marker.select == "FUS") id <- id & channels %in% c("0", "2")
+if (marker.select == "CD63") id <- id & channels %in% c("0", "2")
+if (marker.select == "EEA1") id <- id & channels %in% c("0", "3")
+x <- dplyr::select_if(x, id)
 
-# Initialize and (optionally) drop FUS nuclear features
-channels <- str_split(channels, '')
-regions <- str_split(regions, '')
+# " ## ASO models
+# " We evaluate the degree to which ASO treatment "pushes" cell lines (disease
+# " to health, health to disease) in the context of supervised models trained
+# " to discriminate between ALS/FUS. Models are trained on DMSO treated cell
+# " lines and evaluated on lines treated with (i) H2O (ii) NTC (iii) low ASO
+# " and (iv) high ASO.
+#+ leave_cell_out, fig.height = 12, fig.width = 16
+aso_fit <- function(
+    x,
+    model = irf,
+    model_predict = predict_irf) {
+  # Check for valid input
+  if (nrow(x) == 0) {
+    return(NULL)
+  }
 
-nuc.fus <- mapply(function(ch, rg) {
-  any(ch == 2 & rg == 'D') | any(ch == 2 & rg == 'C')
-}, channels, regions)
+  # Rank-normalize features within plate
+  ecdf <- function(x) rank(x) / length(x)
+  xfeat <- ungroup(x) %>%
+    dplyr::select(matches("(^X|PlateID)")) %>%
+    group_by(PlateID) %>%
+    mutate_if(is.numeric, ecdf) %>%
+    ungroup() %>%
+    dplyr::select(-PlateID)
 
-nuc.fus[is.na(nuc.fus)] <- FALSE
-if (cytosolic) x <- dplyr::select_if(x, !nuc.fus)
+  x <- dplyr::select(x, -matches("^X"))
+  x <- cbind(x, xfeat)
 
-# Initialize and (optionally) drop EEA1 features
-channels <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3)
-channels[is.na(channels)] <- 0
-if (fus.only) x <- dplyr::select_if(x, !str_detect(channels, '3'))
-
-
-#' ## ASO models
-#' We evaluate the degree to which ASO treatment "pushes" cell lines (disease 
-#' to health, health to disease) in the context of supervised models trained
-#' to discriminate between ALS/FUS. Models are trained on DMSO treated cell
-#' lines and evaluated on lines treated with (i) H2O (ii) NTC (iii) low ASO
-#' and (iv) high ASO.
-#+ leave_cell_out
-aso_fit <- function(x,
-                    model=irf,
-                    model_predict=predict_irf) {
-  
-  # Check for valid input
-  if (nrow(x) == 0) return(NULL)
-  
   # Set feature matrix and reponse vector
-  y <- as.numeric(x$Genetics != 'Healthy')
-  
+  y <- as.numeric(x$Genetics != "Healthy")
+
   # Downsample for class balance
   xds <- downSample(x, as.factor(y))
-  xx <- dplyr::select(xds, matches('^X'))  
+  xx <- dplyr::select(xds, matches("^X"))
   y <- as.numeric(xds$Class) - 1
-  
+
   # Set training/test indices
   id.train <- which(xds$ASO == train.condition)
   id.test <- which(xds$ASO != train.condition)
   fit <- fit_model(xx, y, id.train, model, model_predict)
-  
+
   predicted <- data.frame(
-    Ypred=fit$ypred,
-    Ytest=y[id.test],
-    Genetics=xds$Genetics[id.test],
-    CellLine=xds$CellLine[id.test],
-    Treatment=xds$ASO[id.test],
-    WellID=xds$WellID[id.test],
-    ID=xds$ID[id.test],
-    Marker=marker,
-    Plate=xds$PlateID[id.test]
+    Ypred = fit$ypred,
+    Ytest = y[id.test],
+    Genetics = xds$Genetics[id.test],
+    CellLine = xds$CellLine[id.test],
+    Treatment = xds$ASO[id.test],
+    WellID = xds$WellID[id.test],
+    ID = xds$ID[id.test],
+    Marker = marker,
+    Plate = xds$PlateID[id.test]
   )
-  
-  return(list(fit=fit, predicted=predicted))
+
+  return(list(fit = fit, predicted = predicted))
 }
 
 
@@ -173,111 +155,198 @@ aso_fit <- function(x,
 set.seed(47)
 cell.models <- aso_fit(x, model, model_predict)
 
-#' ### Fig 3c
-#' Model prediction scores by genetics, treatment. 
-#+ raw_predictions, fig.height=8, fig.width=12
-if (cytosolic & fus.only) {
-  main <- 'FUS only,\nexcluding nuclear FUS features'
-} else if (cytosolic) {
-  main <- 'FUS & EEA1,\nexcluding nuclear FUS features'
-} else if (fus.only) {
-  main <- 'FUS only,\nall features'
-} else {
-  main <- 'FUS & EEA1,\nall features'
-}
+# " ### Predictions by genetics
+# " Model prediction scores by genetics, treatment.
+#+ raw_predictions, fig.height = 8, fig.width = 12
+main <- str_c(marker.select, ", ", region)
 
-# Comparisons of raw predictions
-xplot <- filter(cell.models$predicted, !Treatment %in% train.condition) %>%
+xplot <- filter(cell.models$predicted) %>%
   group_by(Genetics, CellLine, Treatment) %>%
-  summarize(Ypred=mean(Ypred), .groups='drop') %>%
-  mutate(ChannelS='FUS, EEA1') %>% 
-  filter(Treatment != 'untreated') %>%
-  mutate(Treatment=factor(Treatment, levels=levels(Treatment)[3:5]))
-
-comparison <- list(c("Healthy", "FUS-ALS"))
-                   
-p <- ggboxplot(xplot, x="Genetics", fill="Genetics", y="Ypred", facet.by='Treatment', 
-          outlier.shape=NA, coef=NULL) +
-  stat_boxplot(coef=NULL, aes(fill=Genetics)) +
-  geom_jitter(size=3, width=0.1) +
-  stat_compare_means(comparisons=comparison, label="p.signif", size=7, p.adjust.method='bonferroni') +
-  ylab(NULL) +
-  xlab(NULL) +
-  scale_fill_manual(values=col.pal) +
+  summarize(Ypred = mean(Ypred), .groups = "drop")
+
+fout <- str_c("pred_", params, ".pdf")
+fout <- file.path(output.fig.dir, fout)
+
+if (save.fig) pdf(file = fout, height = 8, width = 8)
+ggplot(xplot, aes(x = Treatment, y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics)) +
+  scale_fill_manual(values = col.pal) +
   ylim(0:1) +
-  theme_ipsum(
-    plot_title_size=28,
-    axis_title_size=24,
-    strip_text_size=20, 
-    axis_text_size=20,
-    base_size=20,
-    base_family='sans',
-    axis=TRUE,
-    axis_col='#000000',
-    grid=FALSE
-  ) +
-  theme(legend.position='none') +
-  theme(axis.text.x=element_blank())
+  ggtitle(main)
+if (save.fig) dev.off()
 
-pdf(file.path(output.fig.dir, 'fig5c.pdf'), height=8, width=8)
-plot(p)
-dev.off()
-
-#' ### Fig 3d
-#' Model prediction scores by cell line, treatment. 
-#+ raw_predictions_cell_line, fig.height=8, fig.width=12
-# Raw predictions
-p <- filter(cell.models$predicted, !Treatment %in% train.condition) %>%
-  mutate(Treatment=factor(Treatment, levels=treatment.levels)) %>%
+#####################
+# Genetic predictions
+#####################
+# Average predictions by well and normalize relative to NTC
+xplot <- filter(cell.models$predicted, !Treatment %in% train.condition) %>%
   group_by(Genetics, CellLine, Treatment, WellID, Plate) %>%
-  summarize(Ypred=mean(Ypred), Count=n(), .groups='drop') %>%
-  filter(Treatment != 'untreated') %>%
-  mutate(CellLineL=mutation.key[CellLine]) %>%
-  ggplot(aes(x=Treatment, y=Ypred, fill=Genetics)) +
-  geom_boxplot(outlier.shape=NA, coef=NULL) +
-  geom_jitter(width=0.25) +
-  facet_wrap(~CellLineL, nrow=2) +
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  filter(Treatment != "untreated") %>%
+  group_by(CellLine) %>%
+  mutate(Ypred = Ypred / mean(Ypred[Treatment == "NTC 10uM"])) %>%
+  ungroup()
+
+# Initialize plot range
+xrange <- group_by(xplot, Genetics, Treatment) %>%
+  summarize(Mean = mean(Ypred), SD = sd(Ypred)) %>%
+  mutate(MaxRange = Mean + SD) %>%
+  mutate(MinRange = Mean - SD)
+
+xrange <- c(min(xrange$MinRange), max(xrange$MaxRange))
+eps <- diff(xrange) / 10
+
+# Compute significance levels for treatment effect
+lme.test <- lapply(unique(xplot$Genetics), function(g) {
+  xg <- filter(xplot, Genetics == g)
+  mixed <- lmerTest::lmer(Ypred ~ Treatment + (1 | CellLine), data = xg)
+  out <- summary(mixed)$coefficients[3, 5]
+  out <- data.frame(Treatment = "ASO 10uM", p = out) %>% 
+    mutate(Treatment = str_remove_all(Treatment, "Treatment")) %>%
+    mutate(Genetics = g)
+  return(out)
+})
+
+test.plot <- rbindlist(lme.test) %>%
+  mutate(padj = p.adjust(p, method = correction)) %>%
+  mutate(padj = signif(padj, 3)) %>%
+  mutate(x = ifelse(Treatment == "ASO 1uM", 1, 2)) %>%
+  mutate(x = 2) %>%
+  mutate(y = max(xrange) - eps) %>%
+  mutate(y = ifelse(Treatment == "ASO 1uM", y + eps / 2, y)) %>%
+  mutate(xend = x + 1) %>%
+  mutate(yend = y) %>%
+  mutate(xtext = 2) %>%
+  mutate(ytext = y + eps / 4) %>%
+  mutate(x = 1) %>%
+  mutate(plab = label_pval(padj)) %>%
+  filter(Genetics %in% genetics.keep)
+
+p <- filter(xplot, Genetics %in% genetics.keep) %>%
+  ggbarplot(
+    x = "Treatment",
+    fill = "Genetics",
+    y = "Ypred",
+    facet.by = "Genetics",
+    merge = TRUE,
+    add = "mean_se",
+    error.plot = "upper_errorbar"
+  ) +
+  geom_text(
+    data = filter(test.plot, str_detect(Treatment, "1uM")),
+    aes(x = xtext, y = ytext, label = plab), size = 8
+  ) +
+  geom_text(
+    data = filter(test.plot, str_detect(Treatment, "10uM")),
+    aes(x = xtext, y = ytext, label = plab), size = 8
+  ) +
+  geom_segment(
+    data = test.plot,
+    aes(x = x, y = y, xend = xend, yend = yend)
+  ) +
   ylab(NULL) +
   xlab(NULL) +
-  theme(legend.position='none') +
-  theme(axis.text.x=element_text(angle=90)) +
-  scale_fill_manual(values=col.pal) +
-  scale_color_manual(values=col.pal) +
+  theme(legend.position = "none") +
+  theme(axis.text.x = element_text(angle = 90)) +
+  scale_fill_manual(values = col.pal) +
+  coord_cartesian(ylim = xrange) +
   ggtitle(main) +
+  geom_hline(yintercept = 1, col = "grey", lty = 2) +
+  ylab("i-MAP score, relative to NTC") +
+  theme(text = element_text(size = 22))
 
-pdf(file.path(output.fig.dir, 'fig5d.pdf'), height=9, width=20)
+w <- ifelse(drop.sporadic, 8, 5)
+fout <- str_c("adj_cell_line_", params, "_", drop.sporadic, ".pdf")
+fout <- file.path(output.fig.dir, fout)
+if (save.fig) pdf(file = fout, height = 8, width = w)
 plot(p)
-dev.off()
-
-save(file=file.path(output.fig.dir, 'model.Rdata'), cell.models)
-
-# Save table of plates used
-if (save.plates) {
-  write.table(
-    file=plate.file, 
-    unique(x$PlateID), 
-    append=TRUE, 
-    row.names=FALSE, 
-    col.names=FALSE, 
-    quote=FALSE
-  )
-  
-  # Save plate maps
-  plate.layout <- cell.models$predicted %>%
-    mutate(PlateID=str_remove_all(ID, '_[0-9]*-[0-9]*$')) %>%
-    dplyr::select(PlateID, CellLine, WellID) %>%
-    distinct() %>%
-    mutate(Row=as.numeric(str_remove_all(WellID, '-.*$'))) %>%
-    mutate(Col=as.numeric(str_remove_all(WellID, '^.*-')))
-  
-  for (p in unique(plate.layout$PlateID)) {
-    platemap <- matrix('', nrow=16, ncol=24)
-    pm <- filter(plate.layout, PlateID == p)
-    for (i in 1:nrow(pm)) platemap[pm$Row[i], pm$Col[i]] <- pm$CellLine[i]
-    colnames(platemap) <- 1:24
-    
-    print(table(platemap))
-    fout <- file.path(platemap.dir, str_c(p, '.csv'))
-    write.csv(file=fout, platemap)
-  }
+if (save.fig) dev.off()
+
+
+#######################
+# Cell line predictions
+#######################
+# Initialize plot range
+xrange <- group_by(xplot, CellLine, Treatment) %>%
+  summarize(Mean = mean(Ypred), SD = sd(Ypred)) %>%
+  mutate(MaxRange = Mean + SD) %>%
+  mutate(MinRange = Mean - SD)
+
+xrange <- c(min(xrange$MinRange), max(xrange$MaxRange))
+eps <- diff(xrange) / 8
+
+# Compute significance levels for treatment effect
+rs.test <- lapply(unique(xplot$CellLine), function(cl) {
+  xc <- filter(xplot, CellLine == cl)
+  xt2 <- filter(xc, Treatment == "ASO 10uM")
+  xt1 <- filter(xc, Treatment == "ASO 1uM")
+  xt0 <- filter(xc, Treatment == "NTC 10uM")
+
+  rs.20 <- wilcox.test(xt2$Ypred, xt0$Ypred, alternative = "less")
+  rs.10 <- wilcox.test(xt1$Ypred, xt0$Ypred, alternative = "less")
+
+  out <- data.frame(Treatment = c("ASO 10uM")) %>%
+    mutate(p = c(rs.20$p.value)) %>%
+    mutate(CellLine = cl)
+
+  return(out)
+})
+
+test.plot <- rbindlist(rs.test) %>%
+  mutate(padj = p.adjust(p, method = correction)) %>%
+  mutate(padj = signif(padj, 3)) %>%
+  mutate(x = ifelse(Treatment == "ASO 1uM", 1, 2)) %>%
+  mutate(y = max(xrange) - eps) %>%
+  mutate(y = ifelse(Treatment == "ASO 1uM", y + eps / 2, y)) %>%
+  mutate(xend = x + 1) %>%
+  mutate(yend = y) %>%
+  mutate(xtext = 2) %>% # x + 0.5) %>%
+  mutate(ytext = y + eps / 2) %>%
+  mutate(x = 1) %>%
+  mutate(plab = label_pval(padj))
+
+if (drop.sporadic) {
+  test.plot <- filter(test.plot, !str_detect(CellLine, "S"))
+} else {
+  test.plot <- filter(test.plot, str_detect(CellLine, "S"))
 }
+
+# Plot FUS, healthy
+p <- filter(xplot, Genetics %in% genetics.keep) %>%
+  ggbarplot(
+    x = "Treatment",
+    fill = "Genetics",
+    y = "Ypred",
+    facet.by = "CellLine",
+    merge = TRUE,
+    add = "mean_se",
+    error.plot = "upper_errorbar",
+    nrow = 1
+  ) +
+  geom_text(
+    data = filter(test.plot, str_detect(Treatment, "1uM")),
+    aes(x = xtext, y = ytext, label = plab), size = 7
+  ) +
+  geom_text(
+    data = filter(test.plot, str_detect(Treatment, "10uM")),
+    aes(x = xtext, y = ytext, label = plab), size = 7
+  ) +
+  geom_segment(
+    data = test.plot,
+    aes(x = x, y = y, xend = xend, yend = yend)
+  ) +
+  ylab(NULL) +
+  xlab(NULL) +
+  theme(legend.position = "none") +
+  theme(axis.text.x = element_text(angle = 90)) +
+  scale_fill_manual(values = col.pal) +
+  coord_cartesian(ylim = xrange) +
+  ggtitle(main) +
+  ylab("i-MAP score, relative to NTC") +
+  theme(text = element_text(size = 22))
+
+fout <- str_c("well_f-h_", params, "_", drop.sporadic, ".pdf")
+fout <- file.path(output.fig.dir, fout)
+if (save.fig) pdf(file = fout, height = 8, width = 16)
+plot(p)
+if (save.fig) dev.off()
diff --git a/paper_figures/Figure_ASO/raw_fus_phenotype.R b/paper_figures/Figure_ASO/raw_fus_phenotype.R
index 3ab142a..9b0e88d 100644
--- a/paper_figures/Figure_ASO/raw_fus_phenotype.R
+++ b/paper_figures/Figure_ASO/raw_fus_phenotype.R
@@ -1,89 +1,70 @@
-#' # Figure 5
 #' The following script generates plots for qPCR levels following ASO knockdown
-#' of FUS (Fig. S5a) and FUS intensity levels in microscopy screens (Fig. 5a).
-#' 
+#' of FUS and FUS intensity levels in microscopy screens.
+#'
 #' Requires input datasets
 #'   `data/2022_ASOscreen_v3_qPCR results.xls`
 #'   `data/080122_ASOScreenV2_Cell`
-#'   
-#' Karl Kumbier, 3/6/2023
+#'
+#' Karl Kumbier, 12/19/2023
 library(data.table)
-library(readxl)
 library(tidyverse)
 library(tidytext)
 library(ggsci)
 library(ggpubr)
-library(hrbrthemes)
-
-theme_set(
-  theme_ipsum(
-    plot_title_size=28,
-    axis_title_size=24,
-    strip_text_size=24, 
-    axis_text_size=22,
-    base_size=22,
-    base_family='sans',
-    axis=TRUE,
-    axis_col='#000000',
-    grid=FALSE
-  )
-)
+library(lmerTest)
+
+theme_set(theme_bw(base_size = 22))
 
 # Set parameters for analysis
 save.fig <- TRUE
-treatment.levels <- c('untreated', 'H2O', 'NTC 10uM', 'ASO 1uM', 'ASO 10uM')
-heat.pal <- c('#FFFFFF', pal_material('blue-grey')(10))
+treatment.levels <- c("untreated", "NTC 10uM", "ASO 1uM", "ASO 10uM")
+heat.pal <- c("#FFFFFF", pal_material("blue-grey")(10))
+correction <- "BH"
 
 # Set paths to data directories
-fig.base.dir <- Sys.getenv('ALS_PAPER')
-data.base.dir <- Sys.getenv('ALS_DATA')
-figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_ASO')
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_ASO")
 
-source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
-source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R'))
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
+source(file.path(fig.base.dir, "paper_figures", "color_palette.R"))
 
 # Set paths to data directories
-data.dir <- file.path(data.base.dir, '080122_ASOScreenV2', 'Cell_Filtered')
-library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv')
-qpcr.file <- file.path(fig.base.dir, 'data', '2022_ASOscreen_qPCR.xls')
-
-output.fig.dir <- file.path(figure.dir, 'fig/')
-dir.create(output.fig.dir, recursive=TRUE, showWarnings=FALSE)
+data.dir <- file.path(data.base.dir, "120423_ASO14", "Cell_Filtered")
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
+qpcr.file <- file.path(fig.base.dir, "data", "20231007_ASO14_qPCRresults.csv")
 
+output.fig.dir <- file.path(figure.dir, "fig/")
+dir.create(output.fig.dir, recursive = TRUE, showWarnings = FALSE)
 
 # Load mutation table
 xcell <- fread(library.file) %>%
-  dplyr::rename(Figure=`Figure Names`) %>%
-  dplyr::rename(CellLine=`Cell line`)
+  dplyr::rename(Figure = `Figure Names`) %>%
+  dplyr::rename(CellLine = `Cell line`)
 
 mutation.key <- setNames(xcell$Figure, xcell$CellLine)
 
 ################################################################################
 # FUS qPCR by cell line.
 ################################################################################
-qpcr <- read_excel(qpcr.file, sheet=5)
-
-colnames(qpcr)[1] <- 'CellLine'
-colnames(qpcr) <- str_remove_all(colnames(qpcr), ' ')
-
-p <- reshape2::melt(qpcr) %>%
-  filter(variable != 'Untreated') %>%
-  mutate(variable=str_replace_all(variable, 'ASO', ' ASO')) %>%
-  mutate(variable=str_replace_all(variable, 'NTC', 'NTC 10uM')) %>%
-  mutate(variable=str_replace_all(variable, 'low ASO', 'ASO 1uM')) %>%
-  mutate(variable=str_replace_all(variable, 'high ASO', 'ASO 10uM')) %>%
-  mutate(CellLineL=mutation.key[CellLine]) %>%
-  mutate(ASO=factor(variable, levels=treatment.levels)) %>%
-  mutate(CellLineL=str_remove_all(CellLineL, '-.*$')) %>%
-  ggplot(aes(x=CellLine, y=value, fill=ASO)) +
-  geom_bar(stat='identity', position=position_dodge(), color='black') +
-  scale_fill_manual(values=heat.pal[c(3, 6, 9)]) +
+qpcr <- fread(qpcr.file)
+
+p <- qpcr %>%
+  filter(ASO != "Un") %>%
+  mutate(ASO = str_replace_all(ASO, "1uM", "ASO 1uM")) %>%
+  mutate(ASO = str_replace_all(ASO, "10uM", "ASO 10uM")) %>%
+  mutate(ASO = str_replace_all(ASO, "NTC", "NTC 10uM")) %>%
+  mutate(CellLineL = mutation.key[CellLine]) %>%
+  mutate(ASO = factor(ASO, levels = treatment.levels)) %>%
+  mutate(CellLineL = str_remove_all(CellLineL, "-.*$")) %>%
+  ggplot(aes(x = CellLine, y = `FUS (normalized to NTC)`, fill = ASO)) +
+  geom_bar(stat = "identity", position = position_dodge(), color = "black") +
+  scale_fill_manual(values = heat.pal[c(3, 6, 9)]) +
   xlab(NULL) +
-  ylab('qPCR-FUS') +
-  facet_wrap(~CellLineL, nrow=2, scales='free_x') +
-  theme(axis.text.x=element_blank())
+  facet_wrap(~CellLineL, nrow = 2, scales = "free_x") +
+  theme(axis.text.x = element_blank())
 
-pdf(file.path(output.fig.dir, 'fig_supp_qpcr.pdf'), height=8, width=16)
+pdf(file.path(output.fig.dir, "fig_supp_qpcr.pdf"), height = 8, width = 16)
 plot(p)
 dev.off()
 
@@ -91,106 +72,121 @@ dev.off()
 ################################################################################
 # Raw nuclear and cytoplasmic FUS intensities
 ################################################################################
-load(file.path(data.dir, 'profiles_raw.Rdata'))
-x <- mutate(x, ASO=str_remove_all(ASOtreatment, 'EP_')) %>%
-  mutate(ASO=str_replace_all(ASO, 'ASO', ' ASO')) %>%
-  mutate(ASO=str_replace_all(ASO, 'NTC', 'NTC 10uM')) %>%
-  mutate(ASO=str_replace_all(ASO, 'low ASO', 'ASO 1uM')) %>%
-  mutate(ASO=str_replace_all(ASO, 'high ASO', 'ASO 10uM')) %>%
-  mutate(ASO=factor(ASO, treatment.levels)) %>%
-  mutate(Genetics=str_replace_all(Genetics, 'FUS', 'FUS-ALS'))
-
-# Plot change in nuclear/cytoplasmic FUS
-fselect <- c('X..Iav_in_dna_region..2', 'X..Iav_in_non_dna_region..2')
-mselect <- c('Genetics', 'ASO')
+load(file.path(data.dir, "profiles_raw.Rdata"))
+x <- mutate(x, ASO = str_remove_all(ASOtreatment, "EP_")) %>%
+  mutate(ASO = str_replace_all(ASO, "ASO", " ASO")) %>%
+  mutate(ASO = str_replace_all(ASO, "NTC", "NTC 10uM")) %>%
+  mutate(ASO = str_replace_all(ASO, "low ASO", "ASO 1uM")) %>%
+  mutate(ASO = str_replace_all(ASO, "high ASO", "ASO 10uM")) %>%
+  mutate(ASO = factor(ASO, treatment.levels)) %>%
+  filter(Genetics != "sporadic")
+
+params.plot <- list(
+  list(
+    fselect = as.symbol("X..Iav_in_dna_region..2"),
+    main = "Nuclear FUS, average pixel intensity",
+    fout = "nuclear_fus.pdf"
+  ),
+  list(
+    fselect = as.symbol("X..Iav_in_non_dna_region..2"),
+    main = "Cytosolic FUS, average pixel intensity",
+    fout = "cytosolic_fus.pdf"
+  ),
+  list(
+    fselect = as.symbol("X..Imax_in_non_dna_region..3"),
+    main = "Cytosolic EEA1, max pixel intensity",
+    fout = "cytosolic_eea1.pdf"
+  )
+)
+
+mselect <- c("Genetics", "ASO")
 
 # Compute well-level median intensities
-xgroup <- group_by(x, ID, ASO, Genetics) %>% 
-  filter(ASO != 'H2O') %>%
-  filter(ASO != 'untreated') %>%
-  summarize_if(is.numeric, median) %>%
-  ungroup() %>%
-  dplyr::select(-ID)
-
-comparisons <- list(c("NTC 10uM", "ASO 1uM"), c("ASO 1uM", "ASO 10uM"))
-
-p1 <- dplyr::rename(xgroup, NucFUS=`X..Iav_in_dna_region..2`) %>% 
-  dplyr::select(one_of(c('NucFUS', mselect))) %>%
-  ggbarplot(x="ASO", y="NucFUS", fill="ASO", facet.by="Genetics", add="mean_sd") +
-  ylab('Intensity') +
-  scale_fill_manual(values=heat.pal[c(3, 6, 9)], labels=NULL) +
-  xlab(NULL) +
-  ylab('Nuclear FUS pixel intensity') +
-  theme_ipsum(
-    plot_title_size=28,
-    axis_title_size=24,
-    strip_text_size=20, 
-    axis_text_size=20,
-    base_size=20,
-    base_family='sans',
-    axis=TRUE,
-    axis_col='#000000',
-    grid=FALSE
-  ) +
-  theme(legend.position='none') +
-  stat_compare_means(comparisons=comparisons, label="p.signif", size=7, p.adjust.method='bonferroni') +
-  theme(axis.text.x=element_blank()) 
-
-p2 <- dplyr::rename(xgroup, CytFUS=`X..Iav_in_non_dna_region..2`) %>% 
-  dplyr::select(one_of(c('CytFUS', mselect))) %>%
-  ggbarplot(x="ASO", y="CytFUS", fill="ASO", facet.by="Genetics", add="mean_sd")+
-  ylab('Intensity') +
-  scale_fill_manual(values=heat.pal[c(3, 6, 9)], labels=NULL) +
-  xlab(NULL) +
-  ylab('Cytosolic FUS pixel intensity') +
-  theme_ipsum(
-    plot_title_size=28,
-    axis_title_size=24,
-    strip_text_size=20, 
-    axis_text_size=20,
-    base_size=20,
-    base_family='sans',
-    axis=TRUE,
-    axis_col='#000000',
-    grid=FALSE
-  ) +
-  theme(legend.position='none') +
-  stat_compare_means(comparisons=comparisons, label="p.signif", size=7, p.adjust.method='bonferroni') +
-  theme(axis.text.x=element_blank())
-
-p3 <- mutate(xgroup, ratio=(`X..Iav_in_dna_region..2` / `X..Iav_in_non_dna_region..2`)) %>% 
-  dplyr::select(one_of(c('ratio', mselect))) %>%
-  ggbarplot(x="ASO", y="ratio", fill="ASO", facet.by="Genetics",add="mean_sd")+
-  ylab('Intensity') +
-  scale_fill_manual(values=heat.pal[c(3, 6, 9)]) +
-  labs(fill=NULL) +
-  xlab(NULL) +
-  ylab('Nuclea') +
-  theme_ipsum(
-    plot_title_size=28,
-    axis_title_size=24,
-    strip_text_size=20, 
-    axis_text_size=20,
-    base_size=20,
-    base_family='sans',
-    axis=TRUE,
-    axis_col='#000000',
-    grid=FALSE
-  ) +
-  stat_compare_means(comparisons=comparisons, label="p.signif", size=7, p.adjust.method='bonferroni') +
-  ylab('Nuclear / cytosolic FUS pixel intensity ratio') +
-  theme(axis.text.x=element_blank())
-
-
-
-if (save.fig) pdf(file=str_c(output.fig.dir, 'fig5a_nuc.pdf'), height=8, width=10)
-plot(p1)
-if (save.fig) dev.off()
-
-if (save.fig) pdf(file=str_c(output.fig.dir, 'fig5a_cyt.pdf'), height=8, width=10)
-plot(p2)
-if (save.fig) dev.off()
-
-if (save.fig) pdf(file=str_c(output.fig.dir, 'fig5a_rat.pdf'), height=8, width=10)
-plot(p3)
-if (save.fig) dev.off()
+for (prm in params.plot) {
+  fselect <- prm$fselect
+  main <- prm$main
+  fout <- file.path(output.fig.dir, prm$fout)
+
+  xplot <- group_by(x, ID, ASO, Genetics, CellLine, WellID, PlateID) %>%
+    summarize_if(is.numeric, mean) %>%
+    ungroup() %>%
+    dplyr::select(-ID) %>%
+    filter(ASO != "untreated") %>%
+    dplyr::rename(Treatment = ASO)
+
+  # Initialize plot range
+  xrange <- group_by(xplot, Genetics, Treatment) %>%
+    summarize(
+      Mean = mean(!!fselect),
+      SD = sd(!!fselect)
+    ) %>%
+    mutate(MaxRange = Mean + SD) %>%
+    mutate(MinRange = Mean - SD)
+
+
+  xrange <- c(min(xrange$MinRange), max(xrange$MaxRange))
+
+  eps <- diff(xrange) / 10
+
+  # Compute significance levels for treatment effect
+  lme.test <- lapply(unique(xplot$Genetics), function(g) {
+    xg <- filter(xplot, Genetics == g)
+    f <- str_c(as.character(fselect), "~ Treatment + (1 | CellLine)")
+    mixed <- lmer(as.formula(f), data = xg)
+    out <- summary(mixed)$coefficients[3, 5]
+    out <- data.frame(Treatment = "ASO 10uM", p = out) %>%
+      mutate(Treatment = str_remove_all(Treatment, "Treatment")) %>%
+      mutate(Genetics = g)
+    return(out)
+  })
+
+  test.plot <- rbindlist(lme.test) %>%
+    mutate(padj = p.adjust(p, method = correction)) %>%
+    mutate(padj = signif(padj, 3)) %>%
+    mutate(x = ifelse(Treatment == "ASO 1uM", 1, 2)) %>%
+    mutate(x = 2) %>%
+    mutate(y = max(xrange) - eps) %>%
+    mutate(y = ifelse(Treatment == "ASO 1uM", y + eps / 2, y)) %>%
+    mutate(xend = x + 1) %>%
+    mutate(yend = y) %>%
+    mutate(xtext = 2) %>%
+    mutate(ytext = y + eps / 4) %>%
+    mutate(x = 1) %>%
+    mutate(plab = label_pval(padj))
+
+  p <- xplot %>%
+    ggbarplot(
+      x = "Treatment",
+      fill = "Genetics",
+      y = as.character(fselect),
+      facet.by = "Genetics",
+      merge = TRUE,
+      add = "mean_se",
+      error.plot = "upper_errorbar"
+    ) +
+    geom_text(
+      data = filter(test.plot, str_detect(Treatment, "1uM")),
+      aes(x = xtext, y = ytext, label = plab), size = 8,
+    ) +
+    geom_text(
+      data = filter(test.plot, str_detect(Treatment, "10uM")),
+      aes(x = xtext, y = ytext, label = plab), size = 8,
+    ) +
+    geom_segment(
+      data = test.plot,
+      aes(x = x, y = y, xend = xend, yend = yend)
+    ) +
+    ylab(main) +
+    xlab(NULL) +
+    theme(legend.position = "none") +
+    theme(axis.text.x = element_text(angle = 90)) +
+    scale_fill_manual(values = col.pal) +
+    coord_cartesian(ylim = xrange) +
+    geom_hline(yintercept = 1, col = "grey", lty = 2)  +
+    theme(text = element_text(size = 22))
+
+
+  if (save.fig) pdf(file = fout, height = 8, width = 10)
+  plot(p)
+  if (save.fig) dev.off()
+}
diff --git a/paper_figures/Figure_Scores/.DS_Store b/paper_figures/Figure_Scores/.DS_Store
index d2e5b0a81e916e733769dfe75dd04e96ac97a89e..a01f7271dc3889f2e9fb98bf20ce79252f7d94f1 100644
GIT binary patch
literal 10244
zcmeGhZEPG@apvsAyZD-{9XoL@PHs~d63CI>`71Uw^7-tfNt58zJ|}jZ<Z^erKCiO3
zd)eJPH!-P9K_Jmq1%6bez%PWL>PHajr;rLlDn#&u(ng?GD$rm20fDG!g@jb*?b|)?
zo#O&RYSZSP^mg8RGxPT5>%E;h0KimPNdU+IAW&yeY6S?109^!wjp12e)tYDpsYj{P
zSOS>W@|99fH*>3K8w)p{214|cnEjfmH-QFL(0=%0uB2<W6OaEvk}%ZNyl#DqAbzYh
ze9MN7Qd|3`wvM*W&7E7iI=4n5-5F_9E}K7Ju*|ZdjG4M=rS!5oSFki?s+7%ZMq$QL
z?{TKIOKL`HuOppQm9r~L>5C<z=~!8Cx4Y9)Rm0tHVt(FMotccZ(au|@t{*pT&CyI_
zHh`TyZ7-OMhaI(e*qArzjTKE@oG39e`-E!Q1XuCT%w{bsYZTPf#WQ*~r|J_%%5)sF
z$SB!vdDA#<scfeijmAuto{_(CtM(+1_=DYiAVCiw0Q4lMf9j3}3DbsYsG``DJnmry
z*aM?*3{Jp#xCCE^@4yxKF<gZ=;IHscY{9K~EAGL5+>ax82=B&8Oyew`!a2?#V)kQg
zI|u#~l>!YJvI*wNw1wQF?FZ)bu3>%325e}cp@ExH1K!yo)ZO<tr7jvmH#E@DK&=MY
z`5;qgFzLZ%4&9@J9_$m6;n^po+h|OAfV(DpFzLZ%4qaB_aajp+R`gN~<m|LAGjf7S
z4=!`a*#S8_p!a6<G8A;*PJFUU4w&Sy@oZ?Ip@G#JpnTuY{q#a&DsL7`j41Tf899Kl
zD`PyXZ`$T(g|A~UCyb=0(<j1|m93F#)h19U^QK-d8g@o%%V+g`S<gD^sIIe=iq`5A
zyFlt#hnhKMo0dZfB5jtDq_V9}s<f9@QO9XVyPBs-C_c4eOZU!?f9m%B;r*$DM<%YZ
z95&n$-h8s6*;-Clrx!KFS(we*dDT!TUp=KMs^Mt!nrew{7!%V*wy0ifm)bUVQNlY@
z&S@9QnsZU=5d^;1j)*Ku&j_a#dh;<;VVsJKiR_5nCJ1wEeauqn$(O^sBA*nctdO=z
zYQ9A*yjPTLF@0V$Jk~_GPmmOG+D%CLZiC?=K{_IytnfURFLs3Q5X2cWUXgyZ&DTaE
zqk?o*ILlsqG;2E(CDovJGZDsiL?W|neN<6Y#gia%C~}t|y&{~l7qXT*v1q6k-{?@}
zbAsSz!L-hHI-qEdSw&7Vq@)=Y%^}rOVoacH3xV|4ONm_-QtBIL4n|F&q_r2u;V7J=
z(!&xw2#>(y@H9LN&%w{(*YG<^g8u}6hqvHucnAK4>u>`~*oN)64ZCp%Mlps1IEedj
z43FY5oWSEajb|}~S<GPpOLzfobnp^>5x<0A#;@Q*_y~RzAI0zD6Zj-v!5`vJ@G8E7
zui`a)1AmWy#5eKJ_z!#=-wBDK_RywKM`%}QcPJ71OlTyOWIvih!@GfCme2iX=rE1E
zGKy~dWw6lKt{*VbgLfW06u#@m4Vc$>nWv+3^A?%K&AvO5RR`n#=(}LhwOwzni2>J}
zUDMZlA_da6?(N~|G`YKaPkjLTfho1H@Dr{v>YcZSqtZGG2tNJ9E}E{VVBpg|v1n8Z
zQ4I2FJP{2`O%!3G%i9`@hP~m?a5yYAQ>1as{_QjcZ!(&qO<3Fr|ECB45?+PZ;7#}o
zdGNc~j3Rd7PTYl`CVw5lQ5?quc#wSdZk(c$4)fb{<h6NJu!tsku0_84dAt{w@e6n#
z-j5I9gZMD{@1yuF{5F0MzmGq_$M8q^G(L;Z;q&+c{tSPGFXL~xKUTfeD@^%bim$JD
zY0(@X_lmW`Ti?)zFUc>B2C>R}U`q>u%i_v1+urm2%9Lg3rs3D01kk~R0@a{waOjy|
zRf(b!<8S@Tq(x{I&P>s5!B3G?)|z9pv}x)&f1J&opduXA2**sr$!c^?xeRn*6Uph5
z<W1&ciWMSlmv@RI1W^z(QrFVbP+~YH5A^k3j>${IgYlR=I2gZtSrj^ZZvX6L=Dc1s
z%&#(W=o`|H^jrA6`n;b;tK_PayGndx4?5}RwLq2*;5;mlc`Bd=4D^D3c)f21+Pbuq
z=<n~1$@>Nd-orx70@T*D&;x|56RXZX%~Ti7g8@cBTwWRdaQb6ao;o?#s{ejCrSzl;
zeyX~TP_1(Z&$PMS&%AW+!`_+ZR>6zr%9*wr8$8o$cdgUuMqX2g40(_VMJTxe#7;Pq
zJj(uvc+`K8vQ8)Kt2(&=`NO)I_ih%-$B{+r-gn*R>a~V8psf$&xEbj0i^+ZQI6G|;
zeZ7M*c{skW?n3x)j++{xY8-U$>0gl8HTa;LK0ew1;zQl8o7T~+IO|$8H7>jNWv1Pl
zJ?f0)UD0*sVgDTk&Ervh(1sa`55(l5M4xMz-aeWP#(V!)!_b#R<NjxEah<k~ugR?h
z53K$F|8+PUOAQS)H1JW;05*?Lji*Q&Ki{brjMds@>gTAl#^{zgtg_I+EyWWpE5*At
pA6oN^J&Y$w>kKY)NF!?R`yT-{_W`9Fjr+fG|JQw0+zk2uzX7uz;EDhM

delta 1831
zcmd^9UrbY19R7W`4YuJrZh=yYD_mD3Zfi<`0(FY98;VYi1e`M)3&DE9&bGJE0z*O?
zqGpp#qn>TANYs}-Y;JYq!QlR6*@I@;mY5|-T=w8Hvj<~z$-**qzZRlbsZI90b92wV
z=bYbn&hPi-eBYt=L+!^MkWciNo$xw^uiES+ZkY~53ejE)Q;d2jNm|-wDKn>4^HEcJ
znZ;~LWrB(+L^?kM#z5+(1jQK?B8|M%u_dC1_C)kO9M3yakYDfpY~w$o<<!fE^f4+#
zBYcp~Xq_TF7kUi&i4mE~4~_KIb%yk4JaYndeQwGh9o1V~7gMyXn>*+W%zc|lGf!&K
zSM<IA5fa~LNa+n<CNax{+Iwk`YmS_(tg<bc6qV)Ur49Z@kJ?-x81ty3YwCR-)!)!0
zE>!wTb8VBZkS-yRYsJV5rn_2l#R#ABtaW!dq{lkL2Q)ERGE&*C6c2^9;T`d0SdS%k
zX#)xVx;JEtAWGj>CXiCV^|+{p0<?j4a%DP=(|a^Y=V^v6()aW;{X*C2Ce72|beHbI
z1Q~WXP!1=afg7t(g9fZa3)bUhbYdrV;Z<}&LofCrj2MRT77kz(<Cws^IErKV5R*8C
zDSU*+G|uA-OyfKJfLUC{9IoRx{Ei#AgMV-r_asGfN)^%y$t%@L0VybLlG;Rbi8}}5
zJtgj<w#{u@<i<yMsL#)Xb@r(u#a3KWYA<uBtDdX%2U>!nxpe0FMo*Qjz+$~p?yOj`
z<^@?+%u2$#FkJ8AAuSGGd`Y<MD^q!i{E4p8uXKa{q<Ive5DKcW8eaIBwoTZKt=NX`
zOx<q0hS#(7^)Y#GVn31?Vgg5)!h<-BxA6{+;3VG12U!}Ya2jWD79YcL4xi#PT)+&z
z#8>zl-{3N?U{+kSyHiW9r%^n^P3^%yu4ToOBkrMM^2Hf-g6Ug)LybBqSP+WX_;e?x
z#eHXgjbn7QuEF2rQGG^jjEz|>c6aUCj_&=DXiPsMRywLu6TDZ!1I_hQ$S4P*a`WTs
z#@e<X@2FoihDY{@8l2!sF$*<`A6?5NQpK!G4a=OyAX{C=EHfrLz-F0PYL}Sjl#F&+
zNSrpK;%paLU(SV+`W~XvHgmPfmN7tXY9V(U*@$Jf^S?}@s5&UxWJPpUPt|oR%3wn4
X&>}s_@Q~KplhhLVulxT+_TGO19!|fQ

diff --git a/paper_figures/Figure_Scores/README.md b/paper_figures/Figure_Scores/README.md
new file mode 100644
index 0000000..0d35ce1
--- /dev/null
+++ b/paper_figures/Figure_Scores/README.md
@@ -0,0 +1,33 @@
+# FUS-ALS imaging scores
+Scripts in this directory train and evaluate imaging-based FUS-ALS vs. Healthy
+classifiers and generate figures summarizing model performance. Data are derived
+from:
+
+- `data_profiles/032422_WTFUSSporadics/Cell_Filtered`
+- `data_profiles/032422_WTFUSSporadics/Well`
+
+## Modeling
+Models can be fit using the `train_fus_models.R` script.
+
+### Parameters in modeling scripts
+- `cytosolic`: TRUE/FALSE, indicating whether models should be restricted to
+  cytosolic FUS features.
+
+- `model`: modeling function that takes arguments `x` (feature matrix) and `y`
+  (response vector) and returns a fitted model.
+
+- `model_predict: model prediction function that takes arguments `model` (a
+  fitted mode) and `x` (feature matrix) and returns predictions on `x`.
+
+## Figures
+Scripts to generate figures for the marker search screens require fitted models,
+generated using the script described above. `fig_scores.R` generates summary
+plots based on the FUS-ALS vs. WT classifier that appear in the main text of the
+manuscript. To generate summaries for different models (e.g., irf, logistic
+regression, neural network), change the `model.str` argument in this script.
+`fig_coordinate_space.R` generates plots of the cell lines in 2D coordinate
+space based on raw features. `fig_qc_pca.R` plots points in PCA space.
+`fig_qc_plates.R` runs models and evaluates performance on hold-out plates.
+`fig_model_comparisons.R` generates plots comparing predictions across logistic
+regression, irf, and neural networks. `run_models.sh` provides a shell wrapper
+to iterate over models and cytosolic / full cell analyses.
diff --git a/paper_figures/Figure_Scores/fig_coordinate_space.R b/paper_figures/Figure_Scores/fig_coordinate_space.R
new file mode 100644
index 0000000..49e2247
--- /dev/null
+++ b/paper_figures/Figure_Scores/fig_coordinate_space.R
@@ -0,0 +1,104 @@
+################################################################################
+#+ setup, message=FALSElibrary(tidyverse)
+################################################################################
+library(tidyverse)
+library(data.table)
+library(gridExtra)
+library(ggsci)
+library(patchwork)
+library(Cairo)
+
+theme_set(theme_bw(base_size = 22))
+
+
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores")
+
+source(file.path(fig.base.dir, "paper_figures", "color_palette.R"))
+
+# Set paths to data directories
+data.dir <- file.path(data.base.dir, "032422_WTFUSSporadics", "Well")
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
+
+# Fit model to select data
+output.fig.dir <- file.path(figure.dir, "fig/")
+load(file.path(data.dir, "profiles_raw.Rdata"))
+
+# Analysis parameters
+marker <- "FUS_EEA1"
+f1 <- "X..Iav_in_non_dna_region..2" # x-axis feature
+f2 <- "X..Iav_in_cell_region..3" # y-axis feature
+save.fig <- TRUE
+
+################################################################################
+# Load in raw feature and merge with cell line metadata
+################################################################################
+# Read in cell line genetics
+xgen <- fread(library.file) %>%
+  dplyr::rename(CellLine = `Cell line`, Genetics = `Disease Gene`) %>%
+  mutate(Genetics = ifelse(Genetics == "FUS", "FUS-ALS", Genetics)) %>%
+  mutate(Genetics = ifelse(Genetics == "WT", "Healthy", Genetics))
+
+x <- left_join(x, xgen, by = "CellLine")
+
+# Filter to FUS/WT cell lines
+x <- filter(x, Genetics %in% c("FUS-ALS", "Healthy"))
+
+################################################################################
+# Plot raw feature distributions
+################################################################################
+pxlim <- range(x[[f1]])
+pylim <- range(x[[f2]])
+
+# Generate raw feature plot
+g <- "FUS-ALS"
+
+# Compute centroids & standard errors
+xmean <- dplyr::select(x, one_of(f1, f2, "CellLine", "Genetics", "Mutation")) %>%
+  group_by(CellLine, Genetics, Mutation) %>%
+  summarize_if(is.numeric, mean) %>%
+  mutate(Mutation = str_remove_all(Mutation, "FUS-")) %>%
+  mutate(Mutation = str_remove_all(Mutation, "FUS "))
+
+pxlim <- c(min(x[[f1]]), max(x[[f1]]))
+pylim <- c(min(x[[f2]]), max(x[[f2]]))
+
+# Generate plots
+s1 <- ggplot(xmean, aes_string(x = f1, y = f2)) +
+  geom_point(size = 4, aes(col = Genetics)) +
+  geom_point(shape = 1, size = 4) +
+  geom_point(data = x, alpha = 0.5, aes(col = Genetics)) +
+  xlim(pxlim) +
+  ylim(pylim) +
+  xlab("Cytosolic FUS intensity (KS)") +
+  ylab("Cellular EEA1 intensity (KS)") +
+  scale_color_manual(values = col.pal) +
+  theme(legend.position = "none") +
+  theme(plot.margin = ggplot2::margin(0, 0, 0, 0))
+
+d1 <- filter(xmean, Genetics %in% c("Healthy", g)) %>%
+  ggplot(aes(fill = Genetics)) +
+  geom_boxplot(aes_string(x = f1), coef = NULL) +
+  scale_fill_manual(values = col.pal) +
+  theme_void() +
+  theme(legend.position = "none") +
+  theme(plot.title = element_text(face = "bold", size = 18)) +
+  xlim(pxlim)
+
+d2 <- filter(xmean, Genetics %in% c("Healthy", g)) %>%
+  ggplot(aes(fill = Genetics)) +
+  geom_boxplot(aes_string(x = f2), coef = NULL) +
+  scale_fill_manual(values = col.pal) +
+  theme_void() +
+  theme(legend.position = "none") +
+  coord_flip() +
+  xlim(pylim)
+
+pg <- d1 + plot_spacer() + s1 + d2 +
+  plot_layout(ncol = 2, nrow = 2, widths = c(4, 0.5), heights = c(0.5, 4))
+
+fout <- file.path(output.fig.dir, "fig_projection.pdf")
+if (save.fig) pdf(file = fout, height = 8, width = 8)
+plot(pg)
+if (save.fig) dev.off()
diff --git a/paper_figures/Figure_Scores/fig_model_comparison.R b/paper_figures/Figure_Scores/fig_model_comparison.R
new file mode 100644
index 0000000..210fef1
--- /dev/null
+++ b/paper_figures/Figure_Scores/fig_model_comparison.R
@@ -0,0 +1,114 @@
+################################################################################
+#+ setup, message=FALSE
+################################################################################
+library(tidyverse)
+library(data.table)
+library(ggsci)
+library(PRROC)
+
+col.pal <- pal_jama()(7)[c(2, 6, 7)]
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores")
+output.fig.dir <- file.path(figure.dir, "fig_comparisons")
+dir.create(output.fig.dir, recursive = FALSE)
+
+source(file.path(fig.base.dir, "paper_figures", "theme.R"))
+theme_set(theme_als())
+
+save.fig <- TRUE
+models <- c("fcnn", "logistic", "irf")
+cytosolic <- FALSE
+
+# Load predictions from each model
+xpred <- lapply(models, function(m) {
+  subdir <- ifelse(cytosolic, "fig_cytosolic", "fig_full")
+  load(file.path(figure.dir, m, subdir, "model.Rdata"))
+  return(mutate(cell.models$ypred, Model = m))
+})
+
+
+################################################################################
+#' # ROC curves
+#+ roc, message=FALSE
+################################################################################
+roc <- function(ypred, ytrue) {
+  roc.curve(scores.class0 = ypred, weights.class0 = ytrue, curve = TRUE)
+}
+
+roc.cell <- rbindlist(xpred) %>%
+  filter(Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  group_by(Model) %>%
+  summarize(ROC = list(roc(Ypred, Ytest))) %>%
+  mutate(Type = "Single cell")
+
+# Aggregate predictions by single well
+roc.well <- rbindlist(xpred) %>%
+  filter(Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  group_by(CellLine, WellID, Ytest, PlateID, Model) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  group_by(Model) %>%
+  summarize(ROC = list(roc(Ypred, Ytest))) %>%
+  mutate(Type = "Well replicate")
+
+# Aggregate predictions by cell line
+roc.line <- rbindlist(xpred) %>%
+  filter(Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  group_by(CellLine, Ytest, Model) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  group_by(Model) %>%
+  summarize(ROC = list(roc(Ypred, Ytest))) %>%
+  mutate(Type = "Cell line")
+
+rocs <- rbind(roc.cell, roc.well, roc.line)
+
+# Initialize type labels
+aucs <- mutate(rocs, AUC = sapply(ROC, function(z) z$auc)) %>%
+  mutate(AUC = round(AUC, 2)) %>%
+  mutate(AUC = str_c("AUROC = ", AUC)) %>%
+  mutate(X = 0.75) %>%
+  mutate(Y = ifelse(Model == "fcnn", 0.2, 0.15)) %>%
+  mutate(Y = ifelse(Model == "logistic", 0.1, Y)) %>%
+  mutate(AUC=Model)
+
+rocs <- mutate(rocs, FPR = sapply(ROC, function(z) z$curve[, 1])) %>%
+  mutate(TPR = sapply(ROC, function(z) z$curve[, 2]))
+
+xplot <- lapply(1:nrow(rocs), function(i) {
+  out <- data.frame(FPR = rocs$FPR[[i]], TPR = rocs$TPR[[i]]) %>%
+    mutate(Model = rocs$Model[i]) %>%
+    mutate(Type = rocs$Type[i])
+  return(out)
+})
+
+type.levels <- c("Single cell", "Well replicate", "Cell line")
+aucs <- mutate(aucs, Type = factor(Type, levels = type.levels))
+xplot <- rbindlist(xplot) %>% mutate(Type = factor(Type, levels = type.levels))
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_roc_comparison.pdf"), h = 8, w = 16)
+ggplot(xplot, aes(col = Model)) +
+  geom_line(aes(x = FPR, y = TPR), size = 2) +
+  geom_abline(col = "grey", lty = 2) +
+  scale_color_manual(values = col.pal) +
+  xlab("False positive rate") +
+  ylab("True positive rate") +
+  theme(legend.position = "none") +
+  labs(color = NULL) +
+  facet_wrap(~Type) +
+  geom_text(data = aucs, aes(x = X, y = Y, label = AUC))
+if (save.fig) dev.off()
+
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_roc_comparison_well.pdf"), h = 8, w = 8)
+filter(xplot, Type == "Well replicate") %>%
+  ggplot(aes(col = Model)) +
+  geom_line(aes(x = FPR, y = TPR), size = 2) +
+  scale_color_manual(values = col.pal) +
+  xlab("False positive rate") +
+  ylab("True positive rate") +
+  theme(legend.position = c(0.85, 0.15)) +
+  labs(col = NULL) +
+  theme(legend.text = element_text(size = 24))
+if (save.fig) dev.off()
diff --git a/paper_figures/Figure_Scores/fig_qc_pca.R b/paper_figures/Figure_Scores/fig_qc_pca.R
new file mode 100644
index 0000000..bcd097f
--- /dev/null
+++ b/paper_figures/Figure_Scores/fig_qc_pca.R
@@ -0,0 +1,123 @@
+#+ setup, message=FALSE
+library(tidyverse)
+library(data.table)
+library(ggsci)
+library(gridExtra)
+library(viridis)
+
+theme_set(theme_bw(base_size = 22))
+size <- 16
+
+# Paths to data directories
+als.dir <- Sys.getenv("ALS_PAPER")
+figure.dir <- file.path(als.dir, "paper_figures", "Figure_Scores", "fig_qc")
+dir.create(figure.dir, showWarnings = FALSE)
+
+# Source utility functions
+source(file.path(als.dir, "paper_figures", "utilities.R"))
+source(file.path(als.dir, "paper_figures", "color_palette.R"))
+source(file.path(fig.base.dir, "paper_figures", "theme.R"))
+theme_set(theme_als())
+
+library.file <- file.path(als.dir, "data", "Fibroblasts_Library.csv")
+
+# Parameters for analysis
+
+# Initialize cell line metadata table
+xcell <- fread(library.file) %>%
+  dplyr::rename(CellLine = `Cell line`) %>%
+  mutate(Sex = toupper(Sex)) %>%
+  dplyr::rename(Genetics = `Disease Gene`) %>%
+  dplyr::rename(Figure = `Figure Names`) %>%
+  mutate(Genetics = str_replace_all(Genetics, "ALS with no known mutation", "sporadic"))
+
+data.dir <- file.path(als.dir, "data_profiles", "032422_WTFUSSporadics", "Cell_Filtered")
+load(file.path(data.dir, "profiles_raw.Rdata"))
+
+# Fix typo in F7 cell line passage number
+x <- left_join(x, xcell, by = "CellLine") %>%
+  mutate(Plate = str_remove_all(PlateID, "^.*plate_")) %>%
+  mutate(DiseaseStatus = ifelse(Genetics.x != "Healthy", "ALS", "Healthy")) %>%
+  mutate(ALS = Genetics.x != "Healthy") %>%
+  dplyr::rename(Age = `Age of biopsy`) %>%
+  dplyr::rename(Passage = CellLinePassageNumber) %>%
+  dplyr::rename(Genetics = Genetics.x) %>%
+  dplyr::rename(Site = `Origin Institution`) %>%
+  mutate(Passage = ifelse(CellLine == "F7", 20, Passage))
+
+
+################################################################################
+# PCA plots
+################################################################################
+xfeat <- dplyr::select(x, matches("^X"))
+xpca <- prcomp(apply(xfeat, MAR = 2, rank))
+pct.var <- xpca$sdev^2 / sum(xpca$sdev^2)
+
+plot_pc <- function(x, var, pal, pct.var, facet = FALSE) {
+  # Wrapper function to plot PCs 1-4
+
+  cols <- c("PlateID", "WellID", var)
+  if (facet) cols <- c(cols, "ALS")
+  pct.var <- round(pct.var, 3) * 100
+
+  p1 <- ggplot(x, aes(x = PC1, y = PC2)) +
+    geom_point(aes_string(col = var)) +
+    scale_y_continuous(labels = scales::scientific, breaks = c(-2e5, 0, 2e5)) +
+    scale_x_continuous(labels = scales::scientific, breaks = c(-4e5, 0, 4e5)) +
+    xlab(str_c("PC1, (", pct.var[1], "%)")) +
+    ylab(str_c("PC2, (", pct.var[2], "%)"))
+
+  p2 <- ggplot(x, aes(x = PC3, y = PC4)) +
+    geom_point(aes_string(col = var)) +
+    scale_y_continuous(labels = scales::scientific, breaks = c(-2e5, 0, 2e5)) +
+    scale_x_continuous(labels = scales::scientific, breaks = c(-5e5, 0, 5e5)) +
+    xlab(str_c("PC3, (", pct.var[3], "%)")) +
+    ylab(str_c("PC4, (", pct.var[4], "%)"))
+
+  if (is.numeric(x[[var]])) {
+    p1 <- p1 + scale_color_gradientn(colors = pal, limits = force)
+    p2 <- p2 + scale_color_gradientn(colors = pal, limits = force)
+  } else {
+    p1 <- p1 + scale_color_manual(values = pal, limits = force)
+    p2 <- p2 + scale_color_manual(values = pal, limits = force)
+  }
+
+  if (!facet) {
+    return(grid.arrange(p1, p2, nrow = 1))
+  }
+}
+
+# Compute well-level averages of PCA-projections
+cols <- c("PlateID", "WellID", "ALS", "Genetics", "Sex", "Age", "Passage", "Plate", "Site")
+
+xplot <- cbind(xpca$x, x) %>%
+  group_by_at(vars(one_of(cols))) %>%
+  summarize_if(is.numeric, mean)
+
+pdf(file.path(figure.dir, "pca_als.pdf"), height = size / 2, width = size)
+plot_pc(xplot, "ALS", pal_jama()(7), pct.var)
+dev.off()
+
+pdf(file.path(figure.dir, "pca_gene.pdf"), height = size / 2, width = size)
+plot_pc(xplot, "Genetics", col.pal, pct.var)
+dev.off()
+
+pdf(file.path(figure.dir, "pca_site.pdf"), height = size / 2, width = size)
+plot_pc(xplot, "Site", pal_jama()(7), pct.var)
+dev.off()
+
+pdf(file.path(figure.dir, "pca_sex.pdf"), height = size / 2, width = size)
+plot_pc(xplot, "Sex", pal_jama()(7), pct.var)
+dev.off()
+
+pdf(file.path(figure.dir, "pca_age.pdf"), height = size / 2, width = size)
+plot_pc(xplot, "Age", viridis(10), pct.var)
+dev.off()
+
+pdf(file.path(figure.dir, "pca_passage.pdf"), height = size / 2, width = size)
+plot_pc(xplot, "Passage", viridis(10), pct.var)
+dev.off()
+
+pdf(file.path(figure.dir, "pca_plate.pdf"), height = size / 2, width = size)
+plot_pc(xplot, "Plate", pal_jama()(7), pct.var)
+dev.off()
diff --git a/paper_figures/Figure_Scores/fig_qc_plates.R b/paper_figures/Figure_Scores/fig_qc_plates.R
new file mode 100644
index 0000000..c8b64e8
--- /dev/null
+++ b/paper_figures/Figure_Scores/fig_qc_plates.R
@@ -0,0 +1,178 @@
+#+ setup, message=FALSE
+library(tidyverse)
+library(data.table)
+library(ggsci)
+library(gridExtra)
+library(caret)
+
+theme_set(theme_bw(base_size = 22))
+
+# Paths to data directories
+als.dir <- Sys.getenv("ALS_PAPER")
+figure.dir <- file.path(als.dir, "paper_figures", "Figure_Scores", "fig_qc")
+dir.create(figure.dir, showWarnings = FALSE)
+
+# Source utility functions
+source(file.path(als.dir, "paper_figures", "utilities.R"))
+source(file.path(als.dir, "paper_figures", "color_palette.R"))
+library.file <- file.path(als.dir, "data", "Fibroblasts_Library.csv")
+
+# Parameters for analysis
+save.fig <- TRUE # save figures or generate in notebook
+heat.pal <- pal_material("blue-grey")(10)
+
+model <- irf
+model_predict <- predict_irf
+
+# Initialize cell line metadata table
+xcell <- fread(library.file) %>%
+  dplyr::rename(CellLine = `Cell line`) %>%
+  mutate(Sex = toupper(Sex)) %>%
+  dplyr::rename(Genetics = `Disease Gene`) %>%
+  dplyr::rename(Figure = `Figure Names`) %>%
+  mutate(Genetics = str_replace_all(Genetics, "WT", "Healthy")) %>%
+  mutate(Genetics = str_replace_all(Genetics, "FUS", "FUS-ALS"))
+
+
+#' # Plate generalization
+#+ cell_counts
+data.dir <- file.path(als.dir, "data_profiles", "032422_WTFUSSporadics", "Cell_Filtered")
+load(file.path(data.dir, "profiles_FUS_EEA1.Rdata"))
+
+plate_fit <- function(x,
+                      model = irf,
+                      model_predict = predict_irf) {
+  # Wrapper function to call plate_fit_single predict over all plates
+
+  # Initialize cell lines to evaluate
+  plates <- unique(x$PlateID)
+
+  # Rank-normalize features within plate
+  ecdf <- function(x) rank(x) / length(x)
+  xfeat <- ungroup(x) %>%
+    dplyr::select(matches("(^X|PlateID)")) %>%
+    group_by(PlateID) %>%
+    mutate_if(is.numeric, ecdf) %>%
+    ungroup() %>%
+    dplyr::select(-PlateID)
+
+  x <- dplyr::select(x, -matches("^X"))
+  x <- cbind(x, xfeat)
+
+  # Drop sporadic lines from model trai
+  fits <- lapply(plates, plate_fit_single,
+    x = x,
+    model = model,
+    model_predict = model_predict
+  )
+
+  # Initialize importance
+  return(rbindlist(fits))
+}
+
+plate_fit_single <- function(x,
+                             plate.test,
+                             model = irf,
+                             model_predict = predict_irf) {
+  # Check for valid input
+  print(plate.test)
+  if (nrow(x) == 0) {
+    return(NULL)
+  }
+
+  # Set training/test indices based on hold-out place
+  id.test <- which(x$Plate %in% plate.test)
+  id.train <- setdiff(1:nrow(x), id.test)
+
+  if (length(id.test) == 0) {
+    return(NULL)
+  }
+
+  # Set feature matrix and response vector
+  y <- as.numeric(x$Genetics != "Healthy")
+  xx <- dplyr::select(x, matches("^X"))
+
+  # Take balanced class sample
+  ytab <- data.frame(Y = as.factor(y)) %>%
+    mutate(Idx = 1:n()) %>%
+    filter(Idx %in% id.train)
+
+  id.downsample <- downSample(ytab, as.factor(ytab$Y))
+
+  # Fit model
+  id.train <- intersect(id.train, id.downsample$Idx)
+  fit <- fit_model(xx, y, id.train, model, model_predict, test.id = id.test)
+
+  # Aggregate predictions across each model
+  ypred <- dplyr::select(x[id.test, ], WellID, PlateID, CellLine, Genetics) %>%
+    mutate(Ypred = fit$ypred) %>%
+    mutate(Ytest = y[id.test])
+
+  return(ypred)
+}
+
+x <- filter(x, Genetics != "sporadic")
+plate.models <- plate_fit(x, model, model_predict)
+
+################################################################################
+# Plate generalization by cell line
+################################################################################
+# Aggregate FUS v. WT predictions by cell line
+predictions <- plate.models %>%
+  group_by(CellLine, Ytest, PlateID) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine")
+
+# Compute test statistic for difference in prediction scores
+p <- predictions %>%
+  filter(Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  mutate(PlateID = str_remove_all(PlateID, "^.*_plate_")) %>%
+  ggplot(aes(x = reorder(Genetics, Ypred, median), y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, size = 3, alpha = 0.6) +
+  ylab("i-MAP score") +
+  xlab(NULL) +
+  facet_wrap(~PlateID, ncol = 1) +
+  theme(legend.position = "none") +
+  theme(axis.text.x = element_text(angle = 90)) +
+  scale_fill_manual(values = col.pal) +
+  scale_color_manual(values = col.pal) +
+  ylim(0:1)
+
+
+if (save.fig) pdf(file = file.path(figure.dir, "plate_generalization_a.pdf"), h = 16, w = 4)
+plot(p)
+if (save.fig) dev.off()
+
+################################################################################
+# Plate generalization by well replicate
+################################################################################
+# Aggregate predictions by well replicate
+predictions <- plate.models %>%
+  group_by(CellLine, WellID, Ytest, PlateID) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine")
+
+predictions.agg <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  group_by(CellLine, Genetics, PlateID) %>%
+  summarize(Ypred = median(Ypred), .groups = "drop") %>%
+  arrange(Ypred)
+
+# Plot predictions, FUS v. WT
+p <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  mutate(PlateID = str_remove_all(PlateID, "^.*_plate_")) %>%
+  ggplot(aes(x = reorder(Figure, Ypred, median), y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, alpha = 0.6) +
+  theme(axis.text.x = element_text(angle = 90)) +
+  theme(legend.position = "none") +
+  xlab(NULL) +
+  ylab("i-MAP score") +
+  scale_color_manual(values = col.pal) +
+  scale_fill_manual(values = col.pal) +
+  facet_wrap(~PlateID, ncol = 1) +
+  ylim(c(0, 1))
+
+if (save.fig) pdf(file = file.path(figure.dir, "plate_generalization_b.pdf"), h = 16, w = 16)
+plot(p)
+if (save.fig) dev.off()
diff --git a/paper_figures/Figure_Scores/fig_scores.R b/paper_figures/Figure_Scores/fig_scores.R
new file mode 100644
index 0000000..21f30b1
--- /dev/null
+++ b/paper_figures/Figure_Scores/fig_scores.R
@@ -0,0 +1,588 @@
+################################################################################
+#+ setup, message=FALSE
+################################################################################
+library(tidyverse)
+library(data.table)
+library(tidytext)
+library(scales)
+library(ggsci)
+library(ggrepel)
+library(PRROC)
+
+# Set user args for model choice
+args <- R.utils::commandArgs(asValues = TRUE)
+cytosolic <- as.logical(args$CYTOSOLIC) # FALSE
+model.str <- args$MODEL #' irf'
+
+if (!length(cytosolic)) cytosolic <- FALSE
+if (!length(model.str)) model.str <- "irf"
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores")
+platemap.dir <- file.path(fig.base.dir, "data", "platemaps")
+dir.create(platemap.dir)
+
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
+source(file.path(fig.base.dir, "paper_figures", "color_palette.R"))
+source(file.path(fig.base.dir, "paper_figures", "theme.R"))
+theme_set(theme_als())
+
+# Set paths to data directories
+data.dir <- file.path(data.base.dir, "032422_WTFUSSporadics", "Cell_Filtered")
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
+plate.file <- file.path(fig.base.dir, "data", "plate_list.txt")
+
+marker <- "FUS_EEA1"
+save.fig <- TRUE
+save.plate <- FALSE
+
+importance_norm <- function(x) x / max(x)
+ext <- ifelse(cytosolic, "_cytosolic", "_full")
+output.fig.dir <- file.path(figure.dir, model.str, str_c("fig", ext))
+dir.create(output.fig.dir, showWarnings = FALSE)
+
+################################################################################
+#' # Load data
+#+ parameters, message=FALSE
+################################################################################
+# Load in datasets for selected marker
+input.file <- str_c("profiles_", marker, ".Rdata")
+load(file.path(data.dir, input.file))
+
+# Initialize cell line metadata table
+xcell <- fread(library.file) %>% dplyr::rename(CellLine = `Cell line`)
+
+xcell <- dplyr::select(x, CellLine, CellLinePassageNumber, Genetics) %>%
+  distinct() %>%
+  left_join(xcell, by = "CellLine") %>%
+  mutate(Sex = toupper(Sex)) %>%
+  dplyr::rename(Passage = CellLinePassageNumber) %>%
+  dplyr::rename(Site = `Origin Institution`)
+
+if (!file.exists(file.path(output.fig.dir, "model.Rdata"))) {
+  stop("Model not fit, run `train_fus_models.R`")
+} else {
+  load(file.path(output.fig.dir, "model.Rdata"))
+}
+
+################################################################################
+#' # Predictions by cell line
+#+ predictions_cell_line, message=FALSE
+################################################################################
+# Aggregate FUS v. Healthy predictions by cell line
+predictions <- cell.models$ypred %>%
+  group_by(CellLine, Ytest) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine")
+
+################################################################################
+# FUS-ALS vs. WT
+################################################################################
+# Compute test statistic for difference in prediction scores
+x.wt.fus <- filter(predictions, Genetics != "sporadic")
+fit <- lm(Ypred ~ Genetics + Sex + Site + Passage, data = x.wt.fus)
+pval.lm <- summary(fit)$coefficients["GeneticsHealthy", "Pr(>|t|)"]
+print(pval.lm)
+
+yfus <- predictions$Ypred[predictions$Genetics == "FUS-ALS"]
+ywt <- predictions$Ypred[predictions$Genetics == "Healthy"]
+pval.sr <- wilcox.test(yfus, ywt, alternative = "greater")$p.value
+print(pval.sr)
+
+pval.lab <- label_pval(pval.sr)
+ymax <- max(predictions$Ypred) + 1e-2
+
+p <- predictions %>%
+  filter(Genetics != "sporadic") %>%
+  ggplot(aes(x = reorder(Genetics, Ypred, median), y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, size = 3, alpha = 0.6) +
+  geom_text(x = 1.5, y = ymax + 5e-2, label = pval.lab, size = 8) +
+  geom_segment(x = 1, xend = 2, y = ymax, yend = ymax, col = "#0B4668", linewidth = 1) +
+  ylab("i-MAP score") +
+  xlab(NULL) +
+  theme(legend.position = "none") +
+  theme(axis.text.x = element_text(angle = 90)) +
+  scale_fill_manual(values = col.pal) +
+  scale_color_manual(values = col.pal) +
+  ylim(c(0, 1))
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_a.pdf"), h = 8, w = 4)
+plot(p)
+if (save.fig) dev.off()
+
+################################################################################
+# sporadic vs. WT
+################################################################################
+# Compute test statistic for difference in prediction scores
+x.wt.sp <- filter(predictions, Genetics != "FUS-ALS")
+fit <- lm(Ypred ~ Genetics + Sex + Site + Passage, data = x.wt.sp)
+pval.lm <- summary(fit)$coefficients["Geneticssporadic", "Pr(>|t|)"]
+
+ysporadic <- predictions$Ypred[predictions$Genetics == "sporadic"]
+pval.sr <- wilcox.test(ysporadic, ywt, alternative = "greater")$p.value
+
+pval.lab.s <- label_pval(pval.lm)
+ymax.s <- max(predictions$Ypred[predictions$Genetics == "sporadic"]) + 5e-2
+
+p <- predictions %>%
+  ggplot(aes(x = reorder(Genetics, Ypred, median), y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, size = 3, alpha = 0.6) +
+  geom_text(x = 1.5, y = ymax + 5e-2, label = pval.lab, size = 8) +
+  geom_segment(x = 1, xend = 3, y = ymax, yend = ymax, col = "#0B4668", size = 1) +
+  geom_text(x = 1.5, y = ymax.s + 5e-2, label = pval.lab.s, size = 8) +
+  geom_segment(x = 1, xend = 2, y = ymax.s, yend = ymax.s, col = "#0B4668", size = 1) +
+  ylab("i-MAP score") +
+  xlab(NULL) +
+  theme(legend.position = "none") +
+  theme(axis.text.x = element_text(angle = 90)) +
+  scale_fill_manual(values = col.pal) +
+  scale_color_manual(values = col.pal) +
+  ylim(0:1)
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_sa.pdf"), h = 8, w = 4)
+plot(p)
+if (save.fig) dev.off()
+
+################################################################################
+# Predictions vs. metadata features
+################################################################################
+p1 <- predictions %>%
+  filter(Genetics == "Healthy") %>%
+  ggplot(aes(x = Sex, y = Ypred, fill = Sex)) +
+  geom_boxplot(outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, size = 3, alpha = 0.6) +
+  xlab("Sex") +
+  theme(legend.position = "none") +
+  ylab("i-MAP score") +
+  scale_fill_hue(l = 55) +
+  labs(fill = NULL) +
+  ylim(0:1)
+
+p2 <- predictions %>%
+  filter(Genetics == "Healthy") %>%
+  ggplot(aes(x = Site, y = Ypred, fill = Site)) +
+  geom_boxplot(outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, size = 3, alpha = 0.6) +
+  xlab("Origin institution") +
+  theme(legend.position = "none") +
+  ylab(NULL) +
+  scale_fill_jama() +
+  labs(fill = NULL) +
+  ylim(0:1)
+
+p3 <- predictions %>%
+  filter(Genetics == "Healthy") %>%
+  ggplot(aes(x = Passage, y = Ypred)) +
+  geom_smooth(method = "lm", se = FALSE) +
+  geom_point(size = 3) +
+  theme(legend.position = "none") +
+  ylab(NULL) +
+  xlab("Cell Line Passage Number") +
+  ylim(0:1)
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_metadata.pdf"), h = 8, w = 24)
+gridExtra::grid.arrange(p1, p2, p3, nrow = 1)
+if (save.fig) dev.off()
+
+# Aggregate by cell line / plate
+predictions <- cell.models$ypred %>%
+  group_by(CellLine, Ytest, PlateID) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine") %>%
+  dplyr::rename(Figure = `Figure Names`) %>%
+  mutate(Plate = as.numeric(as.factor(PlateID))) %>%
+  mutate(Plate = str_c("Plate ", Plate))
+
+p4 <- predictions %>%
+  filter(Genetics == "Healthy") %>%
+  ggplot(aes(x = Plate, y = Ypred, fill = PlateID)) +
+  geom_boxplot(outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, size = 3, alpha = 0.6) +
+  theme(legend.position = "none") +
+  ylab("i-MAP score") +
+  xlab(NULL) +
+  scale_fill_hue(l = 55) +
+  labs(fill = NULL) +
+  ylim(0:1)
+
+# Aggregate by cell line / plate
+predictions <- cell.models$ypred %>%
+  group_by(CellLine, Ytest, PlateID, WellID) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine") %>%
+  dplyr::rename(Figure = `Figure Names`) %>%
+  mutate(Row = str_remove_all(WellID, "-.*$")) %>%
+  mutate(Row = as.numeric(Row)) %>%
+  mutate(Col = str_remove_all(WellID, "^.*-")) %>%
+  mutate(Col = as.numeric(Col)) %>%
+  mutate(Plate = as.numeric(as.factor(PlateID))) %>%
+  mutate(Plate = str_c("Plate ", Plate))
+
+p5 <- predictions %>%
+  mutate(Ypred = ifelse(Genetics == "Healthy", Ypred, NA)) %>%
+  ggplot(aes(x = Col, y = Row, fill = Ypred)) +
+  geom_tile() +
+  ylab("Plate Column") +
+  xlab("Plate Row") +
+  scale_fill_viridis_c(limits = 0:1) +
+  facet_wrap(~Plate) +
+  labs(fill = "i-MAP score")
+
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_plate.pdf"), h = 8, w = 8)
+plot(p4)
+if (save.fig) dev.off()
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_well.pdf"), h = 12, w = 24)
+plot(p5)
+if (save.fig) dev.off()
+
+
+################################################################################
+#' # Predictions by well replicate
+#+ predictions_by_well, message=FALSE
+################################################################################
+# Aggregate predictions by well replicate
+predictions <- cell.models$ypred %>%
+  group_by(CellLine, WellID, Ytest, PlateID) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine") %>%
+  dplyr::rename(Figure = `Figure Names`)
+
+################################################################################
+# FUS-ALS vs. WT
+################################################################################
+predictions.agg <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  group_by(CellLine, Genetics) %>%
+  summarize(Ypred = median(Ypred), .groups = "drop") %>%
+  arrange(Ypred) %>%
+  mutate(Face = ifelse(CellLine %in% c("F12", "F1"), "bold", "plain"))
+
+# Plot predictions, FUS v. Healthy
+p <- filter(predictions, Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  ggplot(aes(x = reorder(Figure, Ypred, median), y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, alpha = 0.6) +
+  theme(axis.text.x = element_text(angle = 90, face = predictions.agg$Face)) +
+  theme(legend.position = "none") +
+  xlab(NULL) +
+  ylab("i-MAP score") +
+  scale_color_manual(values = col.pal) +
+  scale_fill_manual(values = col.pal) +
+  ylim(c(0, 1))
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_b.pdf"), h = 8, w = 16)
+plot(p)
+if (save.fig) dev.off()
+
+################################################################################
+# sporadic vs. WT
+################################################################################
+n.sporadic.top <- 5
+
+# plot predictions, sporadic v. Healthy
+predictions.agg <- filter(predictions, Genetics %in% c("sporadic", "Healthy")) %>%
+  group_by(CellLine, Genetics) %>%
+  summarize(Ypred = median(Ypred), .groups = "drop") %>%
+  arrange(Ypred)
+
+p <- filter(predictions, Genetics %in% c("sporadic", "Healthy")) %>%
+  ggplot(aes(x = reorder(Figure, Ypred, median), y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics), outlier.shape = NA, coef = NULL) +
+  geom_jitter(width = 0.2, alpha = 0.6) +
+  theme(axis.text.x = element_text(angle = 90, face = predictions.agg$Face)) +
+  theme(legend.position = "none") +
+  xlab(NULL) +
+  ylab("i-MAP score") +
+  scale_color_manual(values = col.pal) +
+  scale_fill_manual(values = col.pal) +
+  ylim(c(0, 1)) 
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_sb.pdf"), h = 8, w = 16)
+plot(p)
+if (save.fig) dev.off()
+
+################################################################################
+#' # ROC curves
+#+ roc, message=FALSE
+################################################################################
+predictions.cell <- filter(cell.models$ypred, Genetics %in% c("FUS-ALS", "Healthy"))
+
+# Aggregate predictions by single well
+predictions.well <- filter(cell.models$ypred, Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  group_by(CellLine, WellID, Ytest, PlateID) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine")
+
+# Aggregate predictions by cell line
+predictions.line <- filter(cell.models$ypred, Genetics %in% c("FUS-ALS", "Healthy")) %>%
+  group_by(CellLine, Ytest) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine")
+
+# Compute ROC curves
+roc.cell <- roc.curve(
+  scores.class0 = predictions.cell$Ypred,
+  weights.class0 = predictions.cell$Ytest,
+  curve = TRUE
+)
+
+roc.well <- roc.curve(
+  scores.class0 = predictions.well$Ypred,
+  weights.class0 = predictions.well$Ytest,
+  curve = TRUE
+)
+
+roc.line <- roc.curve(
+  scores.class0 = predictions.line$Ypred,
+  weights.class0 = predictions.line$Ytest,
+  curve = TRUE
+)
+
+# Initialize type labels
+type.levels <- c("Single cell", "Well replicate", "Cell line")
+aucs <- c(roc.cell$auc, roc.well$auc, roc.line$auc) %>% round(2)
+type.levels <- str_c(type.levels, ", AUROC = ", aucs)
+
+# Plot ROC curves
+xplot <- rbind(
+  data.frame(roc.cell$curve[, 1:2], Type = type.levels[1]),
+  data.frame(roc.well$curve[, 1:2], Type = type.levels[2]),
+  data.frame(roc.line$curve[, 1:2], Type = type.levels[3])
+)
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_c.pdf"), h = 8, w = 8)
+mutate(xplot, Type = factor(Type, levels = type.levels)) %>%
+  ggplot(aes(x = X1, y = X2, col = Type)) +
+  geom_line(size = 2) +
+  geom_abline(col = "grey", lty = 2) +
+  scale_color_jama() +
+  xlab("False positive rate") +
+  ylab("True positive rate") +
+  theme(legend.position = c(0.75, 0.15)) +
+  theme(legend.text = element_text(size = 16)) +
+  labs(color = NULL)
+if (save.fig) dev.off()
+
+
+################################################################################
+#' # Feature importance
+#+ importance, message=FALSE
+################################################################################
+clean_marker <- function(x, markers = c("2" = "FUS", "3" = "EEA1")) {
+  # Map channel indices to marker names
+  channels <- str_split(x, "\\.\\.") %>%
+    sapply("[", 2) %>%
+    str_split("")
+  return(sapply(channels, function(ch) str_c(markers[ch], collapse = "/")))
+}
+
+clean_region <- function(x) {
+  # Map region indicators to full names
+  region.key <- c(D = "nucleus", N = "cytoplasm", C = "cell")
+  x <- str_remove_all(x, "^.*_")
+  regions <- str_split(x, "\\.\\.") %>%
+    sapply("[", 1) %>%
+    str_split("")
+  return(sapply(regions, function(r) str_c(region.key[r], collapse = "/")))
+}
+
+clean_feature <- function(x) {
+  # Clean feature name, remove region / marker indicators
+  x <- str_remove_all(x, "_(D|C|N)*\\.\\.[0-9]*$")
+  return(str_replace_all(x, "_", " ") %>% str_c("\n"))
+}
+
+# Max-normalize feature importance for each hold-out cell line
+importance <- apply(abs(cell.models$importance), MAR = 2, importance_norm)
+features <- rownames(cell.models$importance) %>% str_remove_all("^X\\.\\.")
+
+# Compute maximum importance within feature category, average across models
+importance.avg <- reshape2::melt(importance) %>%
+  mutate(Feature = features[Var1]) %>%
+  dplyr::rename(Model = Var2) %>%
+  mutate(Feature = str_remove_all(Feature, "\\.\\.[0-9]*$")) %>%
+  group_by(Model, Feature) %>%
+  summarize(Importance = max(value), .groups = "drop") %>%
+  group_by(Feature) %>%
+  summarize(SD = sd(Importance), Importance = mean(Importance)) %>%
+  mutate(Marker = clean_marker(Feature)) %>%
+  mutate(Region = clean_region(Feature)) %>%
+  mutate(Feature = clean_feature(Feature)) %>%
+  arrange(desc(Importance))
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_d.pdf"), h = 7, w = 8)
+top_n(importance.avg, 5, Importance) %>%
+  mutate(Feature = str_c(Feature, Region, ", ", Marker)) %>%
+  ggplot(aes(x = reorder(Feature, Importance), y = Importance)) +
+  geom_bar(stat = "identity", fill = "#0B4668", color = "black", width = 0.5) +
+  geom_errorbar(aes(ymin = Importance, ymax = Importance + SD), width = 0.25) +
+  theme(axis.text.y = element_text(size = 16)) +
+  theme(panel.grid.major.y = element_blank()) +
+  theme(panel.grid.minor.y = element_blank()) +
+  coord_flip() +
+  xlab(NULL) +
+  ylab(NULL) +
+  ggtitle(NULL) +
+  theme(text = element_text(size = 22))
+if (save.fig) dev.off()
+
+fout <- file.path(output.fig.dir, "importance.csv")
+write.csv(file = fout, importance.avg)
+
+################################################################################
+#' # Age correlation
+#+ age, message=FALSE
+################################################################################
+# Aggregate model predictions by cell line
+predictions <- cell.models$ypred %>%
+  dplyr::rename(Plate = PlateID) %>%
+  group_by(CellLine, Ytest, Genetics, WellID, Plate) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop")
+
+# Plot age v. predictions by genetics
+xcell.s <- dplyr::select(xcell, -Genetics) %>%
+  dplyr::rename(Age = `Age of biopsy`) %>%
+  dplyr::rename(AgeOfOnset = `Age of onset`) %>%
+  dplyr::rename(Figure = `Figure Names`)
+
+xplot <- predictions %>% left_join(xcell.s, by = "CellLine")
+
+xplot.avg <- group_by(predictions, CellLine, Genetics) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell.s, by = "CellLine") %>%
+  mutate(AgeOfOnset = ifelse(AgeOfOnset == "Control", Age, AgeOfOnset)) %>%
+  mutate(AgeOfOnset = ifelse(AgeOfOnset == "control", Age, AgeOfOnset)) %>%
+  mutate(AgeOfOnset = ifelse(AgeOfOnset == "", Age, AgeOfOnset)) %>%
+  mutate(AgeOfOnset = as.numeric(AgeOfOnset)) %>%
+  mutate(Figure = str_remove_all(Figure, "-.*$")) %>%
+  dplyr::rename(Site = Site)
+
+# Compute age/prediction correlations within genetic background
+xfus <- filter(xplot.avg, Genetics == "FUS-ALS")
+cor.fus <- cor.test(xfus$Ypred, xfus$Age, alternative = "less")
+
+xwt <- filter(xplot.avg, Genetics == "Healthy")
+cor.wt <- cor.test(xwt$Ypred, xwt$Age, alternative = "less")
+
+xsporadic <- filter(xplot.avg, Genetics == "sporadic")
+cor.spor <- cor.test(xsporadic$Ypred, xsporadic$Age, alternative = "less")
+
+rhos <- round(c(cor.fus$estimate, cor.wt$estimate, cor.spor$estimate), 2)
+pvals <- c(cor.fus$p.value, cor.wt$p.value, cor.spor$p.value)
+pvals <- p.adjust(pvals, method='bonferroni')
+pval.lab <- str_c("(", sapply(pvals, label_pval), ")")
+
+xplot.text <- data.frame(Genetics = c("FUS-ALS", "Healthy", "sporadic")) %>%
+  mutate(text = str_c("Cor = ", rhos, pval.lab)) %>%
+  mutate(Ypred = c(0.15, 0.1, 0.05)) %>%
+  mutate(Age = 55)
+
+# Plot predictions v. age, FUS v. Healthy
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_aob.pdf"), h = 10, w = 10)
+filter(xplot.avg, Genetics %in% c("Healthy", "FUS-ALS")) %>%
+  ggplot(aes(x = Age, y = Ypred, col = Genetics)) +
+  geom_point(size = 3) +
+  geom_line(stat = "smooth", method = "lm", formula = y ~ x, alpha = 0.7, size = 1.5) +
+  geom_text_repel(aes(label = CellLine), size = 7, min.segment.length = 0.01) +
+  geom_text(data = filter(xplot.text, Genetics != "sporadic"), aes(label = text), size = 10) +
+  scale_color_manual(values = col.pal) +
+  ylim(0:1) +
+  ylab("i-MAP score") +
+  theme(legend.position = "none")
+if (save.fig) dev.off()
+
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_supp_sporadic_aob.pdf"), h = 10, w = 10)
+filter(xplot.avg, Genetics %in% c("Healthy", "sporadic")) %>%
+  ggplot(aes(x = Age, y = Ypred, col = Genetics)) +
+  geom_point(size = 3) +
+  geom_line(stat = "smooth", method = "lm", formula = y ~ x, alpha = 0.7, size = 1.5) +
+  geom_text_repel(aes(label = CellLine), size = 7, min.segment.length = 0.01) +
+  geom_text(data = filter(xplot.text, Genetics != "FUS-ALS"), aes(label = text), size = 10) +
+  scale_color_manual(values = col.pal) +
+  ylim(0:1) +
+  ylab("i-MAP score") +
+  theme(legend.position = "none")
+if (save.fig) dev.off()
+
+
+################################################################################
+# Age of onset
+################################################################################
+# Compute age/prediction correlations within genetic background
+cor.fus.aoo <- cor.test(xfus$Ypred, xfus$AgeOfOnset, alternative = "less")
+
+rhos <- round(c(cor.fus.aoo$estimate, cor.wt$estimate, cor.spor$estimate), 2)
+pvals <- c(cor.fus.aoo$p.value, cor.wt$p.value, cor.spor$p.value)
+pvals <- p.adjust(pvals, method='bonferroni')
+pval.lab <- str_c("(", sapply(pvals, label_pval), ")")
+
+xplot.text <- data.frame(Genetics = c("FUS-ALS", "Healthy", "sporadic")) %>%
+  mutate(text = str_c("Cor = ", rhos, pval.lab)) %>%
+  mutate(Ypred = c(0.15, 0.1, 0.05)) %>%
+  mutate(AgeOfOnset = 55)
+
+# Plot predictions v. age of onset, FUS v. Healthy
+if (save.fig) pdf(file = file.path(output.fig.dir, "fig_g.pdf"), h = 12, w = 12)
+filter(xplot.avg, Genetics %in% c("Healthy", "FUS-ALS")) %>%
+  ggplot(aes(x = AgeOfOnset, y = Ypred, col = Genetics)) +
+  geom_point(size = 3) +
+  geom_line(stat = "smooth", method = "lm", formula = y ~ x, size = 2) +
+  geom_text_repel(aes(label = CellLine), size = 7, min.segment.length = 0.01) +
+  geom_text(data = filter(xplot.text, Genetics != "sporadic"), aes(label = text), size = 10) +
+  scale_color_manual(values = col.pal) +
+  ylim(0:1) +
+  ylab("i-MAP score") +
+  xlab("Age of onset") +
+  theme(legend.position = "none")
+if (save.fig) dev.off()
+
+
+################################################################################
+# Save table of predictions by cell line
+################################################################################
+# Predictions by genetics
+predictions <- cell.models$ypred %>%
+  group_by(CellLine, Ytest) %>%
+  summarize(Ypred = mean(Ypred), .groups = "drop") %>%
+  left_join(xcell, by = "CellLine")
+
+save(file = file.path(output.fig.dir, "predictions.Rdata"), predictions)
+
+################################################################################
+# Save table of plates used
+################################################################################
+if (save.plate) {
+  plate.list <- unique(predictions$Plate)
+
+  write.table(
+    file = plate.file,
+    plate.list,
+    append = TRUE,
+    row.names = FALSE,
+    col.names = FALSE,
+    quote = FALSE
+  )
+
+  # Save plate maps
+  plate.layout <- cell.models$ypred %>%
+    dplyr::select(PlateID, CellLine, WellID) %>%
+    distinct() %>%
+    mutate(Row = as.numeric(str_remove_all(WellID, "-.*$"))) %>%
+    mutate(Col = as.numeric(str_remove_all(WellID, "^.*-")))
+
+  for (p in unique(plate.layout$PlateID)) {
+    platemap <- matrix("", nrow = 16, ncol = 24)
+    pm <- filter(plate.layout, PlateID == p)
+    for (i in 1:nrow(pm)) platemap[pm$Row[i], pm$Col[i]] <- pm$CellLine[i]
+    colnames(platemap) <- 1:24
+
+    print(table(platemap))
+    fout <- file.path(platemap.dir, str_c(p, ".csv"))
+    write.csv(file = fout, platemap)
+  }
+}
diff --git a/paper_figures/Figure_Scores/run_models.sh b/paper_figures/Figure_Scores/run_models.sh
new file mode 100644
index 0000000..8af9176
--- /dev/null
+++ b/paper_figures/Figure_Scores/run_models.sh
@@ -0,0 +1,16 @@
+declare -a MODEL=("irf" "fcnn" "logistic")
+declare -a CYTOSOLIC=("TRUE" "FALSE")
+
+for m in "${MODEL[@]}"
+do
+  
+  for c in "${CYTOSOLIC[@]}" 
+  do
+    echo "RUNNING: $m, $c"
+    Rscript train_fus_models.R --MODEL "$m" --CYTOSOLIC "$c"
+    Rscript fig_scores.R --MODEL "$m" --CYTOSOLIC "$c"
+    echo "####################"
+  done
+
+done
+
diff --git a/paper_figures/Figure_Scores/train_fus_models.R b/paper_figures/Figure_Scores/train_fus_models.R
new file mode 100644
index 0000000..c282a8a
--- /dev/null
+++ b/paper_figures/Figure_Scores/train_fus_models.R
@@ -0,0 +1,73 @@
+################################################################################
+#+ setup, message=FALSE
+################################################################################
+library(tidyverse)
+library(data.table)
+
+# Set user args for model choice
+args <- R.utils::commandArgs(asValues = TRUE)
+cytosolic <- as.logical(args$CYTOSOLIC) # TRUE/FALSE
+model.str <- args$MODEL # 'irf', 'logistic', or 'fcnn'
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores")
+platemap.dir <- file.path(fig.base.dir, "data", "platemaps")
+dir.create(platemap.dir)
+
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
+
+# Set paths to data directories
+data.dir <- file.path(data.base.dir, "032422_WTFUSSporadics", "Cell_Filtered")
+plate.file <- file.path(fig.base.dir, "data", "plate_list.txt")
+
+n.core <- 10
+marker <- "FUS_EEA1"
+save.fig <- FALSE
+save.plate <- FALSE
+
+################################################################################
+#+ parameters, message=FALSE
+################################################################################
+
+# Initialize models to be used for supervised learning
+if (model.str == "fcnn") {
+  model <- fcnn
+  model_predict <- predict_fcnn
+} else if (model.str == "logistic") {
+  model <- logistic
+  model_predict <- predict_logistic
+} else {
+  model <- function(x, y) irf(x, y, n.core = n.core)
+  model_predict <- predict_irf
+}
+
+ext <- ifelse(cytosolic, "_cytosolic", "_full")
+output.fig.dir <- file.path(figure.dir, model.str, str_c("fig", ext))
+dir.create(output.fig.dir, showWarnings = FALSE, recursive = TRUE)
+
+################################################################################
+#' # Load data
+#+ parameters, message=FALSE
+################################################################################
+# Load in datasets for selected marker
+input.file <- str_c("profiles_", marker, ".Rdata")
+load(file.path(data.dir, input.file))
+
+# Filter to cytosolic features
+if (cytosolic) x <- select_cytosolic(x)
+
+################################################################################
+#' # Predictive analysis
+#+ modeling, message=FALSE
+################################################################################
+# Fit model to select data
+set.seed(47)
+
+if (!file.exists(file.path(output.fig.dir, "model.Rdata"))) {
+  cell.models <- cell_fit(x, model, model_predict)
+  save(file = file.path(output.fig.dir, "model.Rdata"), cell.models)
+} else {
+  warning("Model file already exists, model not fit")
+}
diff --git a/paper_figures/Figure_Search/.DS_Store b/paper_figures/Figure_Search/.DS_Store
index fe8eb2dcb92d09c8fc15bc4ff30a7703f54fd01a..95ea6914022c19af35b9e5801ad168be2f1b620c 100644
GIT binary patch
delta 220
zcmZn(XbIThF2J~JvWGySrc`ycp@EUHj)IAqVXclrwV|oGnT~>`u}N($Cx@uAzI9N1
zc1~_ye$V6w0`iQ#lLW<B7=!QiZI%*T&T9YnR@=Ay2S*$k7#Nrt(ik!s(sT1uT#|C~
rlYk-|n6d>KhQZ1CxdlL32F4JFP2#Mao7ok9v26Y?a)^;~)0h|mKX^ro

delta 264
zcmZn(XbIThF2J~BvWGxnqC|DInTeT>f|;RZt&T#qrICS-f{C$NZ7nB<sItCwP<(by
zZeD&D0~jzeLTCnFC=H{!fedDbG=@xu^qh3V;N<+=0<hEzhC7>O1edeg&;Psn=KH%x
u91-er^HW@sa`KaaA{>~q1sTYuBy469XXV_?s_=(pGqdOcMk*M`!UzE5zDVl;

diff --git a/paper_figures/Figure_Search/README.md b/paper_figures/Figure_Search/README.md
new file mode 100644
index 0000000..a580224
--- /dev/null
+++ b/paper_figures/Figure_Search/README.md
@@ -0,0 +1,38 @@
+# Marker search screens
+Scripts in this directory are used to train and evalaute models used in marker
+set search screens and generate figures summarizing model performance. Data are
+derived from three screens:
+
+- `data_profiles/110420_BiomarkerScreen/Cell_Filtered`
+- `data_profiles/121820_SporadicScreen/Cell_Filtered`
+- `data_profiles/030321_FUSWT/Cell_Filtered`
+
+## Model fitting
+Models for the first two screens can be fit by running the
+`train_marker_screens.R` script. Select the screen to analyze by setting the
+`screen` variable as one of '110420_BiomarkerScreen' or '121820_SporadicScreen'.
+Models for the third screen can be trained using the `train_fus.R` script. Each
+of these scripts saves the output model in `output.fig.dir/model_<screen>.Rdata`
+
+### Parameters in modeling scripts
+
+- `genetics.train`: cell line genetics to be used for model training. Default is
+  to use all genetics available for a given marker set. (only defined for
+  `train_marker_screens.R`)
+
+- `model`: modeling function that takes arguments `x` (feature matrix) and `y`
+  (response vector) and returns a fitted model.
+
+- `model_predict: model prediction function that takes arguments `model` (a
+  fitted mode) and `x` (feature matrix) and returns predictions on `x`.
+
+- `markers`: marker sets to be used for model. Single marker models can be
+  defined from a marker pair as <marker1>_<marker2>:<marker*>. (only defined for
+  `train_fus_screen.R`)
+
+## Figures
+Scripts to generate figures for the marker search screens require fitted models,
+generated using the scripts described above. `fig_heatmap.R` produces a heatmap
+for a selected `screen` reporting classification accuracy by genetics / marker.
+`fig_marker_tuning.R` produces figures summarizing marker set vs. individual
+marker performance for the FUS screen.
diff --git a/paper_figures/Figure_Search/fig_marker_tuning.R b/paper_figures/Figure_Search/fig_marker_tuning.R
index bd8955d..4333986 100644
--- a/paper_figures/Figure_Search/fig_marker_tuning.R
+++ b/paper_figures/Figure_Search/fig_marker_tuning.R
@@ -1,39 +1,14 @@
-#' # Figure S1
-#' The following script fits models by marker set for the FUS, CD63, EEA1 
-#' comparison screen (Fig. S1). We consider model performance with respect to
-#' both marker pairs and individual markers. 
-#' 
-#' Requires input datasets
-#'   `data_profiles/030321_FUSWT_Cell`
-#'   
-#' Requires models trained using scripts:
-#'    `train_fus_screen.R`
-#'   
-#'   Karl Kumbier 2/23/2022`
+################################################################################
 #+ setup, message=FALSE
+################################################################################
 library(tidyverse)
 library(data.table)
 library(tidytext)
 library(PRROC)
 library(scales)
 library(ggsci)
-library(hrbrthemes)
 library(gridExtra)
 
-theme_set(
-  theme_ipsum(
-    plot_title_size=24,
-    axis_title_size=24,
-    strip_text_size=24, 
-    axis_text_size=22,
-    base_size=24,
-    base_family='sans',
-    axis=TRUE,
-    axis_col='#000000',
-    grid=FALSE
-  )
-)
-
 # Set paths to data directories
 fig.base.dir <- Sys.getenv('ALS_PAPER')
 data.base.dir <- Sys.getenv('ALS_DATA')
@@ -41,6 +16,8 @@ figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search')
 
 source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
 source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R'))
+source(file.path(fig.base.dir, "paper_figures", "theme.R"))
+theme_set(theme_als())
 
 # Set paths to data directories
 data.dir <- file.path(data.base.dir, '030321_FUSWT', 'Cell_Filtered')
@@ -53,14 +30,10 @@ save.fig <- TRUE
 output.fig.dir <- file.path(figure.dir, 'fig/')
 load(file.path(output.fig.dir, 'model_030321_FUSWT.Rdata'))
 
-
+################################################################################
 #' ## Accuracy by marker set
-#' To evaluate performance of marker sets, we compute AUROC on held-out cell 
-#' lines at three data resolutions: (i) cell line (ii) well replicate 
-#' (iii) single cell. Since NS003 cannot be accurately classified using any
-#' marker set, but is an outlier wrt some, we drop this line when computing
-#' AUROC.
 #+ accuracy
+################################################################################
 accuracy <- lapply(fits, function(z) {
   
   # Drop cell line not classified by any marker set
@@ -115,13 +88,13 @@ xplot <- lapply(unique(predictions$Markers), function(m) {
   }
   
   p <- predictions.m  %>%
-    ggplot(aes(x=reorder(Figure, Ypred, median), y=Ypred, col=Genetics)) +
+    ggplot(aes(x=reorder(CellLine, Ypred, median), y=Ypred, col=Genetics)) +
     geom_boxplot(aes(fill=Genetics), outlier.shape=NA, coef=NULL, alpha=0.75) +
     geom_jitter(width=0.15, alpha=0.9) +
     theme(legend.position='none') +
     theme(axis.text.x=element_text(angle=90)) +
     xlab(NULL) +
-    ylab('FUS-ALS phenotype score') +
+    ylab('i-MAP score') +
     scale_color_manual(values=col.pal) +
     scale_fill_manual(values=col.pal) +
     ylim(0:1) +
diff --git a/paper_figures/Figure_Search/fig_pred_by_marker.R b/paper_figures/Figure_Search/fig_pred_by_marker.R
new file mode 100644
index 0000000..1274d73
--- /dev/null
+++ b/paper_figures/Figure_Search/fig_pred_by_marker.R
@@ -0,0 +1,60 @@
+################################################################################
+#+ setup, message=FALSE
+################################################################################
+library(tidyverse)
+library(data.table)
+library(RColorBrewer)
+library(scales)
+library(superheat)
+library(ggsci)
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv('ALS_PAPER')
+data.base.dir <- Sys.getenv('ALS_DATA') 
+figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search')
+
+#screen <- '110420_BiomarkerScreen'
+#screen <- '121820_SporadicScreen'
+screen <- '030321_FUSWT'
+
+# Source utility functions
+source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
+source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R'))
+source(file.path(fig.base.dir, "paper_figures", "theme.R"))
+theme_set(theme_als())
+
+# Parameters for visualization
+save.fig <- TRUE
+
+# Load in fitted model for current screen
+output.fig.dir <- file.path(figure.dir, 'fig')
+load(file.path(output.fig.dir, str_c('model_', screen, '.Rdata')))
+
+
+################################################################################
+#+ cell_prediction_plots, fig.width=18, fig.height=12, warning=FALSE
+################################################################################
+# Predictions by genetics
+predictions <- rbindlist(fits) %>%
+  mutate(Genetics=factor(Genetics, levels=genetic.order)) %>%
+  filter(!str_detect(Markers, ':')) %>%
+  group_by(PlateID, WellID, CellLine, Genetics, Markers) %>%
+  summarize(Ypred = mean(Ypred))
+
+nrow <- ifelse(screen == "110420_BiomarkerScreen", 2, 1)
+p <- ggplot(predictions, aes(x = reorder(CellLine, Ypred), y = Ypred)) +
+  geom_boxplot(aes(fill = Genetics, col = Genetics), alpha = 0.5) + 
+  geom_jitter(width = 0.2, alpha = 0.8, aes(col = Genetics)) +
+  facet_wrap(~Markers, nrow = nrow) + 
+  scale_fill_manual(values = col.pal) +
+  scale_color_manual(values = col.pal) +
+  theme(legend.position = "none") +
+  ylab("i-MAP score") + 
+  xlab(NULL) +
+  ylim(c(0, 1)) +
+  theme(text = element_text(size = 13))
+
+fout <- file.path(output.fig.dir, str_c("fig_", screen, ".pdf"))
+if (save.fig) pdf(file = fout, height = nrow * 5, width=16)
+plot(p)
+if (save.fig) dev.off()
diff --git a/paper_figures/Figure_Search/run.sh b/paper_figures/Figure_Search/run.sh
new file mode 100644
index 0000000..47cdcde
--- /dev/null
+++ b/paper_figures/Figure_Search/run.sh
@@ -0,0 +1,4 @@
+Rscript train_marker_screens.R --SCREEN 110420_BiomarkerScreen
+Rscript train_marker_screens.R --SCREEN 121820_SporadicScreen
+Rscript train_fus_screen.R 
+
diff --git a/paper_figures/Figure_Search/train_fus_screen.R b/paper_figures/Figure_Search/train_fus_screen.R
index fea9e56..af2ee7e 100644
--- a/paper_figures/Figure_Search/train_fus_screen.R
+++ b/paper_figures/Figure_Search/train_fus_screen.R
@@ -1,87 +1,76 @@
-#' The following script fits models by marker set for the FUS, CD63, EEA1 
-#' comparison screen. We consider model performance with respect to both marker 
-#' pairs and individual markers. 
-#' 
-#' Requires input datasets
-#'   `data_profiles/030321_FUSWT_Cell`
-#'   
-#'   Karl Kumbier 2/23/2022
+###############################################################################
 #+ setup, message=FALSE
+###############################################################################
 library(tidyverse)
 library(data.table)
-library(scales)
 
 # Set paths to data directories
-fig.base.dir <- Sys.getenv('ALS_PAPER')
-data.base.dir <- Sys.getenv('ALS_DATA')
-figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search')
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Search")
 
-data.dir <- file.path(data.base.dir, '030321_FUSWT', 'Cell_Filtered')
-library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv')
-plate.file <- file.path(fig.base.dir, 'data', 'plate_list.txt')
-platemap.dir <- file.path(fig.base.dir, 'data', 'platemaps')
+data.dir <- file.path(data.base.dir, "030321_FUSWT", "Cell_Filtered")
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
+plate.file <- file.path(fig.base.dir, "data", "plate_list.txt")
+platemap.dir <- file.path(fig.base.dir, "data", "platemaps")
 
-output.fig.dir <- file.path(figure.dir, 'fig')
-dir.create(output.fig.dir, recursive=FALSE)
+output.fig.dir <- file.path(figure.dir, "fig")
+dir.create(output.fig.dir, recursive = FALSE)
 dir.create(platemap.dir)
 
 # Source utility funcitons
-source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
+
+# save plate IDs to experiment summary file
+save.plates <- FALSE
 
 # Parameters for analysis
-type <- 'Cell'
 n.core <- 16
 
+###############################################################################
+#+ parameters
+###############################################################################
 markers <- c(
-  'FUS_EEA1', 'CD63_EEA1', 'CD63_FUS',
-  'FUS_EEA1:FUS', 'CD63_EEA1:CD63', 'CD63_FUS:CD63',
-  'FUS_EEA1:EEA1', 'CD63_EEA1:EEA1', 'CD63_FUS:FUS'
+  "FUS_EEA1", "CD63_EEA1", "CD63_FUS",
+  "FUS_EEA1:FUS", "CD63_EEA1:CD63", "CD63_FUS:CD63",
+  "FUS_EEA1:EEA1", "CD63_EEA1:EEA1", "CD63_FUS:FUS"
 )
 
-save.plates <- FALSE
-
 # Initialize models to be used for supervised learning
-model <- function(x, y) irf(x, y, n.core=n.core) 
+model <- function(x, y) irf(x, y, n.core = n.core)
 model_predict <- predict_irf
 
-#' # Loading data
-#' Our dataset consists of phenotypic profiles derived from `r marker`. Each 
-#' observation in the dataset corresponds to a single `r type`.
+###############################################################################
+#' # Load data
 #+load_data
+###############################################################################
 # Load in datasets for selected marker
 load_marker <- function(marker) {
   # wrapper function for loading data from select marker set
-  load(file.path(data.dir, str_c('profiles_', marker, '.Rdata')))
+  load(file.path(data.dir, str_c("profiles_", marker, ".Rdata")))
   return(x)
 }
 
-#' ## Predictive analysis
-#' Here we address the question of which marker is best for separating WT from 
-#' FUS cell lines. Toward this end, we train classifiers to discriminate between 
-#' disease and healthy `r type`s. Classifiers are trained using features 
-#' derived from select markers / marker sets.
-#' 
-#' To assess the generalizability of our predictive models to new cell lines, we 
-#' consider a leave one cell line out strategy. Specifically, we remove a single 
-#' cell line, train models on the remaining cell lines, and evaluate predictions 
-#' on the held-out line. 
-#+ leave_cell_out
+################################################################################
+#' # Predictive analysis
+#+ modeling
+################################################################################
 marker_fit <- function(marker) {
   # Wrapper function to call cell_fit over each marker set
-  
-  marker.split <- str_split(marker, ':')[[1]]
-  
+
+  marker.split <- str_split(marker, ":")[[1]]
+
   # Fit model to predict each cell lines
   x <- load_marker(marker.split[1])
 
   # Filter to singletons
   if (length(marker.split) == 2) {
-    marker.set <- str_split(marker.split[1], '_')[[1]]
+    marker.set <- str_split(marker.split[1], "_")[[1]]
     single.marker <- marker.split[2]
     channel <- which(marker.set == single.marker) + 1
-    
-    xchannel <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3)
-    id.meta <- !str_detect(colnames(x), '^X')
+
+    xchannel <- sapply(str_split(colnames(x), "\\.\\."), "[", 3)
+    id.meta <- !str_detect(colnames(x), "^X")
     id.marker <- xchannel %in% c(channel, str_c(channel, channel))
     x <- select_if(x, id.marker | id.meta)
   }
@@ -93,11 +82,11 @@ marker_fit <- function(marker) {
 
 # Fit model to select data
 set.seed(47)
-fout <- file.path(output.fig.dir, 'model_030321_FUSWT.Rdata')
+fout <- file.path(output.fig.dir, "model_030321_FUSWT.Rdata")
 
 fits <- lapply(markers, marker_fit)
 fits <- lapply(fits, function(f) f$ypred)
-save(file=fout, fits)
+save(file = fout, fits)
 
 
 ################################################################################
@@ -106,30 +95,30 @@ save(file=fout, fits)
 if (save.plates) {
   predictions <- rbindlist(fits)
   plate.list <- unique(predictions$PlateID)
-  
+
   write.table(
-    file=plate.file, 
-    plate.list, 
-    append=TRUE, 
-    row.names=FALSE, 
-    col.names=FALSE, 
-    quote=FALSE
+    file = plate.file,
+    plate.list,
+    append = TRUE,
+    row.names = FALSE,
+    col.names = FALSE,
+    quote = FALSE
   )
-  
+
   # Save plate maps
   plate.layout <- dplyr::select(predictions, PlateID, CellLine, WellID) %>%
     distinct() %>%
-    mutate(Row=as.numeric(str_remove_all(WellID, '-.*$'))) %>%
-    mutate(Col=as.numeric(str_remove_all(WellID, '^.*-')))
-  
+    mutate(Row = as.numeric(str_remove_all(WellID, "-.*$"))) %>%
+    mutate(Col = as.numeric(str_remove_all(WellID, "^.*-")))
+
   for (p in unique(plate.layout$PlateID)) {
-    platemap <- matrix('', nrow=16, ncol=24)
+    platemap <- matrix("", nrow = 16, ncol = 24)
     pm <- filter(plate.layout, PlateID == p)
     for (i in 1:nrow(pm)) platemap[pm$Row[i], pm$Col[i]] <- pm$CellLine[i]
     colnames(platemap) <- 1:24
-    
+
     print(table(platemap))
-    fout <- file.path(platemap.dir, str_c(p, '.csv'))
-    write.csv(file=fout, platemap)
+    fout <- file.path(platemap.dir, str_c(p, ".csv"))
+    write.csv(file = fout, platemap)
   }
 }
diff --git a/paper_figures/Figure_Search/train_marker_screens.R b/paper_figures/Figure_Search/train_marker_screens.R
index 4e61f13..4eeec5d 100644
--- a/paper_figures/Figure_Search/train_marker_screens.R
+++ b/paper_figures/Figure_Search/train_marker_screens.R
@@ -1,117 +1,106 @@
-#' The following script generates heatmaps of prediction accuracy by genetics 
-#' and marker set as reported in paper Fig. 1b. Accuracy is measured within each
-#' screen individually.
-#' 
-#' Requires input datasets
-#'   `data_profiles/110420_BiomarkerScreen/Cell_Filtered` 
-#'   `data_profiles/121820_SporadicScreen/Cell_Filtered`
-#' 
-#'   Karl Kumbier 2/23/2022`
-#+ setup, message=FALSE
+###############################################################################
+#+ setup, message=FALSE, echo=FALSE
+###############################################################################
 library(tidyverse)
 library(data.table)
-library(scales)
 
 # Paths to data directories
-fig.base.dir <- Sys.getenv('ALS_PAPER')
-data.base.dir <- Sys.getenv('ALS_DATA') 
-figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Search')
-
-#screen <- '110420_BiomarkerScreen'
-screen <- '121820_SporadicScreen'
-data.dir <- file.path(data.base.dir, screen, 'Cell_Filtered')
-plate.file <- file.path(fig.base.dir, 'data', 'plate_list.txt')
-platemap.dir <- file.path(fig.base.dir, 'data', 'platemaps')
-
-output.fig.dir <- file.path(figure.dir, 'fig')
-dir.create(output.fig.dir, recursive=FALSE)
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Search")
+
+#screen <- "121820_SporadicScreen"
+args <- R.utils::commandArgs(asValues=TRUE)
+screen <- args$SCREEN
+if (is.null(screen)) screen <- "110420_BiomarkerScreen" 
+
+data.dir <- file.path(data.base.dir, screen, "Cell_Filtered")
+plate.file <- file.path(fig.base.dir, "data", "plate_list.txt")
+platemap.dir <- file.path(fig.base.dir, "data", "platemaps")
+
+output.fig.dir <- file.path(figure.dir, "fig")
+dir.create(output.fig.dir, recursive = FALSE)
 dir.create(platemap.dir)
 
 # Source utility functions
-source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
-source(file.path(fig.base.dir, 'paper_figures', 'color_palette.R'))
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
 
-# Parameters for analysis
-n.core <- 16 # number of cores for fitting models
-type <- 'Cell' # sample type for modeling
-genetics.train <- c('FUS-ALS', 'C9orf72', 'sporadic', 'TBK1') # genetics for training models
-save.plates <- TRUE # save plate IDs to experimental summary file
+# save plate IDs to experiment summary file
+save.plates <- TRUE
 
-# Initialize plot ordering for genetics
-genetic.order <- c('Healthy', 'TBK1', 'sporadic', 'C9orf72', 'FUS-ALS')
+# number of cores to use
+n.core <- 16
 
-# Initialize models to be used for supervised learning
-model <- function(x, y) irf(x, y, n.core=n.core) 
+###############################################################################
+#+ parameters
+###############################################################################
+# genetics used for model training
+genetics.train <- c("FUS-ALS")
+
+# models used for supervised learning
+model <- function(x, y) irf(x, y, n.core = n.core)
 model_predict <- predict_irf
 
-#' # Loading data
-#' Our dataset consists of phenotypic profiles derived from multiple marker sets. 
-#' Each observation in the dataset corresponds to a single `r type`. For each 
-#' marker set, we evaluate the predictive strength of a single marker by 
-#' ALS genetic subtype.
+###############################################################################
+#' # Load data
 #+load_data
-
+###############################################################################
 # Load in datasets for each marker set
-input.files <- list.files(data.dir) %>% str_subset('Rdata')
+input.files <- list.files(data.dir) %>% str_subset("Rdata")
 
 xlist <- lapply(input.files, function(f) {
   load(file.path(data.dir, f))
-  x <- filter(x, str_detect(Compounds, '(DMSO|None|Control)')) 
+  x <- filter(x, str_detect(Compounds, "(DMSO|None|Control)"))
+  x <- filter(x, Genetics %in% c("FUS-ALS", "Healthy"))
   return(as.data.frame(x))
 })
 
-#' ## Predictive analysis - cell line generalization
-#' To assess the generalizability of our predictive models to new cell lines, we 
-#' consider a leave one cell line out strategy. Specifically, we remove a single 
-#' cell line from our dataset, train models on the remaining cell lines from the
-#' following genetic backgrounds `r genetics.train`, and evaluate predictions on 
-#' the held-out line. The wrapper functions defined below run this analysis.
-#+ leave_cell_out
-
-#' Below, we run the analysis described above over each cell line.
-#+ cell_model, cache=TRUE
+###############################################################################
+#' # Predictive analysis
+#+ modeling, cache=TRUE
+###############################################################################
 set.seed(47)
-fout <- file.path(output.fig.dir, str_c('model_', screen, '.Rdata'))
+fout <- file.path(output.fig.dir, str_c("model_", screen, ".Rdata"))
 
 # Fit model to select data
 fits <- lapply(xlist, function(z) {
-  print('-----------------------------------------------')
+  print("-----------------------------------------------")
   print(z$Markers[1])
   cell_fit(z, model, model_predict, genetics.train)$ypred
 })
 
 fits <- fits[!sapply(fits, is.null)]
-save(file=fout, fits)
+save(file = fout, fits)
 
 
 # Save table of plates used
 if (save.plates) {
-  plate.list <- lapply(xlist, function(z) unique(z$PlateID)) %>% unlist
-  
+  plate.list <- lapply(xlist, function(z) unique(z$PlateID)) %>% unlist()
+
   write.table(
-    file=plate.file, 
-    unique(plate.list), 
-    append=TRUE, 
-    row.names=FALSE, 
-    col.names=FALSE, 
-    quote=FALSE
+    file = plate.file,
+    unique(plate.list),
+    append = TRUE,
+    row.names = FALSE,
+    col.names = FALSE,
+    quote = FALSE
   )
-  
+
   # Save plate maps
   plate.layout <- lapply(xlist, function(z) {
     dplyr::select(z, PlateID, CellLine, RowID, ColID) %>% distinct()
   })
 
-  plate.layout <- rbindlist(plate.layout)  
-  
+  plate.layout <- rbindlist(plate.layout)
+
   for (p in unique(plate.layout$PlateID)) {
-    platemap <- matrix('', nrow=16, ncol=24)
+    platemap <- matrix("", nrow = 16, ncol = 24)
     pm <- filter(plate.layout, PlateID == p)
     for (i in 1:nrow(pm)) platemap[pm$RowID[i], pm$ColID[i]] <- pm$CellLine[i]
     colnames(platemap) <- 1:24
-    
+
     print(table(platemap))
-    fout <- file.path(platemap.dir, str_c(p, '.csv'))
-    write.csv(file=fout, platemap)
+    fout <- file.path(platemap.dir, str_c(p, ".csv"))
+    write.csv(file = fout, platemap)
   }
 }
diff --git a/paper_figures/Figure_Transcriptomics/.DS_Store b/paper_figures/Figure_Transcriptomics/.DS_Store
index 743769fd11f613c105a0310f076ebbcda1ffde02..39bdc4a1b05b1235a92fd2381d70f822a4a6eca6 100644
GIT binary patch
literal 10244
zcmeHMU2GIp6h3EKV3rPawt@_>tqWBtuz{A}R1~&<K&7Om+tMG9Wp-z1C(O<)JF{D$
zRvUdV5z)j4MWZGrYT{4yMG&JgXo4~RCQ3A*5BOkWG-%?BzX_guXBM|D8Xt&h%-rPO
zGxwZ(?z#7ybLQ^33jo-h*BSt#03cFjRF<f@N?~$Vrxd~G%pr=T4|Y0b+9~SEQ^p32
z0D%C30D%C30D%C3>45;9*{p~a9u3+6fdGL3fh!2G??aR-qj8^(dz61VsNpF9(Q;z*
zLeKoiKhY>-5RLnE+@r*yf}Sc-PZj+X1O40qJ#~`D{qf@-_0$2mWtzvgjDCiKe(Hfy
zJ{&Oa(Vz_w2oRWx0L$IW-~bpf)NYpM@AxZ8R$b<WtNEi+BTm-O@=vF-rs25t^`E0q
zRz73qEOC}tA>NS~wnq}K>gJthHMfWLdTiTF6`$4AZbKhQ%JW)m%T*0a&v6Aq(`m@w
zKEu-Nk!Cw@X%5#-cmgpbhLW-}I@+)?7G1ZtVJsFMZCJZzLoC`@zj16VB+jp0y{S8S
z(9Br&{oXdw|5O0mV`(`?9e+3GP(vbZXR=(prsiyk#3{6R`z)&~?(;%%pP|_m(j29C
zcW<9Gkd&(mw4u(dYuJ{vThBQ}lX9g)8*fkBR##4E^1iff<};R)l*4J&Oy^D2)tgK+
zVI0+oSLvj4wrO_Rj=`oG=y&zQE|X0W^$$9B&TZk7C1pA9=-oQaWfb&Qn$hVu$Rtyg
zG4%4>1(78;tysOTadXSIj;?c+a=7Zcd6G0pkXo*Bz|eEOBZlS<52#LBw=}~V>MIzI
zkuvojOU>wEd^i+3S0#rlm4(V;QLYvH^C{y{-f)lbxvFavX|G0BciI}8O_NWGAud^3
zqbPfrq$Q_QtBE3K-@HtbdK|Vm)gQPrCP_U9>7C0CnPLm9Z;+H;Q%&intc5(a^&2Fm
zuiybI@9>$oY?73IdUYAviWA0|&)yPOlzq%{hw8YUS>2*ly+`KxE^3smqBy{$O`4`_
z+}`$fMHzI4)tufrLcS<}>aTSt@%dXM37e};Gh9383wLp$Y*+<@9Gza6=kKB@))pPD
zHeTmFPCzJB8?X0Ah)@k{pcOh`KjdH(j>8Ff5nhE;@EW`epTGsU2;ad^@H6}dzr!Cm
z181U$A*{m1ScfZcCB|?)HeoZiU<dBP1omJr?!z?dcmRhmiv>K4NAM`#i^uRdK8O$D
z349u#!DsO~d<EaaGx#>XgCF3B_z`}A7x8<%gg@Yq_=_-Gs1PJ!zECGD7gh)>g>}Mh
z!tKHiZ{<QUt_F*H_4z$Si*e-?{I7)Y<kh1mvTggec4^1-jh^#!y?B{_{emb9qK&sU
zUGX>1L5n_F>gn9QC%kg*-aMWr`Q<rf;iAQnYUzez&`l1;UWyC-u*jTSp5={RD%~hj
zT*x!L?lNh)ND(2=^t!RgDv@GP7I;@`h^!4!q{(GoHx{X&AYn$OMrl)sB2F&%x|^kV
zhy}Y0ydP<aw22fo^0jj3_u(@*4`0J2a_7rfPEM@g5?qd}u@SdmoE*3nw_+!DVK=!k
zbK)ebm_iLRXptiyLI>}}F}xec@g8#K2gsct!AJ2id>o(P?)*HyfG^=mJcXz6b$kQg
z#IyJ@eu-aAa@mfFE{os%<+2&Owbeh^#Bc9g6uv8Zwy9LA{ev_+TzZV0VMhEwC?s;0
zX<I|YVPO*n6KqgHgCsw6Qxy6(q=>7611_wB?uo~+z%)ybVX0&W2eNc<TcA(X1Ge_3
zN{;WjG&W6El2k3KCLAG)2GuDxv3inj1>|XM2I&}_>Q~d^D=oE1lLRF|AV45MAV45M
zAV6TcAyDQmD9Y~t+y4Il|LL~uU_Ai>0Rq<$0W5D#v^LX(#l(K{-0ZGBN_9U~R@k_4
zk8%}i_;ox{ejQI8b{)?zK!{ho@nCv!;xjt#QF@~CpZ^)~Z*I?m{6EP5$v6ExK4bsS
H^8Y^pnfLjP

literal 10244
zcmeHMU2GIp6h3EKVAc+FO3U9$+l9)H+LV?*iwG|5q7+Js-Ts5FyE{WWFgsIrW_N*x
zq-cykfDcA}Q#9(MXcROUB}OsP2TdddBT>Qw2BL}a!I)s8c<#Nku-g_)Oh}N-P41kz
z=iYPfJ>Q%&bM74gz?O{G1P}rMnJS~ILYD~&>t}UbkvvW%Q6zbQT@ZuqU_b)W<3{@o
z9|0c$9|0c$9|0eMTLA(3X0sxf3F=>c1bhU11g;{$-VY(FjD|frDky(-(1mXSh?WwY
z7rLi?0GA1QH0;q)LAfaD6uSrXMA6$~pb)425F1W3?9ov{g*c!P2lQq}Z-;_>c9IM6
z;ecU5{i~0FkHB~Y*xr3AXrO~bwQ&D#Lj!cSnRYT^+6fxP0_z>B4Oe)fJAj(&NX9(A
z0fS)CnScWsIumdp4k;K0lV%#Pd)*k_#jA2i&LCYANaBGplEtgKCix#$h1Fwpm+<qA
z!D3wS^2DPOIcL~I?|5R^G#t0F@eE2OrIRL4k*COI@}toqI~R51ZpLYer?;|Jmu;Jg
z{CzypW9Ye<TGeV>Zrrf+G#4;5orY}dH7w1}wb&U;bGU3&bjSfY5L4%jj5IYaZLAM9
z-`BjaJ~Xn5-7jCZa^JpyJfm*O>dx3sGiBKfJSynDSIHTQ%Ry>-Mw^38(WIRk<^$$D
z@moQUlWi4-mMqR2dNyxp;Epn7YOuSfyI1M&Gm^FyHTLQ;wcH_V+o-uEt!K%~-lT12
zQkD}_E0S?DnK9$8zRon+VB(cK$+T^nTWrU04cqGPbM@VBUQtn3-+*JM-BzwFrm7i7
z@6<_`k<}xlkyK4*vvmwJHBOJ7c8J!68->+@BhzNhowwkgCCgW>ZGE_X%jt5pV!BeL
z%<Rn?j*&3+?wp~yL;Z0lsau+14Gs|GmTPP`^mL#CcLchucuGG#U9BjeJtr9K%Os3l
z8N=P9*2%KaRTB&a)MN5CjTW=n)|jNmdxAB=yJUGQ<F}@Dx(c?kD0p{3jmupQ0U=b>
zD-8kF33TnG=PyK?lx4E21-i|6LN^6#rP3^`n*zOAVWD8IR@TTBSRd%4$C#pZJAxe!
zw#e$2@?&i7_PFDA4C@w6&#Z{l1cUvIzfRM1%`+m{7ThGOr{n==D4y0ka^#mnQCqM>
zmieP4KGU>bGhF*>XOwlShLttQ<q3A-`Es4W9-uC~Slkw&6Hr-lPz?*AnUaAn3P2n7
z!$CL<N8uAV0bj!zI0qNtSGWv+!e8(=mY{+&a2C$SIXEBdZ~-pFW%vND!L_&%qu7N#
z*o)gx$L%<T23nZI-MAM=a6dkW2k;O+k1yk4d=1~gH}P%!06)az_z`}Er|<{-5zpa8
zyo5jFFZc&ukw8MJQkpAOOEprHv{YIxMWjchPGO0ZkE?;aksiN`Xg;o-toNFbUYR(0
zYB#iP*r=?y<)i2OX)InUt7goc6=Ffuyr$)<s|n}K`(vS>bJs3<<lMD+D2??=imE&3
zRx7pL<oo*WvC){1QgsURa``kHJ#W5Jt4^Zm5Y2^)s5zM;Lp1NLudP)j3P{mxY^qh%
z5(>8fmt9d$P4pV8?pGAGlmbumt$mQ1z+zssTO*{2g^~QP4t+y+{ua){dAJ0>!)2U|
zWvF5;F2)92PR{!fZop00j+@DW`)~laawi@n7v6z8(Zwve@g8#IC-EtK8lS;u@dbPl
zU*gXEI(O!`@Cd$(@8eNCh9~e-{0u+GllTRGgWutWF)rI)?6UaNHJ44<k%;$YlRw6b
z?;SdSl6}8Dhfdkr4SI)%3O*x2mR98DveC3<+SVX(SlCohreIUfpcVX8uqlr)iRXYj
zK8KK)WMO`y6ll1X2TYuMyjqagR8h7Orx}JIM|2O(uT$=krVJ!WycBOuFh9?a3iV7Q
zJxS6KqiRt#NtZ)fobhYtnMR>(C5>VkS%%M+<k`<4X-W|$EMGGx)WSaJg<8r;-X&R+
zDx)mV8)rYqtx$@T_{B%SN5DtGN8q+aphQ>}V(0(sumAu5wzc0k%tydSfO759NHo$y
zNaXjbV?kkO?IEg<Q)PvX8x@p`(1i_Qe105H4R##QH}k~v&Q^GT*c&%0C>JL=E>2XO
W=HK+60sj8~=A54S`~UyV{{J7YA9?)%

diff --git a/paper_figures/Figure_Transcriptomics/README.md b/paper_figures/Figure_Transcriptomics/README.md
new file mode 100644
index 0000000..9c2239d
--- /dev/null
+++ b/paper_figures/Figure_Transcriptomics/README.md
@@ -0,0 +1,48 @@
+# FUS-ALS transcriptomic scores
+Scripts in this directory train and evaluate transcriptomic-based FUS-ALS vs.
+Healthy classifiers and generate figures summarizing model performance. Data for
+both spinal cord and fibroblast classifiers are derived from:
+
+- **fibroblast raw read counts:** `data/050322_RNAseq/read_counts.Rdata`
+- **spinal cord raw read counts:** `data/050322_RNAseq/nygc/read_counts.Rdata`
+
+## Preprocessing
+Spinal cord and fibroblast RNA-seq datasets are preprocessed from raw read
+counts using the `load_spine.R` and `load_fibro.R` scripts. Running these
+scripts generates the `processed_*.Rdata` files contained in 
+`data/050322_RNAseq`. The resulting files contain sparse matrices with
+both (i) raw read counts and (ii) TPM for genes that are both annotated in 
+the reactome pathway database and expressed in fibroblasts / spinal cord 
+samples.
+
+## Modeling
+Models can be fit using the `fit_*.R` scripts. The spinal cord models train ALS
+vs. healthy control classifiers on patient-derived spinal cord samples, using
+subsets of genes corresponding to reactome pathways. The fibroblast models train
+classifiers using the subset of reactome pathways with better than random
+prediction in the spinal cord data.
+
+### Parameters in modeling scripts
+#### Spinal cord model
+- `min.healthy`: minimum number of healthy controls for a site to be included in
+  analysis. I.e., data from sites with fewer than `min.healthy` HCs are dropped.
+
+- `n.null`: number of bootstrap samples to use in generating null distribution
+  on metadata features.
+
+- `min.size`, `max.size`: minimum and maximum pathway sizes included in
+  analysis.
+
+#### Fibroblast model
+- `n.rep`: number of training model replicates to fit for each hold-out cell
+  line. Upsampling for class balance, and optional bootstrap samples, result in
+  variation across model replicates.
+- `bootstrap`: T/F, should models be trained on bootstrap samples of the
+  training data. 
+- `gene_transform`: transformation applied to each feature. Default is rank
+  transform.
+
+
+## Figures
+`fig_*.R` scripts generate figures reported in the manuscript. Models must be
+trained first, using the `fit_*.R` scripts described above.
diff --git a/paper_figures/Figure_Transcriptomics/fig_age_cor.R b/paper_figures/Figure_Transcriptomics/fig_age_cor.R
new file mode 100644
index 0000000..567baeb
--- /dev/null
+++ b/paper_figures/Figure_Transcriptomics/fig_age_cor.R
@@ -0,0 +1,90 @@
+################################################################################
+#+ setup, message=FALSE
+################################################################################
+library(tidyverse)
+library(data.table)
+library(tidytext)
+library(ggrepel)
+theme_set(theme_bw(base_size = 22))
+
+save.fig <- TRUE
+age.of.onset <- TRUE
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.base.dir <- Sys.getenv("ALS_DATA")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics")
+output.fig.dir <- file.path(figure.dir, "fig")
+
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
+source(file.path(fig.base.dir, "paper_figures", "color_palette.R"))
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
+
+load(file.path(output.fig.dir, "fibro_pw_models_auroc.Rdata"))
+load(file.path(output.fig.dir, "spine_auc_enrich_group.Rdata"))
+
+# Initialize top-leve pathway grouping
+groups <- setNames(pw.enriched$RootName, pw.enriched$Name)
+
+# Initialize cell line metadata table
+xcell <- fread(library.file) %>%
+  dplyr::rename(CellLine = `Cell line`) %>%
+  dplyr::rename(Age = `Age of biopsy`) %>%
+  dplyr::rename(AgeOfOnset = `Age of onset`) %>%
+  dplyr::rename(Figure = `Figure Names`) %>%
+  dplyr::select(CellLine, Age, AgeOfOnset, Figure)
+
+# Select top-performaing pathway classifiers from spinal cord data
+top.spine.pw <- group_by(top.pw, RootName) %>%
+  mutate(Count = n()) %>%
+  filter(Count >= 3) %>%
+  top_n(1, AUC) %>%
+  filter(!is.na(RootName))
+
+# Initialize enzemblized fibroblast predictions
+ypred.ens <- filter(ypred, Name %in% top.spine.pw$Name) %>%
+  filter(Genetics != "sporadic") %>%
+  group_by(CellLine, Genetics, Sex, Site) %>%
+  summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred)) %>%
+  ungroup()
+
+# Merge predictions with cell metadata
+predictions <- left_join(ypred.ens, xcell, by = "CellLine") %>%
+  dplyr::select(-Ypred) %>%
+  dplyr::rename(Ypred = YpredSeq) %>%
+  mutate(AgeOfOnset = ifelse(AgeOfOnset == "Control", Age, AgeOfOnset)) %>%
+  mutate(AgeOfOnset = ifelse(AgeOfOnset == "control", Age, AgeOfOnset)) %>%
+  mutate(AgeOfOnset = ifelse(AgeOfOnset == "", Age, AgeOfOnset)) %>%
+  mutate(AgeOfOnset = as.numeric(AgeOfOnset)) %>%
+  mutate(Figure = str_remove_all(Figure, "-.*$"))
+
+if (age.of.onset) predictions$Age <- predictions$AgeOfOnset
+
+# Compute age / prediction correlation by group
+xplot.text <- group_by(predictions, Genetics) %>%
+  summarize(
+    Cor = cor.test(Ypred, Age, alternative = "less")$estimate,
+    CorP = cor.test(Ypred, Age, alternative = "less")$p.value
+  ) %>%
+  mutate(CorP = p.adjust(CorP, method = "bonferroni")) %>%
+  mutate(Cor = round(Cor, 3)) %>%
+  mutate(CorP = round(CorP, 3)) %>%
+  mutate(PLab = sapply(CorP, label_pval)) %>%
+  mutate(Ypred = ifelse(Genetics == "Healthy", 0.95, 0.85)) %>%
+  mutate(text = str_c("Cor = ", Cor, PLab))
+
+# Plot predictions v. age, FUS v. Healthy
+fout <- file.path(output.fig.dir, "fig_supp_rna_aob.pdf")
+if (save.fig) pdf(file = fout, h = 8, w = 8)
+xplot.text$Age <- 60
+filter(predictions, Genetics %in% c("Healthy", "FUS-ALS")) %>%
+  ggplot(aes(x = Age, y = Ypred, col = Genetics)) +
+  geom_point(size = 3) +
+  geom_line(stat = "smooth", method = "lm", formula = y ~ x, size = 2) +
+  geom_text(data = filter(xplot.text, Genetics != "sporadic"), aes(label = text), size = 8) +
+  geom_text_repel(aes(label = CellLine)) +
+  scale_color_manual(values = col.pal) +
+  ylim(0:1) +
+  ylab("t-MAP score") +
+  theme(legend.position = "none")
+if (save.fig) dev.off()
diff --git a/paper_figures/Figure_Transcriptomics/fig_fibro.R b/paper_figures/Figure_Transcriptomics/fig_fibro.R
new file mode 100644
index 0000000..b47a9be
--- /dev/null
+++ b/paper_figures/Figure_Transcriptomics/fig_fibro.R
@@ -0,0 +1,332 @@
+######################################################## i#######################
+#+ setup, echo=FALSE, warning=FALSE, message=FALSE
+################################################################################
+library(data.table)
+library(tidyverse)
+library(ggsci)
+library(Matrix)
+library(PRROC)
+library(ggrepel)
+library(org.Hs.eg.db)
+library(RColorBrewer)
+
+options(stringsAsFactors = FALSE)
+
+auroc <- function(yp, yt) roc.curve(scores.class0 = yp, weights.class0 = yt)$auc
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics")
+pathway.file <- file.path(data.dir, "reactome_pathway.Rdata")
+gene.file <- file.path(fig.base.dir, "data", "transcriptomic_genes.csv")
+
+output.fig.dir <- file.path(figure.dir, "fig")
+dir.create(output.fig.dir, showWarnings = FALSE)
+
+source(file.path(figure.dir, "utilities.R"))
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
+source(file.path(fig.base.dir, "paper_figures", "color_palette.R"))
+source(file.path(fig.base.dir, "paper_figures", "theme.R"))
+theme_set(theme_als())
+
+# Parameters for analysis
+heat.pal <- pal_material("blue-grey")(10)
+save.fig <- TRUE
+
+################################################################################
+#+ load_data
+################################################################################
+load(file.path(output.fig.dir, "fibro_models.Rdata"))
+load(file.path(output.fig.dir, "spine_models.Rdata"))
+load(pathway.file)
+
+xgene <- fread(gene.file) %>%
+  mutate(Disease = ifelse(!Disease %in% c("ALS", ""), "ND", Disease))
+
+# Initialize pathway groups
+groups <- setNames(pw.enriched$RootName, pw.enriched$Name)
+
+# Initialize keys for converting between ENSEMBL / Symbol gene IDs
+genes <- rbindlist(coefs)$Var1 %>%
+  as.character() %>%
+  str_subset("^ENS")
+
+gene.key.se <- mapIds(
+  x = org.Hs.eg.db,
+  keys = genes,
+  column = "SYMBOL",
+  keytype = "ENSEMBL"
+)
+
+# Select top-performing pathway classifiers from spinal cord data
+top.spine.pw <- filter(pw.enriched, RootName != "Other") %>%
+  group_by(RootName) %>%
+  top_n(1, AUC)
+
+# Compute fibroblast AUROC for pathways / groups
+auc.fibro <- filter(ypred, Genetics != "sporadic") %>%
+  group_by(Name) %>%
+  summarize(AUC = auroc(YpredSeq, ALS)) %>%
+  arrange(desc(AUC))
+
+auc.fibro.group <- mutate(auc.fibro, Group = groups[Name]) %>%
+  group_by(Group) %>%
+  summarize(AUC = max(AUC), Name = Name[which.max(AUC)]) %>%
+  arrange(AUC)
+
+# Generate pathway enrichment table
+xtab <- left_join(pw.enriched, auc.fibro, by='Name') %>%
+  dplyr::rename(AUROC_Spine=AUC.x) %>%
+  dplyr::rename(AUROC_Fibroblast=AUC.y) %>%
+  group_by(RootName) %>%
+  arrange(desc(AUROC_Fibroblast), .by_group = TRUE) %>%
+  ungroup() %>%
+  dplyr::select(Name, RootName, Size, AUROC_Fibroblast, AUROC_Spine)
+
+fout <- file.path(output.fig.dir, 'pw_enrichment_table.csv')
+write.csv(file = fout, xtab, row.names = FALSE)
+
+################################################################################
+#' ## Distribution of predictions by pathway
+#+ pw_dist, fig.height=12, fig.width=12
+################################################################################
+ypred.ens <- filter(ypred, Name %in% top.spine.pw$Name) %>%
+  filter(Genetics != "sporadic") %>%
+  group_by(CellLine, Genetics, Sex, Site) %>%
+  summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred))
+
+fit <- lm(YpredSeq ~ Genetics + Sex + Site, data = ypred.ens)
+pval.lm <- summary(fit)$coefficients["GeneticsHealthy", "Pr(>|t|)"]
+print(pval.lm)
+
+yfus <- ypred.ens$YpredSeq[ypred.ens$Genetics == "FUS-ALS"]
+ywt <- ypred.ens$YpredSeq[ypred.ens$Genetics == "Healthy"]
+pval.sr <- wilcox.test(yfus, ywt, alternative = "greater")$p.value
+print(pval.sr)
+
+pval.lab <- label_pval(pval.sr)
+ymax <- max(ypred.ens$Ypred) + 1e-2
+
+fout <- file.path(output.fig.dir, "predictions_by_genetics.pdf")
+if (save.fig) pdf(fout, height = 8, width = 4)
+ggplot(ypred.ens, aes(x = Genetics, y = YpredSeq, fill = Genetics)) +
+  geom_boxplot(outlier.shape=NA) +
+  geom_jitter(size = 3, width = 0.2, alpha = 0.6) +
+  geom_text(x = 1.5, y = ymax + 5e-2, label = pval.lab, size = 8) +
+  geom_segment(x = 1, xend = 2, y = ymax, yend = ymax, col = "#0B4668", linewidth = 1) +
+  scale_fill_manual(values = col.pal) +
+  ylab("t-MAP score") +
+  xlab(NULL) +
+  theme(legend.position = "none") +
+  ylim(0:1)
+if (save.fig) dev.off()
+
+################################################################################
+#' ## Imaging vs. sequencing predictions for select pathway
+#+ pw_predictions, fig.height=12, fig.width=12
+################################################################################
+xthr <- filter(ypred.ens, Genetics == "Healthy") %>% arrange(desc(Ypred))
+xthr <- xthr$Ypred[1]
+
+ythr <- filter(ypred.ens, Genetics == "Healthy") %>% arrange(desc(YpredSeq))
+ythr <- ythr$YpredSeq[1]
+
+fout <- file.path(output.fig.dir, "pathway_predictions.pdf")
+if (save.fig) pdf(fout, height = 10, width = 10)
+ypred.ens %>%
+  ggplot(aes(x = Ypred, y = YpredSeq, col = Genetics)) +
+  geom_vline(xintercept = xthr, col = "grey", lty = 2, linewidth = 1.5) +
+  geom_hline(yintercept = ythr, col = "grey", lty = 2, linewidth = 1.5) +
+  geom_point(size = 4) +
+  geom_text_repel(aes(label = CellLine), size = 6) +
+  scale_color_manual(values = col.pal) +
+  theme(legend.position = "none") +
+  xlab("i-MAP score") +
+  ylab("t-MAP score") +
+  xlim(0:1) +
+  ylim(0:1)
+if (save.fig) dev.off()
+
+
+# Plot ensemblized predictions with sporadics
+ypred.ens.full <- filter(ypred, Name %in% top.spine.pw$Name) %>%
+  group_by(CellLine, Genetics, Sex, Site) %>%
+  summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred))
+
+fout <- file.path(output.fig.dir, "pathway_predictions_full.pdf")
+if (save.fig) pdf(fout, height = 10, width = 10)
+ypred.ens.full %>%
+  ggplot(aes(x = Ypred, y = YpredSeq, col = Genetics)) +
+  geom_vline(xintercept = xthr, col = "grey", lty = 2, linewidth = 1.5) +
+  geom_hline(yintercept = ythr, col = "grey", lty = 2, linewidth = 1.5) +
+  geom_point(size = 4) +
+  geom_text_repel(aes(label = CellLine), size = 6) +
+  scale_color_manual(values = col.pal) +
+  theme(legend.position = "none") +
+  xlab("i-MAP score") +
+  ylab("t-MAP score") +
+  xlim(0:1) +
+  ylim(0:1)
+if (save.fig) dev.off()
+
+################################################################################
+#+ ## imaging and pathway predictions by cell line
+#+ predictions_by_celline
+################################################################################
+xplot <- group_by(ypred, Name) %>%
+  mutate(SeqThreshold = max(YpredSeq[ALS == 0])) %>%
+  mutate(Threshold = max(Ypred[ALS == 0])) %>%
+  ungroup() %>%
+  mutate(SepSeq = YpredSeq > SeqThreshold) %>%
+  mutate(Sep = Ypred > Threshold) %>%
+  mutate(Group = groups[Name]) %>%
+  group_by(Group, CellLine, Genetics) %>%
+  summarize(Sep = any(Sep), SepSeq = any(SepSeq))
+
+xplot.seq <- filter(xplot, SepSeq) %>%
+  dplyr::select(CellLine, Group, Genetics)
+
+xplot.img <- filter(xplot, Sep) %>%
+  dplyr::select(CellLine, Group, Genetics) %>%
+  mutate(Group = "Imaging")
+
+cell.order <- dplyr::select(ypred, CellLine, Ypred, ALS) %>%
+  distinct() %>%
+  arrange(desc(Ypred))
+
+fout <- file.path(output.fig.dir, "cell_line_sep.pdf")
+if (save.fig) pdf(fout, height = 8, width = 10)
+rbind(xplot.seq, xplot.img) %>%
+  filter(Genetics != "sporadic") %>%
+  mutate(Group = factor(Group, levels = c(auc.fibro.group$Group, "Imaging"))) %>%
+  mutate(CellLine = factor(CellLine, levels = cell.order$CellLine)) %>%
+  ggplot(aes(x = CellLine, y = Group)) +
+  geom_point(size = 8) +
+  theme(axis.text.x = element_text(color = col.pal["FUS-ALS"])) +
+  geom_hline(yintercept = 8.5, lty = 2, col = "#363636") +
+  xlab(NULL) +
+  ylab(NULL) +
+  theme(legend.position = "none")
+if (save.fig) dev.off()
+
+################################################################################
+#' ## Number of cell lines classified by modality
+#+ num_cell_line, fig.height=12, fig.width=12
+################################################################################
+xplot <- auc.fibro.group %>% 
+  mutate(Group = factor(Group, levels = Group)) %>%
+  filter(Group != "Other")
+
+fout <- file.path(output.fig.dir, "num_classified_auc.pdf")
+if (save.fig) pdf(fout, height = 8, width = 2)
+ggplot(xplot, aes(x = Group, y = AUC)) +
+  geom_point(size = 6, color = "#0B4668") +
+  geom_segment(aes(xend = Group, yend = 0.7), lwd = 3, color = "#0B4668") +
+  coord_flip() +
+  theme(legend.position = "none") +
+  xlab(NULL) +
+  theme(axis.text.y = element_blank()) +
+  ylab("AUROC") +
+  scale_y_continuous(breaks = c(0.7, 1), limits = c(0.7, 1))
+if (save.fig) dev.off()
+
+################################################################################
+#' ## Minimal gene set
+#' We sought to identify minimal gene sets cable of predicting ALS vs. health
+#' within each PW group. This strategy was analogous to imaging: (i) define a
+#' refined, pathway focused assay (ii) train classifier (iii) predict hold-out.
+#'
+#' As with imaging, we focused on sparse (minimal) set of readouts for
+#' prediction. Focusing on spinal cord enriched PWs to prioritize ALS relevant
+#' pathways.
+#+ coeffs, fig.height=8, fig.width=12
+################################################################################
+#' ## Minimal gene set
+coef.select.full <- rbindlist(coefs) %>%
+  filter(str_detect(Var1, "^ENSG")) %>%
+  mutate(Group = groups[Name]) %>%
+  filter(Group != "Other") %>%
+  group_by(Var1, Group, Name) %>%
+  summarize(Stability = mean(AbsBeta != 0), AbsBeta = mean(AbsBeta)) %>%
+  mutate(Var1 = as.character(Var1)) %>%
+  group_by(Group, Var1) %>%
+  summarize(Stability = max(Stability), AbsBeta = max(AbsBeta)) %>%
+  mutate(SYMBOL = gene.key.se[Var1]) %>%
+  filter(Stability >= 0.5)
+
+# Initialize stability score / coefficient matrices
+coef.select <- group_by(coef.select.full, Group) %>%
+  top_n(10, Stability) %>%
+  mutate(Group = factor(Group, levels = auc.fibro.group$Group))
+
+xgene <- dplyr::rename(xgene, SYMBOL = Gene)[-14, ]
+xgene <- mutate(xgene, Disease = factor(Disease, levels = c("ALS", "ND", "")))
+
+# Plot gene fingerprints by pathway
+gene.order <- left_join(coef.select, xgene, by = "SYMBOL") %>%
+  group_by(SYMBOL) %>%
+  summarize(Stability = max(Stability)) %>%
+  arrange(desc(Stability))
+
+fout <- file.path(output.fig.dir, "gene_fingerprint.pdf")
+if (save.fig) pdf(fout, height = 8, width = 24)
+mutate(coef.select, SYMBOL = factor(SYMBOL, levels = gene.order$SYMBOL)) %>%
+  ggplot(aes(x = SYMBOL, y = Group)) +
+  geom_point(aes(col = Stability), size=8) +
+  geom_point(shape = 1, size=8) +
+  scale_color_material("blue-grey", breaks=c(0.5, 1), limits=c(0.5, 1)) +
+  theme(axis.text.x = element_text(angle = 90)) +
+  xlab(NULL) +
+  ylab(NULL) +
+  theme(legend.position = "bottom") +
+  theme(legend.title = element_blank()) +
+  theme(legend.key.width = unit(2, "cm"))
+if (save.fig) dev.off()
+
+
+fout <- file.path(output.fig.dir, "gene_refs.pdf")
+if (save.fig) pdf(fout, height = 3, width = 24)
+mutate(xgene, SYMBOL = factor(SYMBOL, levels = gene.order$SYMBOL)) %>%
+  ggplot(aes(x = SYMBOL, y = 1, shape = Disease)) +
+  geom_point(size = 8) +
+  theme(axis.text.x = element_text(angle = 90)) +
+  theme(axis.text.y = element_blank()) +
+  theme(panel.grid.minor.y = element_blank()) +
+  theme(panel.grid.major.y = element_blank()) +
+  scale_shape_manual(guide = "none", values = c(8, 10, 32)) +
+  xlab(NULL) +
+  ylab(NULL) +
+  ylim(0.95, 1.05)
+if (save.fig) dev.off()
+
+# Plot gene fingerprints aggregated
+fout <- file.path(output.fig.dir, "gene_fingerprint_agg.pdf")
+if (save.fig) pdf(fout, height = 7, width = 20)
+left_join(gene.order, xgene, by = "SYMBOL") %>%
+  mutate(Disease = as.character(Disease)) %>%
+  mutate(Disease = ifelse(Disease == "", "No reported ND", Disease)) %>%
+  mutate(Disease = ifelse(is.na(Disease), "No reported ND", Disease)) %>%
+  mutate(SYMBOL = factor(SYMBOL, levels = rev(gene.order$SYMBOL))) %>%
+  ggplot(aes(y = SYMBOL, x = Stability, col = Disease)) +
+  geom_point(size=5) +
+  geom_segment(aes(yend=SYMBOL), xend = 0.5, linewidth = 3) +
+  scale_color_jama() +
+  theme(axis.text.x = element_text(angle = 90)) +
+  xlab('Stability score') +
+  ylab(NULL) +
+  labs(col=NULL) +
+  theme(legend.position = c(0.1, 0.85)) +
+  theme(legend.text = element_text(size = 18)) +
+  coord_flip()
+if (save.fig) dev.off()
+
+# Write gene fingerprint table
+xtab <- left_join(coef.select.full, xgene, by = "SYMBOL") %>%
+  dplyr::rename(RootName = Group) %>%
+  group_by(RootName) %>%
+  arrange(desc(Stability), .by_group = TRUE) %>%
+  dplyr::select(RootName, SYMBOL, Stability, `Protein name`, Function, Disease, Reference, `Author, Year`)
+
+fout <- file.path(output.fig.dir, 'gene_enrichment_table.csv')
+write.csv(file = fout, xtab, row.names = FALSE)
\ No newline at end of file
diff --git a/paper_figures/Figure_Transcriptomics/fig_spine.R b/paper_figures/Figure_Transcriptomics/fig_spine.R
new file mode 100644
index 0000000..0198f36
--- /dev/null
+++ b/paper_figures/Figure_Transcriptomics/fig_spine.R
@@ -0,0 +1,119 @@
+################################################################################
+#+ setup, echo=FALSE, warning=FALSE, message=FALSE
+################################################################################
+library(data.table)
+library(tidyverse)
+library(Matrix)
+library(ggsci)
+library(PRROC)
+library(ggh4x)
+library(parallel)
+library(RColorBrewer)
+options(stringsAsFactors = FALSE)
+theme_set(theme_bw(base_size = 22))
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics")
+pathway.file <- file.path(data.dir, "reactome_pathway.Rdata")
+hierarchy.file <- file.path(fig.base.dir, "data", "reactome_hierarchy.Rdata")
+output.fig.dir <- file.path(figure.dir, "fig")
+
+disc.col.pal <- pal_npg()(10)[-8]
+save.fig <- FALSE
+n.null <- 100000
+p.thresh <- 0
+
+load(file.path(output.fig.dir, "spine_models.Rdata"))
+load(hierarchy.file)
+
+source(file.path(fig.base.dir, "paper_figures", "color_palette.R"))
+
+clean_name <- function(x, n = 10) {
+  # Prettify pathway names for figure axes
+  x <- str_split(x, " ")[[1]]
+  #if (length(x) > 12) n <- 6
+  nbreak <- floor(length(x) / n)
+  
+  # adjust n for roughly equal words per line
+  n <- ceiling(length(x) / (nbreak + 1))
+  
+  if (nbreak > 0) {
+    for (i in 1:nbreak) {
+      if ((i * n) > length(x)) break
+      x[i * n] <- str_c(x[i * n], "\n")
+    }
+  }
+  
+  return(str_c(x, collapse = " "))
+}
+
+################################################################################
+#+ pathways, fig.height=12, fig.width=16
+# Initialize top-level pathways in hierarchy
+################################################################################
+root.nodes <- filter(nodes, Top)$id
+root.children <- lapply(root.nodes, get_children, nodes = nodes, edges = edges)
+root.children <- mapply(function(r, ch) c(r, ch), root.nodes, root.children, SIMPLIFY = FALSE)
+
+root.children <- reshape2::melt(root.children) %>%
+  dplyr::rename(Root = L1) %>%
+  dplyr::rename(Depth = L2) %>%
+  dplyr::rename(id = value) %>%
+  left_join(nodes, by = "id") %>%
+  dplyr::rename(Name = label)
+
+root.key <- filter(root.children, Top) %>%
+  dplyr::select(Root, Name) %>%
+  dplyr::rename(RootName = Name)
+
+pw.enriched <- filter(pw.aucs, p <= p.thresh) %>%
+  left_join(root.children, by = "Name") %>%
+  left_join(root.key, by = "Root") %>%
+  group_by(RootName) %>%
+  mutate(Count = n()) %>%
+  mutate(RootName = ifelse(Count < 3, "Other", RootName)) %>%
+  ungroup() %>%
+  filter(!(Name == "Retinoid metabolism and transport" & RootName == "Other"))
+
+# Plot pathways with enrichment
+limits <- c(0.25, 1)
+breaks <- seq(0.25, 1, by=0.25)
+
+p1 <- filter(pw.enriched, p == 0) %>%
+  mutate(Name=str_sub(Name, 1, 50)) %>%
+  ggplot(aes(x=AUC, y=RootName)) +
+  #geom_boxplot() +
+  geom_jitter(height=0.25, width=0) +
+  ylab(NULL) +
+  scale_x_continuous(limits=limits, breaks=breaks) +
+  xlab(NULL) +
+  theme(axis.text.x = element_blank()) +
+  labs(fill=NULL)
+
+p2 <- ggplot(null.accuracy) +
+  geom_histogram(aes(x=AUC)) +
+  scale_x_continuous(limits=limits, breaks=breaks) +
+  ylab(NULL) +
+  xlab('AUROC')
+
+# build the plots 
+p1.common <- ggplot_gtable(ggplot_build(p1))
+p2.common <- ggplot_gtable(ggplot_build(p2))
+p2.common$widths <- p1.common$widths
+
+fout <- file.path(output.fig.dir, 'spine_auroc.pdf')
+pdf(file=fout, height=12, width=8)
+grid.arrange(p1.common, p2.common, nrow=2, heights=c(5, 1))
+dev.off()
+
+
+# Save Rdata object
+fout <- file.path(output.fig.dir, "spine_models.Rdata")
+save(file = fout, pw.aucs, pw.models, null.pred, null.accuracy, pw.enriched)
+
+# Save table of pathway enrichment
+fout <- file.path(output.fig.dir, "spine_enrichment_table.csv")
+xtab <- dplyr::select(pw.enriched, Name, RootName, Size, AUC, p)
+write.csv(file = fout, xtab, row.names=FALSE)
diff --git a/paper_figures/Figure_Transcriptomics/fit_fibro.R b/paper_figures/Figure_Transcriptomics/fit_fibro.R
new file mode 100644
index 0000000..731c5c4
--- /dev/null
+++ b/paper_figures/Figure_Transcriptomics/fit_fibro.R
@@ -0,0 +1,101 @@
+################################################################################
+#+ setup, echo=FALSE, warning=FALSE, message=FALSE
+################################################################################
+library(data.table)
+library(tidyverse)
+library(Matrix)
+library(mltools)
+library(ggrepel)
+library(parallel)
+
+options(stringsAsFactors = FALSE)
+theme_set(theme_bw(base_size = 22))
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics")
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
+pathway.file <- file.path(data.dir, "reactome_pathway.Rdata")
+
+output.fig.dir <- file.path(figure.dir, "fig")
+dir.create(output.fig.dir, showWarnings = FALSE)
+source(file.path(figure.dir, "utilities.R"))
+
+# Parameters for analysis
+n.core <- 12
+nrep <- 10
+bootstrap <- TRUE
+
+gene_transform <- function(x) rank(x)
+model <- fit_lm_classification
+model_predict <- predict_lm_classification
+
+################################################################################
+#' ## Data preprocessing
+#+ load_data, echo=FALSE, warning=FALSE, message=FALSE
+################################################################################
+load(file.path(data.dir, "fibro_processed.Rdata"))
+load(file.path(output.fig.dir, "spine_models.Rdata"))
+load(file.path(output.fig.dir, "genes.Rdata"))
+load(pathway.file)
+
+colnames(xmeta.fibro) <- str_replace_all(colnames(xmeta.fibro), " ", "_")
+
+x <- x.fibro[, shared.genes]
+xmeta <- mutate(xmeta.fibro, Y = as.numeric(ALS))
+rm(x.fibro, xmeta.fibro)
+
+# Initialize one-hot encoding of metadata features
+xonehot <- dplyr::select(xmeta, Site, Sex) %>%
+  mutate_all(as.factor) %>%
+  as.data.table() %>%
+  mltools::one_hot()
+
+# Normalize features for modeling
+x <- apply(x, MAR = 2, gene_transform)
+
+# Load in cell line library metadata
+xcell <- fread(library.file) %>%
+  dplyr::rename(CellLine = `Cell line`) %>%
+  dplyr::rename(Fig = `Figure Names`)
+
+################################################################################
+#' ## Modeling
+#' Transcription model of ALS. We fit supervised models trained to predict
+#' disease status. We use a leave-one-out sample split to evaluate predictions
+#' on held-out WT & FUS cell lines. Genes used as features in classification
+#' models are taken from pathways enriched in patient-derived spinal cord
+#' samples.
+#+ models, fig.height=8, fig.width=12, cache=TRUE
+################################################################################
+pw.genes <- pw.enriched$ENSEMBL
+pathways <- pw.enriched$Name
+
+fit <- mclapply(pw.genes, function(g) {
+  fit_genes(x, xmeta, xonehot, g, model, model_predict, nrep, bootstrap = bootstrap)
+}, mc.cores = n.core)
+
+ypred <- lapply(1:length(fit), function(i) {
+  return(mutate(fit[[i]]$Ypred, Name = pathways[i]))
+})
+
+ypred <- rbindlist(ypred) %>%
+  group_by(CellLine, Genetics, Name, Site, Sex) %>%
+  summarize(YpredSeq = mean(YpredSeq), Ypred = mean(Ypred), .groups = "drop") %>%
+  mutate(ALS = as.numeric(Genetics != "Healthy"))
+
+coefs <- lapply(1:length(fit), function(i) {
+  coef.i <- reshape2::melt(fit[[i]]$Coef) %>%
+    mutate(Name = pathways[i]) %>%
+    dplyr::rename(CellIdx = L1) %>%
+    filter(Var1 != "Intercept") %>%
+    mutate(AbsBeta = abs(value)) %>%
+    mutate(SignBeta = sign(value)) %>%
+    dplyr::select(Name, Var1, CellIdx, AbsBeta, SignBeta)
+
+  return(coef.i)
+})
+
+fout <- file.path(output.fig.dir, "fibro_models.Rdata")
+save(file = fout, fit, ypred, coefs)
diff --git a/paper_figures/Figure_Transcriptomics/fit_spine.R b/paper_figures/Figure_Transcriptomics/fit_spine.R
new file mode 100644
index 0000000..964ddb3
--- /dev/null
+++ b/paper_figures/Figure_Transcriptomics/fit_spine.R
@@ -0,0 +1,133 @@
+###############################################################################
+#+ setup, echo=FALSE, warning=FALSE, message=FALSE
+################################################################################
+library(data.table)
+library(tidyverse)
+library(Matrix)
+library(caret)
+library(iRF)
+library(PRROC)
+library(parallel)
+
+options(stringsAsFactors = FALSE)
+theme_set(theme_bw(base_size = 22))
+
+# Paths to data directories
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/")
+figure.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Transcriptomics")
+output.fig.dir <- file.path(figure.dir, "fig")
+
+n.core <- 12
+train.prop <- 0.5 # training proportion
+min.healthy <- 3 # minimum number of healthy samples to include site in analysis
+n.null <- 100000 # number of bootstrap subsamples for null distribution
+min.size <- 5 # minimum pathway size included in analysis
+max.size <- 500 # maximmum pathway size included in analysis
+
+source(file.path(figure.dir, "utilities.R"))
+load(file.path(data.dir, "spine_processed.Rdata"))
+load(file.path(output.fig.dir, "genes.Rdata"))
+load(file.path(data.dir, "reactome_pathway.Rdata"))
+
+auc <- function(x) {
+  # Compute AUROC from data frame with columns `Ypred`, `Ytrue`
+  roc.curve(scores.class0 = x$Ypred, weights.class0 = x$Ytrue)$auc
+}
+
+pw_predict <- function(x, y, id.train, genes) {
+  # Fit ALS vs. health classification models on subset of genes
+  fselect <- str_c(genes, collapse = "|")
+  fselect <- str_c("(", fselect, "|Sex|^Site$|^Age$|pct_)")
+  x <- dplyr::select(x, matches(fselect))
+  return(fit_sample_split(x, y, id.train))
+}
+
+################################################################################
+#' ## Load data
+#' Below Initialize spinal cord data sets used in classification models. We
+#' include sample metadata features: `Sex`, `Site` (of sample collection),
+#' `Age`, and `pct_x` (x corresponding to ancestry computed from WGS data).
+#+ modeling
+################################################################################
+xmeta <- dplyr::select(xmeta.spine, matches("(^Sex$|^Site$|^Age$|pct_)"))
+x.spine <- x.spine[, shared.genes]
+
+x <- cbind(x.spine, xmeta)
+y <- as.factor(xmeta.spine$ALS)
+
+# Filter to sites with both ALS and healthy control samples
+sites <- group_by(xmeta.spine, Site) %>% summarize(NHealthy = sum(!ALS))
+dplyr::select(xmeta.spine, Site, ALS) %>% table()
+
+genes <- colnames(x.spine)
+id.drop <- xmeta$Site %in% sites$Site[sites$NHealthy < min.healthy]
+
+x <- x[!id.drop, ]
+y <- y[!id.drop]
+xmeta <- xmeta[!id.drop, ]
+x.spine <- x.spine[!id.drop, ]
+
+# Take balanced sample within site
+set.seed(47)
+
+xtrain <- data.frame(x, Y = y) %>%
+  mutate(Idx = 1:n()) %>%
+  group_by(Site, Y) %>%
+  sample_frac(train.prop)
+
+id.train <- xtrain$Idx
+id.test <- setdiff(1:nrow(x), id.train)
+ytrue <- as.numeric(y) - 1
+
+################################################################################
+#' ## Null model
+#' Performance of a null model is estimated
+#+ null_model
+################################################################################
+bs.train <- replicate(n.null, sample(id.train, replace = TRUE), simplify = FALSE)
+bs.test <- replicate(n.null, sample(id.test, replace = TRUE), simplify = FALSE)
+
+fit.null <- mcmapply(function(trn, tst) {
+  fit <- fit_sample_split(xmeta, y, trn, tst)
+  ypred <- data.frame(Ypred = fit$ypred, Ytrue = ytrue[fit$id])
+  acc <- data.frame(AUC = auc(ypred))
+  return(list(ypred = ypred, acc = acc))
+}, bs.train, bs.test, mc.cores = n.core, SIMPLIFY = FALSE)
+
+null.accuracy <- rbindlist(lapply(fit.null, function(z) z$acc))
+null.pred <- lapply(fit.null, function(z) z$ypred)
+
+################################################################################
+#' ## Pathway models
+#' For each reactome pathway, we fit ALS vs. healthy classifiers using the
+#' subset of genes associated with that pathway.
+#+ pw_models
+################################################################################
+reactome.pathways <- reactome.pathways %>%
+  mutate(ENSEMBL = sapply(ENSEMBL, intersect, y = genes)) %>%
+  mutate(Size = sapply(ENSEMBL, length)) %>%
+  filter(Size > min.size & Size <= max.size) %>%
+  dplyr::select(-Genes, -ID)
+
+# Fit classification models for each reactome pathway
+pw.models <- mclapply(reactome.pathways$Name, function(pw) {
+  genes <- filter(reactome.pathways, Name == pw)$ENSEMBL
+  genes <- na.omit(unlist(genes))
+  fit.g <- pw_predict(x, y, id.train, genes)
+  ypred <- data.frame(Ypred = fit.g$ypred, Ytrue = ytrue[fit.g$id])
+
+  return(list(
+    acc = data.frame(AUC = auc(ypred), Name = pw),
+    ypred = ypred,
+    importance = fit.g$importance
+  ))
+}, mc.cores = n.core)
+
+pw.aucs <- rbindlist(lapply(pw.models, function(z) z$acc)) %>%
+  arrange(desc(AUC)) %>%
+  left_join(reactome.pathways, by = "Name") %>%
+  mutate(p = sapply(AUC, function(z) mean(null.accuracy$AUC > z)))
+
+fout <- file.path(output.fig.dir, "spine_models.Rdata")
+save(file = fout, pw.aucs, pw.models, null.pred, null.accuracy)
diff --git a/paper_figures/Figure_Transcriptomics/load_fibroblast.R b/paper_figures/Figure_Transcriptomics/load_fibroblast.R
index 10eb4b1..71b3170 100644
--- a/paper_figures/Figure_Transcriptomics/load_fibroblast.R
+++ b/paper_figures/Figure_Transcriptomics/load_fibroblast.R
@@ -1,74 +1,65 @@
-#' The following loads RNA-seq data for patient derived fibroblast samples.
-#' 
-#' Requires input datasets
-#'   `data/050322_RNAseq`
-#'   `data/reactome_pathway.Rdata`
-#'   `scripts/Figure_scores/models/predictions.Rdata`
-#' 
-#' **Note:** RNA-seq read count datasets can be processed from raw fastq files 
-#' using the `als_rnaseq_fibro` pipeline.
-#' Karl Kumbier 10/19/2022library(data.table)
-library(tidyverse)
+################################################################################
+#+ setup
+################################################################################
 library(Matrix)
-library(clusterProfiler)
+library(tidyverse)
+library(data.table)
+library(DESeq2)
 
-options(stringsAsFactors=FALSE)
+options(stringsAsFactors = FALSE)
 
-################################################################################
-# Load data and initialize analysis paramters
-################################################################################
 # Paths to data directories
-fig.base.dir <- Sys.getenv('ALS_PAPER')
-data.dir <- file.path(fig.base.dir, 'data/050322_RNAseq/')
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/")
 
-pred.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Scores', 'fig_full') 
-pathway.file <- str_c(data.dir, 'reactome_pathway.Rdata')
-library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv')
-qc.file <- file.path(fig.base.dir, 'data', 'data Table 2.1 Sample Sequencing Statistics.csv')
+pred.dir <- file.path(fig.base.dir, "paper_figures", "Figure_Scores", "irf", "fig_full")
+pathway.file <- str_c(data.dir, "reactome_pathway.Rdata")
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
+qc.file <- file.path(fig.base.dir, "data", "data Table 2.1 Sample Sequencing Statistics.csv")
 
-load(file.path(data.dir, 'read_counts.Rdata'))
-load(file.path(pred.dir, 'predictions.Rdata'))
-xqc <- fread(qc.file) %>% dplyr::rename(CellLine=`Sample ID`)
+load(file.path(data.dir, "read_counts.Rdata"))
+load(file.path(pred.dir, "predictions.Rdata"))
+xqc <- fread(qc.file) %>% dplyr::rename(CellLine = `Sample ID`)
 
 # Source utility functions
-source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
 expression.threshold <- 2.5 # minimum log TPM + 1 level for gene filtering
 
 ################################################################################
 # Initialize cell line metadata
 ################################################################################
 # Read in cell line meta data
-cell.lines <- str_remove_all(bam.files, '^.*/') %>% str_remove_all('Aligned.*$')
+cell.lines <- str_remove_all(bam.files, "^.*/") %>% str_remove_all("Aligned.*$")
 
 xcell <- fread(library.file) %>%
-  dplyr::rename(CellLine=`Cell line`) %>%
-  dplyr::rename(Genetics=`Disease Gene`)
-
-xmeta <- data.frame(CellLine=cell.lines) %>% 
-  left_join(xcell, by='CellLine') %>%
-  mutate(Genetics=str_replace_all(Genetics, 'ALS with no known mutation', 'sporadic')) %>% 
-  mutate(Genetics=str_replace_all(Genetics, 'WT', 'Healthy')) %>%
-  mutate(Genetics=str_replace_all(Genetics, 'FUS', 'FUS-ALS')) %>%
-  mutate(Sex=ifelse(Sex == 'm', 'M', Sex)) %>%
-  mutate(Site=`Origin Institution`) %>%
-  mutate(Institution=Site) %>%
-  left_join(xqc, by='CellLine')
+  dplyr::rename(CellLine = `Cell line`) %>%
+  dplyr::rename(Genetics = `Disease Gene`)
+
+xmeta <- data.frame(CellLine = cell.lines) %>%
+  left_join(xcell, by = "CellLine") %>%
+  mutate(Genetics = str_replace_all(Genetics, "ALS with no known mutation", "sporadic")) %>%
+  mutate(Genetics = str_replace_all(Genetics, "WT", "Healthy")) %>%
+  mutate(Genetics = str_replace_all(Genetics, "FUS", "FUS-ALS")) %>%
+  mutate(Sex = ifelse(Sex == "m", "M", Sex)) %>%
+  mutate(Site = `Origin Institution`) %>%
+  mutate(Institution = Site) %>%
+  left_join(xqc, by = "CellLine")
 
 # Initialize cell line genetics key
 gene.tab <- dplyr::select(xmeta, CellLine, Genetics)
 gene.tab <- setNames(gene.tab$Genetics, gene.tab$CellLine)
 
 # Merge metadata with imaging predictions
-predictions <- group_by(predictions, CellLine) %>% 
+predictions <- group_by(predictions, CellLine) %>%
   arrange(desc(Ypred)) %>%
   dplyr::select(CellLine, CellLinePassageNumber, Ypred)
 
-xmeta <- left_join(xmeta, predictions, by='CellLine') %>%
-  mutate(ALS=Genetics != 'Healthy')
+xmeta <- left_join(xmeta, predictions, by = "CellLine") %>%
+  mutate(ALS = Genetics != "Healthy")
 
 #' # Preprocessing
 #' Raw read counts are preprocessed as follows:
-#' 
+#'
 #' 1. Normalize expression within sample as transcript per million (TPM)
 #' 2. Drop genes with low expression: average log(TPM + 1) < `r expression.threshold`
 #' 3. Filter reactome pathway genes
@@ -79,24 +70,26 @@ genes <- filter(genes, ensembl_gene_id %in% colnames(x)) %>%
   arrange(match(ensembl_gene_id, colnames(x)))
 
 exon.length <- setNames(genes$ExonSize, genes$ensembl_gene_id)
-genes.drop <- which(is.na(exon.length)) %>% names
-x <- as.matrix(x[,!colnames(x) %in% genes.drop])
+genes.drop <- which(is.na(exon.length)) %>% names()
+x <- as.matrix(x[, !colnames(x) %in% genes.drop])
 xcount <- x
 
-x <- sapply(colnames(x), function(g) x[,g] / exon.length[g])
-x <- t(apply(x, MAR=1, function(z) 1e6 * z / sum(z)))
+x <- sapply(colnames(x), function(g) x[, g] / exon.length[g])
+x <- t(apply(x, MAR = 1, function(z) 1e6 * z / sum(z)))
 
 # Drop low expression genes
-xcount <- xcount[,colMedians(log(x + 1)) >= expression.threshold]
-x <- x[,colMedians(log(x + 1)) >= expression.threshold]
+xcount <- xcount[, colMedians(log(x + 1)) >= expression.threshold]
+x <- x[, colMedians(log(x + 1)) >= expression.threshold]
 
 # Filter to genes with annotated pathways
 load(pathway.file)
-xcount <- xcount[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
-x <- x[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
+xcount <- xcount[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
+x <- x[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
 
 # Rename for joint analysis
 xcount.fibro <- xcount
 x.fibro <- x
-xmeta.fibro <- mutate(xmeta, Type='Fibroblast') 
-rm(x, xmeta)
\ No newline at end of file
+xmeta.fibro <- mutate(xmeta, Type = "Fibroblast")
+
+fout <- file.path(data.dir, "fibro_processed.Rdata")
+save(file = fout, x.fibro, xcount.fibro, xmeta.fibro)
diff --git a/paper_figures/Figure_Transcriptomics/load_spine.R b/paper_figures/Figure_Transcriptomics/load_spine.R
index 9b71cef..7e7dc63 100644
--- a/paper_figures/Figure_Transcriptomics/load_spine.R
+++ b/paper_figures/Figure_Transcriptomics/load_spine.R
@@ -1,63 +1,52 @@
-#' The following loads RNA-seq data for patient derived spinal cord samples.
-#' 
-#' Requires input datasets
-#'   `data/050322_RNAseq`
-#'   `data/reactome_pathway.Rdata`
-#' 
-#' **Note:** RNA-seq read count datasets can be processed from raw fastq files 
-#' using the `als_rnaseq_spine` pipeline.
-#' 
-#' Karl Kumbier 10/19/2022
+################################################################################
+#+ setup
+################################################################################
 library(Matrix)
 library(tidyverse)
 library(data.table)
+library(DESeq2)
 
-################################################################################
-# Load data and initialize analysis paramters
-################################################################################
 # Paths to data directories
-fig.base.dir <- Sys.getenv('ALS_PAPER')
-data.dir <- file.path(fig.base.dir, 'data/050322_RNAseq/')
-nygc.dir <- file.path(data.dir, 'nygc/')
+fig.base.dir <- Sys.getenv("ALS_PAPER")
+data.dir <- file.path(fig.base.dir, "data/050322_RNAseq/")
+nygc.dir <- file.path(data.dir, "nygc/")
 
-pred.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Scores', 'fig_full') 
-pathway.file <- str_c(data.dir, 'reactome_pathway.Rdata')
-library.file <- file.path(fig.base.dir, 'data', 'Fibroblasts_Library.csv')
-pathway.file <- file.path(data.dir, 'reactome_pathway.Rdata')
+pathway.file <- str_c(data.dir, "reactome_pathway.Rdata")
+library.file <- file.path(fig.base.dir, "data", "Fibroblasts_Library.csv")
 
 # Source utility functions
-source(file.path(fig.base.dir, 'paper_figures', 'utilities.R'))
+source(file.path(fig.base.dir, "paper_figures", "utilities.R"))
 expression.threshold <- 2.5 # minimum log read count threshold
 
 # Load exon length table
-load(file.path(data.dir, 'read_counts.Rdata'))
+load(file.path(data.dir, "read_counts.Rdata"))
 gene.length <- dplyr::select(genes, ensembl_gene_id, ExonSize)
 
 # Load spinal cord read count data
-load(file.path(nygc.dir, 'read_counts_aw.Rdata'))
-genes <- left_join(genes, gene.length, by='ensembl_gene_id')
+load(file.path(nygc.dir, "read_counts_aw.Rdata"))
+genes <- left_join(genes, gene.length, by = "ensembl_gene_id")
 
 # Initialize patient populations used for analysis
-als.groups <- 'ALS Spectrum MND'
-control.groups <- 'Non-Neurological Control'
-site <- 'Spinal_Cord_Cervical'
+als.groups <- "ALS Spectrum MND"
+control.groups <- "Non-Neurological Control"
+site <- "Spinal_Cord_Cervical"
 
 # Initialize metadata table
-meta.file <- 'RNA_Metadata_200212/Delivered Quotes-Table 1.csv'
+meta.file <- "RNA_Metadata_200212/Delivered Quotes-Table 1.csv"
 xmeta <- fread(file.path(nygc.dir, meta.file))
-  
+
 colnames(xmeta) <- colnames(xmeta) %>%
-  str_remove_all(' ') %>%
-  str_remove_all('\\(.*$')
+  str_remove_all(" ") %>%
+  str_remove_all("\\(.*$")
 
-xmeta <- dplyr::rename(xmeta, Mutation=ReportedGenomicMutations) %>%
-  mutate(SubjectGroup=str_replace_all(SubjectGroup, 'DIs', 'Dis'))
+xmeta <- dplyr::rename(xmeta, Mutation = ReportedGenomicMutations) %>%
+  mutate(SubjectGroup = str_replace_all(SubjectGroup, "DIs", "Dis"))
 
 # Filter to metadata with RNA-seq samples
 xmeta <- filter(xmeta, ExternalSampleId %in% xbam$ExternalSampleId)
 
 #' # Preprocessing
-#' 
+#'
 #' We preprocess data as follows:
 #' 1. Filter to samples derived from patient spinal cords
 #' 2. Filter to ALS and non-neurological control samples
@@ -70,8 +59,8 @@ xmeta <- filter(xmeta, ExternalSampleId %in% xbam$ExternalSampleId)
 id.spine <- xmeta$SampleSource %in% site
 id.groups <- xmeta$SubjectGroup %in% c(als.groups, control.groups)
 
-x <- x[id.spine & id.groups,]
-xmeta <- xmeta[id.spine & id.groups,]
+x <- x[id.spine & id.groups, ]
+xmeta <- xmeta[id.spine & id.groups, ]
 
 # Normalize read counts as transcripts per killobase million (TPM)
 genes <- filter(genes, ensembl_gene_id %in% colnames(x)) %>%
@@ -79,31 +68,33 @@ genes <- filter(genes, ensembl_gene_id %in% colnames(x)) %>%
 
 exon.length <- setNames(genes$ExonSize, genes$ensembl_gene_id)
 
-genes.drop <- which(is.na(exon.length)) %>% names
-x <- as.matrix(x[,!colnames(x) %in% genes.drop])
+genes.drop <- which(is.na(exon.length)) %>% names()
+x <- as.matrix(x[, !colnames(x) %in% genes.drop])
 xcount <- x
 
-x <- sapply(colnames(x), function(g) x[,g] / exon.length[g])
-x <- t(apply(x, MAR=1, function(z) 1e6 * z / sum(z)))
+x <- sapply(colnames(x), function(g) x[, g] / exon.length[g])
+x <- t(apply(x, MAR = 1, function(z) 1e6 * z / sum(z)))
 
 # Drop low expression genes
-xcount <- xcount[,colMedians(log(x + 1)) >= expression.threshold]
-x <- x[,colMedians(log(x + 1)) >= expression.threshold]
+xcount <- xcount[, colMedians(log(x + 1)) >= expression.threshold]
+x <- x[, colMedians(log(x + 1)) >= expression.threshold]
 
 # Filter to genes with annotated pathways
 load(pathway.file)
-xcount <- xcount[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
-x <- x[,colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
+xcount <- xcount[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
+x <- x[, colnames(x) %in% unlist(reactome.pathways$ENSEMBL)]
 
 # Standardize metadata feature names
-xmeta <- mutate(xmeta, ALS=SubjectGroup %in% als.groups) %>%
-  dplyr::rename(Site=SiteSpecimenCollected) %>%
-  dplyr::rename(Age=AgeatDeath) %>%
-  mutate(Age=ifelse(Age == '90 or Older', 90, Age)) %>%
-  mutate(Age=as.numeric(Age))
+xmeta <- mutate(xmeta, ALS = SubjectGroup %in% als.groups) %>%
+  dplyr::rename(Site = SiteSpecimenCollected) %>%
+  dplyr::rename(Age = AgeatDeath) %>%
+  mutate(Age = ifelse(Age == "90 or Older", 90, Age)) %>%
+  mutate(Age = as.numeric(Age))
 
 # Rename for joint analysis
 x.spine <- x
 xcount.spine <- xcount
-xmeta.spine <- mutate(xmeta, Type='Spine')
-rm(x, xmeta)
+xmeta.spine <- mutate(xmeta, Type = "Spine")
+
+fout <- file.path(data.dir, "spine_processed.Rdata")
+save(file = fout, x.spine, xcount.spine, xmeta.spine)
diff --git a/paper_figures/Figure_Transcriptomics/pca_analysis.R b/paper_figures/Figure_Transcriptomics/pca_analysis.R
index a6dd207..b6705da 100644
--- a/paper_figures/Figure_Transcriptomics/pca_analysis.R
+++ b/paper_figures/Figure_Transcriptomics/pca_analysis.R
@@ -1,4 +1,4 @@
-#' In this script, we run PCA analysis on expression data to investigate primary
+#' In this script, we run PCA analysis on expression data and report primary
 #' drivers of variation in fibroblast / spinal cord expression data.
 #' 
 #' Requires input datasets
@@ -17,26 +17,12 @@
 #+ setup, echo=FALSE, warning=FALSE, message=FALSE
 library(data.table)
 library(tidyverse)
-library(hrbrthemes)
 library(Matrix)
 library(gridExtra)
 library(ggsci)
 
 options(stringsAsFactors=FALSE)
-
-theme_set(
-  theme_ipsum(
-    plot_title_size=24,
-    axis_title_size=18,
-    strip_text_size=18, 
-    axis_text_size=18,
-    base_size=18,
-    axis=TRUE,
-    base_family='sans',
-    axis_col='#000000',
-    grid=FALSE
-  )
-)
+theme_set(theme_bw(base_size=18))
 
 grid.arrange4 <- function(...) grid.arrange(..., ncol=4)
 
@@ -45,16 +31,16 @@ grid.arrange4 <- function(...) grid.arrange(..., ncol=4)
 ################################################################################
 fig.base.dir <- Sys.getenv('ALS_PAPER')
 data.dir <- file.path(fig.base.dir, 'data/050322_RNAseq/')
-figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure3')
-output.fig.dir <- file.path(figure.dir, 'fig3')
+figure.dir <- file.path(fig.base.dir, 'paper_figures', 'Figure_Transcriptomics')
+output.fig.dir <- file.path(figure.dir, 'fig')
 dir.create(output.fig.dir, showWarnings=FALSE)
 
 ################################################################################
 # Load data and initialize analysis parameters
 ################################################################################
 source(file.path(figure.dir, 'utilities.R'))
-source(file.path(figure.dir, 'load_fibroblast.R'))
-source(file.path(figure.dir, 'load_spine.R'))
+load(file.path(data.dir, 'fibro_processed.Rdata'))
+load(file.path(data.dir, 'spine_processed.Rdata'))
 
 # Initialize analysis parameters
 save.fig <- TRUE
@@ -65,9 +51,6 @@ gene_transform <- function(x) log(x + 1)
 colnames(xmeta.fibro) <- str_replace_all(colnames(xmeta.fibro), ' ', '_')
 xmeta.fibro <- xmeta.fibro %>% mutate_if(is.character, as.factor)
 
-# Fix typo in cell line passage number
-xmeta.fibro$CellLinePassageNumber[xmeta.fibro$CellLine == 'ND40077'] <- 20
-
 # Clean names of fibro metadata
 xmeta.fibro <- xmeta.fibro %>%
   dplyr::rename(Age=Age_of_biopsy) %>%
@@ -86,7 +69,7 @@ pca <- prcomp(gene_transform(x.fibro))
 xpca <- cbind(pca$x, xmeta.fibro)
 pct.var <- round(pca$sdev ^ 2 / sum(pca$sdev ^ 2), 3) * 100
 
-meta.features <- c('Site', 'Sex', 'ALS', 'Age', 'Passage')
+meta.features <- c('Site', 'Sex', 'Genetics', 'Age')
 
 p <- lapply(meta.features, function(f) {
   out1 <- ggplot(xpca, aes_string(col=f)) +
@@ -105,6 +88,9 @@ p <- lapply(meta.features, function(f) {
     out1 <- out1 + scale_color_viridis_c()
     out2 <- out2 + scale_color_viridis_c()
     
+  } else if (f == "Genetics") {
+    out1 <- out1 + scale_color_manual(values = col.pal)
+    out2 <- out2 + scale_color_manual(values = col.pal)
   } else {
     out1 <- out1 + scale_color_jama()
     out2 <- out2 + scale_color_jama()
@@ -144,7 +130,7 @@ p <- lapply(meta.features, function(f) {
     out1 <- out1 + scale_color_viridis_c()
     out2 <- out2 + scale_color_viridis_c()
     
-  } else {
+  }  else {
     out1 <- out1 + scale_color_manual(values=disc.pal)
     out2 <- out2 + scale_color_manual(values=disc.pal)
   }
diff --git a/paper_figures/Figure_Transcriptomics/utilities.R b/paper_figures/Figure_Transcriptomics/utilities.R
index e342a56..b865bdd 100644
--- a/paper_figures/Figure_Transcriptomics/utilities.R
+++ b/paper_figures/Figure_Transcriptomics/utilities.R
@@ -1,145 +1,132 @@
-library(DESeq2)
-library(fgsea)
+library(glmnet)
+library(iRF)
 
 ################################################################################
-# Differential expression and gene set enrichment analysis
+# Supervised analysis
 ################################################################################
-deseq_design <- function(x, xmeta, design, ...) {
-  # Wrapper function for running deseq over targeted samples
-  deseq <- DESeqDataSetFromMatrix(
-    countData=t(round(x)),
-    colData=xmeta,
-    design=as.formula(design),
-  )
+fit_holdout <- function(x, xmeta, lines.predict, fit_, predict_, bootstrap=FALSE) {
+  # Wrapper function for supervised analysis with holdout cell lines
+  train.id <- !xmeta$CellLine %in% lines.predict
+  test.id <- xmeta$CellLine %in% lines.predict
   
-  deseq <- DESeq(deseq, ...)
-  return(deseq)
-}
-
-
-gsea <- function(enrichment, pathway.file, pthr=0.01, ...) {
-  # Compute gene set enrichment for selected pathway
-
-  load(pathway.file)
-  pathways <- reactome.pathways$Genes
-  names(pathways) <- reactome.pathways$Name
-
-  entrez <- clusterProfiler::bitr(
-    names(enrichment),
-    fromType="ENSEMBL",
-    toType="ENTREZID",
-    OrgDb='org.Hs.eg.db'
-  )
+  x <- mutate(data.frame(x), Y=xmeta$Y)
+  xus <- caret::upSample(x[train.id,], as.factor(xmeta$ALS[train.id]))
   
-  # Join entrez ids with enrichment data
-  gene.set <- data.frame(Loading=unname(enrichment)) %>%
-    mutate(ENSEMBL=names(enrichment)) %>%
-    right_join(entrez, by='ENSEMBL')
-  
-  # Run gene set enrichment analysis for bag data
-  gene.list <- gene.set$Loading
-  names(gene.list) <- gene.set$ENTREZID
-  
-  out <- fgsea(
-    pathways=pathways,
-    stats=gene.list,
-    scoreType='pos',
-    eps=1e-30,
-    ...
-  )
+  if (bootstrap) {
+    xus <- group_by(xus, Class) %>% 
+      sample_frac(1, replace=TRUE) %>%
+      ungroup()
+  }
   
-  out <- filter(out, padj < pthr) %>% arrange(padj)
-  out.collapse <- collapsePathways(out, pathways, gene.list)
-  out.f <- filter(out, pathway %in% out.collapse$mainPathways)
-  return(list(pw=out.f, filtered=out.collapse, pw.full=out))
-}
-
-
-jaccard <- function(x, y) {
-  # Compute jaccard distance between two sets
-  return(length(intersect(x, y)) / length(union(x, y)))
+  xi <- as.matrix(dplyr::select(xus, -Class, -Y))
+  fit <- fit_(x=as.matrix(xi), y=xus$Y)
+  
+  # Get model coefficients
+  betas <- coef(fit, s='lambda.min')
+  if (sum(betas) == 0) {
+    betas <- coef(fit$glmnet.fit)
+    betas.sum <- Matrix::colSums(betas)
+    betas <- betas[,min(which(betas.sum != 0))]
+  }
+  
+  x <- dplyr::select(x, -Y) %>% as.matrix
+  ypred <- predict_(fit, x[test.id,])
+  ypred <- mutate(xmeta[test.id,], YpredSeq=ypred[,1])
+  return(list(Ypred=ypred, Coef=betas))
 }
 
-pathway_distance <- function(gene.set, pw.names=NULL) {
-  # Compute jaccard distance matrix over set of pathways
-  pairs <- combn(1:length(gene.set), 2, simplify=FALSE)
-  dvec <- sapply(pairs, function(p) jaccard(gene.set[[p[1]]], gene.set[[p[2]]]))
-  
-  # Initialize pathway-level distance matrix
-  dmat <- matrix(0, nrow=length(gene.set), ncol=length(gene.set))
-  
-  dmat[lower.tri(dmat)] <- dvec
-  dmat <- dmat + t(dmat)
-  diag(dmat) <- 1
+fit_genes <- function(x, xmeta, xonehot, genes, fit_, predict_, 
+                      nrep=10, bootstrap=FALSE) {
   
-  if (!is.null(pw.names)) {
-    rownames(dmat) <- pw.names
-    colnames(dmat) <- pw.names
-  }
+  # Runs leave-one out prediction analysis, fitting models on selected subset of 
+  # full gene set.
+  xg <- x[,intersect(genes, colnames(x))]
+  xg <- cbind(xg, xonehot)
   
-  return(dmat)
-}
+  # Initialize subset of FUS/WT cell lines for binary classification
+  cell.lines <- filter(xmeta, Genetics != 'sporadic')$CellLine
+  s.cell.lines <- filter(xmeta, Genetics == 'sporadic')$CellLine
 
-subsample_deseq <- function(x, xmeta, 
-                            designs='~Genetics + Site',
-                            name='Genetics_Healthy_vs_FUS.ALS') {
-  # Wrapper function for computing deseq over subsamples samples
-  
-  id <- sample(nrow(x), 0.75 * nrow(x))
-  
-  out <- sapply(designs, function(d) {
-    deseq <- deseq_design(
-      x[id,], 
-      xmeta[id,],
-      design=d,
-      fitType='mean'
-    )
+  out <- lapply(1:length(cell.lines), function(i) {
+    set.seed(i)
+    lines.predict <- c(s.cell.lines, cell.lines[i])
+    
+    out <- replicate(nrep, {
+      fit_holdout(xg, xmeta, lines.predict, fit_, predict_, bootstrap)
+    }, simplify=FALSE)
     
-    res <- results(deseq, name=name)
-    return(setNames(-log10(res$pvalue), rownames(res)))
+    ypred <- lapply(out, function(z) z$Ypred) %>% rbindlist
+    coefs <- sapply(out, function(z) as.matrix(z$Coef))
+    rownames(coefs) <- c('Intercept', colnames(xg))
+    return(list(Ypred=ypred, Coef=coefs))
   })
   
-  return(out)
+  ypred <- lapply(out, function(z) z$Ypred) %>% rbindlist
+  coefs <- lapply(out, function(z) z$Coef)
+  return(list(Ypred=ypred, Coef=coefs))
 }
 
-################################################################################
-# Supervised analysis
-################################################################################
-fit_holdout <- function(x, xmeta, lines.predict, fit_, predict_, importance_) {
-  # Wrapper function for supervised analysis with holdout cell lines
-  train.id <- !xmeta$CellLine %in% lines.predict
-  test.id <- xmeta$CellLine %in% lines.predict
-  y <- as.numeric(xmeta$Y)
-  labels <- as.numeric(xmeta$ALS)
+
+fit_sample_split <- function(x, y, id.train, id.test=NULL, n.iter=1, n.core=1) {
+  # Fits iRF and predicts RF models on sample split
+  if (is.null(id.test)) id.test <- setdiff(1:nrow(x), id.train)
+  
+  fit.trn <- iRF(
+    x=x[id.train,],
+    y=y[id.train],
+    type='ranger',
+    n.iter=n.iter,
+    n.core=n.core,
+    respect.unordered.factors=TRUE
+  )
   
+  fit.tst <-  iRF(
+    x=x[id.test,],
+    y=y[id.test],
+    type='ranger',
+    n.iter=n.iter,
+    n.core=n.core,
+    respect.unordered.factors=TRUE
+  )
   
-  x <- mutate(data.frame(x), Y=xmeta$Y)
-  xus <- caret::upSample(x[train.id,], as.factor(labels[train.id]))
-  xi <- as.matrix(dplyr::select(xus, -Class, -Y))
-  yi <- xus$Y
-  fit <- fit_(x=xi, y=yi)
+  ypred.train <- predict(fit.tst$rf.list, x[id.train], predict.all=TRUE)
+  ypred.train <- rowMeans(ypred.train$predictions)
   
-  x <- dplyr::select(x, -Y) %>% as.matrix
-  ypred <- predict_(fit, x[test.id,])
-  ypred <- data.frame(CellLine=xmeta$CellLine[test.id]) %>%
-    mutate(YpredSeq=ypred[,1]) %>%
-    mutate(Genetics=xmeta$Genetics[test.id])
+  ypred.test <- predict(fit.trn$rf.list, x[id.test], predict.all=TRUE)
+  ypred.test <- rowMeans(ypred.test$predictions)
   
+  ypred <- c(ypred.train, ypred.test)
+  id.split <- c(id.train, id.test)
   
-  return(ypred)
-}
-
-
-fit_model <- function(x, y, id.train, model, model_predict) {
-  # Wrapper function for fitting model and predicting on held-out samples
-  fit <- model(x[id.train,], y[id.train])
-  ypred <- model_predict(fit, x[!id.train,])
-  return(list(fit=fit, ypred=ypred, ytest=y[!id.train]))
+  importance.trn <- fit.trn$rf.list$variable.importance
+  importance.tst <- fit.tst$rf.list$variable.importance
+  importance <- (importance.trn + importance.tst) / 2
+  
+  return(list(id=id.split, ypred=ypred, importance=importance))
 }
 
 ################################################################################
 # Wrapper functions for running supervised analysis
 ################################################################################
+fit_lm <- function(x, y, ...) {
+  # Standardized function for sparse logistic regression
+  return(cv.glmnet(x=x, y=y, nfolds=3, intercept=FALSE, ...))
+}
+
+predict_lm <- function(fit, x) {
+  # Standardized function for sparse logistic regression predictions
+  lambda.min <- fit$lambda.min
+  
+  if (which(fit$lambda == lambda.min) == 1) {
+    ypred <- predict(fit$glmnet.fit, x, type='response')
+    null.model <- colMeans(ypred == 0.5) == 1
+    ypred <- ypred[,min(which(!null.model))] %>% as.matrix
+  } else {
+    ypred <- predict(fit, x, s='lambda.min', type='response')
+  }
+  return(ypred)
+}
+
 fit_lm_classification <- function(x, y, ...) {
   # Standardized function for sparse logistic regression
   return(cv.glmnet(x=x, y=y, nfolds=3, family='binomial', intercept=FALSE, ...))
@@ -160,3 +147,37 @@ predict_lm_classification <- function(fit, x) {
   
   return(ypred)
 }
+
+################################################################################
+# General utility functions
+################################################################################
+reweight_importance <- function(importance, xcor) {
+  # Computes correlation-weighted feature importance. 
+  importance <- importance[rownames(xcor)]
+  out <- abs(xcor) %*% as.matrix(importance)
+  return(out[,1])
+}
+
+get_children <- function(nodes, edges, ids, children=list()) {
+  # Traverse graph to select children of selected node
+  if (!any(ids %in% edges$from)) return(children)
+  
+  new.children <- edges$to[edges$from %in% ids]
+  children <- c(children, list(new.children))
+  return(get_children(nodes, edges, new.children, children))
+}
+
+set_node_gradient <- function(x, col.pal=viridis(100), col.range=NULL) {
+  if (all(is.na(x))) return(rep('#707579', length(x)))
+
+  if (is.null(col.range)) {
+    col.range <- c(min(x, na.rm=TRUE), max(x, na.rm=TRUE))
+  }
+
+  if (length(unique(col.range)) == 1) stop('Color range must contain unique levels')
+  cut.seq <- seq(col.range[1], col.range[2], length.out=length(col.pal))
+  out <- rev(col.pal)[cut(x, cut.seq, include.lowest=TRUE)]
+  out[is.na(out)] <- '#707579'
+  return(out)
+}
+
diff --git a/paper_figures/color_palette.R b/paper_figures/color_palette.R
index 2f43f24..debdc5d 100644
--- a/paper_figures/color_palette.R
+++ b/paper_figures/color_palette.R
@@ -1,58 +1,5 @@
-library(tidyverse)
-library(RColorBrewer)
-library(data.table)
+library(ggsci)
 
-genetics.include <- c('C9orf72', 'FUS-ALS', 'sporadic', 'TBK1', 'Healthy')
-
-library.file <- 'Fibroblasts_Library.csv'
-library.path <- file.path(Sys.getenv('ALS_PAPER'), 'data', library.file)
-
-# Load in cell line metadata
-xcell <- fread(library.path) %>% 
-  dplyr::rename(Genetics=`Disease Gene`, CellLine=`Cell line`) %>%
-  mutate(Genetics=ifelse(Genetics == 'WT', 'Healthy', Genetics)) %>%
-  mutate(Genetics=ifelse(Genetics == 'FUS', 'FUS-ALS', Genetics)) %>%
-  mutate(Genetics=str_replace_all(Genetics, 'ALS with no known mutation', 'sporadic'))
-
-# Initialize cell line/age key
-age.key <- setNames(xcell$`Age of biopsy`, xcell$CellLine)
-
-# Initialize cell line/genetics key
-xg <- dplyr::select(xcell, CellLine, Genetics) %>% 
-  distinct() %>%
-  group_by(Genetics) %>% 
-  summarize(CellLine=list(unique(CellLine))) %>%
-  filter(Genetics %in% genetics.include)
-
-# Initialize base color palette
-col.pals <- list(
-  brewer.pal(8, 'Purples')[-(1:3)],
-  brewer.pal(8, 'Reds')[-(1:2)],
-  brewer.pal(8, 'Greys')[-(1:3)],
-  brewer.pal(8, 'YlOrRd')[4:5],
-  brewer.pal(8, 'Greens')[-(1:3)]
-)
-
-# Initialize color palette for cell lines
-col.pal.c <- lapply(1:nrow(xg), function(i) {
-  ncell <- length(xg$CellLine[[i]])
-  cols <- colorRampPalette(col.pals[[i]])(ncell)
-  
-  cells.i <- xg$CellLine[[i]]
-  age.i <- age.key[cells.i] %>% sort %>% rev
-  
-  names(cols) <- names(age.i)
-  return(cols)
-})
-
-col.pal.c <- unlist(col.pal.c)
-
-# Initialize color palette for genetics
-col.pal.g <- sapply(1:nrow(xg), function(i) {
-  ncell <- length(xg$CellLine[[i]]) + 1
-  cols <- colorRampPalette(col.pals[[i]])(ncell)
-  return(cols[floor(ncell / 2)])
-})
-
-names(col.pal.g) <- xg$Genetics
-col.pal <- c(col.pal.c, col.pal.g)
+col.pal <- pal_jama()(7)
+genetics.include <- c("FUS-ALS", "sporadic", "Healthy", "ALS")
+names(col.pal)[c(4, 5, 1, 3)] <- genetics.include
diff --git a/paper_figures/setup.R b/paper_figures/setup.R
new file mode 100644
index 0000000..e84974d
--- /dev/null
+++ b/paper_figures/setup.R
@@ -0,0 +1,41 @@
+# Keras install
+install.packages("keras")
+reticulate::install_python(version = "3.10")
+keras::install_keras()
+
+# Standard package installs
+install.packages("data.table")
+install.packages("tidyverse")
+install.packages("Matrix")
+install.packages("viridis")
+install.packages("ggsci")
+install.packages("caret")
+install.packages("glmnet")
+install.packages("PRROC")
+install.packages("RColorBrewer")
+install.packages("scales")
+install.packages("superheat")
+install.packages("gridExtra")
+install.packages("hrbrthemes")
+install.packages("ggpubr")
+install.packages("patchwork")
+install.packages("Cairo")
+install.packages("kableExtra")
+install.packages("clusterProfiler")
+install.packages("mltools")
+install.packages("ggrepel")
+install.packages("parallel")
+install.packages("ggh4x")
+install.packages("tidytext")
+install.packages("lme4")
+install.packages("lmerTest")
+install.packages("R.utils")
+
+# Bioconductor package installs
+install.packages("BiocManager")
+BiocManager::install("org.Hs.eg.db")
+BiocManager::install("DESeq2")
+
+# Devtools installs
+install.packages("devtools")
+devtools::install_github("karlkumbier/iRF2.0")
diff --git a/paper_figures/theme.R b/paper_figures/theme.R
new file mode 100644
index 0000000..6dcc044
--- /dev/null
+++ b/paper_figures/theme.R
@@ -0,0 +1,4 @@
+theme_als <- function(){
+	theme_bw() %+replace%
+	theme(panel.grid.minor = element_blank())
+}
diff --git a/paper_figures/utilities.R b/paper_figures/utilities.R
index 0a0c96a..7d453a7 100644
--- a/paper_figures/utilities.R
+++ b/paper_figures/utilities.R
@@ -1,126 +1,136 @@
-#' This script contains helper functions for running the predictive analyses 
-#' described in Kumbier et al. 2022. The core function `fit_model` can be 
-#' used with any supervised model by passing in functions with standardized
-#' inputs/outputs for model fitting and prediction. Currently, models for
-#' iterative random forests and logistic regression with L1 penalty are 
-#' supported.
-#' 
-#' The functions `cell_fit` and `cell_fit_single` are used to run leave one
-#' cell line out analyses described in the paper.
-#' 
-#' Note: irf is set to use all available cores.
-#' 
-#' Karl Kumbier 10/18/2022
+# " This script contains helper functions for running the predictive analyses.
+# " The core function `fit_model` can be used with any supervised model by
+# " passing in functions with standardized inputs/outputs for model fitting and
+# " prediction. Currently, models for iterative random forests and logistic
+# " regression with L1 penalty are supported.
+# "
+# " Note: irf is set to use all available cores.
+# "
+# " Karl Kumbier 10/4/2023
 library(iRF)
 library(glmnet)
+library(keras)
 library(caret)
 library(PRROC)
 
-cell_fit <- function(x, model, model_predict, genetics.train='FUS-ALS', verbose=TRUE) {
+cell_fit <- function(
+    x, model, model_predict,
+    genetics.train = "FUS-ALS", # nolint
+    verbose = TRUE,
+    ncell = 250) {
   # Wrapper function to call cell_fit_single predict over all cell lines.
 
-
-  # Rank-normalize features within plate
+  # Normalize features within plate
   ecdf <- function(x) rank(x) / length(x)
   xfeat <- ungroup(x) %>%
-    dplyr::select(matches('(^X|PlateID)')) %>%
+    dplyr::select(matches("(^X|PlateID)")) %>%
     group_by(PlateID) %>%
     mutate_if(is.numeric, ecdf) %>%
     ungroup() %>%
     dplyr::select(-PlateID)
 
-  x <- dplyr::select(x, -matches('^X'))
+  x <- dplyr::select(x, -matches("^X"))
   x <- cbind(x, xfeat)
 
   # Initialize grid for hold-out cell lines, training genetics
   cell.lines <- unique(x$CellLine)
-  
+
   # Drop sporadic lines from model training — evaluated in all models
-  sporadics <- filter(x, Genetics == 'sporadic')$CellLine
-  if (!'sporadic' %in% genetics.train) cell.lines <- setdiff(cell.lines, sporadics)
-  
+  sporadics <- filter(x, Genetics == "sporadic")$CellLine
+  if (!"sporadic" %in% genetics.train) {
+    cell.lines <- setdiff(cell.lines, sporadics)
+  }
+
+  # Fit model
   fits <- lapply(cell.lines, function(cl) {
     if (verbose) print(cl)
     cell_fit_single(
-      x=x,
-      cell.test=cl,
-      genetics.train=genetics.train,
-      model=model, 
-      model_predict=model_predict
+      x = x,
+      cell.test = cl,
+      genetics.train = genetics.train,
+      model = model,
+      model_predict = model_predict,
+      ncell = ncell
     )
   })
-  
+
   id.drop <- sapply(fits, is.null)
   fits <- fits[!id.drop]
-  
+
   # Initialize importance
   importance <- sapply(fits, function(f) f$fit$fit$importance)
-  ypred <- lapply(fits, function(f) f$ypred) %>% rbindlist
-  return(list(ypred=ypred, importance=importance))
+  ypred <- lapply(fits, function(f) f$ypred) %>% rbindlist()
+  return(list(ypred = ypred, importance = importance))
 }
 
-cell_fit_single <- function(x, cell.test, model, model_predict, genetics.train='FUS-ALS', ncell=200) {
+cell_fit_single <- function(x, cell.test, model, model_predict,
+                            genetics.train = "FUS-ALS", ncell = 250) {
   # Runs leave one cell line out prediction analysis.
   #
   # x: data frame containing predictive features (named as ^X.*), response
   #   (Genetics), CellLine, WellID, PlateID, and Markers.
-  # cell.test: held-out cell line to be used for model evaluation.
+  # cell.test: held-out cell line(s) to be evaluated.
   # model: standardized model fitting function. Takes arguments x, y only.
   # model_predict: standardized model prediction function. Takes arguments
   #   fit (fitted model) and x only
   # genetics.train: ALS genetic backgrounds to be used for model training.
-  
+  # ncell: number of cells per well to use in training
+
   # Check for valid input
-  print(str_c('Fitting ', cell.test, '...'))
-  if (nrow(x) == 0) return(NULL)
-  
+  model.line <- cell.test
+  print(str_c("Fitting ", cell.test, "..."))
+  if (nrow(x) == 0) {
+    return(NULL)
+  }
+
   # Set training/test indices based on hold-out cell line
-  sporadics <- filter(x, Genetics == 'sporadic')$CellLine
-  cells.test <- cell.test
-  if (!'sporadic' %in% genetics.train) cells.test <- c(cells.test, unique(sporadics))
-  cells.train <- setdiff(unique(x$CellLine), cells.test)
-  
+  sporadics <- filter(x, Genetics == "sporadic")$CellLine
+  if (!"sporadic" %in% genetics.train) cell.test <- c(cell.test, unique(sporadics))
+  cells.train <- setdiff(unique(x$CellLine), cell.test)
+
   # Set training/test indices based on hold-out cell line
   id.cell.train <- which(x$CellLine %in% cells.train)
-  id.genetics <- which(x$Genetics %in% c(genetics.train, 'Healthy'))
-  
+  id.genetics <- which(x$Genetics %in% c(genetics.train, "Healthy"))
+
   id.train <- intersect(id.cell.train, id.genetics)
-  id.test <- which(x$CellLine %in% cells.test)
-  if (length(id.test) == 0) return(NULL)
-  
+  id.test <- which(x$CellLine %in% cell.test)
+  if (length(id.test) == 0) {
+    return(NULL)
+  }
+
   # Set feature matrix and response vector
-  y <- as.numeric(x$Genetics != 'Healthy')
-  xx <- select(x, matches('^X'))
-  
+  y <- as.numeric(x$Genetics != "Healthy")
+  xx <- select(x, matches("^X"))
+
   # Initialize cell index table for down sampling
-  xidx <- dplyr::select(x, ID) %>% 
-    mutate(Y=y, Idx=1:n()) %>%
+  xidx <- dplyr::select(x, ID) %>%
+    mutate(Y = y, Idx = 1:n()) %>%
     filter(Idx %in% id.train)
-  
+
   # Down sample for equal cell counts from each well
-  xcount <- group_by(xidx, ID) %>% count()
+  xcount <- group_by(xidx, ID) %>% dplyr::count()
   sample.size <- min(c(xcount$n, ncell))
   xidx <- group_by(xidx, ID) %>% sample_n(sample.size)
-  
+
   # Down sample for class balance
   xidx <- downSample(xidx, as.factor(xidx$Y))
-  
+
   # Fit model
   id.train <- intersect(id.train, xidx$Idx)
-  fit <- fit_model(xx, y, id.train, model, model_predict, test.id=id.test)
-  
+  fit <- fit_model(xx, y, id.train, model, model_predict, test.id = id.test)
+
   # Initialize prediction table output
-  f.select <- c('CellLine', 'Genetics', 'WellID', 'PlateID', 'Markers')
-  out <- dplyr::select(x[id.test,], one_of(f.select)) %>%
-    mutate(Ypred=fit$ypred) %>%
-    mutate(Ytest=as.numeric(Genetics != 'Healthy')) %>%
-    mutate(ModelLine=cell.test) %>%
-    mutate(TestIdx=id.test)
-  
-  return(list(fit=fit, ypred=out))
+  f.select <- c("CellLine", "Genetics", "WellID", "PlateID", "Markers")
+  out <- dplyr::select(x[id.test, ], one_of(f.select)) %>%
+    mutate(Ypred = fit$ypred) %>%
+    mutate(Ytest = as.numeric(Genetics != "Healthy")) %>%
+    mutate(ModelLine = model.line) %>%
+    mutate(TestIdx = id.test)
+
+  return(list(fit = fit, ypred = out))
 }
 
-fit_model <- function(x, y, train.id, model, model_predict, test.id=NULL) {
+fit_model <- function(x, y, train.id, model, model_predict, test.id = NULL) {
   # Core function for model fitting. Fits the specified model, generates
   # predictions, and evaluates AUROC.
   #
@@ -130,80 +140,146 @@ fit_model <- function(x, y, train.id, model, model_predict, test.id=NULL) {
   # model: standardized model fitting function. Takes arguments x, y only.
   # model_predict: standardized model prediction function. Takes arguments
   #   fit (fitted model) and x only
-  # test.id: rows of x to generate predictions for. If NULL, all non training 
+  # test.id: rows of x to generate predictions for. If NULL, all non training
   #   samples will be used.
-  
+
   # Set test indices if not specified
-  if (is.null(test.id)) test.id <- setdiff(1:nrow(x), train.id)
-  
-  # Fit model  
-  fit <- model(x=x[train.id,], y=y[train.id])
-  
+  if (is.null(test.id)) test.id <- setdiff(seq_len(nrow(x)), train.id)
+
+  # Fit model
+  fit <- model(x = x[train.id, ], y = y[train.id])
+
   # Fit model and generate predictions
-  ypred <- model_predict(fit, x[test.id,])
+  ypred <- model_predict(fit, x[test.id, ])
   ytest <- y[test.id]
-  roc <- roc.curve(scores.class0=ypred, weights.class0=ytest)
-  return(list(fit=fit, auc=roc$auc, y=y, ypred=ypred, id.test=test.id))
+  roc <- roc.curve(scores.class0 = ypred, weights.class0 = ytest)
+
+  return(list(
+    fit = fit,
+    auc = roc$auc,
+    y = y,
+    ypred = ypred,
+    id.test = test.id
+  ))
 }
 
 
 logistic <- function(x, y) {
   # Standardized function for L1 logistic regression.
-  fit <- cv.glmnet(x=as.matrix(x), y=y, family='binomial')
-  fit$importance <- coef(fit, s='lambda.min')[,1]
+
+  print("Fitting LR")
+  fit <- cv.glmnet(
+    x = as.matrix(x),
+    y = y,
+    family = "binomial",
+    type.logistic = "modified.Newton",
+    thresh = 1e-5
+  )
+  print("LR fit")
+
+  fit$importance <- coef(fit, s = "lambda.min")[-1, 1]
   return(fit)
 }
 
+
 predict_logistic <- function(fit, x) {
   # Standardized function for logistic regression prediction.
-  ypred <- predict(fit, as.matrix(x), s='lambda.min', type='response')
-  return(ypred[,1])
+  ypred <- predict(fit, as.matrix(x), s = "lambda.min", type = "response")
+  return(ypred[, 1])
 }
 
-irf <- function(x, y, n.core=1) {
-  # Standardized function for irf. 
+irf <- function(x, y, n.core = 1) {
+  # Standardized function for irf.
   if (all(y %in% c(0, 1))) y <- as.factor(y)
-  fit <- iRF(x=x, y=y, type='ranger', verbose=FALSE, n.iter=2, n.core=n.core)
+
+  fit <- iRF(
+    x = x,
+    y = y,
+    type = "ranger",
+    verbose = FALSE,
+    n.iter = 2,
+    n.core = n.core
+  )
+
   fit$importance <- fit$rf.list$variable.importance
   return(fit)
 }
 
 predict_irf <- function(fit, x) {
   # Standardized function for irf prediction.
-  ypred <- predict(fit$rf.list, x, predict.all=TRUE)
+  ypred <- predict(fit$rf.list, x, predict.all = TRUE)
   ypred <- rowMeans(ypred$prediction)
   return(ypred)
 }
 
+fcnn <- function(x, y) {
+  # Standardized function for FC network
+
+  if (all(y %in% 0:1)) {
+    y <- to_categorical(y, 2)
+    x <- as.matrix(x)
+  } else {
+    stop("Multiclass and regression not supported")
+  }
+
+  p <- c(ncol(x))
+
+  model <- keras_model_sequential() %>%
+    layer_dense(units = 16, activation = "relu", input_shape = p) %>%
+    layer_dropout(rate = 0.5) %>%
+    layer_dense(units = 16, activation = "relu") %>%
+    layer_dropout(rate = 0.5) %>%
+    layer_dense(units = 2, activation = "softmax")
+
+  model %>% compile(
+    loss = "categorical_crossentropy",
+    optimizer = optimizer_rmsprop(),
+    metrics = c("accuracy")
+  )
+
+  history <- model %>% fit(
+    x, y,
+    epochs = 30,
+    batch_size = 128,
+    validation_split = 0.2
+  )
+
+
+  # Evaluate feature importance
+  ximportance <- diag(ncol(x))
+  importance <- predict(model, ximportance)[, 2]
+  importance <- abs(importance - mean(importance))
+  names(importance) <- colnames(x)
+
+  fit <- list()
+  fit$model <- model
+  fit$importance <- importance
+  return(fit)
+}
+
+predict_fcnn <- function(fit, x) {
+  return(predict(fit$model, as.matrix(x))[, 2])
+}
+
+
+###############################################################################
+# Additional utility functions used in analysis
+###############################################################################
 label_pval <- function(x) {
   # Generate significance *s for p-values
-  pval.lab.s <- ' ns'
-  pval.lab.s <- ifelse(x < 0.05, '*', pval.lab.s)
-  pval.lab.s <- ifelse(x < 0.01, '**', pval.lab.s)
-  pval.lab.s <- ifelse(x < 0.001, '***', pval.lab.s)
+  pval.lab.s <- " ns"
+  pval.lab.s <- ifelse(x < 0.05, "*", pval.lab.s)
+  pval.lab.s <- ifelse(x < 0.01, "**", pval.lab.s)
+  pval.lab.s <- ifelse(x < 0.001, "***", pval.lab.s)
   return(pval.lab.s)
 }
 
-select_cytosolic_fus <- function(x) {
-  # Subset feature matrix to cytosolic fus features - assumes FUS is channel 2
-  
-  # Initialize channel / region IDs
-  channels <- sapply(str_split(colnames(x), '\\.\\.'), '[', 3)
-  
-  regions <- sapply(str_split(colnames(x), '\\.\\.'), '[', 2)
-  regions <- lapply(regions, str_split, pattern='_') 
+select_cytosolic <- function(x) {
+  regions <- sapply(str_split(colnames(x), "\\.\\."), "[", 2)
+  regions <- lapply(regions, str_split, pattern = "_")
   regions <- sapply(regions, function(z) tail(z[[1]], 1))
-  
-  # Initialize FUS nuclear features
-  channels <- str_split(channels, '')
-  regions <- str_split(regions, '')
-  
-  nuc.fus <- mapply(function(ch, rg) {
-    any(ch == 2 & rg == 'D') |   any(ch == 2 & rg == 'C')
-  }, channels, regions)
-  
-  nuc.fus[is.na(nuc.fus)] <- FALSE
-  
-  # Drop nuclear features
-  return(dplyr::select_if(x, !nuc.fus))
+  regions[is.na(regions)] <- "X"
+
+  # Filter to selected region / marker
+  return(dplyr::select_if(x, regions %in% c("X", "N", "NN")))
 }
diff --git a/preprocessing/make_eigenfeatures.R b/preprocessing/make_eigenfeatures.R
new file mode 100644
index 0000000..b9056cc
--- /dev/null
+++ b/preprocessing/make_eigenfeatures.R
@@ -0,0 +1,125 @@
+#' This script is used to generate eigenfeatures from raw imaging data.
+#' Eigenfeatures are computed for a single screen / marker set for either
+#' single cells or KS normalized well replicates.
+#'
+#' Requires input data
+#'
+#'  `<screen>`: raw imaging data from a screen with both
+#'     `cbfeature_profiles.csv` and `metadata.csv` files. These can be generated
+#'     using the `rprofiler` package in awlabtools (see `GENERATE_PROFILE` shell
+#'     script in each screen subdirectory).
+#'  `Fibroblasts_Library.csv`: cell line metadata
+#'  `features_select_expanded.csv`: table of features to be included and their
+#'     associated feature groups.
+#'
+#' Karl Kumbier 10/20/2022
+library(tidyverse)
+library(data.table)
+
+als.dir <- Sys.getenv("ALS_PAPER")
+source(file.path(als.dir, "preprocessing/utilities.R"))
+args <- R.utils::commandArgs(asValues = TRUE)
+
+# Variable parameters for eigenfeature generation
+pct.var <- 0.95 # % variance explained by eigenfeatures within category
+type <- "cell" # type pf sample, being 'cell' or 'well'
+screen <- args$SCREEN #' 032422_WTFUSporadics' # screen to be analyzed
+raw <- as.logical(args$RAW) # FALSE
+
+# Directory containing cell line and feature metadata files
+library.path <- file.path(als.dir, "data", "Fibroblasts_Library.csv")
+feature.path <- file.path(als.dir, "data", "features_select_expanded.csv")
+
+# Directories with raw imaging data, metadata, and eigenfeature output
+base.dir <- "/awlab/projects/2017_01_ProjectALS/"
+data.dir <- file.path(base.dir, "plates")
+metadata.dir <- file.path(base.dir, "data", screen, "Cell_Filtered/")
+output.dir <- file.path(base.dir, "data", screen, "Cell_Filtered/")
+
+################################################################################
+# Load data
+################################################################################
+# Load in metadata
+xmeta <- fread(str_c(metadata.dir, "metadata.csv")) %>%
+  mutate(Markers = str_remove_all(Markers, " ")) %>%
+  mutate(Markers = str_remove_all(Markers, "Hoechst_")) %>%
+  mutate(Markers = str_remove_all(Markers, "_Mitotracker"))
+
+# Load in cell line data
+xcell <- fread(library.path) %>%
+  dplyr::rename(CellLine = `Cell line`) %>%
+  dplyr::rename(Genetics = `Disease Gene`) %>%
+  dplyr::select(CellLine, Genetics) %>%
+  mutate(CellLine = str_remove_all(CellLine, " ")) %>%
+  mutate(Genetics = ifelse(str_detect(Genetics, "mutation"), "sporadic", Genetics)) %>%
+  mutate(Genetics = ifelse(Genetics == "WT", "Healthy", Genetics)) %>%
+  mutate(Genetics = ifelse(Genetics == "FUS", "FUS-ALS", Genetics))
+
+xmeta <- left_join(xmeta, xcell, by = "CellLine")
+
+# Clean markerset names
+markers <- unique(xmeta$Markers)
+
+# Initialize feature grouping key
+xmarker <- fread(feature.path) %>%
+  rename(Feature = feature_name) %>%
+  rename(FeatureAbbr = feature_short) %>%
+  rename(Channel = channel) %>%
+  rename(Group = group) %>%
+  distinct()
+
+# Generate key for mapping feature full names to feature short names
+feature.key <- select(xmarker, Feature, FeatureAbbr) %>% distinct()
+
+# Drop channels for select markersets
+lapply(markers, function(m) {
+  print(m)
+  xmeta.m <- dplyr::filter(xmeta, Markers == m)
+  plate.ids <- unique(xmeta.m$PlateID)
+
+  xm <- lapply(plate.ids, function(p) {
+    data.file <- str_c("cbfeature_", type, "_profiles.csv")
+    data.path <- file.path(data.dir, p, "ks_profiles", data.file)
+    fread(data.path)
+  })
+
+  # Merge data / metadata
+  xm <- rbindlist(xm) %>%
+    mutate(ID = str_c(PlateID, "_", ID)) %>%
+    dplyr::select(-PlateID) %>%
+    left_join(xmeta.m, by = "ID")
+
+  if (raw) {
+    features <- str_replace_all(xmarker$Feature, "\\*", "\\.\\*")
+    features <- str_c("X\\.\\.", features) %>% str_c(collapse = "|")
+    xf <- dplyr::select(xm, matches(str_c("(", features, ")")))
+
+    channels <- str_remove_all(colnames(xf), "^.*\\.\\.") %>% str_split("")
+    id.channel <- sapply(channels, function(ch) all(ch %in% 2:3))
+    xf <- dplyr::select_if(xf, id.channel)
+    xf.meta <- dplyr::select(xm, -matches("^X"))
+    x <- cbind(xf.meta, xf) %>% mutate_if(is.numeric, median_impute)
+    save(file = file.path(output.dir, "profiles_raw.Rdata"), x)
+  } else {
+    # Reduce dataset based on select features
+    xm <- subset_features(xm, m, xmarker$Channel, xmarker$Feature, pct.var)
+
+    # Update column names with feature abbrevaiation
+    col.names <- str_split(colnames(xm), "\\.\\.")
+    col.names <- sapply(col.names, function(z) {
+      if (length(z) == 1) {
+        return(z)
+      }
+      z.feat <- str_remove_all(z[2], "\\.")
+      z.feat <- feature.key$FeatureAbbr[feature.key$Feature == z.feat]
+      out <- str_c(z[1], "..", z.feat, "..", z[3])
+      return(str_replace_all(out, ":PC", ".."))
+    })
+
+    colnames(xm) <- col.names
+    x <- xm
+
+    output.path <- str_c(output.dir, "profiles_", m, ".Rdata")
+    save(file = output.path, x)
+  }
+})
diff --git a/preprocessing/make_eigenfeatures_well.R b/preprocessing/make_eigenfeatures_well.R
new file mode 100644
index 0000000..0b97777
--- /dev/null
+++ b/preprocessing/make_eigenfeatures_well.R
@@ -0,0 +1,150 @@
+library(tidyverse)
+library(data.table)
+library(readxl)
+
+setwd("~/github/projectALS/")
+source("121820_SporadicScreen/utilities.R")
+output.dir <- "data/profiles/121820_SporadicScreen_MergedWell_Condensed/"
+dir.create(output.dir, showWarnings = FALSE)
+
+################################################################################
+# Data loading
+################################################################################
+# Set paths to datasets used for integrated analysis
+data.path.1 <- "data/profiles/110420_BiomarkerScreen/cbfeature_profiles.csv"
+data.path.2 <- "data/profiles/121820_SporadicScreen/cbfeature_profiles.csv"
+
+meta.path.1 <- "data/profiles/110420_BiomarkerScreen/metadata.csv"
+meta.path.2 <- "data/profiles/121820_SporadicScreen/metadata.csv"
+
+desc.path.1 <- "data/profiles/110420_BiomarkerScreen/feature_descriptors.csv"
+desc.path.2 <- "data/profiles/121820_SporadicScreen/feature_descriptors.csv"
+feature.path <- "data/features_select_condensed.csv"
+
+# Load in data from 11/20 and 12/20 screens
+x1 <- fread(data.path.1) %>%
+  dplyr::mutate(ID = str_c(PlateID, ID, sep = "_")) %>%
+  dplyr::select(-PlateID) %>%
+  dplyr::filter(!Bootstrap)
+
+x2 <- fread(data.path.2) %>%
+  dplyr::mutate(ID = str_c(PlateID, ID, sep = "_")) %>%
+  dplyr::select(-PlateID) %>%
+  dplyr::filter(!Bootstrap)
+
+# Load in metadata from 11/20 and 12/20 screens
+xmeta1 <- fread(meta.path.1)
+x1 <- left_join(x1, xmeta1, by = "ID")
+x1 <- as.data.frame(x1) %>% mutate(Screen = "1120")
+
+# Match marker set names
+x1$Markers <- str_replace_all(x1$Markers, "_EEA1 _CD63", "_CD63 _EEA1")
+x1$Markers <- str_replace_all(x1$Markers, "p-p62", "phospho-p62")
+
+xmeta2 <- fread(meta.path.2)
+x2 <- left_join(x2, xmeta2, by = "ID")
+x2 <- as.data.frame(x2) %>% mutate(Screen = "1220")
+
+# Merge data and metadata
+x <- rbindlist(list(x1, x2), fill = TRUE)
+
+# Update marker set names
+x <- mutate(x, Markers = str_remove_all(Markers, " "))
+markers <- unique(x$Markers)
+
+# Update compound names
+x <- mutate(x, Compounds = str_replace_all(Compounds, "None", "Control")) %>%
+  mutate(Compounds = str_replace_all(Compounds, "0.1%_DMSO", "Control")) %>%
+  mutate(CellLine = str_remove_all(CellLine, " ")) %>%
+  mutate(CellLine = str_replace_all(CellLine, "308", "ALS308")) %>%
+  mutate(CellLine = str_replace_all(CellLine, "309", "ALS309"))
+
+# Load in cell line data
+xcl <- fread("data/2020_Fibroblasts_Library.csv") %>%
+  dplyr::rename(CellLine = `Cell line`) %>%
+  dplyr::rename(CellLineGenetics = `Disease Gene`) %>%
+  dplyr::select(CellLine, CellLineGenetics) %>%
+  mutate(CellLine = str_remove_all(CellLine, " "))
+
+x <- left_join(x, xcl, by = "CellLine")
+
+# Load in feature desc file
+xdesc1 <- fread(desc.path.1) %>%
+  mutate(Full = str_c("X..", Feature, "..", Channel)) %>%
+  mutate(Full = str_replace_all(Full, "-", "."))
+
+xdesc2 <- fread(desc.path.2) %>%
+  mutate(Full = str_c("X..", Feature, "..", Channel)) %>%
+  mutate(Full = str_replace_all(Full, "-", "."))
+
+xdesc <- rbindlist(list(xdesc1, xdesc2)) %>% distinct()
+
+# Load in select features file
+xmarker <- fread(feature.path) %>%
+  rename(Feature = feature_name) %>%
+  rename(FeatureAbbr = feature_short) %>%
+  rename(Channel = channel) %>%
+  rename(Group = group) %>%
+  distinct() %>%
+  select(-V5, -V6)
+
+################################################################################
+# Data preprocessing
+################################################################################
+# Markers for which to generate datasets
+markers <- c(
+  "CD63_EEA1",
+  "EEA1",
+  "CD63",
+  "p62_phospho-p62S403",
+  "p62",
+  "phospho-p62S403",
+  "LAMP1_TFEB",
+  "LAMP1",
+  "TFEB",
+  "FUS",
+  "TIAR"
+)
+
+# Filter to WT cell lines shared across screens
+x.lines <- filter(x, CellLineGenetics == "WT") %>%
+  group_by(Screen) %>%
+  summarize(CellLine = list(unique(CellLine)), .groups = "drop")
+
+# Normalize data by median shift over shared WT cell lines
+shared.lines <- reduce(x.lines$CellLine, intersect)
+
+# Generate key for mapping feature full names to feature short names
+feature.key <- select(xmarker, Feature, FeatureAbbr) %>% distinct()
+
+# Process each markerset independently and return as list
+lapply(markers, function(m) {
+  # Drop channels for single marker evaluation
+  if (m %in% c("FUS", "EEA1", "phospho-p62S403", "TFEB")) {
+    xmarker <- filter(xmarker, Channel == 3)
+  } else if (m %in% c("TIAR", "p62", "CD63", "LAMP1")) {
+    xmarker <- filter(xmarker, Channel == 2)
+  }
+
+  print(str_c("Processing markerset: ", m, "..."))
+  xm <- subset_features(x, m, xmarker$Channel, xmarker$Feature, 0.95)
+  xm <- batch_normalize(as.data.frame(xm), shared.lines)
+
+  # Update column names with feature abbrevaiation
+  col.names <- str_split(colnames(xm), "\\.\\.")
+  col.names <- sapply(col.names, function(z) {
+    if (length(z) == 1) {
+      return(z)
+    }
+    z.feat <- str_remove_all(z[2], "\\.")
+    z.feat <- feature.key$FeatureAbbr[feature.key$Feature == z.feat]
+    out <- str_c(z[1], "..", z.feat, "..", z[3])
+    return(str_replace_all(out, ":PC", ".."))
+  })
+
+  colnames(xm) <- col.names
+  x <- xm
+
+  output.path <- str_c(output.dir, "cbfeature_profiles_", m, ".Rdata")
+  save(file = output.path, x)
+})
diff --git a/preprocessing/utilities.R b/preprocessing/utilities.R
new file mode 100644
index 0000000..5ac4f97
--- /dev/null
+++ b/preprocessing/utilities.R
@@ -0,0 +1,60 @@
+library(tidyverse)
+library(data.table)
+
+median_impute <- function(x) {
+  x[is.na(x)] <- median(x, na.rm = TRUE)
+  return(x)
+}
+
+subset_features <- function(x, marker, channels, features, pct.var = 0) {
+  # Filter dataset to selected markers/channels/features
+
+  # Filter to selected marker set
+  xm <- filter(x, str_detect(Markers, marker)) %>%
+    select_if(function(col) !all(is.na(col)))
+
+  # Filter to select channels
+  channels.select <- unlist(str_split(channels, ""))
+  c.drop <- str_c(setdiff(1:4, channels.select), collapse = "|")
+  f.drop <- str_c("X\\.\\..*\\.\\..*(", c.drop, ").*$")
+  id.drop <- str_detect(colnames(xm), f.drop)
+  xm <- select_if(xm, !id.drop)
+
+  # Filter to select features
+  features <- str_replace_all(features, "\\*", "\\.\\*")
+  features.select <- str_c("X..", features, "..", channels)
+  xf <- lapply(features.select, function(f) select(xm, matches(f)))
+
+  # Reduce dimensions of filtered data
+  if (pct.var != 0) {
+    xf <- lapply(xf, reduce_dim, pct.var = pct.var)
+    p <- sapply(xf, ncol)
+    features.select <- rep(features.select, times = p)
+    xf <- do.call(cbind, xf)
+    colnames(xf) <- str_c(features.select, ":", colnames(xf))
+  } else if (pct.var == "average") {
+    xf <- sapply(xf, rowMeans)
+    colnames(xf) <- features.select
+  } else {
+    xf <- do.call(cbind, xf)
+    xf <- mutate_all(xf, median_impute)
+  }
+
+  # Aggregatedata and metadata
+  meta.cols <- str_subset(colnames(xm), "^X", negate = TRUE)
+  xm <- select(xm, one_of(meta.cols))
+  out <- cbind(xm, xf)
+  return(out)
+}
+
+reduce_dim <- function(x, pct.var = 0.9) {
+  # Reduce dataset dimensionality with PCA
+  x <- mutate_all(x, median_impute)
+  pca <- prcomp(x)
+
+  # Evaluate # PCs required to obtain specified level of explained variability
+  var.exp <- cumsum(pca$sdev^2) / sum(pca$sdev^2)
+  n.pc <- min(which(var.exp > pct.var))
+  xout <- pca$x[, 1:n.pc, drop = FALSE]
+  return(xout)
+}