-
Notifications
You must be signed in to change notification settings - Fork 0
/
Step1_Seurat_processing_steps.R
228 lines (196 loc) · 14.6 KB
/
Step1_Seurat_processing_steps.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#
# --------- General Seurat pipeline processing steps adopted ---------
#
# Here we list the general sequence of steps we took to process the 10X scRNAseq
# data in the paper titled, "CaSSiDI: novel single-cell "Cluster Similarity
# Scoring and Distinction Index" reveals critical functions for PirB and
# context-dependent Cebpb repression" https://www.nature.com/articles/s41418-024-01268-8
#
# These steps precede the computation of the Scores 1 to 8
# (see the two functions: "CaSSiDI_Scores_1_to_7.R", "CaSSiDI_Score8.R")
#
# Code tested on Seurat v3.1.0
#
# Generally following the steps in the Seurat v3.1 tutorial:
# https://satijalab.org/seurat/archive/v3.1/pbmc3k_tutorial
library(dplyr)
library(Matrix)
library(cowplot)
library(gridExtra)
library(RColorBrewer)
library(tools)
library(gplots)
library(ggplot2)
library(ComplexHeatmap)
library(circlize)
library(Seurat)
# Setting some basic variables
# The following general variables can be set as per preference
# Please make sure to change these as needed.
sample_type <- "scd" # A string describing the sample type ; examples: "scd", blood", "cancer"
subtype1 <- "WT" # subtype1 info ; examples: "WT", "KO", "tumor", "healthy"
subtype2 <- "KO" # subtype2 info ; examples: "WT", "KO", "tumor", "healthy"
subtype1_dataPath <- "./" # full path of the 10X data files - usually matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv files provided by 10X
subtype2_dataPath <- "./" # full path of the 10X data files - usually matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv files provided by 10X
spath <- "./" # path of a directory to save any output files
nPCs <- 10 # Number of principal components to use for FindNeighbors, TSNE, and UMAP plots
maytag1 <- "" # additional label to tag useful info to any results filenames ; examples: "resolution0p6"
maytag2 <- "" # additional label to tag useful info to any results filenames ; examples: "nPCs10"
resval <- 0.4 # resolution value for clustering resolution in FindClusters
# Some additional parameters to set, specifically related to Seurat specifications
# Please make sure to change these from the default values assigned, as needed.
#
pcstostore <- 100 # Total Number of PCs to compute and store
PCsToUse <- 1:nPCs # Number of principal components to use for FindNeighbors, TSNE, and UMAP plots
minCellsThr <- 3 # Seurat parameter - Include features detected in at least this many cells
minGenesThr <- 200 # Seurat parameter - Include cells where at least this many features are detected
maXnGeneThr <- 8000 # Seurat QC metric - We filter out cells that have unique gene counts over maXnGeneThr, to avoid multiplets
maXmitoThr <- 30 # Seurat parameter ; upper limit for mitochondrial contamination tolerance;
# This is actual percentage in SeuratV3. So, instead of .3, we should say 30.
adj_pvalue_thr <- 0.05 # Bonferroni-adjusted p-values threshold for marker genes list cutoff
cpperp <- 100 # RunTSNE Perplexity value
iter_tsne <- 3000 # RunTSNE maximum number of iterations
short <- substr(sample_type, 1, 4) # A shorter version of the sample_type string, to avoid filenames from getting too long
cat("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n")
cat("Seurat RUN for Sample - ", sample_type, "\n")
cat("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n")
# Making a named vector for the read10XResults function
datavec <- c(subtype1_dataPath, subtype2_dataPath)
names(datavec) <- c(subtype1, subtype2)
# Load the Aggregated dataset
scd.data <- Read10X(data.dir=datavec)
cat("\nNumber of [ GENES / CELLS ] in the RAW data: [", scd.data@Dim[1], "/", scd.data@Dim[2], "]\n")
cat("\nSubsetting and separating out the two subtypes ...\n")
scd1.data <- scd.data[,grep(paste0("^",subtype1), colnames(scd.data), value=TRUE)]
scd2.data <- scd.data[,grep(paste0("^",subtype2), colnames(scd.data), value=TRUE)]
cat("\nNumber of [ GENES / CELLS ] in :",subtype1," subset raw data [", scd1.data@Dim[1], "/", scd1.data@Dim[2], "]\n")
cat("\nNumber of [ GENES / CELLS ] in :",subtype2," subset raw data [", scd2.data@Dim[1], "/", scd2.data@Dim[2], "]\n")
cat("In BOTH subset data, WE KEEP all genes expressed in >=", minCellsThr,"CELLS | AND | KEEP all CELLS with >=", minGenesThr,"expressed GENES \n")
# Initialize the Seurat object with the raw (non-normalized) data. Keep all
# genes expressed in >= minCellsThr cells. Keep all cells with at least minGenesThr detected genes
scd1 <- CreateSeuratObject(counts = scd1.data, min.cells = minCellsThr, min.features = minGenesThr, project = "10X_MPhages")
scd2 <- CreateSeuratObject(counts = scd2.data, min.cells = minCellsThr, min.features = minGenesThr, project = "10X_MPhages")
cat("\nAfter creating the two Seurat objects, \n")
cat("\nNumber of [ GENES / CELLS ] IN",subtype1,"Seurat object: [", dim(GetAssayData(object = scd1))[1], "/", dim(GetAssayData(object = scd1))[2], "]\n")
cat("\nNumber of [ GENES / CELLS ] IN",subtype2,"Seurat object: [", dim(GetAssayData(object = scd2))[1], "/", dim(GetAssayData(object = scd2))[2], "]\n")
# MITO percentage calculations
scd1[["percent.mito"]] <- Seurat::PercentageFeatureSet(scd1, pattern = "^mt-")
scd2[["percent.mito"]] <- Seurat::PercentageFeatureSet(scd2, pattern = "^mt-")
# We filter out cells that have unique gene counts over maXnGeneThr or less than
# 200 ; also those greater than maXmitoThr
cat("\nApplying FURTHER FILTERING to both Seurat Objects: maXnGeneThr=", maXnGeneThr, " | maXmitoThr=", maXmitoThr, "\n")
scd1 <- subset(scd1, subset = nFeature_RNA > 200 & nFeature_RNA < maXnGeneThr & percent.mito < maXmitoThr)
scd2 <- subset(scd2, subset = nFeature_RNA > 200 & nFeature_RNA < maXnGeneThr & percent.mito < maXmitoThr)
cat("\nNumber of [ GENES / CELLS ] IN",subtype1,"Seurat object remaining AFTER maXnGeneThr/maXmitoThr Thresholding: [", dim(GetAssayData(object = scd1))[1], "/", dim(GetAssayData(object = scd1))[2], "]\n")
cat("\nNumber of [ GENES / CELLS ] IN",subtype2,"Seurat object remaining AFTER maXnGeneThr/maXmitoThr Thresholding: [", dim(GetAssayData(object = scd2))[1], "/", dim(GetAssayData(object = scd2))[2], "]\n")
# Simple Log NORMALIZATION - Here, we DO NOT do SCT, but do the standard normalization
# We also use this for the CellCycle scores computations
scd1 <- Seurat::NormalizeData(object = scd1, normalization.method = "LogNormalize", scale.factor = 10000)
scd2 <- Seurat::NormalizeData(object = scd2, normalization.method = "LogNormalize", scale.factor = 10000)
# Cell-Cycle Scoring
cat("\nComputing cell-cycle scores ... ")
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.
# We can segregate this list into markers of G2/M phase and markers of S phase
s.genes <- toTitleCase(tolower(cc.genes$s.genes))
g2m.genes <- toTitleCase(tolower(cc.genes$g2m.genes))
# Assign Cell-Cycle Scores
scd1 <- Seurat::CellCycleScoring(object = scd1, s.features = s.genes, g2m.features = g2m.genes, set.ident = F)
scd2 <- Seurat::CellCycleScoring(object = scd2, s.features = s.genes, g2m.features = g2m.genes, set.ident = F)
cat("DONE!\n")
# Since we are NOT running SCT, but instead, are doing the old
# Log-Normalization, we follow the steps in the tutorial
# https://satijalab.org/seurat/archive/v3.1/pbmc3k_tutorial
# That is---we have to individually run NormalizeData, FindVariableFeatures, and ScaleData.
# If we were to use SCTransform, then all these three commands are integrated into
# that one command. Otherwise, got to do them individually.
cat("\nRUNNING FindVariableFeatures ... ")
scd1 <- FindVariableFeatures(scd1, selection.method = "vst", nfeatures = 750)
scd2 <- FindVariableFeatures(scd2, selection.method = "vst", nfeatures = 750)
cat("\nRUNNING ScaleData with regressing out ... ")
scd1 <- ScaleData(scd1, features = rownames(scd1), vars.to.regress = c("nCount_RNA", "percent.mito", "S.Score", "G2M.Score"))
scd2 <- ScaleData(scd2, features = rownames(scd2), vars.to.regress = c("nCount_RNA", "percent.mito", "S.Score", "G2M.Score"))
cat("DONE!\n")
# Now doing PCA
# PCA
cat("\nComputing PCA ... \n")
scd1 <- Seurat::RunPCA(object=scd1, features = VariableFeatures(object = scd1), npcs = pcstostore)
scd2 <- Seurat::RunPCA(object=scd2, features = VariableFeatures(object = scd2), npcs = pcstostore)
cat("\n--- PCA task DONE! ---\n")
# Finding KNNs
cat("\nFinding Neighbors ... \n")
scd1 <- FindNeighbors(object=scd1, reduction = "pca", dims = PCsToUse, verbose = TRUE)
scd2 <- FindNeighbors(object=scd2, reduction = "pca", dims = PCsToUse, verbose = TRUE)
cat("\n--- Finding Neighbors DONE! ---\n")
# Finding Clusters!
cat("\nFinding Clusters ... \n")
scd1 <- FindClusters(object=scd1, resolution = resval, temp.file.location = maintemp, verbose = TRUE)
scd2 <- FindClusters(object=scd2, resolution = resval, temp.file.location = maintemp, verbose = TRUE)
cat("\n--- Finding Clusters DONE! ---\n")
# RUNNING TSNE
cat("\nStarting RunTSNE for Perplexity value of ", cpperp, "... for ", subtype1, "\n")
scd1 <- RunTSNE(object=scd1, dims = PCsToUse, max_iter=iter_tsne, perplexity=cpperp)
cat("\nStarting RunTSNE for Perplexity value of ", cpperp, "... for ", subtype2, "\n")
scd2 <- RunTSNE(object=scd2, dims = PCsToUse, max_iter=iter_tsne, perplexity=cpperp)
cat("\nFinished All Run tSNEs!!!\n")
# Running UMAP
cat("\nStarting UMAP run for ", subtype1)
scd1 <- RunUMAP(object=scd1, reduction = "pca", dims = PCsToUse)
cat("\nStarting UMAP run for ", subtype2)
scd2 <- RunUMAP(object=scd2, reduction = "pca", dims = PCsToUse)
# PLOTTING TSNEs & UMAP
cat("\nPlotting TSNEs and UMAPs ...")
ttp1 <- DimPlot(object=scd1, pt.size = 0.5, reduction = "tsne", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp1 <- ttp1 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype1, " TSNE | Res: ",resval, " | PCs: ",nPCs))
ttp2 <- DimPlot(object=scd1, pt.size = 0.5, reduction = "umap", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp2 <- ttp2 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype1, " UMAP | Res: ",resval, " | PCs: ",nPCs))
#
ttp3 <- DimPlot(object=scd2, pt.size = 0.5, reduction = "tsne", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp3 <- ttp3 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype2, " TSNE | Res: ",resval, " | PCs: ",nPCs))
ttp4 <- DimPlot(object=scd2, pt.size = 0.5, reduction = "umap", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp4 <- ttp4 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype2, " UMAP | Res: ",resval, " | PCs: ",nPCs))
#
gg12 <- cowplot::plot_grid(ttp1,ttp2,ttp3,ttp4, nrow=2, ncol=2, align="hv")
cowplot::save_plot(paste(spath,sample_type,"_",maytag1,"_",maytag2,"_TSNEsUMAP_", PCsToUse[[length(PCsToUse)]], "_PCs_TSNEsperp_", cpperp,".pdf",sep=""),
gg12, ncol=1, base_width=10, base_height = 13)
cat(" DONE!\n")
# Setting Tibble Print options
options(tibble.width = Inf)
options(tibble.print_max = Inf)
# Set Idents to the desired ones, just to be sure
Idents(object=scd1) <- paste0("RNA_snn_res.",resval)
Idents(object=scd2) <- paste0("RNA_snn_res.",resval)
# FIND Markers
cat("\nFinding Pos markers ... \n")
scd1.pos.markers <- FindAllMarkers(object=scd1, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = TRUE)
scd2.pos.markers <- FindAllMarkers(object=scd2, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = TRUE)
scd1.pos.markers$pct.1_minus_pct.2 <- scd1.pos.markers$pct.1 - scd1.pos.markers$pct.2
scd2.pos.markers$pct.1_minus_pct.2 <- scd2.pos.markers$pct.1 - scd2.pos.markers$pct.2
cat("\nFinding PosNeg markers ... \n")
scd1.posneg.markers <- FindAllMarkers(object=scd1, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = FALSE)
scd2.posneg.markers <- FindAllMarkers(object=scd2, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = FALSE)
scd1.posneg.markers$pct.1_minus_pct.2 <- scd1.posneg.markers$pct.1 - scd1.posneg.markers$pct.2
scd2.posneg.markers$pct.1_minus_pct.2 <- scd2.posneg.markers$pct.1 - scd2.posneg.markers$pct.2
# Writing markers to CSVs
cat("\nWriting Pos and PosNeg markers to CSVs ... \n")
write.csv(scd1.posneg.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_PosNeg_Mrkrs_",subtype1,".csv"))
write.csv(scd2.posneg.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_PosNeg_Mrkrs_",subtype2,".csv"))
write.csv(scd1.pos.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_Pos_Mrkrs_",subtype1,".csv"))
write.csv(scd2.pos.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_Pos_Mrkrs_",subtype2,".csv"))
# -----------------------------------------------------------------------------------
# CODE FOR finding UNIQUELY-OCCURRING ONLY POS (UP) markers and BOTH POS and Neg markers! -
# unique meaning, that marker occurs ONLY in one cluster!
# THEN saving such unique markers separately!
# -----------------------------------------------------------------------------------
# All
scd1.posneg.markers_UniquelyOccrrng <- scd1.posneg.markers[!(duplicated(scd1.posneg.markers$gene) | duplicated(scd1.posneg.markers$gene, fromLast=TRUE)),]
scd2.posneg.markers_UniquelyOccrrng <- scd2.posneg.markers[!(duplicated(scd2.posneg.markers$gene) | duplicated(scd2.posneg.markers$gene, fromLast=TRUE)),]
scd1.pos.markers_UniquelyOccrrng <- scd1.pos.markers[!(duplicated(scd1.pos.markers$gene) | duplicated(scd1.pos.markers$gene, fromLast=TRUE)),]
scd2.pos.markers_UniquelyOccrrng <- scd2.pos.markers[!(duplicated(scd2.pos.markers$gene) | duplicated(scd2.pos.markers$gene, fromLast=TRUE)),]
cat("\nWriting UNIQUEly Occurring markers to CSVs ... \n")
write.csv(scd1.posneg.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_PosNeg_UniquelyOccrrng_Mrkrs_",subtype1,".csv"))
write.csv(scd2.posneg.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_PosNeg_UniquelyOccrrng_Mrkrs_",subtype2,".csv"))
write.csv(scd1.pos.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_Pos_UniquelyOccrrng_Mrkrs_",subtype1,".csv"))
write.csv(scd2.pos.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_Pos_UniquelyOccrrng_Mrkrs_",subtype2,".csv"))
# Saving workspace (saves the entire workspace, if needed)
# save.image(file=paste(spath,sample_type,"_",maytag1,"_",maytag2,".RData",sep=""))