Step1_Seurat_processing_steps.R

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 
# ---------  General Seurat pipeline processing steps adopted ---------
#
# Here we list the general sequence of steps we took to process the 10X scRNAseq
# data in the paper titled, "CaSSiDI: novel single-cell "Cluster Similarity
# Scoring and Distinction Index" reveals critical functions for PirB and
# context-dependent Cebpb repression" https://www.nature.com/articles/s41418-024-01268-8
# 
# These steps precede the computation of the Scores 1 to 8 
# (see the two functions: "CaSSiDI_Scores_1_to_7.R", "CaSSiDI_Score8.R")
# 
# Code tested on Seurat v3.1.0
# 
# Generally following the steps in the Seurat v3.1 tutorial:
# https://satijalab.org/seurat/archive/v3.1/pbmc3k_tutorial

library(dplyr)
library(Matrix)
library(cowplot)
library(gridExtra)
library(RColorBrewer)
library(tools)
library(gplots)
library(ggplot2)
library(ComplexHeatmap)
library(circlize)
library(Seurat)

# Setting some basic variables
# The following general variables can be set as per preference
# Please make sure to change these as needed.
sample_type       <- "scd" # A string describing the sample type ; examples: "scd", blood", "cancer"
subtype1          <- "WT"   # subtype1 info ; examples: "WT", "KO", "tumor", "healthy"
subtype2          <- "KO"   # subtype2 info ; examples: "WT", "KO", "tumor", "healthy"
subtype1_dataPath <- "./"   # full path of the 10X data files - usually matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv files provided by 10X
subtype2_dataPath <- "./"   # full path of the 10X data files - usually matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv files provided by 10X
spath             <- "./"   # path of a directory to save any output files
nPCs              <- 10     # Number of principal components to use for FindNeighbors, TSNE, and UMAP plots
maytag1           <- ""     # additional label to tag useful info to any results filenames ; examples: "resolution0p6"
maytag2           <- ""     # additional label to tag useful info to any results filenames ; examples: "nPCs10"
resval          <- 0.4      # resolution value for clustering resolution in FindClusters

# Some additional parameters to set, specifically related to Seurat specifications
# Please make sure to change these from the default values assigned, as needed.
# 
pcstostore <- 100       # Total Number of PCs to compute and store
PCsToUse <- 1:nPCs      # Number of principal components to use for FindNeighbors, TSNE, and UMAP plots
minCellsThr <- 3        # Seurat parameter - Include features detected in at least this many cells
minGenesThr <- 200      # Seurat parameter - Include cells where at least this many features are detected
maXnGeneThr <- 8000     # Seurat QC metric - We filter out cells that have unique gene counts over maXnGeneThr, to avoid multiplets
maXmitoThr <- 30        # Seurat parameter ; upper limit for mitochondrial contamination tolerance; 
                        # This is actual percentage in SeuratV3. So, instead of .3, we should say 30.
adj_pvalue_thr <- 0.05  # Bonferroni-adjusted p-values threshold for marker genes list cutoff
cpperp <- 100           # RunTSNE Perplexity value
iter_tsne <- 3000       # RunTSNE maximum number of iterations
short <- substr(sample_type, 1, 4) # A shorter version of the sample_type string, to avoid filenames from getting too long

cat("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n")
cat("Seurat RUN for Sample - ", sample_type, "\n")
cat("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n")

# Making a named vector for the read10XResults function
datavec <- c(subtype1_dataPath, subtype2_dataPath)
names(datavec) <- c(subtype1, subtype2)

# Load the Aggregated dataset
scd.data <- Read10X(data.dir=datavec)

cat("\nNumber of [ GENES / CELLS ] in the RAW data: [", scd.data@Dim[1], "/", scd.data@Dim[2], "]\n")

cat("\nSubsetting and separating out the two subtypes ...\n")
scd1.data <- scd.data[,grep(paste0("^",subtype1), colnames(scd.data), value=TRUE)]
scd2.data <- scd.data[,grep(paste0("^",subtype2), colnames(scd.data), value=TRUE)]

cat("\nNumber of [ GENES / CELLS ] in :",subtype1," subset raw data [", scd1.data@Dim[1], "/", scd1.data@Dim[2], "]\n")
cat("\nNumber of [ GENES / CELLS ] in :",subtype2," subset raw data [", scd2.data@Dim[1], "/", scd2.data@Dim[2], "]\n")

cat("In BOTH subset data, WE KEEP all genes expressed in >=", minCellsThr,"CELLS | AND | KEEP all CELLS with >=", minGenesThr,"expressed GENES \n")

# Initialize the Seurat object with the raw (non-normalized) data.  Keep all
# genes expressed in >= minCellsThr cells. Keep all cells with at least minGenesThr detected genes
scd1 <- CreateSeuratObject(counts = scd1.data, min.cells = minCellsThr, min.features = minGenesThr, project = "10X_MPhages")
scd2 <- CreateSeuratObject(counts = scd2.data, min.cells = minCellsThr, min.features = minGenesThr, project = "10X_MPhages")

cat("\nAfter creating the two Seurat objects, \n")
cat("\nNumber of [ GENES / CELLS ] IN",subtype1,"Seurat object: [", dim(GetAssayData(object = scd1))[1], "/", dim(GetAssayData(object = scd1))[2], "]\n")
cat("\nNumber of [ GENES / CELLS ] IN",subtype2,"Seurat object: [", dim(GetAssayData(object = scd2))[1], "/", dim(GetAssayData(object = scd2))[2], "]\n")

# MITO percentage calculations
scd1[["percent.mito"]] <- Seurat::PercentageFeatureSet(scd1, pattern = "^mt-")
scd2[["percent.mito"]] <- Seurat::PercentageFeatureSet(scd2, pattern = "^mt-")

# We filter out cells that have unique gene counts over maXnGeneThr or less than
# 200 ; also those greater than maXmitoThr
cat("\nApplying FURTHER FILTERING to both Seurat Objects: maXnGeneThr=", maXnGeneThr, "  | maXmitoThr=", maXmitoThr, "\n")
scd1 <- subset(scd1, subset = nFeature_RNA > 200 & nFeature_RNA < maXnGeneThr & percent.mito < maXmitoThr)
scd2 <- subset(scd2, subset = nFeature_RNA > 200 & nFeature_RNA < maXnGeneThr & percent.mito < maXmitoThr)

cat("\nNumber of [ GENES / CELLS ] IN",subtype1,"Seurat object remaining AFTER maXnGeneThr/maXmitoThr Thresholding: [", dim(GetAssayData(object = scd1))[1], "/", dim(GetAssayData(object = scd1))[2], "]\n")
cat("\nNumber of [ GENES / CELLS ] IN",subtype2,"Seurat object remaining AFTER maXnGeneThr/maXmitoThr Thresholding: [", dim(GetAssayData(object = scd2))[1], "/", dim(GetAssayData(object = scd2))[2], "]\n")

# Simple Log NORMALIZATION - Here, we DO NOT do SCT, but do the standard normalization
# We also use this for the CellCycle scores computations
scd1 <- Seurat::NormalizeData(object = scd1, normalization.method = "LogNormalize", scale.factor = 10000)
scd2 <- Seurat::NormalizeData(object = scd2, normalization.method = "LogNormalize", scale.factor = 10000)

# Cell-Cycle Scoring
cat("\nComputing cell-cycle scores ... ")
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.
# We can segregate this list into markers of G2/M phase and markers of S phase
s.genes <- toTitleCase(tolower(cc.genes$s.genes))
g2m.genes <- toTitleCase(tolower(cc.genes$g2m.genes))
# Assign Cell-Cycle Scores
scd1 <- Seurat::CellCycleScoring(object = scd1, s.features = s.genes, g2m.features = g2m.genes, set.ident = F)
scd2 <- Seurat::CellCycleScoring(object = scd2, s.features = s.genes, g2m.features = g2m.genes, set.ident = F)
cat("DONE!\n")

# Since we are NOT running SCT, but instead, are doing the old
# Log-Normalization, we follow the steps in the tutorial
# https://satijalab.org/seurat/archive/v3.1/pbmc3k_tutorial
# That is---we have to individually run NormalizeData, FindVariableFeatures, and ScaleData.  
# If we were to use SCTransform, then all these three commands are integrated into
# that one command.  Otherwise, got to do them individually.
cat("\nRUNNING FindVariableFeatures ... ")
scd1 <- FindVariableFeatures(scd1, selection.method = "vst", nfeatures = 750)
scd2 <- FindVariableFeatures(scd2, selection.method = "vst", nfeatures = 750)
cat("\nRUNNING ScaleData with regressing out ... ")
scd1 <- ScaleData(scd1, features = rownames(scd1), vars.to.regress = c("nCount_RNA", "percent.mito", "S.Score", "G2M.Score"))
scd2 <- ScaleData(scd2, features = rownames(scd2), vars.to.regress = c("nCount_RNA", "percent.mito", "S.Score", "G2M.Score"))
cat("DONE!\n")

# Now doing PCA
# PCA
cat("\nComputing PCA ... \n")
scd1 <- Seurat::RunPCA(object=scd1, features = VariableFeatures(object = scd1), npcs = pcstostore)
scd2 <- Seurat::RunPCA(object=scd2, features = VariableFeatures(object = scd2), npcs = pcstostore)
cat("\n--- PCA task DONE! ---\n")

# Finding KNNs
cat("\nFinding Neighbors ... \n")
scd1 <- FindNeighbors(object=scd1, reduction = "pca", dims = PCsToUse, verbose = TRUE)
scd2 <- FindNeighbors(object=scd2, reduction = "pca", dims = PCsToUse, verbose = TRUE)
cat("\n--- Finding Neighbors DONE! ---\n")

# Finding Clusters!
cat("\nFinding Clusters ... \n")
scd1 <- FindClusters(object=scd1, resolution = resval, temp.file.location = maintemp, verbose = TRUE)
scd2 <- FindClusters(object=scd2, resolution = resval, temp.file.location = maintemp, verbose = TRUE)
cat("\n--- Finding Clusters DONE! ---\n")

# RUNNING TSNE
cat("\nStarting RunTSNE for Perplexity value of ", cpperp, "... for ", subtype1, "\n")
scd1 <- RunTSNE(object=scd1, dims = PCsToUse, max_iter=iter_tsne, perplexity=cpperp)
cat("\nStarting RunTSNE for Perplexity value of ", cpperp, "... for ", subtype2, "\n")
scd2 <- RunTSNE(object=scd2, dims = PCsToUse, max_iter=iter_tsne, perplexity=cpperp)
cat("\nFinished All Run tSNEs!!!\n")

# Running UMAP
cat("\nStarting UMAP run for ", subtype1)
scd1 <- RunUMAP(object=scd1, reduction = "pca", dims = PCsToUse)
cat("\nStarting UMAP run for ", subtype2)
scd2 <- RunUMAP(object=scd2, reduction = "pca", dims = PCsToUse)

# PLOTTING TSNEs & UMAP
cat("\nPlotting TSNEs and UMAPs ...")
ttp1 <- DimPlot(object=scd1, pt.size = 0.5, reduction = "tsne", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp1 <- ttp1 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype1, " TSNE | Res: ",resval, " | PCs: ",nPCs))
ttp2 <- DimPlot(object=scd1, pt.size = 0.5, reduction = "umap", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp2 <- ttp2 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype1, " UMAP | Res: ",resval, " | PCs: ",nPCs))
# 
ttp3 <- DimPlot(object=scd2, pt.size = 0.5, reduction = "tsne", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp3 <- ttp3 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype2, " TSNE | Res: ",resval, " | PCs: ",nPCs))
ttp4 <- DimPlot(object=scd2, pt.size = 0.5, reduction = "umap", group.by = paste0("RNA_snn_res.",resval), label = T, label.size = 5)
ttp4 <- ttp4 + theme(legend.position="bottom", plot.title = element_text(size=10)) + labs(title=paste0(short," ",subtype2, " UMAP | Res: ",resval, " | PCs: ",nPCs))
# 
gg12 <- cowplot::plot_grid(ttp1,ttp2,ttp3,ttp4, nrow=2, ncol=2, align="hv")
cowplot::save_plot(paste(spath,sample_type,"_",maytag1,"_",maytag2,"_TSNEsUMAP_", PCsToUse[[length(PCsToUse)]], "_PCs_TSNEsperp_", cpperp,".pdf",sep=""),
                   gg12, ncol=1, base_width=10, base_height = 13)
cat(" DONE!\n")

# Setting Tibble Print options
options(tibble.width = Inf)
options(tibble.print_max = Inf)

# Set Idents to the desired ones, just to be sure
Idents(object=scd1) <- paste0("RNA_snn_res.",resval)
Idents(object=scd2) <- paste0("RNA_snn_res.",resval)

# FIND Markers 
cat("\nFinding Pos markers ... \n")
scd1.pos.markers <- FindAllMarkers(object=scd1, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = TRUE)
scd2.pos.markers <- FindAllMarkers(object=scd2, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = TRUE)
scd1.pos.markers$pct.1_minus_pct.2 <- scd1.pos.markers$pct.1 - scd1.pos.markers$pct.2
scd2.pos.markers$pct.1_minus_pct.2 <- scd2.pos.markers$pct.1 - scd2.pos.markers$pct.2

cat("\nFinding PosNeg markers ... \n")
scd1.posneg.markers <- FindAllMarkers(object=scd1, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = FALSE)
scd2.posneg.markers <- FindAllMarkers(object=scd2, logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, only.pos = FALSE)
scd1.posneg.markers$pct.1_minus_pct.2 <- scd1.posneg.markers$pct.1 - scd1.posneg.markers$pct.2
scd2.posneg.markers$pct.1_minus_pct.2 <- scd2.posneg.markers$pct.1 - scd2.posneg.markers$pct.2

# Writing markers to CSVs
cat("\nWriting Pos and PosNeg markers to CSVs ... \n")
write.csv(scd1.posneg.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_PosNeg_Mrkrs_",subtype1,".csv"))
write.csv(scd2.posneg.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_PosNeg_Mrkrs_",subtype2,".csv"))
write.csv(scd1.pos.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_Pos_Mrkrs_",subtype1,".csv"))
write.csv(scd2.pos.markers, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_Pos_Mrkrs_",subtype2,".csv"))

# -----------------------------------------------------------------------------------
# CODE FOR finding UNIQUELY-OCCURRING ONLY POS (UP) markers and BOTH POS and Neg markers! -
# unique meaning, that marker occurs ONLY in one cluster!
# THEN saving such unique markers separately!
# -----------------------------------------------------------------------------------
# All
scd1.posneg.markers_UniquelyOccrrng <- scd1.posneg.markers[!(duplicated(scd1.posneg.markers$gene) | duplicated(scd1.posneg.markers$gene, fromLast=TRUE)),]
scd2.posneg.markers_UniquelyOccrrng <- scd2.posneg.markers[!(duplicated(scd2.posneg.markers$gene) | duplicated(scd2.posneg.markers$gene, fromLast=TRUE)),]
scd1.pos.markers_UniquelyOccrrng <- scd1.pos.markers[!(duplicated(scd1.pos.markers$gene) | duplicated(scd1.pos.markers$gene, fromLast=TRUE)),]
scd2.pos.markers_UniquelyOccrrng <- scd2.pos.markers[!(duplicated(scd2.pos.markers$gene) | duplicated(scd2.pos.markers$gene, fromLast=TRUE)),]
cat("\nWriting UNIQUEly Occurring markers to CSVs ... \n")
write.csv(scd1.posneg.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_PosNeg_UniquelyOccrrng_Mrkrs_",subtype1,".csv"))
write.csv(scd2.posneg.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_PosNeg_UniquelyOccrrng_Mrkrs_",subtype2,".csv"))
write.csv(scd1.pos.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_Pos_UniquelyOccrrng_Mrkrs_",subtype1,".csv"))
write.csv(scd2.pos.markers_UniquelyOccrrng, file=paste0(spath,sample_type,"_",maytag1,"_",maytag2,"_SeprT_Wilcox_ALL_Pos_UniquelyOccrrng_Mrkrs_",subtype2,".csv"))

# Saving workspace (saves the entire workspace, if needed)
# save.image(file=paste(spath,sample_type,"_",maytag1,"_",maytag2,".RData",sep=""))