Comparison of scGPS to established pseudo-time analysis methods
Load the dataset and packages
library(Seurat)
library(slingshot)
library(RColorBrewer)
library(monocle3)
library(SingleCellExperiment)
library(dplyr)
library(ggplot2)
library(scGPS)
library(locfit)
library(networkD3)
library(dplyr)
library(tidyr)
library(ggpubr)
library(igraph)
rm(list = ls())
# Load the dataset we need
dat <- readRDS("aging-hsc-old_kowalczyk.rds")
Data preprocessing (PCA, UMAP, DE analysis) by Seurat
data_object <- CreateSeuratObject(counts = t(dat$counts))
data_object <- FindVariableFeatures(data_object, selection.method = "vst", nfeatures = 3000)
data_object <- ScaleData(data_object)
data_object <- RunPCA(data_object)
data_object <- FindNeighbors(data_object, dims = 1:50)
data_object <- FindClusters(data_object, resolution = seurat_cluster_res)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 873
## Number of edges: 79504
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7671
## Number of communities: 3
## Elapsed time: 0 seconds
cl <- data_object$seurat_clusters
data_object <- RunUMAP(data_object,dims=1:50, n.neighbors=5, min.dist=0)
rd <- Embeddings(data_object, reduction = "umap")
Slingshot
sl <- slingshot(rd,cl)
# Visualise the trajectory in UMAP plot
data_frame <- as.data.frame(rd)
data_frame$cluster <- cl
g <- ggplot(data_frame, aes(UMAP_1, UMAP_2, color = cluster))
g <- g + geom_point(size = 1)
g <- g + scale_color_manual(values = colours_to_use[1:length(levels(cl))])
# Hide panel borders and remove grid lines
g <- g + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
panel.background = element_rect(colour = "white", fill = "white"))
# Adding the curves
for (i in seq_along(slingCurves(sl))) {
curve_i <- slingCurves(sl)[[i]]
curve_i <- curve_i$s[curve_i$ord, ]
colnames(curve_i) <- c("UMAP_1", "UMAP_2")
g <- g + geom_path(data = as.data.frame(curve_i), col = "black", size = .6)
}
g
Monocole
#Data preprocessing
counts<- t(dat$counts)
colData <- as.data.frame(dat$prior_information$groups_id)
colData$timecourse_discrete <- dat$prior_information$timecourse_discrete[[1]]
start_id <- dat$prior_information$start_id
colData$cluster <- cl[colData$cell_id]
colnames <- colnames(counts)
colnames(counts) <- make.unique(colnames(counts))
row.names(colData) <- colnames(counts)
cds <- new_cell_data_set(counts, cell_metadata=colData)
cds@int_colData@listData$reducedDims$UMAP <- rd
cds <- cluster_cells(cds, reduction_method = "UMAP", cluster_method = "louvain")
cluster <- cds$cluster
names(cluster) <- cds$cell_id
cds@clusters@listData$UMAP$clusters <- cluster
#Run trajectory infrence
cds <- learn_graph(cds, use_partition=F)
##
|
| | 0%
|
|======================================================================| 100%
# Visualise the trajectory in UMAP plot
plot_monocle <- plot_cells(cds, group_cells_by = 'cluster',
label_cell_groups = F,
label_branch_points = F,
label_roots = F,
label_leaves = F,
cell_size = 1) +
theme(legend.position = "right") +
scale_color_manual(values = colours_to_use[1:length(levels(cl))])
plot_monocle
scGPS
# Data preprocessing
dat_expr1 <- t(dat$counts)
dat_celldata <- cl
dat_GeneMetaData1 <- data.frame("GeneSymbol" = dat$feature_info$feature_id)
mixedpop1 <- new_scGPS_object(ExpressionMatrix=dat_expr1, GeneMetadata = dat_GeneMetaData1,
CellMetadata =dat_celldata)
mixedpop2 = mixedpop1
# Find markers for clustering (by comparing cells in one cluster vs remaining cells)
cell_types1 = colData(mixedpop1)[,1]
cell_types2 = colData(mixedpop2)[,1]
DEgenes <- find_markers(expression_matrix=assays(mixedpop1)[["counts"]],
cluster = cell_types1,
selected_cluster=unique(cell_types1))
types1 <- levels(cell_types1)
types2 <- levels(cell_types2)
LSOLDA_dats <- list()
formatted <- list()
# List of colours
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
# Loop through each cluster and perform analysis
for (i in 1:length(types1)) {
print(paste0("Start Round ", i))
c_selectID <- types1[i]
genes <- DEgenes$id[1:500]
LSOLDA_dats[[i]] <- bootstrap_prediction(nboots = scGPS_nboots,
mixedpop1 = mixedpop1,
mixedpop2 = mixedpop2,
genes = genes,
c_selectID = c_selectID,
listData = list(),
cluster_mixedpop1 = cell_types1,
cluster_mixedpop2 = cell_types2,
log_transform = TRUE)
print(paste0("Done round ", i))
formatted[[i]] <- reformat_LASSO(c_selectID = c_selectID,
mp_selectID = 2,
LSOLDA_dat = LSOLDA_dats[[i]],
nPredSubpop = length(types2),
Nodes_group = col_vector[i],
nboots = scGPS_nboots)
}
## [1] "Start Round 1"
## [1] "Done round 1"
## [1] "Start Round 2"
## [1] "Done round 2"
## [1] "Start Round 3"
## [1] "Done round 3"
# Add all the data together
combined <- do.call("rbind", formatted)
# Reformat so we only have the overall, not all individual bootstraps
combined <- combined[, c(scGPS_nboots+1, scGPS_nboots+2, scGPS_nboots+5)]
# Rename nodes
combined$Source <- gsub("_MP2", "", combined$Source)
combined$Target <- gsub("_MP2", "", combined$Target)
combined$Value <- round(combined$Value, digit=2) #2 decimals
g <- graph_from_data_frame(combined)
#cell_annot=broad_type
layout=NULL
legend_position <- "topleft"
node_size <- "topology"
vertex_sizes <- table(cl)/2
weight_factors <- rep(vertex_sizes, each = 4)
# Node colours
cols <- brewer.pal(9,"Set1")
temp <- gsub("C", "", V(g)$name)
cols <- cols[match(temp, 1:length(levels(cl))-1)]
E(g)$weight <- combined$Value * weight_factors / 1000
plot(g,
vertex.size = vertex_sizes,
vertex.shape = "circle",
vertex.color = cols,
edge.width = E(g)$weight,
label.font = 12,
label.dist = 5,
edge.label = combined$Value,
arrow.size = .3,
main = "scGPS Network"
)