Comparison of scGPS to established pseudo-time analysis methods

Load the dataset and packages

library(Seurat)
library(slingshot)
library(RColorBrewer)
library(monocle3)
library(SingleCellExperiment)
library(dplyr)
library(ggplot2)
library(scGPS)
library(locfit)
library(networkD3)
library(dplyr)
library(tidyr)
library(ggpubr)
library(igraph)
rm(list = ls())
# Load the dataset we need
dat <- readRDS("myoblast-differentiation_trapnell.rds")

Top Level Inputs

scGPS_nboots <- 10
seurat_cluster_res <- 0.2
colours_to_use <- brewer.pal(9,"Set1")

Data preprocessing (PCA, UMAP, DE analysis) by Seurat

data_object <- CreateSeuratObject(counts = t(dat$counts))
data_object <- FindVariableFeatures(data_object, selection.method = "vst", nfeatures = 3000)
data_object <- ScaleData(data_object)
data_object <- RunPCA(data_object)
data_object <- FindNeighbors(data_object, dims = 1:50)
data_object <- FindClusters(data_object, resolution = seurat_cluster_res)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 290
## Number of edges: 10809
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8257
## Number of communities: 3
## Elapsed time: 0 seconds

cl <- data_object$seurat_clusters
data_object <- RunUMAP(data_object,dims=1:50, n.neighbors=5, min.dist=0)
rd <- Embeddings(data_object, reduction = "umap")

Slingshot

sl <- slingshot(rd,cl)

# Visualise the trajectory in UMAP plot
data_frame <- as.data.frame(rd)
data_frame$cluster <- cl


g <- ggplot(data_frame, aes(UMAP_1, UMAP_2, color = cluster))
g <- g + geom_point(size = 1)
g <- g + scale_color_manual(values = colours_to_use[1:length(levels(cl))])
# Hide panel borders and remove grid lines
g <- g + theme(panel.border = element_blank(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               axis.line = element_line(colour = "black"),
               panel.background = element_rect(colour = "white", fill = "white"))

# Adding the curves
for (i in seq_along(slingCurves(sl))) {
  curve_i <- slingCurves(sl)[[i]]
  curve_i <- curve_i$s[curve_i$ord, ]
  colnames(curve_i) <- c("UMAP_1", "UMAP_2")
  g <- g + geom_path(data = as.data.frame(curve_i), col = "black", size = .6)
}
g

Monocole

#Data preprocessing
counts<- t(dat$counts)
colData <- as.data.frame(dat$prior_information$groups_id)
colData$timecourse_discrete <- dat$prior_information$timecourse_discrete[[1]]
start_id <- dat$prior_information$start_id
colData$cluster <- cl[colData$cell_id]
colnames <- colnames(counts)
colnames(counts) <- make.unique(colnames(counts))
row.names(colData) <- colnames(counts)
cds <- new_cell_data_set(counts, cell_metadata=colData)
cds@int_colData@listData$reducedDims$UMAP <- rd
cds <- cluster_cells(cds, reduction_method = "UMAP", cluster_method = "louvain")
cluster <- cds$cluster
names(cluster) <- cds$cell_id
cds@clusters@listData$UMAP$clusters <- cluster

#Run trajectory infrence
cds <- learn_graph(cds, use_partition=F)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

# Visualise the trajectory in UMAP plot
plot_monocle <- plot_cells(cds, group_cells_by = 'cluster',
                           label_cell_groups = F,
                           label_branch_points = F,
                           label_roots = F,
                           label_leaves = F,
                           cell_size = 1) + 
  theme(legend.position = "right") + 
  scale_color_manual(values = colours_to_use[1:length(levels(cl))])
plot_monocle

scGPS

# Data preprocessing
dat_expr1 <- t(dat$counts)
dat_celldata <- cl
dat_GeneMetaData1 <- data.frame("GeneSymbol" = dat$feature_info$feature_id)
mixedpop1 <- new_scGPS_object(ExpressionMatrix=dat_expr1, GeneMetadata = dat_GeneMetaData1,
                              CellMetadata =dat_celldata)
mixedpop2 = mixedpop1
# Find markers for clustering (by comparing cells in one cluster vs remaining cells)
cell_types1 = colData(mixedpop1)[,1]
cell_types2 = colData(mixedpop2)[,1]
DEgenes <- find_markers(expression_matrix=assays(mixedpop1)[["counts"]],
                        cluster = cell_types1,
                        selected_cluster=unique(cell_types1))
types1 <- levels(cell_types1)
types2 <- levels(cell_types2)
LSOLDA_dats <- list()
formatted <- list()
# List of colours
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
# Loop through each cluster and perform analysis
for (i in 1:length(types1)) {
  print(paste0("Start Round ", i))
  c_selectID <- types1[i]
  genes <- DEgenes$id[1:500]
  LSOLDA_dats[[i]] <- bootstrap_prediction(nboots = scGPS_nboots,
                                           mixedpop1 = mixedpop1,
                                           mixedpop2 = mixedpop2,
                                           genes = genes,
                                           c_selectID = c_selectID,
                                           listData = list(),
                                           cluster_mixedpop1 = cell_types1,
                                           cluster_mixedpop2 = cell_types2,
                                           log_transform = TRUE)
  print(paste0("Done round ", i))
  formatted[[i]] <- reformat_LASSO(c_selectID = c_selectID,
                                   mp_selectID = 2,
                                   LSOLDA_dat = LSOLDA_dats[[i]],
                                   nPredSubpop = length(types2),
                                   Nodes_group = col_vector[i],
                                   nboots = scGPS_nboots)
}

## [1] "Start Round 1"
## [1] "Done round 1"
## [1] "Start Round 2"
## [1] "Done round 2"
## [1] "Start Round 3"
## [1] "Done round 3"

# Add all the data together
combined <- do.call("rbind", formatted)

# Reformat so we only have the overall, not all individual bootstraps
combined <- combined[, c(scGPS_nboots+1, scGPS_nboots+2, scGPS_nboots+5)]

# Rename nodes
combined$Source <- gsub("_MP2", "", combined$Source)
combined$Target <- gsub("_MP2", "", combined$Target)
combined$Value <- round(combined$Value, digit=2) #2 decimals
g <- graph_from_data_frame(combined)

#cell_annot=broad_type
layout=NULL
legend_position <- "topleft"
node_size <- "topology"
vertex_sizes <- table(cl)/2
weight_factors <- rep(vertex_sizes, each = 4) 

# Node colours
cols <- brewer.pal(9,"Set1")
temp <- gsub("C", "", V(g)$name)
cols <- cols[match(temp, 1:length(levels(cl))-1)]

E(g)$weight <- combined$Value * weight_factors / 1000

plot(g,
     vertex.size = vertex_sizes, 
     vertex.shape = "circle",
     vertex.color = cols,
     edge.width = E(g)$weight,
     label.font = 12, 
     label.dist = 5,
     edge.label = combined$Value,
     arrow.size = .3,
     main = "scGPS Network"
     )

Psudotime Comparison

Maika, Michael, Quan and Tianqi

08/12/2020

Comparison of scGPS to established pseudo-time analysis methods

Load the dataset and packages

Top Level Inputs

Data preprocessing (PCA, UMAP, DE analysis) by Seurat

Slingshot

Monocole

scGPS