scGPS
#Load everyting for scGPS Benchmarking
library(scGPS)
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
## Loading required package: dynamicTreeCut
## Loading required package: SingleCellExperiment
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library("MultiAssayExperiment")
library("scater")
## Loading required package: ggplot2
##
## Attaching package: 'scater'
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library("scran")
#Retrieve the Dataset
kumar_dat <- dataset
#Extract the gene-level length-scaled TPMs
kumar_expr <- assays(experiments(kumar_dat)[["gene"]])[["count_lstpm"]]
#Extract the phenotype data.
phn <- colData(kumar_dat)
phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1", "characteristics_ch1.1")])))
## Simplify labels
phn$phenoid <- plyr::revalue(
phn$phenoid,
c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF",
"v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF",
"v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF")
)
#Create single cell experiment
stopifnot(all(colnames(kumar_expr) == rownames(phn)))
SCE <- SingleCellExperiment(
assays = list(counts = kumar_expr),
colData = phn
)
#Remove features with no gene expression
keep_features <- rowSums(counts(SCE) > 0) > 0
SCE <- SCE[keep_features, ]
#Use this for scran normalisation
SCE <- computeSumFactors(SCE)
SCE <- normalize(SCE, exprs_values = "counts",
return_log = TRUE)
#Create a count per million assay
cpm(SCE) <- calculateCPM(SCE)
#Remove spikes
is.spike <-grepl("^ERCC", rownames(SCE))
SCE <- SCE[!is.spike, ]
#Start the time here
start_time <- Sys.time()
#Extract the needed variables
kumar_dat_exprs <- assays(SCE)[["logcounts"]]
kumar_dat_cellnames <- colnames(SCE)
kumar_dat_cellnames <- data.frame("cellBarcodes" = kumar_dat_cellnames)
kumar_dat_GeneMetaData <- rownames(SCE)
kumar_dat_GeneMetaData <- data.frame("GeneSymbol" = kumar_dat_GeneMetaData)
#Store Data in scGPS format
mixedpop <- new_summarized_scGPS_object(ExpressionMatrix = kumar_dat_exprs, GeneMetadata = kumar_dat_GeneMetaData, CellMetadata = kumar_dat_cellnames)
#Cluster and plot data using SCORE
CORE_cluster_bagging <- CORE_bagging(mixedpop, remove_outlier = c(0), PCA=FALSE)
## Performing 1 round of filtering
## Identifying top variable genes
## Calculating distance matrix
## Performing hierarchical clustering
## Finding clustering information
## No more outliers detected in filtering round 1
## Identifying top variable genes
## Calculating distance matrix
## Performing hierarchical clustering
## Finding clustering information
## 268 cells left after filtering
## Running 20 bagging runs, with 0.8 subsampling...
## Done clustering, moving to stability calculation...
## Done finding optimal clustering
plot_CORE(CORE_cluster_bagging$tree, list_clusters = CORE_cluster_bagging$Cluster)
plot_optimal_CORE(original_tree= CORE_cluster_bagging$tree, optimal_cluster = unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index]), shift = -100)
## Ordering and assigning labels...
## 2
## 45179
## Plotting the colored dendrogram now....
## Plotting the bar underneath now....
#Stop the time here
end_time <- Sys.time()
time_difference_SCORE <- end_time - start_time
#Find data needed for comparisons and store in data frame
phenoid_list <- unlist(colData(SCE)$phenoid)
label_list <- unlist(kumar_dat_cellnames$cellBarcodes) #unlist(colData(SCE)$geo_accession)
cluster_list <- unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index])
compare_frame <- data.frame("Gene_label" = label_list, "phenoid_list" = phenoid_list, "cluster" = cluster_list)
#Find the adjusted rand index
AdjustedRandIndex_SCORE <- mclust::adjustedRandIndex(compare_frame$phenoid_list, compare_frame$cluster)
HighResRand <- mclust::adjustedRandIndex(compare_frame$phenoid_list, unlist(CORE_cluster_bagging$Cluster[1]))
estimated_k_SCORE <- CORE_cluster_bagging$optimalMax
#Remove clutter from the environment
rm(list = setdiff(ls(), c("AdjustedRandIndex_SCORE", "time_difference_SCORE", "estimated_k_SCORE", "HighResRand", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }
## --- AdjustedRandIndex_SCORE ---
## [1] 0.5736199
## --- dataset ---
## A MultiAssayExperiment object of 2 listed
## experiments with user-defined names and respective classes.
## Containing an ExperimentList class object of length 2:
## [1] gene: RangedSummarizedExperiment with 45686 rows and 268 columns
## [2] tx: RangedSummarizedExperiment with 113560 rows and 268 columns
## Features:
## experiments() - obtain the ExperimentList instance
## colData() - the primary/phenotype DataFrame
## sampleMap() - the sample availability DataFrame
## `$`, `[`, `[[` - extract colData columns, subset, or experiment
## *Format() - convert into a long or wide DataFrame
## assays() - convert ExperimentList to a SimpleList of matrices
## --- estimated_k_SCORE ---
## [1] 2
## --- HighResRand ---
## [1] 1
## --- time_difference_SCORE ---
## Time difference of 16.87992 secs
SC3
#Load everything for sc3
library("SC3")
library("MultiAssayExperiment")
library("scater")
#Retrieve the Dataset
kumar_dat <- dataset
#Extract the gene-level length-scaled TPMs
kumar_expr <- assays(experiments(kumar_dat)[["gene"]])[["count_lstpm"]]
#Extract the phenotype data.
phn <- colData(kumar_dat)
phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1", "characteristics_ch1.1")])))
## Simplify labels
phn$phenoid <- plyr::revalue(
phn$phenoid,
c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF",
"v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF",
"v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF")
)
#Create single cell experiment
stopifnot(all(colnames(kumar_expr) == rownames(phn)))
SCE <- SingleCellExperiment(
assays = list(counts = kumar_expr),
colData = phn
)
#Find the genes with all zero entries and remove
keep_features <- rowSums(counts(SCE) > 0) > 0
SCE <- SCE[keep_features, ]
#Create "logcounts" assays
SCE <- normalize(SCE, exprs_values = "counts", return_log = TRUE)
## Warning in .local(object, ...): using library sizes as size factors
#Remove the spikes
is.spike <-grepl("^ERCC", rownames(SCE))
SCE <- SCE[!is.spike, ]
#Start the time here
start_time <- Sys.time()
#Run sc3 with an estimation for k
rowData(SCE)$feature_symbol <- rownames(counts(SCE))
SCE <- sc3_prepare(SCE, n_cores = 1, gene_filter = TRUE)
## Setting SC3 parameters...
SCE <- sc3_estimate_k(SCE)
## Estimating k...
SC3_k_estimate <- as.integer(unlist(metadata(SCE)$sc3$k_estimation))
SCE <- sc3_calc_dists(SCE)
## Calculating distances between the cells...
SCE <- sc3_calc_transfs(SCE)
## Performing transformations and calculating eigenvectors...
SCE <- sc3_kmeans(SCE, ks = SC3_k_estimate)
## Performing k-means clustering...
SCE <- sc3_calc_consens(SCE)
## Calculating consensus matrix...
#Stop the time here
end_time <- Sys.time()
time_difference_SC3 <- end_time - start_time
#Make a dataframe with the results we want to examine
phenoid_list <- colData(SCE)$phenoid
label_list <- rownames(colData(SCE))
cluster_list <- as.numeric(colData(SCE)[, paste0("sc3_", SC3_k_estimate, "_clusters")])
compare_frame <- data.frame("Gene_label" = label_list, "phenoid_list" = phenoid_list, "cluster" = cluster_list)
#Find the Adjusted Rand Index
AdjustedRandIndex_SC3 <- mclust::adjustedRandIndex(compare_frame$phenoid_list, compare_frame$cluster)
#Remove unwanted data
rm(list = setdiff(ls(), c("AdjustedRandIndex_SC3", "time_difference_SC3", "SC3_k_estimate", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }
## --- AdjustedRandIndex_SC3 ---
## [1] 0.9943772
## --- dataset ---
## A MultiAssayExperiment object of 2 listed
## experiments with user-defined names and respective classes.
## Containing an ExperimentList class object of length 2:
## [1] gene: RangedSummarizedExperiment with 45686 rows and 268 columns
## [2] tx: RangedSummarizedExperiment with 113560 rows and 268 columns
## Features:
## experiments() - obtain the ExperimentList instance
## colData() - the primary/phenotype DataFrame
## sampleMap() - the sample availability DataFrame
## `$`, `[`, `[[` - extract colData columns, subset, or experiment
## *Format() - convert into a long or wide DataFrame
## assays() - convert ExperimentList to a SimpleList of matrices
## --- SC3_k_estimate ---
## [1] 4
## --- time_difference_SC3 ---
## Time difference of 29.73346 secs