Benchmaking for the baron dataset

Load the dataset to use

dataset <- readRDS(url("http://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/baron-human.rds"))

scGPS

#Load everyting for scGPS Benchmarking
library(scGPS)
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, basename, cbind, colMeans, colSums, colnames,
##     dirname, do.call, duplicated, eval, evalq, get, grep, grepl,
##     intersect, is.unsorted, lapply, lengths, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, rank, rbind,
##     rowMeans, rowSums, rownames, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply
## Loading required package: dynamicTreeCut
## Loading required package: SingleCellExperiment
library(SingleCellExperiment)
library(scran)
library(scater)
## Loading required package: ggplot2
## 
## Attaching package: 'scater'
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
#Retrieve dataset
baron_dat <- dataset

#Find the genes with all zero entries and remove
keep_features <- rowSums(counts(baron_dat) > 0) > 0
baron_dat <- baron_dat[keep_features, ]

#Use Scran normalisation
baron_dat <- computeSumFactors(baron_dat)
baron_dat <- normalize(baron_dat)
## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its
## own size factors
#Used to add the counts per million
cpm(baron_dat) <- calculateCPM(baron_dat)
## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its
## own size factors
#Remove the spikes
is.spike <-grepl("^ERCC", rownames(baron_dat))
baron_dat <- baron_dat[!is.spike, ]

#Start the time here
start_time <- Sys.time()

#Extract the needed variables
baron_dat_exprs <- assays(baron_dat)[["logcounts"]]
baron_dat_cellnames <- colnames(baron_dat)
baron_dat_cellnames <- data.frame("cellBarcodes" = baron_dat_cellnames)
baron_dat_GeneMetaData <- rownames(baron_dat)
baron_dat_GeneMetaData <- data.frame("GeneSymbol" = baron_dat_GeneMetaData)

#Store Data in scGPS format
mixedpop <- new_summarized_scGPS_object(ExpressionMatrix = baron_dat_exprs, GeneMetadata = baron_dat_GeneMetaData, CellMetadata = baron_dat_cellnames)

#Cluster and plot data using SCORE
CORE_cluster_bagging <- CORE_bagging(mixedpop, remove_outlier = c(0), PCA=FALSE)
## Performing 1 round of filtering
## Identifying top variable genes
## Calculating distance matrix
## Performing hierarchical clustering
## Finding clustering information
## No more outliers detected in filtering round 1
## Identifying top variable genes
## Calculating distance matrix
## Performing hierarchical clustering
## Finding clustering information
## 8569 cells left after filtering
## Running 20 bagging runs, with 0.8 subsampling...
## Done clustering, moving to stability calculation...
## Done finding optimal clustering
plot_CORE(CORE_cluster_bagging$tree, list_clusters = CORE_cluster_bagging$Cluster)
plot of chunk unnamed-chunk-2

plot of chunk unnamed-chunk-2

plot_optimal_CORE(original_tree= CORE_cluster_bagging$tree, optimal_cluster = unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index]), shift = -100)
## Ordering and assigning labels...
## 2
## 4001613NANANANANANANA
## 3
## 40016132897NANANANANANA
## 4
## 400161328973842NANANANANA
## 5
## 4001613289738425091NANANANA
## 6
## 40016132897384250916341NANANA
## 7
## 400161328973842509163417174NANA
## 8
## 4001613289738425091634171747618NA
## 9
## 40016132897384250916341717476188138
## Plotting the colored dendrogram now....
## Plotting the bar underneath now....
plot of chunk unnamed-chunk-2

plot of chunk unnamed-chunk-2

#Stop the time here and record
end_time <- Sys.time()
time_difference_SCORE <- end_time - start_time

#Make a dataframe with the results we want to examine
cell_types1 <- colData(baron_dat)$cell_type1
label_list <- unlist(baron_dat_cellnames$cellBarcodes)
cluster_list <- unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index])
compare_frame <- data.frame("Gene_label" = label_list, "type1" = cell_types1,"cluster" = cluster_list)

#Find the Adjusted Rand Index
AdjustedRandIndex_SCORE <- mclust::adjustedRandIndex(compare_frame$type1, compare_frame$cluster)
HighResRandIndex_SCORE <- mclust::adjustedRandIndex(compare_frame$type1, unlist(CORE_cluster_bagging$Cluster[1]))


#Store the estimated k from the bagging runs
estimated_k_SCORE <- CORE_cluster_bagging$optimalMax

#Remove Unwanted data and save
rm(list = setdiff(ls(), c("AdjustedRandIndex_SCORE", "time_difference_SCORE", "estimated_k_SCORE", "HighResRandIndex_SCORE", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }
## --- AdjustedRandIndex_SCORE ---
## [1] 0.6134301
## --- HighResRandIndex_SCORE ---
## [1] 0.6134301
## --- dataset ---
## class: SingleCellExperiment 
## dim: 20125 8569 
## metadata(0):
## assays(2): counts logcounts
## rownames(20125): A1BG A1CF ... ZZZ3 pk
## rowData names(10): feature_symbol is_feature_control ...
##   total_counts log10_total_counts
## colnames(8569): human1_lib1.final_cell_0001
##   human1_lib1.final_cell_0002 ... human4_lib3.final_cell_0700
##   human4_lib3.final_cell_0701
## colData names(30): human cell_type1 ... pct_counts_ERCC
##   is_cell_control
## reducedDimNames(0):
## spikeNames(1): ERCC
## --- estimated_k_SCORE ---
## [1] 9
## --- time_difference_SCORE ---
## Time difference of 23.93494 mins

Benchmarking SC3 with the baron dataset

#Load everything for SC3
library(SC3)
library(scater)

#Retrieve the dataset
sce <- dataset

#Find the genes with all zero entries and remove
keep_features <- rowSums(counts(sce) > 0) > 0
sce <- sce[keep_features, ]

#Remove the spikes
is.spike <-grepl("^ERCC", rownames(sce))
table(is.spike)
## is.spike
## FALSE  TRUE 
## 17489    10
sce <- sce[!is.spike, ]

#Start the time here
start_time <- Sys.time()

#Run sc3 with an estimation for k
sce <- sc3_prepare(sce, n_cores = 1, gene_filter = TRUE, kmeans_nstart = 50)
## Setting SC3 parameters...
## Defining training cells for SVM using 5000 random cells...
sce <- sc3_estimate_k(sce)
## Estimating k...
SC3_k_estimate <- as.integer(unlist(metadata(sce)$sc3$k_estimation))
sce <- sc3_calc_dists(sce)
## Calculating distances between the cells...
sce <- sc3_calc_transfs(sce)
## Performing transformations and calculating eigenvectors...
sce <- sc3_kmeans(sce, ks = SC3_k_estimate)
## Performing k-means clustering...
sce <- sc3_calc_consens(sce)
## Calculating consensus matrix...
#Here we change to 50 as there are over 2000 cells as suggested in sc3 methods

#Stop the time here
end_time <- Sys.time()
time_difference_SC3 <- end_time - start_time

#Make a dataframe with the results we want to examine
cell_types1 <- colData(sce)$cell_type1
label_list <- rownames(colData(sce))
cluster_list <- as.numeric(colData(sce)[, paste0("sc3_", SC3_k_estimate, "_clusters")])
compare_frame <- data.frame("Gene_label" = label_list, "type1" = cell_types1, "cluster" = cluster_list)

#Find the Adjusted Rand Index
AdjustedRandIndex_SC3 <- mclust::adjustedRandIndex(compare_frame$type1, compare_frame$cluster)

#Remove unwanted data and save
rm(list = setdiff(ls(), c("AdjustedRandIndex_SC3", "time_difference_SC3", "SC3_k_estimate", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }
## --- AdjustedRandIndex_SC3 ---
## [1] 0.260237
## --- SC3_k_estimate ---
## [1] 54
## --- dataset ---
## class: SingleCellExperiment 
## dim: 20125 8569 
## metadata(0):
## assays(2): counts logcounts
## rownames(20125): A1BG A1CF ... ZZZ3 pk
## rowData names(10): feature_symbol is_feature_control ...
##   total_counts log10_total_counts
## colnames(8569): human1_lib1.final_cell_0001
##   human1_lib1.final_cell_0002 ... human4_lib3.final_cell_0700
##   human4_lib3.final_cell_0701
## colData names(30): human cell_type1 ... pct_counts_ERCC
##   is_cell_control
## reducedDimNames(0):
## spikeNames(1): ERCC
## --- time_difference_SC3 ---
## Time difference of 2.295582 hours