Benchmaking for the klein dataset

Load the dataset to use

dataset <- readRDS(url("https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/klein.rds"))

scGPS

#Load everyting for scGPS Benchmarking
library(scGPS)

## Loading required package: SummarizedExperiment

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: DelayedArray

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## Loading required package: BiocParallel

## 
## Attaching package: 'DelayedArray'

## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum

## Loading required package: dynamicTreeCut

## Loading required package: SingleCellExperiment

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

library(scater)

## Loading required package: ggplot2

## 
## Attaching package: 'scater'

## The following object is masked from 'package:S4Vectors':
## 
##     rename

## The following object is masked from 'package:stats':
## 
##     filter

library(scran)

#Retrieve the Dataset
klein_dat <- dataset

#Remove features with no gene expression
keep_features <- rowSums(counts(klein_dat) > 0) > 0
table(keep_features)

## keep_features
## FALSE  TRUE 
##   128 24047

klein_dat <- klein_dat[keep_features, ]
dim(klein_dat)

## [1] 24047  2717

#Use scran normalisation
klein_dat <- computeSumFactors(klein_dat)
klein_dat <- normalize(klein_dat)

## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its
## own size factors

#Create a count per million assay
cpm(klein_dat) <- calculateCPM(klein_dat)

## Warning in .get_all_sf_sets(object): spike-in set 'ERCC' should have its
## own size factors

#Remove spikes
is.spike <-grepl("^ERCC", rownames(klein_dat))
table(is.spike)

## is.spike
## FALSE 
## 24047

klein_dat <- klein_dat[!is.spike, ]

#Start the time here
start_time <- Sys.time()

#Extract the needed variables
klein_dat_exprs <- assays(klein_dat)[["logcounts"]]
klein_dat_cellnames <- colnames(klein_dat)
klein_dat_cellnames <- data.frame("cellBarcodes" = klein_dat_cellnames)
klein_dat_GeneMetaData <- rownames(klein_dat)
klein_dat_GeneMetaData <- data.frame("GeneSymbol" = klein_dat_GeneMetaData)

#Store Data in scGPS format
mixedpop <- new_summarized_scGPS_object(ExpressionMatrix = klein_dat_exprs, GeneMetadata = klein_dat_GeneMetaData, CellMetadata = klein_dat_cellnames)

#Cluster and plot data using SCORE
CORE_cluster_bagging <- CORE_bagging(mixedpop, remove_outlier = c(0), PCA=FALSE)

## Performing 1 round of filtering

## Identifying top variable genes

## Calculating distance matrix

## Performing hierarchical clustering

## Finding clustering information

## No more outliers detected in filtering round 1

## Identifying top variable genes

## Calculating distance matrix

## Performing hierarchical clustering

## Finding clustering information

## 2717 cells left after filtering

## Running 20 bagging runs, with 0.8 subsampling...

## Done clustering, moving to stability calculation...

## Done finding optimal clustering

plot_CORE(CORE_cluster_bagging$tree, list_clusters = CORE_cluster_bagging$Cluster)

plot_optimal_CORE(original_tree= CORE_cluster_bagging$tree, optimal_cluster = unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index]), shift = -100)

## Ordering and assigning labels...

## 2

## 142413NANANANA

## 3

## 142413764NANANA

## 4

## 1424137641466NANA

## 5

## 14241376414662246NA

## 6

## 142413764146622462632

## Plotting the colored dendrogram now....

## Plotting the bar underneath now....

#Stop the time here
end_time <- Sys.time()
time_difference_SCORE <- end_time - start_time

#Find data needed for comparisons and store in data frame
cell_types1 <- colData(klein_dat)$cell_type1
label_list <- unlist(klein_dat_cellnames$cellBarcodes)
cluster_list <- unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index])
compare_frame <- data.frame("Gene_label" = label_list, "type1" = cell_types1,"cluster" = cluster_list)

#Find the adjusted rand index
AdjustedRandIndex_SCORE <- mclust::adjustedRandIndex(compare_frame$type1, compare_frame$cluster)
estimated_k_SCORE <- CORE_cluster_bagging$optimalMax
HighResRand <- mclust::adjustedRandIndex(compare_frame$type1, unlist(CORE_cluster_bagging$Cluster[1]))

#Remove clutter from the environment
rm(list = setdiff(ls(), c("AdjustedRandIndex_SCORE", "time_difference_SCORE", "estimated_k_SCORE", "HighResRand", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }

## --- AdjustedRandIndex_SCORE ---
## [1] 0.8004233
## --- dataset ---
## class: SingleCellExperiment 
## dim: 24175 2717 
## metadata(0):
## assays(2): counts logcounts
## rownames(24175): 0610005C13Rik 0610007P14Rik ... n-R5s146 n-R5s149
## rowData names(10): feature_symbol is_feature_control ...
##   total_counts log10_total_counts
## colnames(2717): cell1 cell2 ... cell2716 cell2717
## colData names(29): cell_type1 total_features ... pct_counts_ERCC
##   is_cell_control
## reducedDimNames(0):
## spikeNames(1): ERCC
## --- estimated_k_SCORE ---
## [1] 6
## --- HighResRand ---
## [1] 0.8004233
## --- time_difference_SCORE ---
## Time difference of 3.342226 mins

SC3

#Load everything for SC3
library(SC3)
library(scater)

#Retrieve the Dataset
sce <- dataset

#Find the genes with all zero entries and remove
keep_features <- rowSums(counts(sce) > 0) > 0
sce <- sce[keep_features, ]

#Remove the spikes
is.spike <-grepl("^ERCC", rownames(sce))
table(is.spike)

## is.spike
## FALSE 
## 24047

sce <- sce[!is.spike, ]

#Start the time here
start_time <- Sys.time()

#Run sc3 with an estimation for k
sce <- sc3_prepare(sce, n_cores = 1, gene_filter = TRUE, kmeans_nstart = 50)

## Setting SC3 parameters...

sce <- sc3_estimate_k(sce)

## Estimating k...

SC3_k_estimate <- as.integer(unlist(metadata(sce)$sc3$k_estimation))
sce <- sc3_calc_dists(sce)

## Calculating distances between the cells...

sce <- sc3_calc_transfs(sce)

## Performing transformations and calculating eigenvectors...

sce <- sc3_kmeans(sce, ks = SC3_k_estimate)

## Performing k-means clustering...

sce <- sc3_calc_consens(sce)

## Calculating consensus matrix...

#Here we change to 50 as there are over 2000 cells as suggested in sc3 methods

#Stop the time here
end_time <- Sys.time()
time_difference_SC3 <- end_time - start_time

#Make a dataframe with the results we want to examine
cell_types1 <- colData(sce)$cell_type1
label_list <- rownames(colData(sce))
cluster_list <- as.numeric(colData(sce)[, paste0("sc3_", SC3_k_estimate, "_clusters")])
compare_frame <- data.frame("Gene_label" = label_list, "type1" = cell_types1, "cluster" = cluster_list)

#Find the Adjusted Rand Index
AdjustedRandIndex_SC3 <- mclust::adjustedRandIndex(compare_frame$type1, compare_frame$cluster)

#Remove unwanted data
rm(list = setdiff(ls(), c("AdjustedRandIndex_SC3", "time_difference_SC3", "SC3_k_estimate", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }

## --- AdjustedRandIndex_SC3 ---
## [1] 0.6362361
## --- dataset ---
## class: SingleCellExperiment 
## dim: 24175 2717 
## metadata(0):
## assays(2): counts logcounts
## rownames(24175): 0610005C13Rik 0610007P14Rik ... n-R5s146 n-R5s149
## rowData names(10): feature_symbol is_feature_control ...
##   total_counts log10_total_counts
## colnames(2717): cell1 cell2 ... cell2716 cell2717
## colData names(29): cell_type1 total_features ... pct_counts_ERCC
##   is_cell_control
## reducedDimNames(0):
## spikeNames(1): ERCC
## --- SC3_k_estimate ---
## [1] 16
## --- time_difference_SC3 ---
## Time difference of 15.89139 mins

Benchmarking Klein Dataset

Quan and Michael

7/9/2018

Benchmaking for the klein dataset

Load the dataset to use

scGPS

SC3