Benchmarking of the kumar dataset

Load the dataset to use

dataset <- readRDS(url("http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE60749-GPL13112.rds"))

scGPS

#Load everyting for scGPS Benchmarking
library(scGPS)

## Loading required package: SummarizedExperiment

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: DelayedArray

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## Loading required package: BiocParallel

## 
## Attaching package: 'DelayedArray'

## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum

## Loading required package: dynamicTreeCut

## Loading required package: SingleCellExperiment

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

library("MultiAssayExperiment")
library("scater")

## Loading required package: ggplot2

## 
## Attaching package: 'scater'

## The following object is masked from 'package:S4Vectors':
## 
##     rename

## The following object is masked from 'package:stats':
## 
##     filter

library("scran")

#Retrieve the Dataset
kumar_dat <- dataset

#Extract the gene-level length-scaled TPMs
kumar_expr <- assays(experiments(kumar_dat)[["gene"]])[["count_lstpm"]]

#Extract the phenotype data.
phn <- colData(kumar_dat)
phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1", "characteristics_ch1.1")])))

## Simplify labels
phn$phenoid <-  plyr::revalue(
  phn$phenoid, 
  c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF", 
    "v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF",
    "v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF")
)

#Create single cell experiment
stopifnot(all(colnames(kumar_expr) == rownames(phn)))
SCE <- SingleCellExperiment(
  assays = list(counts = kumar_expr),
  colData = phn
)

#Remove features with no gene expression
keep_features <- rowSums(counts(SCE) > 0) > 0
SCE <- SCE[keep_features, ]

#Use this for scran normalisation
SCE <- computeSumFactors(SCE)
SCE <- normalize(SCE, exprs_values = "counts", 
                 return_log = TRUE)

#Create a count per million assay
cpm(SCE) <- calculateCPM(SCE)

#Remove spikes
is.spike <-grepl("^ERCC", rownames(SCE))
SCE <- SCE[!is.spike, ]

#Start the time here
start_time <- Sys.time()

#Extract the needed variables
kumar_dat_exprs <- assays(SCE)[["logcounts"]]
kumar_dat_cellnames <- colnames(SCE)
kumar_dat_cellnames <- data.frame("cellBarcodes" = kumar_dat_cellnames)
kumar_dat_GeneMetaData <- rownames(SCE)
kumar_dat_GeneMetaData <- data.frame("GeneSymbol" = kumar_dat_GeneMetaData)

#Store Data in scGPS format
mixedpop <- new_summarized_scGPS_object(ExpressionMatrix = kumar_dat_exprs, GeneMetadata = kumar_dat_GeneMetaData, CellMetadata = kumar_dat_cellnames)

#Cluster and plot data using SCORE
CORE_cluster_bagging <- CORE_bagging(mixedpop, remove_outlier = c(0), PCA=FALSE)

## Performing 1 round of filtering

## Identifying top variable genes

## Calculating distance matrix

## Performing hierarchical clustering

## Finding clustering information

## No more outliers detected in filtering round 1

## Identifying top variable genes

## Calculating distance matrix

## Performing hierarchical clustering

## Finding clustering information

## 268 cells left after filtering

## Running 20 bagging runs, with 0.8 subsampling...

## Done clustering, moving to stability calculation...

## Done finding optimal clustering

plot_CORE(CORE_cluster_bagging$tree, list_clusters = CORE_cluster_bagging$Cluster)

plot_optimal_CORE(original_tree= CORE_cluster_bagging$tree, optimal_cluster = unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index]), shift = -100)

## Ordering and assigning labels...

## 2

## 45179

## Plotting the colored dendrogram now....

## Plotting the bar underneath now....

#Stop the time here
end_time <- Sys.time()
time_difference_SCORE <- end_time - start_time

#Find data needed for comparisons and store in data frame
phenoid_list <- unlist(colData(SCE)$phenoid)
label_list <- unlist(kumar_dat_cellnames$cellBarcodes) #unlist(colData(SCE)$geo_accession)
cluster_list <- unlist(CORE_cluster_bagging$Cluster[CORE_cluster_bagging$optimal_index])
compare_frame <- data.frame("Gene_label" = label_list, "phenoid_list" = phenoid_list, "cluster" = cluster_list)

#Find the adjusted rand index
AdjustedRandIndex_SCORE <- mclust::adjustedRandIndex(compare_frame$phenoid_list, compare_frame$cluster)
HighResRand <- mclust::adjustedRandIndex(compare_frame$phenoid_list, unlist(CORE_cluster_bagging$Cluster[1]))
estimated_k_SCORE <- CORE_cluster_bagging$optimalMax

#Remove clutter from the environment
rm(list = setdiff(ls(), c("AdjustedRandIndex_SCORE", "time_difference_SCORE", "estimated_k_SCORE", "HighResRand", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }

## --- AdjustedRandIndex_SCORE ---
## [1] 0.5736199
## --- dataset ---
## A MultiAssayExperiment object of 2 listed
##  experiments with user-defined names and respective classes. 
##  Containing an ExperimentList class object of length 2: 
##  [1] gene: RangedSummarizedExperiment with 45686 rows and 268 columns 
##  [2] tx: RangedSummarizedExperiment with 113560 rows and 268 columns 
## Features: 
##  experiments() - obtain the ExperimentList instance 
##  colData() - the primary/phenotype DataFrame 
##  sampleMap() - the sample availability DataFrame 
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment 
##  *Format() - convert into a long or wide DataFrame 
##  assays() - convert ExperimentList to a SimpleList of matrices
## --- estimated_k_SCORE ---
## [1] 2
## --- HighResRand ---
## [1] 1
## --- time_difference_SCORE ---
## Time difference of 16.87992 secs

SC3

#Load everything for sc3
library("SC3")
library("MultiAssayExperiment")
library("scater")

#Retrieve the Dataset
kumar_dat <- dataset

#Extract the gene-level length-scaled TPMs
kumar_expr <- assays(experiments(kumar_dat)[["gene"]])[["count_lstpm"]]

#Extract the phenotype data.
phn <- colData(kumar_dat)
phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1", "characteristics_ch1.1")])))

## Simplify labels
phn$phenoid <-  plyr::revalue(
  phn$phenoid, 
  c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF", 
    "v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF",
    "v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF")
)

#Create single cell experiment
stopifnot(all(colnames(kumar_expr) == rownames(phn)))
SCE <- SingleCellExperiment(
  assays = list(counts = kumar_expr),
  colData = phn
)

#Find the genes with all zero entries and remove
keep_features <- rowSums(counts(SCE) > 0) > 0
SCE <- SCE[keep_features, ]

#Create "logcounts" assays
SCE <- normalize(SCE, exprs_values = "counts", return_log = TRUE)

## Warning in .local(object, ...): using library sizes as size factors

#Remove the spikes
is.spike <-grepl("^ERCC", rownames(SCE))
SCE <- SCE[!is.spike, ]

#Start the time here
start_time <- Sys.time()

#Run sc3 with an estimation for k
rowData(SCE)$feature_symbol <- rownames(counts(SCE))
SCE <- sc3_prepare(SCE, n_cores = 1, gene_filter = TRUE)

## Setting SC3 parameters...

SCE <- sc3_estimate_k(SCE)

## Estimating k...

SC3_k_estimate <- as.integer(unlist(metadata(SCE)$sc3$k_estimation))
SCE <- sc3_calc_dists(SCE)

## Calculating distances between the cells...

SCE <- sc3_calc_transfs(SCE)

## Performing transformations and calculating eigenvectors...

SCE <- sc3_kmeans(SCE, ks = SC3_k_estimate)

## Performing k-means clustering...

SCE <- sc3_calc_consens(SCE)

## Calculating consensus matrix...

#Stop the time here
end_time <- Sys.time()
time_difference_SC3 <- end_time - start_time

#Make a dataframe with the results we want to examine
phenoid_list <- colData(SCE)$phenoid
label_list <- rownames(colData(SCE))
cluster_list <- as.numeric(colData(SCE)[, paste0("sc3_", SC3_k_estimate, "_clusters")])
compare_frame <- data.frame("Gene_label" = label_list, "phenoid_list" = phenoid_list, "cluster" = cluster_list)

#Find the Adjusted Rand Index
AdjustedRandIndex_SC3 <- mclust::adjustedRandIndex(compare_frame$phenoid_list, compare_frame$cluster)

#Remove unwanted data
rm(list = setdiff(ls(), c("AdjustedRandIndex_SC3", "time_difference_SC3", "SC3_k_estimate", "dataset")))
for ( obj in ls() ) { cat('---',obj,'---\n'); print(get(obj)) }

## --- AdjustedRandIndex_SC3 ---
## [1] 0.9943772
## --- dataset ---
## A MultiAssayExperiment object of 2 listed
##  experiments with user-defined names and respective classes. 
##  Containing an ExperimentList class object of length 2: 
##  [1] gene: RangedSummarizedExperiment with 45686 rows and 268 columns 
##  [2] tx: RangedSummarizedExperiment with 113560 rows and 268 columns 
## Features: 
##  experiments() - obtain the ExperimentList instance 
##  colData() - the primary/phenotype DataFrame 
##  sampleMap() - the sample availability DataFrame 
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment 
##  *Format() - convert into a long or wide DataFrame 
##  assays() - convert ExperimentList to a SimpleList of matrices
## --- SC3_k_estimate ---
## [1] 4
## --- time_difference_SC3 ---
## Time difference of 29.73346 secs

benchMark

Quan and Michael

7/9/2018

Benchmarking of the kumar dataset

Load the dataset to use

scGPS

SC3