This vignette demonstrates integrating two separate scRNA-seq datasets using cyCombine. The integration is done on the SCTransformed counts.
library("Seurat")
library("dplyr")
library("ggplot2")
library("patchwork")
library("cyDefine") # Available at https://github.com/biosurf/cyDefine
library("cyCombine")
seed <- 942
system("mkdir -p data")
system("mkdir -p results")We are using the normal samples from a public COVID paper, Lee et al., 2020
# Individual Single-Cell RNA-seq PBMC Data from Lee et al. (DOI: 10.1126/sciimmunol.abd1554)
if (!file.exists("data/lee.rds")) system("wget https://datasets.cellxgene.cziscience.com/bdee03e4-14ee-4f3e-a225-3d12f032841f.rds -O data/lee.rds")
# Load and transform Lee
lee <- readRDS("data/lee.rds")
if (! "SCT" %in% Seurat::Assays(lee)) {
lee <- lee[, grepl("Normal", lee$sample)]
lee <- FindVariableFeatures(lee)
oopts <- options(future.globals.maxSize = 1.0 * 1e9)
on.exit(options(oopts))
lee <- Seurat::SCTransform(lee)
lee <- FindVariableFeatures(lee)
lee@reductions <- list()
lee[["RNA"]] <- NULL
}## Running SCTransform on assay: RNA
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 19051 by 16298
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
## Found 304 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 19051 genes
## Computing corrected count matrix for 19051 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 1.075148 mins
## Determine variable features
## Centering data matrix
## Place corrected count matrix in counts slot
## Set default assay to SCT
# Align annotation labels
lee$batch <- "lee"
# lee$sample <- "lee"
celltype <- lee$Cell.group
lee$celltype <- dplyr::case_when(
celltype == "B cell" ~ "B cells",
stringr::str_starts(celltype, "CD4") ~ "CD4pos",
stringr::str_starts(celltype, "CD8") ~ "CD8pos",
stringr::str_ends(celltype, "Monocyte") ~ "Monocytes",
celltype == "NK cell" ~ "NK cells",
celltype %in% c("cDC", "pDC") ~ paste0(celltype, "s"),
lee$cell_type == "gamma-delta T cell" ~ "TCRgd",
celltype == "Platelet" ~ "Platelets",
TRUE ~ "Other"
)# https://cellxgene.cziscience.com/collections/03f821b4-87be-4ff4-b65a-b5fc00061da7
# Local and systemic responses to SARS-CoV-2 infection in children and adults
if (!file.exists("data/yoshida.rds")) system("wget https://datasets.cellxgene.cziscience.com/d911082e-b5e2-40e6-8f08-fb53c7894622.rds -O data/yoshida.rds")
# Load and transform Lee
yoshida <- readRDS("data/yoshida.rds")
if (! "SCT" %in% Seurat::Assays(yoshida)) {
yoshida <- yoshida[, grepl("AN", yoshida$sample_id)]
yoshida <- FindVariableFeatures(yoshida)
oopts <- options(future.globals.maxSize = 2.0 * 1e9)
on.exit(options(oopts))
yoshida <- Seurat::SCTransform(yoshida)
yoshida <- FindVariableFeatures(yoshida)
yoshida@reductions <- list()
yoshida[["RNA"]] <- NULL
}## Running SCTransform on assay: RNA
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 19220 by 46993
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
## Found 665 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 19220 genes
## Computing corrected count matrix for 19220 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 2.829858 mins
## Determine variable features
## Centering data matrix
## Place corrected count matrix in counts slot
## Set default assay to SCT
yoshida$batch <- "yoshida"
# yoshida$sample <- "yoshida"
celltype <- yoshida$cell_type
yoshida$celltype <- dplyr::case_when(
grepl("B cell", celltype) ~ "B cells",
grepl("natural killer cell", celltype) ~ "NK cells",
celltype == "conventional dendritic cell" ~ "cDCs",
celltype == "plasmacytoid dendritic cell" ~ "pDCs",
celltype == "gamma-delta T cell" ~ "TCRgd",
celltype == "platelet" ~ "Platelets",
grepl("CD4", celltype) ~ "CD4pos",
celltype == "regulatory T cell" ~ "CD4pos",
grepl("CD8", celltype) ~ "CD8pos",
grepl("onocyte", celltype) ~ "Monocytes",
TRUE ~ "Other")We use the union of variable features to speed up the integration significantly. Please only use the integrated values for visualizations etc., perform your analyses and draw your conclusions based on the raw/unintegrated values.
In this vignette, we convert the Seurat object to a data.frame for compatibility with plotting and performance measures, but cyCombine can integrate the merged Seurat object directly.
seu <- merge(x = lee, y = yoshida, project = "integrate_RNAseq")
# Variable features
markers <- union(VariableFeatures(lee), VariableFeatures(yoshida))
rm(lee); rm(yoshida)
# Convert to data frame
uncorrected <- seu[markers,] |>
GetAssayData("SCT","data") |>
t() |>
as.data.frame() |>
dplyr::mutate(batch = seu$batch,
sample = seu$sample,
celltype = seu$celltype,
id = dplyr::row_number())## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.3 GiB
rm(seu)
# saveRDS(uncorrected, "results/02_rnaseq_rnaseq_uncorrected.RDS")set.seed(seed)
uncor_batch <- plot_umap(
uncorrected,
down_sample = T,
sample_n = 5000,
markers = markers,
title = "Uncorrected",
ref_col = "celltype")## Generating UMAP
# saveRDS(uncor_batch, "results/02_rnaseq_rnaseq_uncor.RDS")
# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_rnaseq_uncor.RDS")
uncor_batchFor this integration, we are running two iterations of cyCombine to ensure a robust result. The first iteration uses a coarse 1x5 grid clustering, whereas the second iteration uses a somewhat fine-grained 5x5 clustering grid. Cosine distance is used with a FlowSOM clustering to emphasize the angle of variation rather than the amplitude. Note that cosine distance is not available in the default Kohonen cluster method.
system.time({
suppressMessages({ # Hide per-cluster correction info
corrected <- cyCombine(
uncorrected,
markers = markers,
cluster_method = "flowsom",
distf = "cosine",
seed = seed,
method = "ComBat",
xdim = c(1,5),
ydim = 5,# Note: the "5" is reused in the second iteration
mc.cores = 1
)
})
})## Found 217 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 724 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 210 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 210 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 223 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 313 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 228 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 249 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 259 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 235 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 201 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 323 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 367 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 325 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 381 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 234 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 216 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 199 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 438 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 309 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 292 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 303 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 317 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 304 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 726 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 345 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 224 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 616 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 268 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 395 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## user system elapsed
## 557.086 85.322 434.973
# saveRDS(corrected, "results/02_rnaseq_rnaseq_corrected.RDS")set.seed(seed)
umap_batch <- corrected |>
group_by(sample) |>
slice_sample(n = 5000) |>
plot_umap(
return_data = T,
down_sample = F,
seed = seed,
metric = "cosine",
markers = markers,
title = "scRNA-seq Lee vs Yoshida",
legend_title = "Dataset",
col = "batch")## Generating UMAP
# ggplot2::ggsave(umap_batch$plot, filename = "figs/02_rnaseq_rnaseq_batch.png")
# saveRDS(umap_batch$plot, "results/02_rnaseq_rnaseq_batch.RDS")
umap_batch$plotNote: We use plot_embedding to reuse the UMAP
coordinates from the plot above. We could also have rerun
plot_umap for a simpler call.
celltype_colors <- cyDefine::get_distinct_colors(sort(unique(uncorrected$celltype)))
celltype_colors[["Other"]] <- "black"
set.seed(seed)
xlim <- c(min(umap_batch$data$UMAP1), max(umap_batch$data$UMAP1))
ylim <- c(min(umap_batch$data$UMAP2), max(umap_batch$data$UMAP2))
plot_cell <-
plot_embedding(
dplyr::filter(umap_batch$data, batch == "lee"),
colors = celltype_colors,
xlim = xlim,
ylim = ylim,
title = "scRNAseq - Lee",
col = "celltype",
legend_title = "Cell type") +
plot_embedding(
dplyr::filter(umap_batch$data, batch == "yoshida"),
colors = celltype_colors,
xlim = xlim,
ylim = ylim,
title = "scRNAseq - Yoshida",
col = "celltype",
legend_title = "Cell type") +
patchwork::plot_layout(
guides = "collect",
axes = "collect",
axis_titles = "collect")
# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_rnaseq_cell.png")
# saveRDS(plot_cell, "results/02_rnaseq_rnaseq_cell.RDS")
plot_cellcorrected$label <- create_som(corrected, ydim = 5, xdim = 5, seed = seed, markers = markers, cluster_method = "flowsom", distf = "cosine")## Building SOM
## Mapping data to SOM
uncorrected$label <- corrected$label
system.time({
lu <- cyCombine::compute_cilisi(
uncorrected,
markers = get_markers(uncorrected),
label_cols = "batch",
split_by = "label",
return_mean = F)
})## user system elapsed
## 17.366 0.389 17.824
message("CiLISI uncorrected: ", mean(unlist(lu)))## CiLISI uncorrected: 0.0100322993830509
system.time({
l <- cyCombine::compute_cilisi(
corrected,
markers = get_markers(corrected),
label_cols = "batch",
split_by = "label",
return_mean = F)
})## user system elapsed
## 19.496 0.309 19.867
message("CiLISI corrected: ", mean(unlist(l)))## CiLISI corrected: 0.592147820274851
mad <- cyCombine::evaluate_mad(
uncorrected,
corrected,
cell_col = "label",
markers = markers
)## Computing MAD for corrected data..
## Computing MAD for uncorrected data..
## The MAD score is: 0
emd <- cyCombine::evaluate_emd(
uncorrected,
corrected,
binSize = 1,
cell_col = "label",
markers = markers,
mc.cores = 39)## Computing EMD for corrected data..
## Binning destributions using binSize 1
## | | 0%, ETA NA |== | 4%, ETA 00:46 |===== | 8%, ETA 00:31 |======= | 12%, ETA 00:21 |========= | 16%, ETA 00:18 |============ | 20%, ETA 00:15 |============== | 24%, ETA 00:13 |================= | 28%, ETA 00:12 |=================== | 32%, ETA 00:11 |===================== | 36%, ETA 00:10 |======================== | 40%, ETA 00:09 |========================== | 44%, ETA 00:08 |============================ | 48%, ETA 00:07 |=============================== | 52%, ETA 00:07 |================================= | 56%, ETA 00:06 |=================================== | 60%, ETA 00:06 |====================================== | 64%, ETA 00:05 |======================================== | 68%, ETA 00:04 |========================================== | 72%, ETA 00:04 |============================================= | 76%, ETA 00:03 |=============================================== | 80%, ETA 00:03 |================================================== | 84%, ETA 00:02 |==================================================== | 88%, ETA 00:02 |====================================================== | 92%, ETA 00:01 |========================================================= | 96%, ETA 00:01 |=======================================================| 100%, Elapsed 00:13
## | | 0%, ETA NA |== | 4%, ETA 00:48 |===== | 8%, ETA 00:29 |======= | 12%, ETA 00:23 |========= | 16%, ETA 00:21 |============ | 20%, ETA 00:17 |============== | 24%, ETA 00:15 |================= | 28%, ETA 00:14 |=================== | 32%, ETA 00:12 |===================== | 36%, ETA 00:11 |======================== | 40%, ETA 00:10 |========================== | 44%, ETA 00:09 |============================ | 48%, ETA 00:09 |=============================== | 52%, ETA 00:08 |================================= | 56%, ETA 00:07 |=================================== | 60%, ETA 00:06 |====================================== | 64%, ETA 00:06 |======================================== | 68%, ETA 00:05 |========================================== | 72%, ETA 00:05 |============================================= | 76%, ETA 00:04 |=============================================== | 80%, ETA 00:03 |================================================== | 84%, ETA 00:03 |==================================================== | 88%, ETA 00:02 |====================================================== | 92%, ETA 00:01 |========================================================= | 96%, ETA 00:01 |=======================================================| 100%, Elapsed 00:16
## Computing marker-wise EMD for each cell cluster
## | | 0%, ETA NA |== | 4%, ETA 00:20 |===== | 8%, ETA 00:14 |======= | 12%, ETA 00:12 |========= | 16%, ETA 00:11 |============ | 20%, ETA 00:10 |============== | 24%, ETA 00:09 |================= | 28%, ETA 00:09 |=================== | 32%, ETA 00:08 |===================== | 36%, ETA 00:08 |======================== | 40%, ETA 00:07 |========================== | 44%, ETA 00:07 |============================ | 48%, ETA 00:06 |=============================== | 52%, ETA 00:06 |================================= | 56%, ETA 00:05 |=================================== | 60%, ETA 00:05 |====================================== | 64%, ETA 00:04 |======================================== | 68%, ETA 00:04 |========================================== | 72%, ETA 00:03 |============================================= | 76%, ETA 00:03 |=============================================== | 80%, ETA 00:02 |================================================== | 84%, ETA 00:02 |==================================================== | 88%, ETA 00:01 |====================================================== | 92%, ETA 00:01 |========================================================= | 96%, ETA 00:00 |=======================================================| 100%, Elapsed 00:11
## Computing EMD for uncorrected data..
## Binning destributions using binSize 1
## | | 0%, ETA NA |== | 4%, ETA 00:41 |===== | 8%, ETA 00:30 |======= | 12%, ETA 00:21 |========= | 16%, ETA 00:17 |============ | 20%, ETA 00:14 |============== | 24%, ETA 00:12 |================= | 28%, ETA 00:12 |=================== | 32%, ETA 00:11 |===================== | 36%, ETA 00:09 |======================== | 40%, ETA 00:09 |========================== | 44%, ETA 00:08 |============================ | 48%, ETA 00:07 |=============================== | 52%, ETA 00:07 |================================= | 56%, ETA 00:06 |=================================== | 60%, ETA 00:06 |====================================== | 64%, ETA 00:05 |======================================== | 68%, ETA 00:05 |========================================== | 72%, ETA 00:04 |============================================= | 76%, ETA 00:03 |=============================================== | 80%, ETA 00:03 |================================================== | 84%, ETA 00:02 |==================================================== | 88%, ETA 00:02 |====================================================== | 92%, ETA 00:01 |========================================================= | 96%, ETA 00:01 |=======================================================| 100%, Elapsed 00:13
## | | 0%, ETA NA |== | 4%, ETA 00:48 |===== | 8%, ETA 00:32 |======= | 12%, ETA 00:25 |========= | 16%, ETA 00:20 |============ | 20%, ETA 00:16 |============== | 24%, ETA 00:13 |================= | 28%, ETA 00:13 |=================== | 32%, ETA 00:12 |===================== | 36%, ETA 00:10 |======================== | 40%, ETA 00:10 |========================== | 44%, ETA 00:09 |============================ | 48%, ETA 00:09 |=============================== | 52%, ETA 00:08 |================================= | 56%, ETA 00:07 |=================================== | 60%, ETA 00:06 |====================================== | 64%, ETA 00:06 |======================================== | 68%, ETA 00:05 |========================================== | 72%, ETA 00:04 |============================================= | 76%, ETA 00:04 |=============================================== | 80%, ETA 00:03 |================================================== | 84%, ETA 00:02 |==================================================== | 88%, ETA 00:02 |====================================================== | 92%, ETA 00:01 |========================================================= | 96%, ETA 00:01 |=======================================================| 100%, Elapsed 00:16
## Computing marker-wise EMD for each cell cluster
## | | 0%, ETA NA |== | 4%, ETA 00:20 |===== | 8%, ETA 00:14 |======= | 12%, ETA 00:12 |========= | 16%, ETA 00:11 |============ | 20%, ETA 00:10 |============== | 24%, ETA 00:10 |================= | 28%, ETA 00:09 |=================== | 32%, ETA 00:08 |===================== | 36%, ETA 00:08 |======================== | 40%, ETA 00:07 |========================== | 44%, ETA 00:07 |============================ | 48%, ETA 00:06 |=============================== | 52%, ETA 00:06 |================================= | 56%, ETA 00:05 |=================================== | 60%, ETA 00:05 |====================================== | 64%, ETA 00:04 |======================================== | 68%, ETA 00:04 |========================================== | 72%, ETA 00:03 |============================================= | 76%, ETA 00:03 |=============================================== | 80%, ETA 00:02 |================================================== | 84%, ETA 00:02 |==================================================== | 88%, ETA 00:01 |====================================================== | 92%, ETA 00:01 |========================================================= | 96%, ETA 00:00 |=======================================================| 100%, Elapsed 00:11
## Removing EMDs below 2 both before and after correction
## The reduction is: 0.92
## Creating plots..
## Evaluation complete.
# saveRDS(list("emd" = emd, "mad" = mad, "CiLISI_uc" = lu, "CiLISI_c" = l), "results/02_rnaseq_rnaseq_counts_performance.RDS")Contact