Integrating RNAseq and RNAseq

This vignette demonstrates integrating two separate scRNA-seq datasets using cyCombine. The integration is done on the SCTransformed counts.

Dependencies

library("Seurat")
library("dplyr")
library("ggplot2")
library("patchwork")
library("cyDefine") # Available at https://github.com/biosurf/cyDefine
library("cyCombine")

seed <- 942
system("mkdir -p data")
system("mkdir -p results")

The datasets

scRNA-seq - Lee

We are using the normal samples from a public COVID paper, Lee et al., 2020

# Individual Single-Cell RNA-seq PBMC Data from Lee et al. (DOI: 10.1126/sciimmunol.abd1554)
if (!file.exists("data/lee.rds")) system("wget https://datasets.cellxgene.cziscience.com/bdee03e4-14ee-4f3e-a225-3d12f032841f.rds -O data/lee.rds")
  
# Load and transform Lee
lee <- readRDS("data/lee.rds")
if (! "SCT" %in% Seurat::Assays(lee)) {
  lee <- lee[, grepl("Normal", lee$sample)]
  lee <- FindVariableFeatures(lee)
  oopts <- options(future.globals.maxSize = 1.0 * 1e9) 
  on.exit(options(oopts))
  
  lee <- Seurat::SCTransform(lee)
  lee <- FindVariableFeatures(lee)
  
  lee@reductions <- list()
  lee[["RNA"]] <- NULL
}

## Running SCTransform on assay: RNA

## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.

## Calculating cell attributes from input UMI matrix: log_umi

## Variance stabilizing transformation of count matrix of size 19051 by 16298

## Model formula is y ~ log_umi

## Get Negative Binomial regression parameters per gene

## Using 2000 genes, 5000 cells

## Found 304 outliers - those will be ignored in fitting/regularization step

## Second step: Get residuals using fitted parameters for 19051 genes

## Computing corrected count matrix for 19051 genes

## Calculating gene attributes

## Wall clock passed: Time difference of 1.075148 mins

## Determine variable features

## Centering data matrix

## Place corrected count matrix in counts slot

## Set default assay to SCT

# Align annotation labels
lee$batch <- "lee"
# lee$sample <- "lee"
celltype <- lee$Cell.group
lee$celltype <- dplyr::case_when(
         celltype == "B cell" ~ "B cells",
         stringr::str_starts(celltype, "CD4") ~ "CD4pos",
         stringr::str_starts(celltype, "CD8") ~ "CD8pos",
         stringr::str_ends(celltype, "Monocyte") ~ "Monocytes",
         celltype == "NK cell" ~ "NK cells",
         celltype %in% c("cDC", "pDC") ~ paste0(celltype, "s"),
         lee$cell_type == "gamma-delta T cell" ~ "TCRgd",
         celltype == "Platelet" ~ "Platelets",
         TRUE ~ "Other"
         )

scRNA-seq - Yoshida

# https://cellxgene.cziscience.com/collections/03f821b4-87be-4ff4-b65a-b5fc00061da7
# Local and systemic responses to SARS-CoV-2 infection in children and adults
if (!file.exists("data/yoshida.rds")) system("wget https://datasets.cellxgene.cziscience.com/d911082e-b5e2-40e6-8f08-fb53c7894622.rds -O data/yoshida.rds")

# Load and transform Lee
yoshida <- readRDS("data/yoshida.rds")

if (! "SCT" %in% Seurat::Assays(yoshida)) {
  yoshida <- yoshida[, grepl("AN", yoshida$sample_id)]
  
  yoshida <- FindVariableFeatures(yoshida)
  
  oopts <- options(future.globals.maxSize = 2.0 * 1e9) 
  on.exit(options(oopts))
  
  yoshida <- Seurat::SCTransform(yoshida)
  yoshida <- FindVariableFeatures(yoshida)
  
  yoshida@reductions <- list()
  yoshida[["RNA"]] <- NULL
  
}

## Running SCTransform on assay: RNA

## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.

## Calculating cell attributes from input UMI matrix: log_umi

## Variance stabilizing transformation of count matrix of size 19220 by 46993

## Model formula is y ~ log_umi

## Get Negative Binomial regression parameters per gene

## Using 2000 genes, 5000 cells

## Found 665 outliers - those will be ignored in fitting/regularization step

## Second step: Get residuals using fitted parameters for 19220 genes

## Computing corrected count matrix for 19220 genes

## Calculating gene attributes

## Wall clock passed: Time difference of 2.829858 mins

## Determine variable features

## Centering data matrix

## Place corrected count matrix in counts slot

## Set default assay to SCT

yoshida$batch <- "yoshida"
# yoshida$sample <- "yoshida"
celltype <- yoshida$cell_type
yoshida$celltype <- dplyr::case_when(
  grepl("B cell", celltype) ~ "B cells",
  grepl("natural killer cell", celltype) ~ "NK cells",
  celltype == "conventional dendritic cell" ~ "cDCs",
  celltype == "plasmacytoid dendritic cell" ~ "pDCs",
  celltype == "gamma-delta T cell" ~ "TCRgd",
  celltype == "platelet" ~ "Platelets",
  grepl("CD4", celltype) ~ "CD4pos",
  celltype == "regulatory T cell" ~ "CD4pos",
  grepl("CD8", celltype) ~ "CD8pos",
  grepl("onocyte", celltype) ~ "Monocytes",
  TRUE ~ "Other")

Merge datasets

We use the union of variable features to speed up the integration significantly. Please only use the integrated values for visualizations etc., perform your analyses and draw your conclusions based on the raw/unintegrated values.

In this vignette, we convert the Seurat object to a data.frame for compatibility with plotting and performance measures, but cyCombine can integrate the merged Seurat object directly.

seu <- merge(x = lee, y = yoshida, project = "integrate_RNAseq")
# Variable features
markers <- union(VariableFeatures(lee), VariableFeatures(yoshida))
rm(lee); rm(yoshida)


# Convert to data frame
uncorrected <- seu[markers,] |> 
  GetAssayData("SCT","data") |>
  t() |> 
  as.data.frame() |> 
  dplyr::mutate(batch = seu$batch,
                sample = seu$sample,
                celltype = seu$celltype,
                id = dplyr::row_number())

## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.3 GiB

rm(seu)
# saveRDS(uncorrected, "results/02_rnaseq_rnaseq_uncorrected.RDS")

Uncorrected

set.seed(seed)
uncor_batch <- plot_umap(
  uncorrected,
  down_sample = T,
  sample_n = 5000,
  markers = markers,
  title = "Uncorrected",
  ref_col = "celltype")

## Generating UMAP

# saveRDS(uncor_batch, "results/02_rnaseq_rnaseq_uncor.RDS")
# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_rnaseq_uncor.RDS")
uncor_batch

Batch correction

For this integration, we are running two iterations of cyCombine to ensure a robust result. The first iteration uses a coarse 1x5 grid clustering, whereas the second iteration uses a somewhat fine-grained 5x5 clustering grid. Cosine distance is used with a FlowSOM clustering to emphasize the angle of variation rather than the amplitude. Note that cosine distance is not available in the default Kohonen cluster method.

system.time({
  suppressMessages({ # Hide per-cluster correction info
  corrected <- cyCombine(
    uncorrected,
    markers = markers,
    cluster_method = "flowsom",
    distf = "cosine",
    seed = seed, 
    method = "ComBat",
    xdim = c(1,5),
    ydim = 5,# Note: the "5" is reused in the second iteration
    mc.cores = 1
  )
  })
})

## Found 217 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 724 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 210 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 210 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 223 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 313 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 228 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 249 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 259 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 235 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 201 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 323 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 367 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 325 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 381 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 234 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 216 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 199 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 438 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 309 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 292 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 303 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 317 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 304 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 726 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 345 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 224 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 616 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 268 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 395 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.

##    user  system elapsed 
## 557.086  85.322 434.973

# saveRDS(corrected, "results/02_rnaseq_rnaseq_corrected.RDS")

Plot results - Batch

set.seed(seed)
umap_batch <- corrected |> 
  group_by(sample) |> 
  slice_sample(n = 5000) |> 
  plot_umap(
    return_data = T,
    down_sample = F, 
    seed = seed,
    metric = "cosine",
    markers = markers,
    title = "scRNA-seq Lee vs Yoshida",
    legend_title = "Dataset",
    col = "batch")

## Generating UMAP

# ggplot2::ggsave(umap_batch$plot, filename = "figs/02_rnaseq_rnaseq_batch.png")
# saveRDS(umap_batch$plot, "results/02_rnaseq_rnaseq_batch.RDS")
umap_batch$plot

Plot results - Cells

Note: We use plot_embedding to reuse the UMAP coordinates from the plot above. We could also have rerun plot_umap for a simpler call.

celltype_colors <- cyDefine::get_distinct_colors(sort(unique(uncorrected$celltype)))
celltype_colors[["Other"]] <- "black"
set.seed(seed)

xlim <- c(min(umap_batch$data$UMAP1), max(umap_batch$data$UMAP1))
ylim <- c(min(umap_batch$data$UMAP2), max(umap_batch$data$UMAP2))

plot_cell <- 
  plot_embedding(
    dplyr::filter(umap_batch$data, batch == "lee"),
    colors = celltype_colors, 
    xlim = xlim,
    ylim = ylim,
    title = "scRNAseq - Lee",
    col = "celltype",
    legend_title = "Cell type") + 
  plot_embedding(
    dplyr::filter(umap_batch$data, batch == "yoshida"),
    colors = celltype_colors, 
    xlim = xlim,
    ylim = ylim,
    title = "scRNAseq - Yoshida",
    col = "celltype",
    legend_title = "Cell type") +
  patchwork::plot_layout(
    guides = "collect",
    axes = "collect",
    axis_titles = "collect")

# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_rnaseq_cell.png")
# saveRDS(plot_cell, "results/02_rnaseq_rnaseq_cell.RDS")

plot_cell

Measure performance

EMD: Reduction in Earth Movers Distance [0,1] is a metric of how much more aligned the two modalities have become
MAD: Reduction in Median Absolute Deviation [0,1] is a metric of how much biological variance has been removed.
CiLISI: Cell-type aware iLISI [0,1] is the global mean of the iLISIs computed for each cell type or cluster.

corrected$label <- create_som(corrected, ydim = 5, xdim = 5, seed = seed, markers = markers, cluster_method = "flowsom", distf = "cosine")

## Building SOM

## Mapping data to SOM

uncorrected$label <- corrected$label

system.time({
  lu <- cyCombine::compute_cilisi(
    uncorrected, 
    markers = get_markers(uncorrected), 
    label_cols = "batch", 
    split_by = "label", 
    return_mean = F)
})

##    user  system elapsed 
##  17.366   0.389  17.824

message("CiLISI uncorrected: ", mean(unlist(lu)))

## CiLISI uncorrected: 0.0100322993830509

system.time({
  l <- cyCombine::compute_cilisi(
    corrected, 
    markers = get_markers(corrected), 
    label_cols = "batch", 
    split_by = "label", 
    return_mean = F)
})

##    user  system elapsed 
##  19.496   0.309  19.867

message("CiLISI corrected: ", mean(unlist(l)))

## CiLISI corrected: 0.592147820274851

mad <- cyCombine::evaluate_mad(
  uncorrected, 
  corrected, 
  cell_col = "label", 
  markers = markers
  )

## Computing MAD for corrected data..

## Computing MAD for uncorrected data..

## The MAD score is: 0

emd <- cyCombine::evaluate_emd(
  uncorrected, 
  corrected,
  binSize = 1, 
  cell_col = "label", 
  markers = markers, 
  mc.cores = 39)

## Computing EMD for corrected data..

## Binning destributions using binSize 1

##   |                                                              |   0%, ETA NA  |==                                                         |   4%, ETA 00:46  |=====                                                      |   8%, ETA 00:31  |=======                                                    |  12%, ETA 00:21  |=========                                                  |  16%, ETA 00:18  |============                                               |  20%, ETA 00:15  |==============                                             |  24%, ETA 00:13  |=================                                          |  28%, ETA 00:12  |===================                                        |  32%, ETA 00:11  |=====================                                      |  36%, ETA 00:10  |========================                                   |  40%, ETA 00:09  |==========================                                 |  44%, ETA 00:08  |============================                               |  48%, ETA 00:07  |===============================                            |  52%, ETA 00:07  |=================================                          |  56%, ETA 00:06  |===================================                        |  60%, ETA 00:06  |======================================                     |  64%, ETA 00:05  |========================================                   |  68%, ETA 00:04  |==========================================                 |  72%, ETA 00:04  |=============================================              |  76%, ETA 00:03  |===============================================            |  80%, ETA 00:03  |==================================================         |  84%, ETA 00:02  |====================================================       |  88%, ETA 00:02  |======================================================     |  92%, ETA 00:01  |=========================================================  |  96%, ETA 00:01  |=======================================================| 100%, Elapsed 00:13
##   |                                                              |   0%, ETA NA  |==                                                         |   4%, ETA 00:48  |=====                                                      |   8%, ETA 00:29  |=======                                                    |  12%, ETA 00:23  |=========                                                  |  16%, ETA 00:21  |============                                               |  20%, ETA 00:17  |==============                                             |  24%, ETA 00:15  |=================                                          |  28%, ETA 00:14  |===================                                        |  32%, ETA 00:12  |=====================                                      |  36%, ETA 00:11  |========================                                   |  40%, ETA 00:10  |==========================                                 |  44%, ETA 00:09  |============================                               |  48%, ETA 00:09  |===============================                            |  52%, ETA 00:08  |=================================                          |  56%, ETA 00:07  |===================================                        |  60%, ETA 00:06  |======================================                     |  64%, ETA 00:06  |========================================                   |  68%, ETA 00:05  |==========================================                 |  72%, ETA 00:05  |=============================================              |  76%, ETA 00:04  |===============================================            |  80%, ETA 00:03  |==================================================         |  84%, ETA 00:03  |====================================================       |  88%, ETA 00:02  |======================================================     |  92%, ETA 00:01  |=========================================================  |  96%, ETA 00:01  |=======================================================| 100%, Elapsed 00:16

## Computing marker-wise EMD for each cell cluster

##   |                                                              |   0%, ETA NA  |==                                                         |   4%, ETA 00:20  |=====                                                      |   8%, ETA 00:14  |=======                                                    |  12%, ETA 00:12  |=========                                                  |  16%, ETA 00:11  |============                                               |  20%, ETA 00:10  |==============                                             |  24%, ETA 00:09  |=================                                          |  28%, ETA 00:09  |===================                                        |  32%, ETA 00:08  |=====================                                      |  36%, ETA 00:08  |========================                                   |  40%, ETA 00:07  |==========================                                 |  44%, ETA 00:07  |============================                               |  48%, ETA 00:06  |===============================                            |  52%, ETA 00:06  |=================================                          |  56%, ETA 00:05  |===================================                        |  60%, ETA 00:05  |======================================                     |  64%, ETA 00:04  |========================================                   |  68%, ETA 00:04  |==========================================                 |  72%, ETA 00:03  |=============================================              |  76%, ETA 00:03  |===============================================            |  80%, ETA 00:02  |==================================================         |  84%, ETA 00:02  |====================================================       |  88%, ETA 00:01  |======================================================     |  92%, ETA 00:01  |=========================================================  |  96%, ETA 00:00  |=======================================================| 100%, Elapsed 00:11

## Computing EMD for uncorrected data..
## Binning destributions using binSize 1

##   |                                                              |   0%, ETA NA  |==                                                         |   4%, ETA 00:41  |=====                                                      |   8%, ETA 00:30  |=======                                                    |  12%, ETA 00:21  |=========                                                  |  16%, ETA 00:17  |============                                               |  20%, ETA 00:14  |==============                                             |  24%, ETA 00:12  |=================                                          |  28%, ETA 00:12  |===================                                        |  32%, ETA 00:11  |=====================                                      |  36%, ETA 00:09  |========================                                   |  40%, ETA 00:09  |==========================                                 |  44%, ETA 00:08  |============================                               |  48%, ETA 00:07  |===============================                            |  52%, ETA 00:07  |=================================                          |  56%, ETA 00:06  |===================================                        |  60%, ETA 00:06  |======================================                     |  64%, ETA 00:05  |========================================                   |  68%, ETA 00:05  |==========================================                 |  72%, ETA 00:04  |=============================================              |  76%, ETA 00:03  |===============================================            |  80%, ETA 00:03  |==================================================         |  84%, ETA 00:02  |====================================================       |  88%, ETA 00:02  |======================================================     |  92%, ETA 00:01  |=========================================================  |  96%, ETA 00:01  |=======================================================| 100%, Elapsed 00:13
##   |                                                              |   0%, ETA NA  |==                                                         |   4%, ETA 00:48  |=====                                                      |   8%, ETA 00:32  |=======                                                    |  12%, ETA 00:25  |=========                                                  |  16%, ETA 00:20  |============                                               |  20%, ETA 00:16  |==============                                             |  24%, ETA 00:13  |=================                                          |  28%, ETA 00:13  |===================                                        |  32%, ETA 00:12  |=====================                                      |  36%, ETA 00:10  |========================                                   |  40%, ETA 00:10  |==========================                                 |  44%, ETA 00:09  |============================                               |  48%, ETA 00:09  |===============================                            |  52%, ETA 00:08  |=================================                          |  56%, ETA 00:07  |===================================                        |  60%, ETA 00:06  |======================================                     |  64%, ETA 00:06  |========================================                   |  68%, ETA 00:05  |==========================================                 |  72%, ETA 00:04  |=============================================              |  76%, ETA 00:04  |===============================================            |  80%, ETA 00:03  |==================================================         |  84%, ETA 00:02  |====================================================       |  88%, ETA 00:02  |======================================================     |  92%, ETA 00:01  |=========================================================  |  96%, ETA 00:01  |=======================================================| 100%, Elapsed 00:16

## Computing marker-wise EMD for each cell cluster

##   |                                                              |   0%, ETA NA  |==                                                         |   4%, ETA 00:20  |=====                                                      |   8%, ETA 00:14  |=======                                                    |  12%, ETA 00:12  |=========                                                  |  16%, ETA 00:11  |============                                               |  20%, ETA 00:10  |==============                                             |  24%, ETA 00:10  |=================                                          |  28%, ETA 00:09  |===================                                        |  32%, ETA 00:08  |=====================                                      |  36%, ETA 00:08  |========================                                   |  40%, ETA 00:07  |==========================                                 |  44%, ETA 00:07  |============================                               |  48%, ETA 00:06  |===============================                            |  52%, ETA 00:06  |=================================                          |  56%, ETA 00:05  |===================================                        |  60%, ETA 00:05  |======================================                     |  64%, ETA 00:04  |========================================                   |  68%, ETA 00:04  |==========================================                 |  72%, ETA 00:03  |=============================================              |  76%, ETA 00:03  |===============================================            |  80%, ETA 00:02  |==================================================         |  84%, ETA 00:02  |====================================================       |  88%, ETA 00:01  |======================================================     |  92%, ETA 00:01  |=========================================================  |  96%, ETA 00:00  |=======================================================| 100%, Elapsed 00:11

## Removing EMDs below 2 both before and after correction

## The reduction is: 0.92

## Creating plots..

## Evaluation complete.

# saveRDS(list("emd" = emd, "mad" = mad, "CiLISI_uc" = lu, "CiLISI_c" = l), "results/02_rnaseq_rnaseq_counts_performance.RDS")

Contact