This vignette demonstrates integrating scRNA-seq and CyTOF using cyCombine. In order to assimilate the two data modalities, we use our own implementation of a Single-cell nearest-neighbor pseudobulking (scennep).

Dependencies

library("Seurat")
library("pbmcapply")
library("dplyr")
library("ggplot2")
library("patchwork")
library("cyDefine") # Available at https://github.com/biosurf/cyDefine
library("cyCombine")
library("scennep")

seed <- 942
mc.cores <- 39
rerun <- FALSE

system("mkdir -p data")
system("mkdir -p results")

The datasets

scRNA-seq

We are using the normal samples from a public COVID paper, Lee et al., 2020

# Individual Single-Cell RNA-seq PBMC Data from Lee et al. (DOI: 10.1126/sciimmunol.abd1554)
if (file.exists("data/01_lee.rds")) {
  rnaseq <- readRDS("data/01_lee.rds")
} else {
  system("wget https://datasets.cellxgene.cziscience.com/bdee03e4-14ee-4f3e-a225-3d12f032841f.rds -O data/lee.rds")
  
  # Load and transform Lee
  lee <- readRDS("data/lee.rds")
  if (! "SCT" %in% Seurat::Assays(lee)) {
    lee <- lee[, grepl("Normal", lee$sample)]
    lee <- FindVariableFeatures(lee)
    oopts <- options(future.globals.maxSize = 1.0 * 1e9) 
    on.exit(options(oopts))
    
    lee <- Seurat::SCTransform(lee)
    lee <- FindVariableFeatures(lee)
    
    lee@reductions <- list()
    lee[["RNA"]] <- NULL
  }
  
  # Align annotation labels
  lee$batch <- "lee"
  lee$sample <- "lee"
  celltype <- lee$Cell.group
  lee$celltype <- dplyr::case_when(
           celltype == "B cell" ~ "B cells",
           stringr::str_starts(celltype, "CD4") ~ "CD4pos",
           stringr::str_starts(celltype, "CD8") ~ "CD8pos",
           stringr::str_ends(celltype, "Monocyte") ~ "Monocytes",
           celltype == "NK cell" ~ "NK cells",
           celltype %in% c("cDC", "pDC") ~ paste0(celltype, "s"),
           lee$cell_type == "gamma-delta T cell" ~ "TCRgd",
           TRUE ~ "Other"
           )
  saveRDS(lee, "data/01_lee.rds")
  system("rm data/lee.rds")
  
  rnaseq <- lee
  rm(lee)
} 

CyTOF

The CyTOF dataset (FR-FCM-ZYAJ) was manually annotated by Mike Leipold. Only sample 001 is used.

if (file.exists("data/01_cytof_himc.RDS") & !rerun) {
  cytof <- readRDS("data/01_cytof_himc.RDS")
} else {
load("data/data_transformed.Rdata")

# Rename columns and filter to one healthy sample
himc <- expr_trans |> 
  dplyr::rename(
    "CD45" = "CD45RA",
    "HLA-DR" = HLADR,
    celltype = population) |> 
  dplyr::filter(
    sample == "001",
  ) |> 
  dplyr::mutate(
    batch = "CyTOF",
    sample = "CyTOF",
    celltype = case_when(
      celltype == "mDCs" ~ "cDCs",
      celltype == "Basophils" ~ "Other",
      celltype == "Platelets" ~ "Other",
      celltype == "NKT" ~ "Other",
      TRUE ~ celltype
    )
  )
rm(expr_trans)
}

Map CyTOF markers to gene names

if (!file.exists("data/01_cytof_himc.RDS") | rerun) {
# Download marker info
if (!file.exists("ab_info.xlsx")) system("wget https://ars.els-cdn.com/content/image/1-s2.0-S0092867421005833-mmc1.xlsx -O ab_info.xlsx")

# Load marker mapper
gene_marker <- readxl::read_excel("ab_info.xlsx")

# gene_marker <- 
gene_marker$`Ensembl Gene Id` <- gene_marker$`Ensembl Gene Id` |> stringr::str_remove_all(".*MUS.* ")

# Select relevant genes-marker pairs
gm <- sapply(colnames(himc), function(m) gene_marker$`Ensembl Gene Id`[stringr::str_detect(gene_marker$Specificity, m)][1])

# Manually assign gene IDs for remaining markers
gm[is.na(gm)]
gm[c("CCR7", "CD85j", "TCRgd", "celltype") ] <- c("ENSG00000126353", "ENSG00000104972", "ENSG00000211701", NA)
gm <- gm[!is.na(gm)]

# Replace column names
colnames(himc)[which(colnames(himc) %in% names(gm))] <- gm[colnames(himc)[which(colnames(himc) %in% names(gm))]]

saveRDS(himc, file = "data/01_cytof_himc.RDS")
cytof <- himc
rm(himc)
}

Pseudobulk neighbors of scRNA-seq

# Common markers
markers <- intersect(rownames(rnaseq), colnames(cytof))

system.time({
# Run scennep
rna <- scennep(
  rnaseq,
  mc.cores = mc.cores, 
  distance = "cosine",
  markers = markers,
  pb = FALSE,
  return_S4 = FALSE)
})
## Using the top 22 pcs for the SNN
## Building SNN graph with k = 20
## Pseudobulking each cell with its 20 nearest neighbors
##    user  system elapsed 
## 537.404  66.749  45.876

Merge datasets

rnaseq_df <- rna |> 
  t() |>
  tibble::as_tibble() |> 
  mutate(batch = "scRNA-seq",
         sample = "scRNA-seq",
         celltype = rnaseq$celltype) |>
  dplyr::select(dplyr::any_of(c(markers, non_markers)))

# Merge datasets
set.seed(seed)
uncorrected <- cytof |> 
  dplyr::slice_sample(n = ncol(rna)*2) |>
  dplyr::bind_rows(rnaseq_df) |>
  dplyr::select(dplyr::any_of(c(markers, non_markers))) |> 
  dplyr::mutate(id = dplyr::row_number())


# saveRDS(uncorrected, "results/02_rnaseq_cytof_uncorrected.RDS")

Uncorrected

set.seed(seed)
uncor <- uncorrected |> 
  group_by(sample) |> 
  slice_sample(n = 5000) |> 
  plot_umap(
    title = "scRNA-seq + Mass Cytometry",
    down_sample = F,
    metric = "cosine",
    markers = markers,
    col = "batch",
    legend_title = "Technology",)
## Generating UMAP
# saveRDS(uncor, "results/02_rnaseq_cytof_uncor.RDS")
uncor

Integration

For this integration, we are running three iterations of cyCombine to ensure a robust result. The first iteration excludes a clustering step, the second uses a coarse 1x5 grid clustering, and the final iteration uses a somewhat fine-grained 5x5 clustering grid. Cosine distance is used with a FlowSOM clustering to emphasize the angle of variation rather than the amplitude. Note that cosine distance is not available in the default Kohonen cluster method.

system.time({
  suppressMessages({ # Hide per-cluster correction info
    corrected <- cyCombine(
      uncorrected,
      markers = markers,
      cluster_method = "flowsom",
      distf = "cosine",
      seed = seed,
      xdim = c(1,1,5),
      ydim = c(1,5), # Note: the second "5" is reused in the third iteration
      mc.cores = 1, pb = F 
  )
  })
})
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
##    user  system elapsed 
##  88.500  12.576   6.854
# saveRDS(corrected, "results/02_rnaseq_cytof_corrected.RDS")

Plot results - Batch

set.seed(seed)
umap_batch <- corrected |> 
  group_by(sample) |> 
  slice_sample(n = 5000) |> 
  plot_umap(
    return_data = T,
    down_sample = F, 
    seed = seed,
    metric = "cosine",
    markers = markers,
    title = "scRNA-seq + Mass Cytometry",
    legend_title = "Technology",
    col = "batch")
## Generating UMAP
# ggplot2::ggsave(umap_batch$plot, filename = "figs/02_rnaseq_cytof_batch.png")
# saveRDS(umap_batch$plot, "results/02_rnaseq_cytof_batch.RDS")
umap_batch$plot

Plot results - Cells

Note: We use plot_embedding to reuse the UMAP coordinates from the plot above. We could also have rerun plot_umap for a simpler call.

celltype_colors <- cyDefine::get_distinct_colors(sort(unique(uncorrected$celltype)))
celltype_colors[["Other"]] <- "black"

xlim <- c(min(umap_batch$data$UMAP1), max(umap_batch$data$UMAP1))
ylim <- c(min(umap_batch$data$UMAP2), max(umap_batch$data$UMAP2))

plot_cell <- 
  plot_embedding(
    dplyr::filter(umap_batch$data, batch == "scRNA-seq"),
    colors = celltype_colors, 
    xlim = xlim,
    ylim = ylim,
    title = "scRNA-seq",
    col = "celltype",
    legend_title = "Cell type") + 
  plot_embedding(
    dplyr::filter(umap_batch$data, batch == "CyTOF"),
    colors = celltype_colors, 
    xlim = xlim,
    ylim = ylim,
    title = "Mass Cytometry",
    col = "celltype",
    legend_title = "Cell type") +
  patchwork::plot_layout(
    guides = "collect",
    axes = "collect",
    axis_titles = "collect")

# saveRDS(plot_cell, "results/02_rnaseq_cytof_cell.RDS")
# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_cytof_cell.png")
plot_cell

Measure performance

  • EMD: Reduction in Earth Movers Distance [0,1] is a metric of how much more aligned the two modalities have become
  • MAD: Reduction in Median Absolute Deviation [0,1] is a metric of how much biological variance has been removed.
  • CiLISI: Cell-type aware iLISI [0,1] is the global mean of the iLISIs computed for each cell type or cluster.
corrected$label <- create_som(corrected, ydim = 5, xdim = 5, seed = seed, markers = markers, cluster_method = "flowsom", distf = "cosine")
## Building SOM
## Mapping data to SOM
uncorrected$label <- corrected$label

system.time({
  lu <- cyCombine::compute_cilisi(df = uncorrected, markers = get_markers(uncorrected), label_cols = "batch", split_by = "label", return_mean = F)
})
## Warning in compute_lisi(df = df_label, markers = markers, label_cols =
## label_cols): Cannot calculate more neighbors than there are points. Setting
## perplexity to 2
##    user  system elapsed 
##   0.435   0.008   0.445
message("CiLISI uncorrected: ", mean(unlist(lu)))
## CiLISI uncorrected: 0.00783590938374358
system.time({
  l <- cyCombine::compute_cilisi(df = corrected, markers = get_markers(corrected), label_cols = "batch", split_by = "label", return_mean = F)
})
## Warning in compute_lisi(df = df_label, markers = markers, label_cols =
## label_cols): Cannot calculate more neighbors than there are points. Setting
## perplexity to 2
##    user  system elapsed 
##   0.563   0.005   0.570
message("CiLISI corrected: ", mean(unlist(l)))
## CiLISI corrected: 0.485482059171325
mad <- cyCombine::evaluate_mad(uncorrected, corrected, cell_col = "label", markers = markers)
## Computing MAD for corrected data..
## Computing MAD for uncorrected data..
## The MAD score is: 0.08
emd <- cyCombine::evaluate_emd(uncorrected, corrected, cell_col = "label", markers = markers)
## Computing EMD for corrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Computing EMD for uncorrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Removing EMDs below 2 both before and after correction
## The reduction is: 0.8
## Creating plots..
## Evaluation complete.
# saveRDS(list("emd" = emd, "mad" = mad, "CiLISI_uc" = lu, "CiLISI_c" = l), "results/02_rnaseq_cytof_performance.RDS")
 

Contact