This vignette demonstrates integrating scRNA-seq and Spectral Flow Cytometry using cyCombine. In order to assimilate the two data modalities, we use our own implementation of a Single-cell nearest-neighbor pseudobulking (scennep).

Dependencies

library("Seurat")
library("pbmcapply")
library("dplyr")
library("readr")
library("ggplot2")
library("patchwork")
library("cyDefine") # Available at https://github.com/biosurf/cyDefine
library("cyCombine")
library("scennep")

seed <- 942
mc.cores <- 39
rerun <- FALSE
system("mkdir -p data")
system("mkdir -p results")

The datasets

scRNA-seq

We are using the normal samples from a public COVID paper, Lee et al., 2020

# Individual Single-Cell RNA-seq PBMC Data from Lee et al. (DOI: 10.1126/sciimmunol.abd1554)
if (file.exists("data/01_lee.rds")) {
  rnaseq <- readRDS("data/01_lee.rds")
} else {
  system("wget https://datasets.cellxgene.cziscience.com/bdee03e4-14ee-4f3e-a225-3d12f032841f.rds -O data/lee.rds")
  
  # Load and transform Lee
  lee <- readRDS("data/lee.rds")
  if (! "SCT" %in% Seurat::Assays(lee)) {
    lee <- lee[, grepl("Normal", lee$sample)]
    lee <- FindVariableFeatures(lee)
    oopts <- options(future.globals.maxSize = 1.0 * 1e9) 
    on.exit(options(oopts))
    
    lee <- Seurat::SCTransform(lee)
    lee <- FindVariableFeatures(lee)
    
    lee@reductions <- list()
    lee[["RNA"]] <- NULL
  }
  
  # Align annotation labels
  lee$batch <- "lee"
  lee$sample <- "lee"
  celltype <- lee$Cell.group
  lee$celltype <- dplyr::case_when(
           celltype == "B cell" ~ "B cells",
           stringr::str_starts(celltype, "CD4") ~ "CD4pos",
           stringr::str_starts(celltype, "CD8") ~ "CD8pos",
           stringr::str_ends(celltype, "Monocyte") ~ "Monocytes",
           celltype == "NK cell" ~ "NK cells",
           celltype %in% c("cDC", "pDC") ~ paste0(celltype, "s"),
           lee$cell_type == "gamma-delta T cell" ~ "TCRgd",
           # lee$cell_type == "platelet" ~ "Platelets",
           TRUE ~ "Other"
           )
  saveRDS(lee, "data/01_lee.rds")
  system("rm data/lee.rds")
  
  rnaseq <- lee
  rm(lee)
} 

Spectral Flow Cytometry

The Spectral Flow Cytometry (SFC) was acquired from FlowRepository (FR-FCM-Z2QV) and manually annotated using simple marker thresholds.

if (file.exists("data/01_spectral_park.RDS")  & !rerun) {
  spectral <- readRDS("data/01_spectral_park.RDS")
} else {
data_dir <- "../data/park2020_Z2QV"

# Download panel and metadata
if (!file.exists(file.path(data_dir, "panel_Park2020.csv"))) system("wget https://github.com/biosurf/cyCombine/files/13414991/panel_Park2020.csv -O data/FR-FCM-Z2QV/panel_Park2020.csv")
if (!file.exists(file.path(data_dir, "metadata_Park2020.xlsx"))) system("wget https://github.com/biosurf/cyCombine/files/13474508/Spectral.samples.cohort.xlsx -O data/FR-FCM-Z2QV/metadata_Park2020.xlsx")



sfc_panel <- read_csv(file.path(data_dir, "panel_Park2020.csv"))
sfc_metadata <- readxl::read_excel(file.path(data_dir, "metadata_Park2020.xlsx"))
sfc_metadata$FCS_name <- paste0("MC ", sfc_metadata$`Patient id`, ".fcs")

# Define markers
sfc_markers <- sfc_panel |> 
  dplyr::filter(Type != "none") |> 
  dplyr::pull(Marker) |> 
  stringr::str_remove_all("[ _-]")

spectral <- prepare_data(
  data_dir = data_dir,
  metadata = sfc_metadata,
  filename_col = "FCS_name",
  pattern = "^MC.+fcs", # Solely include MC files
  batch_ids = "Batch",
  condition = "Set",
  sample_ids = "Patient id",
  cofactor = 6000,
  derand = FALSE,
  markers = sfc_markers,
  down_sample = FALSE)

# Subset to a single sample for simplicity
spectral <- spectral %>%
  dplyr::filter(sample == "303444") |> 
  dplyr::mutate(CD45 = pmax(.data$CD45, .data$CD45RA)) |> 
  dplyr::select(-CD45RA)
}
## Rows: 47 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Channel, Marker, Type
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Preparing FCS files in directory ../data/park2020_Z2QV
## 
## Reading 4 files to a flowSet..
## 
## Converting flowset to data frame
## 
## Extracting expression data..
## 
## Your flowset is now converted into a dataframe.
## 
## Transforming data using asinh with a cofactor of 6000..
## 
## Done!

Cell annotation

if (!file.exists("data/01_spectral_park.RDS") | rerun) {
spectral <- spectral |> 
  dplyr::mutate(celltype = case_when(
    # CD3 > 2 & CD4 > 1.5 & CD8 > 1.5 ~ "Doublets",
    # CD3 > 2 & (CD19 > 1.5 | CD20 > 1.5) ~ "Doublets",
    # CD14 > 2 & (CD19 > 1.5 | CD20 > 1.5) ~ "Doublets",
    # CD14 > 2 & CD3 > 2 ~ "Doublets",
    HLADR > 2 & CD11c > 2 & (CD1c > 2.5 | CD141 > 2.5) ~ "cDCs",
    CD14 > 2.5 ~ "Monocytes",
    CD3 > 2 & CD4 > 2 ~ "CD4pos",
    CD3 > 2 & CD8 > 2 ~ "CD8pos",
    CD19 > 1.5 & CD20 > 1.5 ~ "B cells",
    CD56 > 2 & CD3 < 2 ~ "NK cells",
    TCRgd > 2 ~ "TCRgd",
    CD123 > 2 ~ "pDCs",
    TRUE ~ "Other"
  ))
}

Map SFC markers to gene names

if (!file.exists("data/01_spectral_park.RDS") | rerun) {
# Download marker info
if (!file.exists("ab_info.xlsx")) system("wget https://ars.els-cdn.com/content/image/1-s2.0-S0092867421005833-mmc1.xlsx -O ab_info.xlsx")

# Load marker mapper
gene_marker <- readxl::read_excel("ab_info.xlsx")

# gene_marker <- 
gene_marker$`Ensembl Gene Id` <- gene_marker$`Ensembl Gene Id` |> stringr::str_remove_all(".*MUS.* ")

# Select relevant genes-marker pairs
gm <- sapply(colnames(spectral), function(m) gene_marker$`Ensembl Gene Id`[stringr::str_detect(gene_marker$Specificity, m)][1])

# Manually assign gene IDs for remaining markers
gm[is.na(gm)]
gm[c("CCR7", "PD1", "TCRgd", "HLADR", "CD159c", "CXCR3", "CD159a", "IgD", "id", "celltype") ] <- c("ENSG00000126353", "ENSG00000188389", "ENSG00000211701", "ENSG00000204287", "ENSG00000205809", "ENSG00000186810", "ENSG00000134545", NA, "id", "celltype")
gm <- gm[!is.na(gm)]

# Replace column names
colnames(spectral)[which(colnames(spectral) %in% names(gm))] <- gm[colnames(spectral)[which(colnames(spectral) %in% names(gm))]]

saveRDS(spectral, file = "data/01_spectral_park.RDS")
}

Pseudobulk neighbors of scRNA-seq

# Common markers
markers <- intersect(get_markers(rnaseq), get_markers(spectral))

system.time({
# Run scennep
rna <- scennep(
  rnaseq,
  mc.cores = mc.cores, 
  distance = "cosine",
  markers = markers,
  pb = FALSE,
  return_S4 = FALSE)
})
## Using the top 22 pcs for the SNN
## Building SNN graph with k = 20
## Pseudobulking each cell with its 20 nearest neighbors
##    user  system elapsed 
## 534.778  65.436  45.569

Merge datasets

rnaseq_df <- rna |> 
  # GetAssayData("scennep","data") |>
  t() |>
  tibble::as_tibble() |> 
  mutate(batch = "scRNA-seq",
         sample = "scRNA-seq",
         celltype = rnaseq$celltype
         )  |>
  dplyr::select(dplyr::any_of(c(markers, non_markers)))

# Merge datasets
set.seed(seed)
uncorrected <- spectral |> 
  dplyr::slice_sample(n = ncol(rna)*2) |>
  dplyr::bind_rows(rnaseq_df) |> 
  dplyr::select(dplyr::any_of(c(markers, non_markers))) |> 
  dplyr::mutate(id = dplyr::row_number())

# saveRDS(uncorrected, "results/02_rnaseq_sfc_uncorrected.RDS")

Uncorrected

set.seed(seed)
umap_batch <- uncorrected |> 
  group_by(sample) |> 
  slice_sample(n = 5000) |> 
  plot_umap(
    down_sample = F,
    metric = "cosine",
    title = "scRNA-seq + Spectral Flow",
    markers = markers,
    col = "batch",
    legend_title = "Technology")
## Generating UMAP
# saveRDS(umap_batch, "results/02_rnaseq_sfc_uncor.RDS")
umap_batch

Integration

For this integration, we are running three iterations of cyCombine to ensure a robust result. The first iteration excludes a clustering step, the second uses a coarse 1x5 grid clustering, and the final iteration uses a somewhat fine-grained 5x5 clustering grid. Cosine distance is used with a FlowSOM clustering to emphasize the angle of variation rather than the amplitude. Note that cosine distance is not available in the default Kohonen cluster method.

system.time({
  suppressMessages({ # Hide per-cluster correction info
    corrected <- cyCombine(
      uncorrected,
      markers = markers,
      cluster_method = "flowsom",
      distf = "cosine",
      seed = seed,
      xdim = c(1,1,5),
      ydim = c(1,5), # Note: the second "5" is reused in the third iteration
      mc.cores = 1, pb = F 
  )
  })
})
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 10 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
##    user  system elapsed 
##  82.179  19.434   8.227
# saveRDS(corrected, "results/02_rnaseq_sfc_corrected.RDS")

Plot results - Batch

set.seed(seed)
umap_batch <- corrected |> 
  group_by(sample) |> 
  slice_sample(n = 5000) |> 
  plot_umap(
    return_data = T,
    down_sample = F, 
    seed = seed,
    metric = "cosine",
    markers = markers,
    title = "scRNA-seq + Spectral Flow",
    legend_title = "Technology",
    col = "batch")
## Generating UMAP
# ggplot2::ggsave(umap_batch$plot, filename = "figs/02_rnaseq_sfc_batch.png")
# saveRDS(umap_batch$plot, "results/02_rnaseq_sfc_batch.RDS")
umap_batch$plot

Plot results - Cells

Note: We use plot_embedding to reuse the UMAP coordinates from the plot above. We could also have rerun plot_umap for a simpler call.

celltype_colors <- cyDefine::get_distinct_colors(sort(unique(uncorrected$celltype)))
celltype_colors[["Other"]] <- "black"

xlim <- c(min(umap_batch$data$UMAP1), max(umap_batch$data$UMAP1))
ylim <- c(min(umap_batch$data$UMAP2), max(umap_batch$data$UMAP2))

plot_cell <- 
  plot_embedding(
    dplyr::filter(umap_batch$data, batch == "scRNA-seq"),
    colors = celltype_colors, 
    xlim = xlim,
    ylim = ylim,
    title = "scRNA-seq",
    col = "celltype",
    legend_title = "Cell type") + 
  plot_embedding(
    dplyr::filter(umap_batch$data, batch == "Spectral"),
    colors = celltype_colors, 
    xlim = xlim,
    ylim = ylim,
    title = "Spectral Flow",
    col = "celltype",
    legend_title = "Cell type") + 
  patchwork::plot_layout(
    guides = "collect",
    axes = "collect",
    axis_titles = "collect")

# saveRDS(plot_cell, "results/02_rnaseq_sfc_cell.RDS")
# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_sfc_cell.png")
plot_cell

Measure performance

  • EMD: Reduction in Earth Movers Distance [0,1] is a metric of how much more aligned the two modalities have become
  • MAD: Reduction in Median Absolute Deviation [0,1] is a metric of how much biological variance has been removed.
  • CiLISI: Cell-type aware iLISI [0,1] is the global mean of the iLISIs computed for each cell type or cluster.
corrected$label <- create_som(corrected, ydim = 5, xdim = 5, seed = seed, markers = markers, cluster_method = "flowsom", distf = "cosine")
## Building SOM
## Mapping data to SOM
uncorrected$label <- corrected$label
uncorrected$id <- corrected$id

system.time({
  lu <- cyCombine::compute_cilisi(df = uncorrected, markers = get_markers(uncorrected), label_cols = "batch", split_by = "label", return_mean = F)
})
##    user  system elapsed 
##   0.486   0.007   0.495
message("CiLISI uncorrected: ", mean(unlist(lu)))
## CiLISI uncorrected: 0.0189570523167698
system.time({
  l <- cyCombine::compute_cilisi(df = corrected, markers = get_markers(corrected), label_cols = "batch", split_by = "label", return_mean = F)
})
##    user  system elapsed 
##   0.615   0.006   0.622
message("CiLISI corrected: ", mean(unlist(l)))
## CiLISI corrected: 0.518000456915949
mad <- cyCombine::evaluate_mad(uncorrected, corrected, cell_col = "label", markers = markers)
## Computing MAD for corrected data..
## Computing MAD for uncorrected data..
## The MAD score is: 0.05
emd <- cyCombine::evaluate_emd(uncorrected, corrected, cell_col = "label", markers = markers)
## Computing EMD for corrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Computing EMD for uncorrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Removing EMDs below 2 both before and after correction
## The reduction is: 0.71
## Creating plots..
## Evaluation complete.
# saveRDS(list("emd" = emd, "mad" = mad, "CiLISI_uc" = lu, "CiLISI_c" = l), "results/02_rnaseq_sfc_performance.RDS")
 

Contact