This vignette demonstrates integrating scRNA-seq and Spectral Flow Cytometry using cyCombine. In order to assimilate the two data modalities, we use our own implementation of a Single-cell nearest-neighbor pseudobulking (scennep).
library("Seurat")
library("pbmcapply")
library("dplyr")
library("readr")
library("ggplot2")
library("patchwork")
library("cyDefine") # Available at https://github.com/biosurf/cyDefine
library("cyCombine")
library("scennep")
seed <- 942
mc.cores <- 39
rerun <- FALSE
system("mkdir -p data")
system("mkdir -p results")We are using the normal samples from a public COVID paper, Lee et al., 2020
# Individual Single-Cell RNA-seq PBMC Data from Lee et al. (DOI: 10.1126/sciimmunol.abd1554)
if (file.exists("data/01_lee.rds")) {
rnaseq <- readRDS("data/01_lee.rds")
} else {
system("wget https://datasets.cellxgene.cziscience.com/bdee03e4-14ee-4f3e-a225-3d12f032841f.rds -O data/lee.rds")
# Load and transform Lee
lee <- readRDS("data/lee.rds")
if (! "SCT" %in% Seurat::Assays(lee)) {
lee <- lee[, grepl("Normal", lee$sample)]
lee <- FindVariableFeatures(lee)
oopts <- options(future.globals.maxSize = 1.0 * 1e9)
on.exit(options(oopts))
lee <- Seurat::SCTransform(lee)
lee <- FindVariableFeatures(lee)
lee@reductions <- list()
lee[["RNA"]] <- NULL
}
# Align annotation labels
lee$batch <- "lee"
lee$sample <- "lee"
celltype <- lee$Cell.group
lee$celltype <- dplyr::case_when(
celltype == "B cell" ~ "B cells",
stringr::str_starts(celltype, "CD4") ~ "CD4pos",
stringr::str_starts(celltype, "CD8") ~ "CD8pos",
stringr::str_ends(celltype, "Monocyte") ~ "Monocytes",
celltype == "NK cell" ~ "NK cells",
celltype %in% c("cDC", "pDC") ~ paste0(celltype, "s"),
lee$cell_type == "gamma-delta T cell" ~ "TCRgd",
# lee$cell_type == "platelet" ~ "Platelets",
TRUE ~ "Other"
)
saveRDS(lee, "data/01_lee.rds")
system("rm data/lee.rds")
rnaseq <- lee
rm(lee)
} The Spectral Flow Cytometry (SFC) was acquired from FlowRepository (FR-FCM-Z2QV) and manually annotated using simple marker thresholds.
if (file.exists("data/01_spectral_park.RDS") & !rerun) {
spectral <- readRDS("data/01_spectral_park.RDS")
} else {
data_dir <- "../data/park2020_Z2QV"
# Download panel and metadata
if (!file.exists(file.path(data_dir, "panel_Park2020.csv"))) system("wget https://github.com/biosurf/cyCombine/files/13414991/panel_Park2020.csv -O data/FR-FCM-Z2QV/panel_Park2020.csv")
if (!file.exists(file.path(data_dir, "metadata_Park2020.xlsx"))) system("wget https://github.com/biosurf/cyCombine/files/13474508/Spectral.samples.cohort.xlsx -O data/FR-FCM-Z2QV/metadata_Park2020.xlsx")
sfc_panel <- read_csv(file.path(data_dir, "panel_Park2020.csv"))
sfc_metadata <- readxl::read_excel(file.path(data_dir, "metadata_Park2020.xlsx"))
sfc_metadata$FCS_name <- paste0("MC ", sfc_metadata$`Patient id`, ".fcs")
# Define markers
sfc_markers <- sfc_panel |>
dplyr::filter(Type != "none") |>
dplyr::pull(Marker) |>
stringr::str_remove_all("[ _-]")
spectral <- prepare_data(
data_dir = data_dir,
metadata = sfc_metadata,
filename_col = "FCS_name",
pattern = "^MC.+fcs", # Solely include MC files
batch_ids = "Batch",
condition = "Set",
sample_ids = "Patient id",
cofactor = 6000,
derand = FALSE,
markers = sfc_markers,
down_sample = FALSE)
# Subset to a single sample for simplicity
spectral <- spectral %>%
dplyr::filter(sample == "303444") |>
dplyr::mutate(CD45 = pmax(.data$CD45, .data$CD45RA)) |>
dplyr::select(-CD45RA)
}## Rows: 47 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Channel, Marker, Type
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Preparing FCS files in directory ../data/park2020_Z2QV
##
## Reading 4 files to a flowSet..
##
## Converting flowset to data frame
##
## Extracting expression data..
##
## Your flowset is now converted into a dataframe.
##
## Transforming data using asinh with a cofactor of 6000..
##
## Done!
if (!file.exists("data/01_spectral_park.RDS") | rerun) {
spectral <- spectral |>
dplyr::mutate(celltype = case_when(
# CD3 > 2 & CD4 > 1.5 & CD8 > 1.5 ~ "Doublets",
# CD3 > 2 & (CD19 > 1.5 | CD20 > 1.5) ~ "Doublets",
# CD14 > 2 & (CD19 > 1.5 | CD20 > 1.5) ~ "Doublets",
# CD14 > 2 & CD3 > 2 ~ "Doublets",
HLADR > 2 & CD11c > 2 & (CD1c > 2.5 | CD141 > 2.5) ~ "cDCs",
CD14 > 2.5 ~ "Monocytes",
CD3 > 2 & CD4 > 2 ~ "CD4pos",
CD3 > 2 & CD8 > 2 ~ "CD8pos",
CD19 > 1.5 & CD20 > 1.5 ~ "B cells",
CD56 > 2 & CD3 < 2 ~ "NK cells",
TCRgd > 2 ~ "TCRgd",
CD123 > 2 ~ "pDCs",
TRUE ~ "Other"
))
}if (!file.exists("data/01_spectral_park.RDS") | rerun) {
# Download marker info
if (!file.exists("ab_info.xlsx")) system("wget https://ars.els-cdn.com/content/image/1-s2.0-S0092867421005833-mmc1.xlsx -O ab_info.xlsx")
# Load marker mapper
gene_marker <- readxl::read_excel("ab_info.xlsx")
# gene_marker <-
gene_marker$`Ensembl Gene Id` <- gene_marker$`Ensembl Gene Id` |> stringr::str_remove_all(".*MUS.* ")
# Select relevant genes-marker pairs
gm <- sapply(colnames(spectral), function(m) gene_marker$`Ensembl Gene Id`[stringr::str_detect(gene_marker$Specificity, m)][1])
# Manually assign gene IDs for remaining markers
gm[is.na(gm)]
gm[c("CCR7", "PD1", "TCRgd", "HLADR", "CD159c", "CXCR3", "CD159a", "IgD", "id", "celltype") ] <- c("ENSG00000126353", "ENSG00000188389", "ENSG00000211701", "ENSG00000204287", "ENSG00000205809", "ENSG00000186810", "ENSG00000134545", NA, "id", "celltype")
gm <- gm[!is.na(gm)]
# Replace column names
colnames(spectral)[which(colnames(spectral) %in% names(gm))] <- gm[colnames(spectral)[which(colnames(spectral) %in% names(gm))]]
saveRDS(spectral, file = "data/01_spectral_park.RDS")
}# Common markers
markers <- intersect(get_markers(rnaseq), get_markers(spectral))
system.time({
# Run scennep
rna <- scennep(
rnaseq,
mc.cores = mc.cores,
distance = "cosine",
markers = markers,
pb = FALSE,
return_S4 = FALSE)
})## Using the top 22 pcs for the SNN
## Building SNN graph with k = 20
## Pseudobulking each cell with its 20 nearest neighbors
## user system elapsed
## 534.778 65.436 45.569
rnaseq_df <- rna |>
# GetAssayData("scennep","data") |>
t() |>
tibble::as_tibble() |>
mutate(batch = "scRNA-seq",
sample = "scRNA-seq",
celltype = rnaseq$celltype
) |>
dplyr::select(dplyr::any_of(c(markers, non_markers)))
# Merge datasets
set.seed(seed)
uncorrected <- spectral |>
dplyr::slice_sample(n = ncol(rna)*2) |>
dplyr::bind_rows(rnaseq_df) |>
dplyr::select(dplyr::any_of(c(markers, non_markers))) |>
dplyr::mutate(id = dplyr::row_number())
# saveRDS(uncorrected, "results/02_rnaseq_sfc_uncorrected.RDS")set.seed(seed)
umap_batch <- uncorrected |>
group_by(sample) |>
slice_sample(n = 5000) |>
plot_umap(
down_sample = F,
metric = "cosine",
title = "scRNA-seq + Spectral Flow",
markers = markers,
col = "batch",
legend_title = "Technology")## Generating UMAP
# saveRDS(umap_batch, "results/02_rnaseq_sfc_uncor.RDS")
umap_batchFor this integration, we are running three iterations of cyCombine to ensure a robust result. The first iteration excludes a clustering step, the second uses a coarse 1x5 grid clustering, and the final iteration uses a somewhat fine-grained 5x5 clustering grid. Cosine distance is used with a FlowSOM clustering to emphasize the angle of variation rather than the amplitude. Note that cosine distance is not available in the default Kohonen cluster method.
system.time({
suppressMessages({ # Hide per-cluster correction info
corrected <- cyCombine(
uncorrected,
markers = markers,
cluster_method = "flowsom",
distf = "cosine",
seed = seed,
xdim = c(1,1,5),
ydim = c(1,5), # Note: the second "5" is reused in the third iteration
mc.cores = 1, pb = F
)
})
})## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 10 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## user system elapsed
## 82.179 19.434 8.227
# saveRDS(corrected, "results/02_rnaseq_sfc_corrected.RDS")set.seed(seed)
umap_batch <- corrected |>
group_by(sample) |>
slice_sample(n = 5000) |>
plot_umap(
return_data = T,
down_sample = F,
seed = seed,
metric = "cosine",
markers = markers,
title = "scRNA-seq + Spectral Flow",
legend_title = "Technology",
col = "batch")## Generating UMAP
# ggplot2::ggsave(umap_batch$plot, filename = "figs/02_rnaseq_sfc_batch.png")
# saveRDS(umap_batch$plot, "results/02_rnaseq_sfc_batch.RDS")
umap_batch$plotNote: We use plot_embedding to reuse the UMAP
coordinates from the plot above. We could also have rerun
plot_umap for a simpler call.
celltype_colors <- cyDefine::get_distinct_colors(sort(unique(uncorrected$celltype)))
celltype_colors[["Other"]] <- "black"
xlim <- c(min(umap_batch$data$UMAP1), max(umap_batch$data$UMAP1))
ylim <- c(min(umap_batch$data$UMAP2), max(umap_batch$data$UMAP2))
plot_cell <-
plot_embedding(
dplyr::filter(umap_batch$data, batch == "scRNA-seq"),
colors = celltype_colors,
xlim = xlim,
ylim = ylim,
title = "scRNA-seq",
col = "celltype",
legend_title = "Cell type") +
plot_embedding(
dplyr::filter(umap_batch$data, batch == "Spectral"),
colors = celltype_colors,
xlim = xlim,
ylim = ylim,
title = "Spectral Flow",
col = "celltype",
legend_title = "Cell type") +
patchwork::plot_layout(
guides = "collect",
axes = "collect",
axis_titles = "collect")
# saveRDS(plot_cell, "results/02_rnaseq_sfc_cell.RDS")
# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_sfc_cell.png")
plot_cellcorrected$label <- create_som(corrected, ydim = 5, xdim = 5, seed = seed, markers = markers, cluster_method = "flowsom", distf = "cosine")## Building SOM
## Mapping data to SOM
uncorrected$label <- corrected$label
uncorrected$id <- corrected$id
system.time({
lu <- cyCombine::compute_cilisi(df = uncorrected, markers = get_markers(uncorrected), label_cols = "batch", split_by = "label", return_mean = F)
})## user system elapsed
## 0.486 0.007 0.495
message("CiLISI uncorrected: ", mean(unlist(lu)))## CiLISI uncorrected: 0.0189570523167698
system.time({
l <- cyCombine::compute_cilisi(df = corrected, markers = get_markers(corrected), label_cols = "batch", split_by = "label", return_mean = F)
})## user system elapsed
## 0.615 0.006 0.622
message("CiLISI corrected: ", mean(unlist(l)))## CiLISI corrected: 0.518000456915949
mad <- cyCombine::evaluate_mad(uncorrected, corrected, cell_col = "label", markers = markers)## Computing MAD for corrected data..
## Computing MAD for uncorrected data..
## The MAD score is: 0.05
emd <- cyCombine::evaluate_emd(uncorrected, corrected, cell_col = "label", markers = markers)## Computing EMD for corrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Computing EMD for uncorrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Removing EMDs below 2 both before and after correction
## The reduction is: 0.71
## Creating plots..
## Evaluation complete.
# saveRDS(list("emd" = emd, "mad" = mad, "CiLISI_uc" = lu, "CiLISI_c" = l), "results/02_rnaseq_sfc_performance.RDS")Contact