This vignette demonstrates integrating scRNA-seq and CyTOF using cyCombine. In order to assimilate the two data modalities, we use our own implementation of a Single-cell nearest-neighbor pseudobulking (scennep).
library("Seurat")
library("pbmcapply")
library("dplyr")
library("ggplot2")
library("patchwork")
library("cyDefine") # Available at https://github.com/biosurf/cyDefine
library("cyCombine")
library("scennep")
seed <- 942
mc.cores <- 39
rerun <- FALSE
system("mkdir -p data")
system("mkdir -p results")We are using the normal samples from a public COVID paper, Lee et al., 2020
# Individual Single-Cell RNA-seq PBMC Data from Lee et al. (DOI: 10.1126/sciimmunol.abd1554)
if (file.exists("data/01_lee.rds")) {
rnaseq <- readRDS("data/01_lee.rds")
} else {
system("wget https://datasets.cellxgene.cziscience.com/bdee03e4-14ee-4f3e-a225-3d12f032841f.rds -O data/lee.rds")
# Load and transform Lee
lee <- readRDS("data/lee.rds")
if (! "SCT" %in% Seurat::Assays(lee)) {
lee <- lee[, grepl("Normal", lee$sample)]
lee <- FindVariableFeatures(lee)
oopts <- options(future.globals.maxSize = 1.0 * 1e9)
on.exit(options(oopts))
lee <- Seurat::SCTransform(lee)
lee <- FindVariableFeatures(lee)
lee@reductions <- list()
lee[["RNA"]] <- NULL
}
# Align annotation labels
lee$batch <- "lee"
lee$sample <- "lee"
celltype <- lee$Cell.group
lee$celltype <- dplyr::case_when(
celltype == "B cell" ~ "B cells",
stringr::str_starts(celltype, "CD4") ~ "CD4pos",
stringr::str_starts(celltype, "CD8") ~ "CD8pos",
stringr::str_ends(celltype, "Monocyte") ~ "Monocytes",
celltype == "NK cell" ~ "NK cells",
celltype %in% c("cDC", "pDC") ~ paste0(celltype, "s"),
lee$cell_type == "gamma-delta T cell" ~ "TCRgd",
TRUE ~ "Other"
)
saveRDS(lee, "data/01_lee.rds")
system("rm data/lee.rds")
rnaseq <- lee
rm(lee)
} The CyTOF dataset (FR-FCM-ZYAJ) was manually annotated by Mike Leipold. Only sample 001 is used.
if (file.exists("data/01_cytof_himc.RDS") & !rerun) {
cytof <- readRDS("data/01_cytof_himc.RDS")
} else {
load("data/data_transformed.Rdata")
# Rename columns and filter to one healthy sample
himc <- expr_trans |>
dplyr::rename(
"CD45" = "CD45RA",
"HLA-DR" = HLADR,
celltype = population) |>
dplyr::filter(
sample == "001",
) |>
dplyr::mutate(
batch = "CyTOF",
sample = "CyTOF",
celltype = case_when(
celltype == "mDCs" ~ "cDCs",
celltype == "Basophils" ~ "Other",
celltype == "Platelets" ~ "Other",
celltype == "NKT" ~ "Other",
TRUE ~ celltype
)
)
rm(expr_trans)
}if (!file.exists("data/01_cytof_himc.RDS") | rerun) {
# Download marker info
if (!file.exists("ab_info.xlsx")) system("wget https://ars.els-cdn.com/content/image/1-s2.0-S0092867421005833-mmc1.xlsx -O ab_info.xlsx")
# Load marker mapper
gene_marker <- readxl::read_excel("ab_info.xlsx")
# gene_marker <-
gene_marker$`Ensembl Gene Id` <- gene_marker$`Ensembl Gene Id` |> stringr::str_remove_all(".*MUS.* ")
# Select relevant genes-marker pairs
gm <- sapply(colnames(himc), function(m) gene_marker$`Ensembl Gene Id`[stringr::str_detect(gene_marker$Specificity, m)][1])
# Manually assign gene IDs for remaining markers
gm[is.na(gm)]
gm[c("CCR7", "CD85j", "TCRgd", "celltype") ] <- c("ENSG00000126353", "ENSG00000104972", "ENSG00000211701", NA)
gm <- gm[!is.na(gm)]
# Replace column names
colnames(himc)[which(colnames(himc) %in% names(gm))] <- gm[colnames(himc)[which(colnames(himc) %in% names(gm))]]
saveRDS(himc, file = "data/01_cytof_himc.RDS")
cytof <- himc
rm(himc)
}# Common markers
markers <- intersect(rownames(rnaseq), colnames(cytof))
system.time({
# Run scennep
rna <- scennep(
rnaseq,
mc.cores = mc.cores,
distance = "cosine",
markers = markers,
pb = FALSE,
return_S4 = FALSE)
})## Using the top 22 pcs for the SNN
## Building SNN graph with k = 20
## Pseudobulking each cell with its 20 nearest neighbors
## user system elapsed
## 537.404 66.749 45.876
rnaseq_df <- rna |>
t() |>
tibble::as_tibble() |>
mutate(batch = "scRNA-seq",
sample = "scRNA-seq",
celltype = rnaseq$celltype) |>
dplyr::select(dplyr::any_of(c(markers, non_markers)))
# Merge datasets
set.seed(seed)
uncorrected <- cytof |>
dplyr::slice_sample(n = ncol(rna)*2) |>
dplyr::bind_rows(rnaseq_df) |>
dplyr::select(dplyr::any_of(c(markers, non_markers))) |>
dplyr::mutate(id = dplyr::row_number())
# saveRDS(uncorrected, "results/02_rnaseq_cytof_uncorrected.RDS")set.seed(seed)
uncor <- uncorrected |>
group_by(sample) |>
slice_sample(n = 5000) |>
plot_umap(
title = "scRNA-seq + Mass Cytometry",
down_sample = F,
metric = "cosine",
markers = markers,
col = "batch",
legend_title = "Technology",)## Generating UMAP
# saveRDS(uncor, "results/02_rnaseq_cytof_uncor.RDS")
uncorFor this integration, we are running three iterations of cyCombine to ensure a robust result. The first iteration excludes a clustering step, the second uses a coarse 1x5 grid clustering, and the final iteration uses a somewhat fine-grained 5x5 clustering grid. Cosine distance is used with a FlowSOM clustering to emphasize the angle of variation rather than the amplitude. Note that cosine distance is not available in the default Kohonen cluster method.
system.time({
suppressMessages({ # Hide per-cluster correction info
corrected <- cyCombine(
uncorrected,
markers = markers,
cluster_method = "flowsom",
distf = "cosine",
seed = seed,
xdim = c(1,1,5),
ydim = c(1,5), # Note: the second "5" is reused in the third iteration
mc.cores = 1, pb = F
)
})
})## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found 1 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## user system elapsed
## 88.500 12.576 6.854
# saveRDS(corrected, "results/02_rnaseq_cytof_corrected.RDS")set.seed(seed)
umap_batch <- corrected |>
group_by(sample) |>
slice_sample(n = 5000) |>
plot_umap(
return_data = T,
down_sample = F,
seed = seed,
metric = "cosine",
markers = markers,
title = "scRNA-seq + Mass Cytometry",
legend_title = "Technology",
col = "batch")## Generating UMAP
# ggplot2::ggsave(umap_batch$plot, filename = "figs/02_rnaseq_cytof_batch.png")
# saveRDS(umap_batch$plot, "results/02_rnaseq_cytof_batch.RDS")
umap_batch$plotNote: We use plot_embedding to reuse the UMAP
coordinates from the plot above. We could also have rerun
plot_umap for a simpler call.
celltype_colors <- cyDefine::get_distinct_colors(sort(unique(uncorrected$celltype)))
celltype_colors[["Other"]] <- "black"
xlim <- c(min(umap_batch$data$UMAP1), max(umap_batch$data$UMAP1))
ylim <- c(min(umap_batch$data$UMAP2), max(umap_batch$data$UMAP2))
plot_cell <-
plot_embedding(
dplyr::filter(umap_batch$data, batch == "scRNA-seq"),
colors = celltype_colors,
xlim = xlim,
ylim = ylim,
title = "scRNA-seq",
col = "celltype",
legend_title = "Cell type") +
plot_embedding(
dplyr::filter(umap_batch$data, batch == "CyTOF"),
colors = celltype_colors,
xlim = xlim,
ylim = ylim,
title = "Mass Cytometry",
col = "celltype",
legend_title = "Cell type") +
patchwork::plot_layout(
guides = "collect",
axes = "collect",
axis_titles = "collect")
# saveRDS(plot_cell, "results/02_rnaseq_cytof_cell.RDS")
# ggplot2::ggsave(plot_cell, filename = "figs/02_rnaseq_cytof_cell.png")
plot_cellcorrected$label <- create_som(corrected, ydim = 5, xdim = 5, seed = seed, markers = markers, cluster_method = "flowsom", distf = "cosine")## Building SOM
## Mapping data to SOM
uncorrected$label <- corrected$label
system.time({
lu <- cyCombine::compute_cilisi(df = uncorrected, markers = get_markers(uncorrected), label_cols = "batch", split_by = "label", return_mean = F)
})## Warning in compute_lisi(df = df_label, markers = markers, label_cols =
## label_cols): Cannot calculate more neighbors than there are points. Setting
## perplexity to 2
## user system elapsed
## 0.435 0.008 0.445
message("CiLISI uncorrected: ", mean(unlist(lu)))## CiLISI uncorrected: 0.00783590938374358
system.time({
l <- cyCombine::compute_cilisi(df = corrected, markers = get_markers(corrected), label_cols = "batch", split_by = "label", return_mean = F)
})## Warning in compute_lisi(df = df_label, markers = markers, label_cols =
## label_cols): Cannot calculate more neighbors than there are points. Setting
## perplexity to 2
## user system elapsed
## 0.563 0.005 0.570
message("CiLISI corrected: ", mean(unlist(l)))## CiLISI corrected: 0.485482059171325
mad <- cyCombine::evaluate_mad(uncorrected, corrected, cell_col = "label", markers = markers)## Computing MAD for corrected data..
## Computing MAD for uncorrected data..
## The MAD score is: 0.08
emd <- cyCombine::evaluate_emd(uncorrected, corrected, cell_col = "label", markers = markers)## Computing EMD for corrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Computing EMD for uncorrected data..
## Binning destributions using binSize 0.1
## Computing marker-wise EMD for each cell cluster
## Removing EMDs below 2 both before and after correction
## The reduction is: 0.8
## Creating plots..
## Evaluation complete.
# saveRDS(list("emd" = emd, "mad" = mad, "CiLISI_uc" = lu, "CiLISI_c" = l), "results/02_rnaseq_cytof_performance.RDS")Contact