This vignette will demonstrate the integration of spectral flow cytometry (SFC) and CyTOF protein expression measurements using cyCombine.
In this vignette, we will analyze the healthy donor PBMC SFC and CyTOF data, which is also presented in the three-platform vignette.
The SFC data is from Park et al. (2020) available from FlowRepository (ID: FR-FCM-Z2QV). We pre-gated to live single cells in FlowJo version 10 (Tree Star Inc). Singlets and non-debris were identified using forward and side-scatter. Dead cells were excluded using live/dead stains. Data from these gates were then exported in FCS format.
For the CyTOF data, we use the data from a single healthy donor processed at the Human Immune Monitoring Center. The sample was also derived from FlowRepository (ID: FR-FCM-ZYAJ) and pre-gated to live intact singlets in FlowJo version 10 (Tree Star Inc).
We start by loading some packages:
We start by defining some colors to use.
# Define some nice colors for plotting
color_clusters <- c("#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72",
"#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3",
"#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D",
"#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999",
"#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000",
"#aeae5c", "#1e90ff", "#00bfff", "#FAE174", "#56ff0d",
"#ffff00", "#D4E1C8", "#D470C8", "#64C870", "#0AC8D4",
"#64C80C", "#641400", "#000000", "#0C00FA", "#7800FA",
"#FA00FA", "#00FAFA", "#707A00", "#0C0C91", "#B5651D")We are now ready to load the spectral flow data into a tibble.
# Directory with raw .fcs files
data_dir <- "Park_et_al_2020_Live+/"
# Panel and reading data
sfc_panel <- read_csv(paste0(data_dir, "/panel_Park2020.csv"))
sfc_markers <- sfc_panel %>%
dplyr::filter(Type != "none") %>%
pull(Marker) %>%
str_remove_all("[ _-]")
spectral <- prepare_data(data_dir = data_dir,
metadata = paste0(data_dir, "/Spectral samples cohort.xlsx"),
filename_col = "FCS_name",
batch_ids = "Batch",
condition = "Set",
sample_ids = "Patient id",
cofactor = 6000,
derand = FALSE,
markers = sfc_markers,
down_sample = FALSE)
# Subset to a single sample for simplicity
spectral <- spectral %>%
dplyr::filter(sample == "303444")Now, we a single sample consisting of 582,005 cells. We now want to generate some cell labels using the overlapping markers
# Defining the set of overlapping markers - there are 26
overlap_markers <- c('CD3', 'CD4', 'CD8', 'CD25', 'CCR7', 'CD45RA', 'TCRgd', 'CD27', 'CD19', 'CD20', 'CD24', 'IgD', 'CXCR5', 'CD56', 'CD57', 'CD16', 'CD38', 'CD28', 'CCR6', 'CD127', 'HLADR', 'CD14', 'CD11c', 'CD123', 'CXCR3', 'PD1')
# Time to label the cells. Here, we will apply a SOM and ConsensusClusterPlus
# Clustering with kohonen 10x10
set.seed(seed)
som_ <- spectral %>%
dplyr::select(dplyr::all_of(overlap_markers)) %>%
as.matrix() %>%
kohonen::som(grid = kohonen::somgrid(xdim = 10, ydim = 10),
rlen = 10,
dist.fcts = "euclidean")
cell_clustering_som <- som_$unit.classif
codes <- som_$codes[[1]]
# Meta-clustering with ConsensusClusterPlus
mc <- ConsensusClusterPlus::ConsensusClusterPlus(t(codes), maxK = 35, reps = 100,
pItem = 0.9, pFeature = 1, plot = F,
clusterAlg = "hc", innerLinkage = "average",
finalLinkage = "average",
distance = "euclidean", seed = seed)# Get cluster ids for each cell - here we choose to look at 30 meta-clusters but we will merge some in the next step
code_clustering1 <- mc[[30]]$consensusClass
spectral$label <- as.factor(code_clustering1[cell_clustering_som])# Down-sampling for plot (20,000 cells)
set.seed(seed)
spectral_sliced <- spectral %>%
dplyr::slice_sample(n = 20000)
# Let us visualize these clusters on a UMAP
umap <- spectral_sliced %>% plot_dimred(name = 'SFC clusters')
spectral_sliced <- cbind.data.frame(umap$data, spectral_sliced)
ggplot(spectral_sliced, aes(x = UMAP1, y = UMAP2)) +
geom_point(aes(color = label), alpha = 0.3, size = 0.4) +
theme_bw() + theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = color_clusters) +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 1)))# And now let us use violin plots for the assignment of labels for each population
p <- list()
for (m in overlap_markers) {
p[[m]] <- ggplot(spectral, aes_string(x = 'label', y = m)) +
geom_violin(aes(fill = label)) +
scale_fill_manual(values = color_clusters) +
theme_bw()
}
p2 <- ggpubr::ggarrange(plotlist = p, common.legend = T)Based on these plots and the UMAP, it is possible to define labels for many of the clusters, although some also appear strange. This includes the very small cluster 2, which could be B-T cell doublets (CD19+CD3+). Cluster 22 was also very small and expressed only CD25, CD127, CD45RA. Cluster 24 also seemed very mixed with bimodal CD14 and CD11c distributions. Finally cluster 29 had only 3 cells, and was left unlabeled.
Regarding cluster 10, this was also tricky, but considering both its UMAP location and the intermediate level of CD4, which is comparable to clusters 1, 2, and 5, we are comfortable with labeling these as myeloid cells. The rest of the labels are assigned below.
spectral_cl_names <- c('1' = 'B cells', '2' = 'Unlabeled', '3' = 'Naive CD8+ T cells', '4' = 'Memory CD8+ T cells', '5' = 'gdT cells',
'6' = 'Memory CD4+ T cells', '7' = 'Naive CD4+ T cells', '8' = 'B cells (IgDlo)', '9' = 'gdT cells (CD57dim)', '10' = 'Memory CD8+ T cells (CD57+)',
'11' = 'NK cells (CD16+CD57+)', '12' = 'NK cells (CD16+CD57-)', '13' = 'NK cells (CD16-CD57-)', '14' = 'NK cells (CD16-CD57+)', '15' = 'Memory CD4+ T cells (CD57+ PD1+)',
'16' = 'NK cells (CD16-CD57-)', '17' = 'pDCs', '18' = 'Tregs', '19' = 'pDCs', '20' = 'B cells (CD19lo CD38++)',
'21' = 'CD3lo gdT cells' , '22' = 'Unlabeled', '23' = 'pDCs (CD38+)', '24' = 'Unlabeled', '25' = 'Memory CD4+ T cells',
'26' = 'mDCs' , '27' = 'Classical monocytes', '28' = 'Non-classical monocytes', '29' = 'Unlabeled', '30' = 'Classical monocytes')
# c('1' = '', '2' = '', '3' = '', '4' = '', '5' = '',
# '6' = '', '7' = '', '8' = '', '9' = '', '10' = '',
# '11' = '', '12' = '', '13' = '', '14' = '', '15' = '',
# '16' = '', '17' = '', '18' = '', '19' = '', '20' = '',
# '21' = '' , '22' = '', '23' = '', '24' = '', '25' = '',
# '26' = '' , '27' = '', '28' = '', '29' = '', '30' = '')
spectral$label <- spectral_cl_names[spectral$label]
# Removing the unlabeled cells
spectral <- spectral %>%
dplyr::filter(label != 'Unlabeled')After removal of the unlabeled cells, we have 573,397 cells remaining (98.5 %) in the SFC dataset and this portion of the data is now ready for batch correction. We will now look at the CyTOF data.
Then it is time to read the CyTOF data. We use a single sample (ctrls-001) from FlowRepository: FR-FCM-ZYAJ. We downloaded the version normalized with MATLAB and pre-gated it to live intact singlets using FlowJo.
### Get HIMC cytof data for one sample
# Panel and reading data
cytof_panel <- read_csv("cytof_panel_HIMC.csv")
cytof_markers <- cytof_panel %>%
dplyr::filter(Type != "none") %>%
pull(Marker) %>%
str_remove_all("[ _-]")
# Read the fcs file and make it a dataframe
cytof <- prepare_data(data_dir = '.',
cofactor = 5,
derand = T,
markers = cytof_markers,
down_sample = FALSE,
batch_ids = c('CyTOF'),
sample_ids = c('Sample1'))Now, we a single sample consisting of 174,601 cells. Similarly to the other datasets, we now need to generate some cell labels - based on the overlapping markers only. Let us look at this:
# Subset to overlap
cytof <- cytof %>%
select(all_of(c(overlap_markers, 'sample', 'batch')))
# Time to label the cells. Here, we will again apply a SOM and ConsensusClusterPlus
# Clustering with kohonen
set.seed(seed)
som_ <- cytof %>%
dplyr::select(dplyr::all_of(overlap_markers)) %>%
as.matrix() %>%
kohonen::som(grid = kohonen::somgrid(xdim = 10, ydim = 10),
rlen = 10,
dist.fcts = "euclidean")
cell_clustering_som <- som_$unit.classif
codes <- som_$codes[[1]]
# Meta-clustering with ConsensusClusterPlus
mc <- ConsensusClusterPlus::ConsensusClusterPlus(t(codes), maxK = 35, reps = 100,
pItem = 0.9, pFeature = 1, plot = F,
clusterAlg = "hc", innerLinkage = "average",
finalLinkage = "average",
distance = "euclidean", seed = seed)# Get cluster ids for each cell - here we also look at 30 meta-clusters but we will merge some in the next step
code_clustering1 <- mc[[30]]$consensusClass
cytof$label <- as.factor(code_clustering1[cell_clustering_som])# Down-sampling for plot (20,000 cells)
set.seed(seed)
cytof_sliced <- cytof %>%
dplyr::slice_sample(n = 20000)
# Let us visualize these clusters on a UMAP
umap <- cytof_sliced %>% plot_dimred(name = 'CyTOF clusters')
cytof_sliced <- cbind.data.frame(umap$data, cytof_sliced)
ggplot(cytof_sliced, aes(x = UMAP1, y = UMAP2)) +
geom_point(aes(color = label), alpha = 0.3, size = 0.4) +
theme_bw() + theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = color_clusters) +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 1)))# And now let us use violin plots for the assignment of labels for each population
p <- list()
for (m in overlap_markers) {
p[[m]] <- ggplot(cytof, aes_string(x = 'label', y = m)) +
geom_violin(aes(fill = label)) +
scale_fill_manual(values = color_clusters) +
theme_bw()
}
p2 <- ggpubr::ggarrange(plotlist = p, common.legend = T)