1 Introduction

Chimeric Antigen Receptor T-cell (CAR-T) therapy represents a revolutionary approach to cancer treatment, where a patient’s T-cells are genetically modified to recognize and attack cancer cells. The success of CAR-T therapy critically depends on identifying suitable target antigens—proteins expressed on cancer cells that can serve as targets for the engineered T-cells.

This vignette exemplifies a comprehensive computational workflow for CAR-T target assessment at the isoform level. We demonstrate this workflow using ERBB2 (HER2/neu) as a case study, systematically evaluating isoforms based on subcellular localization, membrane topology, epitope retention, and expression patterns. ERBB” is a receptor tyrosine kinase overexpressed in approximately 25-30% of breast cancers Slamon, D.J. et al. and an established target for monoclonal antibody therapy (trastuzumab) Yoon, J. et al..

1.1 Required Packages

library(Biostrings)
library(bio3d)
library(biomaRt)
library(ggnewscale)
library(patchwork)
library(tidyverse) 
library(httr)
library(jsonlite)


options(timeout = 300)
theme_set(theme_light())

2 Phase 1: Isoform Discovery

2.1 Step 1: Retrieve Isoform Sequences

Download protein sequences for all HER2 isoforms from Ensembl (GRCh38, release 115). We retrieve sequences for all protein-coding transcripts to ensure comprehensive coverage.

# Connect to Ensembl
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")

# Get protein-coding ERBB2 transcripts
get_gene_transcripts <- function(gene_symbol, mart) {
  getBM(
    attributes = c("ensembl_gene_id", "ensembl_transcript_id",
                   "transcript_biotype", "transcript_length"),
    filters = "hgnc_symbol",
    values = gene_symbol,
    mart = mart
  ) %>%
    filter(transcript_biotype == "protein_coding") %>%
    arrange(desc(transcript_length))
}

her2_transcripts <- get_gene_transcripts("ERBB2", ensembl)

# Retrieve protein sequences
get_protein_sequences <- function(transcript_ids, mart) {
  seqs <- getBM(
    attributes = c("ensembl_transcript_id", "peptide"),
    filters = "ensembl_transcript_id",
    values = transcript_ids,
    mart = mart
  )

  sequences <- AAStringSet(seqs$peptide)
  names(sequences) <- seqs$ensembl_transcript_id
  sequences[width(sequences) > 0]  # Remove empty sequences
}

her2_sequences_all <- get_protein_sequences(her2_transcripts$ensembl_transcript_id, ensembl)
# saveRDS(her2_sequences_all, "cartcontent/results/HER2_analysis/her2_sequences_all.rds")
print(her2_sequences_all)

## AAStringSet object of length 28:
##      width seq                                              names               
##  [1]  1056 MELAALCRWGLLLALLPPGAAST...PAPGAGGMVHHRHRSSSTRNM* ENST00000584450
##  [2]   604 MKLRLPASPETHLDMLRHLYQGC...IWKFPDEEGACQPCPINCTHS* ENST00000578199
##  [3]   177 XPCHPECQPQNGSVTCFGPEADQ...KETELRKVKGIWIPDGENVKIP ENST00000582818
##  [4]   980 MELAALCRWGLLLALLPPGAAST...STFKGTPTAENPEYLGLDVPV* ENST00000445658
##  [5]   102 MKLRLPASPETHLDMLRHLYQGC...NYALAVLDNGDPLNNTTPVTGA ENST00000578709
##  ...   ... ...
## [24]  1140 MELAALCRWGLLLALLPPGAAST...STFKGTPTAENPEYLGLDVPV* ENST00000959775
## [25]  1267 MELAALCRWGLLLALLPPGAAST...STFKGTPTAENPEYLGLDVPV* ENST00000959774
## [26]  1016 MELAALCRWGLLLALLPPGAAST...SMLGASKPSCIPFTVKVQSHS* ENST00001116003
## [27]   634 MELAALCRWGLLLALLPPGAAST...IWKFPDEEGACQPCPINCTHS* ENST00001005734
## [28]  1256 MELAALCRWGLLLALLPPGAAST...STFKGTPTAENPEYLGLDVPV* ENST00001141787

2.2 Step 2: Download Expression Data

We use pre-processed TCGA/GTEx expression data from Xena Browser. This dataset provides transcript-level expression across cancer and normal tissues. Note: Ensembl’s September 2025 update added ~120,000 new transcripts not yet reflected in available expression databases, so some isoforms are absent from the expression data. Link to dataset: https://xenabrowser.net/datapages/?dataset=TcgaTargetGtex_rsem_isoform_tpm&host=https%3A%2F%2Ftoil.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

# run on HPC
# File paths
expr_file <- "data/TcgaTargetGtex_rsem_isoform_tpm.gz"
filtered_file <- "results/her2_isoforms_filtered.txt"
output_rds <- "results/her2_expr_filtered.rds"

# HER2 transcripts to extract
her2_transcripts <- names(her2_sequences_all)

# Filter expression file line-by-line (efficient for large files)
con <- gzfile(expr_file, "r")
header <- readLines(con, n = 1)
writeLines(header, filtered_file)

line_count <- 0
matched_count <- 0

while (TRUE) {
  lines <- readLines(con, n = 1000)
  if (length(lines) == 0) break
  
  line_count <- line_count + length(lines)
  
  # Extract HER2 transcript lines
  pattern <- paste0("^(", paste(her2_transcripts, collapse = "|"), ")(\\.|\\t)")
  matching_lines <- grep(pattern, lines, value = TRUE)
  
  if (length(matching_lines) > 0) {
    write(matching_lines, file = filtered_file, append = TRUE)
    matched_count <- matched_count + length(matching_lines)
  }
}

close(con)

# Read filtered data and save as RDS
her2_expr <- read_tsv(filtered_file, show_col_types = FALSE)
saveRDS(her2_expr, output_rds)

2.3 Step 3: Identify Available Transcripts

Identify which HER2 isoforms have expression data and filter sequences accordingly.

# Load pre-filtered expression data

her2_expr <- readRDS("results/HER2_analysis/her2_expr_filtered.rds")

# Extract transcript IDs (remove version numbers)
her2_available_transcripts <- her2_expr %>% 
  mutate(transcript_clean = str_remove(sample, "\\..*")) %>%
  pull(transcript_clean) %>%
  unique()

# Filter sequences to those with expression data
sequence_names_clean <- str_remove(names(her2_sequences_all), "\\..*")
keep_indices <- sequence_names_clean %in% her2_available_transcripts
her2_sequences_filtered <- her2_sequences_all[keep_indices]

cat("Sequences with expression data:", length(her2_sequences_filtered), "\n")

## Sequences with expression data: 10

2.4 Step 4: Export for External Tools

Save sequences in FASTA format for external prediction tools (DeepLoc2 for subcellular localization, DeepTMHMM for membrane topology).

fasta_file <- "data/her2_isoforms_filtered.fasta"
writeXStringSet(her2_sequences_filtered, filepath = fasta_file)

cat("✓ FASTA saved:", fasta_file, "\n\n")
cat("Upload to:\n")
cat("  • DeepLoc2: https://services.healthtech.dtu.dk/services/DeepLoc-2.1/\n")
cat("  • DeepTMHMM: https://dtu.biolib.com/DeepTMHMM\n")

3 Phase 2: Structural Characterization

3.1 Step 5: Align Isoforms with MAFFT

Align sequences using MAFFT with the --localpair algorithm, which is optimized for sequences with large insertions/deletions from alternative splicing. This ensures proper alignment of conserved functional domains.

temp_input <- tempfile(fileext = ".fasta")
temp_output <- tempfile(fileext = ".fasta")

writeXStringSet(her2_sequences_filtered, temp_input)

# MAFFT alignment optimized for isoforms
system2("mafft", 
        args = c("--localpair", "--maxiterate", "1000", "--quiet", temp_input),
        stdout = temp_output)

her2_aligned <- readAAStringSet(temp_output)
unlink(c(temp_input, temp_output))

cat("Alignment width:", unique(width(her2_aligned)), "positions\n")
# writeXStringSet(her2_aligned, "../../results/HER2_analysis/her2_aligned_mafft.fasta")

3.2 Step 6: Import and Process Predictions

Import external prediction results and perform integrated analysis: classify isoforms by subcellular localization, map membrane topology to aligned sequences, and identify therapeutic epitope retention across all isoforms.

her2_aligned <- readAAStringSet("results/HER2_analysis/her2_aligned_mafft.fasta")
# ===== Import DeepLoc2 Results =====
deeploc_results <- read_csv("results/HER2_analysis/HER2_deeploc21_filtered.csv", 
                             show_col_types = FALSE)
deeploc_filtered <- deeploc_results %>%
  mutate(transcript_clean = str_remove(Protein_ID, "\\..*")) %>%
  filter(transcript_clean %in% her2_available_transcripts)

# ===== Import DeepTMHMM Results =====
parse_deeptmhmm <- function(file_path) {
  lines <- readLines(file_path)
  results <- list()
  i <- 1
  
  while (i <= length(lines)) {
    if (startsWith(lines[i], ">")) {
      transcript_id <- str_extract(lines[i], "(?<=>)[^ ]+")
      sequence <- lines[i + 1]
      topology <- lines[i + 2]
      
      results[[transcript_id]] <- list(
        transcript_id = transcript_id,
        sequence = sequence,
        topology = topology
      )
      i <- i + 3
    } else {
      i <- i + 1
    }
  }
  return(results)
}

deeptmhmm_results <- parse_deeptmhmm(
  "results/HER2_analysis/HER2_deeptmhmm_predicted_topologies_filtered.3line"
)

# ===== Classify Isoforms by Localization =====
transcript_classification <- deeploc_filtered %>%
  dplyr::select(Protein_ID, `Cell membrane`, Extracellular) %>%
  mutate(
    transcript_clean = str_remove(Protein_ID, "\\..*"),
    classification = case_when(
      `Cell membrane` > Extracellular ~ "Membrane-bound",
      Extracellular > `Cell membrane` ~ "Secreted",
      TRUE ~ "Ambiguous"
    )
  )

classification_colors <- c(
  "Membrane-bound" = "#E74C3C",
  "Secreted" = "#3498DB",
  "Ambiguous" = "#95A5A6"
)

# ===== Map Topology to Alignment =====
map_topology_to_alignment <- function(deeptmhmm_results, aligned_sequences) {
  map_dfr(names(deeptmhmm_results), function(tid) {
    topology <- deeptmhmm_results[[tid]]$topology
    aligned_seq <- as.character(aligned_sequences[tid])
    
    topology_chars <- strsplit(topology, "")[[1]]
    aligned_chars <- strsplit(aligned_seq, "")[[1]]
    aligned_topology <- character(length(aligned_chars))
    
    # Transfer topology to aligned positions
    original_pos <- 1
    for (aligned_pos in 1:length(aligned_chars)) {
      if (aligned_chars[aligned_pos] == "-") {
        aligned_topology[aligned_pos] <- "-"
      } else {
        if (original_pos <= length(topology_chars)) {
          aligned_topology[aligned_pos] <- topology_chars[original_pos]
        }
        original_pos <- original_pos + 1
      }
    }
    
    data.frame(
      position = 1:length(aligned_chars),
      aa = aligned_chars,
      topology = aligned_topology,
      transcript_id = tid
    ) %>%
      mutate(
        topology_type = case_when(
          topology == "S" ~ "Signal peptide",
          topology == "O" ~ "Extracellular",
          topology == "M" ~ "Transmembrane",
          topology == "I" ~ "Intracellular",
          topology == "-" ~ "Alignment gap",
          TRUE ~ "Unknown"
        )
      )
  })
}

aligned_segments <- map_topology_to_alignment(deeptmhmm_results, her2_aligned)

# ===== Map Epitopes Across Isoforms =====
# Therapeutic antibody epitope positions in canonical HER2
epitope_positions <- list(
  P = list(c(257, 267), c(308, 318)),  # Pertuzumab
  T = list(c(580, 582), c(593, 595), c(610, 625))  # Trastuzumab
)

# Load canonical HER2 sequence
her2_seq <- her2_sequences_all[["ENST00000269571"]]
her2_aa <- strsplit(as.character(her2_seq), "")[[1]]

# Pertuzumab epitope sequences
cat("Pertuzumab epi1 (257-267):", paste(her2_aa[257:267], collapse=""), "\n")

## Pertuzumab epi1 (257-267): HFNHSGICELH

cat("Pertuzumab epi2 (308-318):", paste(her2_aa[308:318], collapse=""), "\n")

## Pertuzumab epi2 (308-318): VGSCTLVCPLH

# Trastuzumab epitope sequences
cat("Trastuzumab epi1 (580-582):", paste(her2_aa[580:582], collapse=""), "\n")

## Trastuzumab epi1 (580-582): EAD

cat("Trastuzumab epi2 (593-595):", paste(her2_aa[593:595], collapse=""), "\n")

## Trastuzumab epi2 (593-595): PPF

cat("Trastuzumab epi3 (610-625):", paste(her2_aa[610:625], collapse=""), "\n")

## Trastuzumab epi3 (610-625): YMPIWKFPDEEGACQP

map_epitopes_from_alignment <- function(aligned_seqs, canonical_name, epitope_pos) {
  canonical_seq <- as.character(aligned_seqs[[canonical_name]])
  epitope_map_list <- list()
  
  for(seq_name in names(aligned_seqs)) {
    isoform_seq <- as.character(aligned_seqs[[seq_name]])
    canonical_ungapped_pos <- 0
    epitope_positions_aligned <- list()
    
    for(i in 1:nchar(canonical_seq)) {
      canon_char <- substr(canonical_seq, i, i)
      iso_char <- substr(isoform_seq, i, i)
      
      if(canon_char != "-") canonical_ungapped_pos <- canonical_ungapped_pos + 1
      
      # Check if position is in epitope and isoform has residue
      for(epitope_name in names(epitope_pos)) {
        for(region in epitope_pos[[epitope_name]]) {
          if(canonical_ungapped_pos >= region[1] && 
             canonical_ungapped_pos <= region[2] &&
             iso_char != "-") {
            epitope_positions_aligned[[length(epitope_positions_aligned) + 1]] <- data.frame(
              epitope = epitope_name,
              position = i,
              canonical_pos = canonical_ungapped_pos
            )
          }
        }
      }
    }
    
    if(length(epitope_positions_aligned) > 0) {
      epitope_map_list[[seq_name]] <- bind_rows(epitope_positions_aligned)
    }
  }
  
  bind_rows(epitope_map_list, .id = "transcript_id")
}

epitope_mapping <- map_epitopes_from_alignment(
  aligned_seqs = her2_aligned,
  canonical_name = "ENST00000269571",
  epitope_pos = epitope_positions
)

# Summarize epitope retention
epitope_summary <- epitope_mapping %>%
  group_by(transcript_id, epitope) %>%
  summarise(n_positions = n(), .groups = "drop") %>%
  pivot_wider(names_from = epitope, values_from = n_positions, values_fill = 0)

print(epitope_summary)

## # A tibble: 9 × 3
##   transcript_id       P     T
##   <chr>           <int> <int>
## 1 ENST00000269571    22    22
## 2 ENST00000445658    11    22
## 3 ENST00000578199    22    22
## 4 ENST00000578502    22    22
## 5 ENST00000580074     0    10
## 6 ENST00000582818     0    22
## 7 ENST00000584014    22    22
## 8 ENST00000584450    22    22
## 9 ENST00000584601    22    22

3.3 Step 7: Visualize Integrated Results

Create comprehensive visualization combining subcellular localization (DeepLoc2), membrane topology (DeepTMHMM), and epitope locations in a single integrated figure.

# Define isoform display order
transcript_order <- c(
  "ENST00000580074",
  "ENST00000578709",
  "ENST00000584601",
  "ENST00000584450",
  "ENST00000584014",
  "ENST00000582818",
  "ENST00000578502",  # Longest
  "ENST00000578199",  # Secreted
  "ENST00000445658",  # Truncated
  "ENST00000269571"   # Canonical
)

# ===== DeepLoc2 Heatmap =====
localization_cols <- c("Cell membrane", "Cytoplasm", "Endoplasmic reticulum",
                       "Extracellular", "Golgi apparatus", "Lysosome/Vacuole",
                       "Mitochondrion", "Nucleus", "Peroxisome", "Plastid")

plot_data_combined <- deeploc_filtered %>%
  mutate(Protein_ID = factor(Protein_ID, levels = transcript_order)) %>%
  dplyr::select(Protein_ID, all_of(localization_cols)) %>%
  pivot_longer(cols = -Protein_ID, names_to = "localization", values_to = "probability")

deeploc_heatmap <- ggplot(plot_data_combined, 
                          aes(x = localization, y = Protein_ID, fill = probability)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_gradientn(
    colors = c("#4A90B8", "#5E9DC3", "#FFFACD", "#F26D5E", "#D73027"),
    values = c(0, 0.2, 0.5, 0.8, 1),
    limits = c(0, 1)
  ) +
  labs(title = "HER2 Isoforms", x = NULL, y = NULL, fill = "Probability") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 16),
    axis.text.y = element_text(size = 16, family = "mono"),
    plot.title = element_text(size = 18, face = "bold"),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 16),
    legend.position = "top",
    panel.grid = element_blank()
  )

# ===== Topology Plot with Epitopes =====
plot_topology_with_epitopes <- function(aligned_segments, epitope_data, transcript_order) {
  topology_colors <- c(
    "Signal peptide" = "#C41C24",
    "Extracellular" = "#FFB20F",
    "Transmembrane" = "#18848C",
    "Intracellular" = "#96BDC6",
    "Alignment gap" = "#EDE7E3"
  )
  
  epitope_colors <- c("P" = "#55a630", "T" = "#ff5d8f")
  
  aligned_segments <- aligned_segments %>%
    mutate(transcript_id = factor(transcript_id, levels = transcript_order))
  
  # Base topology plot
  p <- ggplot(aligned_segments, aes(x = position, y = transcript_id, fill = topology_type)) +
    geom_tile(height = 0.8, width = 1) +
    scale_fill_manual(values = topology_colors, name = "Topology") +
    scale_x_continuous(expand = c(0, 0), breaks = seq(0, max(aligned_segments$position), 400)) +
    labs(x = "Amino acid position (aligned)", y = NULL) +
    theme_minimal() +
    theme(
      axis.text.y = element_blank(),
      axis.text.x = element_text(size = 16),
      axis.title.x = element_text(size = 18),
      legend.position = "right",
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18),
      panel.grid = element_blank()
    )
  
  # Add epitope overlay
  if(!is.null(epitope_data) && nrow(epitope_data) > 0) {
    epitope_data <- epitope_data %>%
      mutate(transcript_id = factor(transcript_id, levels = transcript_order))
    
    p <- p +
      new_scale_fill() +
      geom_tile(data = epitope_data,
                aes(x = position, y = transcript_id, fill = epitope),
                height = 0.8, width = 1, alpha = 0.9, inherit.aes = FALSE) +
      scale_fill_manual(
        values = epitope_colors,
        name = "Epitope",
        labels = c("P" = "Pertuzumab", "T" = "Trastuzumab")
      )
  }
  
  return(p)
}

topology_plot <- plot_topology_with_epitopes(aligned_segments, epitope_mapping, transcript_order)

# ===== Combined Plot =====
combined_plot <- deeploc_heatmap + topology_plot + 
  plot_layout(widths = c(1, 1.5))

print(combined_plot)

# ggsave("results/HER2_analysis/figures_paper2026/her2_localization_topology_epitopes.pdf",
#        combined_plot, width = 20, height = 8)

4 Phase 3: Expression Analysis

4.1 Step 8: Prepare Expression Data

Filter to targetable isoforms (membrane-bound with retained epitopes) and merge with tissue annotations from TCGA/GTEx.

# Targetable isoforms based on topology and epitope analysis
targetable_isoforms <- c("ENST00000269571", "ENST00000584601", "ENST00000584450", 
                         "ENST00000584014", "ENST00000578502", "ENST00000578199", 
                         "ENST00000445658", "ENST00000582818")

# Load tissue annotations
phenotype <- read_tsv("data/HER2_TCGA_GTEX_category_2026.txt", show_col_types = FALSE) %>%
  separate(TCGA_GTEX_main_category, 
           into = c("dataset", "tissue_or_disease"), 
           sep = " ", extra = "merge") %>%
  mutate(tissue_or_disease = tolower(tissue_or_disease))

# Transform and filter expression data
her2_expr_long <- her2_expr %>%
  pivot_longer(cols = -sample, names_to = "sample_id", values_to = "log2_tpm001") %>%
  dplyr::rename(transcript_id = sample) %>%
  mutate(
    transcript_clean = str_remove(transcript_id, "\\..*"),
    # Convert log2(TPM + 0.001) to log2(TPM + 1)
    tpm = 2^log2_tpm001 - 0.001,
    tpm = pmax(tpm, 0),
    log2_tpm_plus1 = log2(tpm + 1)
  ) %>%
  filter(transcript_clean %in% targetable_isoforms)

# Merge with phenotype and classification
her2_expr_final <- her2_expr_long %>%
  left_join(phenotype, by = c("sample_id" = "sample")) %>%
  left_join(transcript_classification, by = "transcript_clean") %>%
  filter(!is.na(dataset))

4.2 Step 9: Visualize Expression Patterns

Compare expression across normal tissues (GTEx) and cancer types (TCGA) to assess tumor selectivity and on-target/off-tumor toxicity risk.

# Categorize normal tissues by clinical importance
tissue_categories <- list(
  "Critical" = c("brain", "blood", "blood vessel", "heart", "liver", "lung", "nerve"),
  "Important" = c("pancreas", "kidney", "stomach", "colon", "small intestine", "bladder"),
  "Others" = c("spleen", "uterus", "ovary", "testis", "prostate", "breast", "skin")
)

tissue_category_df <- data.frame(
  tissue_or_disease = unlist(tissue_categories),
  category = rep(names(tissue_categories), sapply(tissue_categories, length))
) %>%
  mutate(category = factor(category, levels = c("Critical", "Important", "Others")))

# ===== GTEx Data =====
gtex_data <- her2_expr_final %>%
  filter(dataset == "GTEX") %>%
  left_join(tissue_category_df, by = "tissue_or_disease") %>%
  filter(!is.na(category)) %>%
  arrange(category, transcript_clean) %>%
  group_by(category) %>%
  mutate(position_within_cat = as.numeric(factor(transcript_clean))) %>%
  ungroup() %>%
  mutate(
    gap_offset = case_when(
      category == "Critical" ~ 0,
      category == "Important" ~ 9,
      category == "Others" ~ 18
    ),
    x_position = gap_offset + position_within_cat
  )
gtex_data <- gtex_data %>%
  mutate(
    group_label = paste0(category, "_", transcript_clean),
  )

# ===== TCGA Data =====
tcga_cancers <- c("breast invasive carcinoma", "stomach adenocarcinoma", "glioblastoma multiforme")

tcga_data <- her2_expr_final %>%
  filter(dataset == "TCGA", tissue_or_disease %in% tcga_cancers) %>%
  arrange(tissue_or_disease, transcript_clean) %>%
  group_by(tissue_or_disease) %>%
  mutate(position_within_disease = as.numeric(factor(transcript_clean))) %>%
  ungroup() %>%
  mutate(
    gap_offset = case_when(
      tissue_or_disease == "breast invasive carcinoma" ~ 0,
      tissue_or_disease == "stomach adenocarcinoma" ~ 9,
      tissue_or_disease == "glioblastoma multiforme" ~ 18
    ),
    x_position = gap_offset + position_within_disease + 27 # Add offset to place after GTEx (26 + 1 gap)
  )

tcga_data <- tcga_data %>%
  mutate(
    group_label = paste0(tissue_or_disease, "_", transcript_clean),
  )

# ===== Combined Visualization =====
# Combine datasets
combined_data <- bind_rows(
  gtex_data %>% 
    dplyr::select(transcript_clean, log2_tpm_plus1, classification, 
           dataset, group_label, x_position),
  tcga_data %>% 
    dplyr::select(transcript_clean, log2_tpm_plus1, classification, 
           dataset, group_label, x_position)
)

expression_plot <- ggplot(combined_data, 
                          aes(x = x_position, 
                              y = log2_tpm_plus1, 
                              fill = classification, 
                              group = interaction(dataset, group_label))) +
  geom_boxplot(outlier.shape = NA) +
  scale_fill_manual(values = classification_colors, name = "Localization") +
  scale_x_continuous(
    breaks = combined_data %>% 
      distinct(x_position, group_label) %>% 
      pull(x_position),
    labels = combined_data %>% 
      distinct(x_position, group_label) %>% 
      pull(group_label) 
  ) +
  coord_cartesian(ylim = c(0, 7.5)) +
  geom_vline(xintercept = 26.5, linetype = "dashed", color = "gray50", linewidth = 0.5) +
  # Add dataset labels at the top
  annotate("text", x = 13, y = max(combined_data$log2_tpm_plus1) * 1.05, 
           label = "GTEx (Normal Tissues)", fontface = "bold", size = 4) +
  annotate("text", x = 40, y = max(combined_data$log2_tpm_plus1) * 1.05, 
           label = "TCGA (Cancer Types)", fontface = "bold", size = 4) +
  labs(
    title = "HER2 Isoform Expression Across Normal Tissues and Cancer Types",
    subtitle = "GTEx tissues grouped by essentiality, followed by TCGA cancer types",
    x = "Tissue/Cancer Type_Isoform",
    y = "Expression (log2(TPM + 1))"
  ) +
  theme_minimal() +
  theme(
    plot.margin = margin(t = 20, r = 10, b = 10, l = 40),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
    axis.text.y = element_text(size = 10),
    axis.title = element_text(size = 12),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11),
    legend.position = "right",
    legend.text = element_text(size = 10)
  )

print(expression_plot)

# ggsave("results/HER2_analysis/figures_paper2026/her2_expression_gtex_tcga.pdf",
#        expression_plot, width = 16, height = 8)

4.3 Step 10: Off-target binding assessment

4.3.1 Sequence-level analysis (BLOSUM62)

ensembl_family <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl", 
                              mirror = "uswest")

genes <- c("ERBB2", "ERBB3", "ERBB4", "EGFR")

canonical_ids <- getBM(
  attributes = c("hgnc_symbol", "ensembl_transcript_id", "transcript_is_canonical"),
  filters = "hgnc_symbol",
  values = genes,
  mart = ensembl_family
) %>%
  filter(transcript_is_canonical == 1)

seqs <- getBM(
  attributes = c("ensembl_transcript_id", "peptide"),
  filters = "ensembl_transcript_id",
  values = canonical_ids$ensembl_transcript_id,
  mart = ensembl_family
) %>%
  filter(peptide != "") %>%
  left_join(canonical_ids %>% select(hgnc_symbol, ensembl_transcript_id),
            by = "ensembl_transcript_id")

family_sequences <- AAStringSet(seqs$peptide)
names(family_sequences) <- paste0(seqs$hgnc_symbol, "_", seqs$ensembl_transcript_id)

temp_input <- tempfile(fileext = ".fasta")
temp_output <- tempfile(fileext = ".fasta")
writeXStringSet(family_sequences, temp_input)

system2("mafft",
        args = c("--localpair", "--maxiterate", "1000", "--quiet", temp_input),
        stdout = temp_output)

family_aligned <- readAAStringSet(temp_output)
unlink(c(temp_input, temp_output))

# Epitope positions in canonical HER2
epitope_positions <- list(
  P = list(c(257, 267), c(308, 318)),               # Pertuzumab
  T = list(c(580, 582), c(593, 595), c(610, 625))   # Trastuzumab
)

her2_name <- names(family_aligned)[str_detect(names(family_aligned), "ERBB2")]

epitope_mapping <- map_epitopes_from_alignment(
  aligned_seqs = family_aligned,
  canonical_name = her2_name,
  epitope_pos = epitope_positions
)

her2_seq <- strsplit(as.character(family_aligned[[her2_name]]), "")[[1]]

substitution_table <- epitope_mapping %>%
  mutate(
    aa = map2_chr(transcript_id, position, function(tid, pos) {
      strsplit(as.character(family_aligned[[tid]]), "")[[1]][pos]
    }),
    her2_aa = her2_seq[position]
  ) %>%
  filter(transcript_id != her2_name) %>%
  select(transcript_id, epitope, canonical_pos, her2_aa, aa) %>%
  mutate(changed = aa != her2_aa)

data(BLOSUM62)

substitution_table_scored <- substitution_table %>%
  filter(aa != "-") %>%
  rowwise() %>%
  mutate(blosum62 = BLOSUM62[her2_aa, aa]) %>%
  ungroup()

# Gaps (missing residues) are scored separately, not by BLOSUM62
gap_table <- epitope_mapping %>%
  filter(transcript_id == her2_name) %>%
  select(epitope, canonical_pos, position) %>%
  crossing(transcript_id = setdiff(names(family_aligned), her2_name)) %>%
  mutate(
    aa = map2_chr(transcript_id, position, function(tid, pos) {
      strsplit(as.character(family_aligned[[tid]]), "")[[1]][pos]
    })
  ) %>%
  filter(aa == "-") %>%
  mutate(her2_aa = her2_seq[position], changed = TRUE, blosum62 = NA) %>%
  select(transcript_id, epitope, canonical_pos, her2_aa, aa, changed, blosum62)

full_substitution_table <- bind_rows(substitution_table_scored, gap_table) %>%
  arrange(transcript_id, epitope, canonical_pos)

full_summary <- full_substitution_table %>%
  filter(transcript_id != her2_name) %>%
  mutate(protein = case_when(
    str_detect(transcript_id, "ERBB2") ~ "HER2",
    str_detect(transcript_id, "ERBB3") ~ "HER3",
    str_detect(transcript_id, "ERBB4") ~ "HER4",
    str_detect(transcript_id, "EGFR")  ~ "EGFR"
  )) %>%
  group_by(protein, epitope) %>%
  summarise(
    substitutions = sum(changed & aa != "-"),
    n_negative_blosum = sum(blosum62 < 0, na.rm = TRUE),
    gaps = sum(aa == "-"),
    mean_blosum = round(mean(blosum62[changed & aa != "-"], na.rm = TRUE), 2),
    .groups = "drop"
  )
print(full_summary)

## # A tibble: 6 × 6
##   protein epitope substitutions n_negative_blosum  gaps mean_blosum
##   <chr>   <chr>           <int>             <int> <int>       <dbl>
## 1 EGFR    P                  16                10     0       -1.06
## 2 EGFR    T                  15                10     1       -0.6 
## 3 HER3    P                  12                 9     0       -1.42
## 4 HER3    T                  12                 6     2       -0.17
## 5 HER4    P                  13                 8     0       -0.85
## 6 HER4    T                  13                 8     2       -0.31

4.3.2 Structural analysis (backbone RMSD)

# RMSD calculated externally in PyMOL (Cα atoms only, no outlier rejection,
# cycles=0). The script below is shown for reproducibility but is not run
# as part of this vignette — results are hardcoded in rmsd_table.

# PyMOL: Pertuzumab epitope RMSD
fetch 1S78
fetch 1M6B
fetch 2AHX
fetch 1NQL

remove solvent
remove organic
remove 1M6B and chain B
remove 2AHX and chain B
remove 1NQL and chain B

hide everything
show cartoon, all

select ep_her2, 1S78 and chain A and (resi 235-245 or resi 286-296)
select ep_her3, 1M6B and chain A and (resi 229-239 or resi 279-289)
select ep_her4, 2AHX and chain A and (resi 226-236 or resi 276-286)
select ep_egfr, 1NQL and chain A and (resi 229-239 or resi 280-290)

align 1M6B and chain A and (resi 229-239 or resi 279-289) and name CA, 1S78 and chain A and (resi 235-245 or resi 286-296) and name CA, cycles=0
align 2AHX and chain A and (resi 226-236 or resi 276-286) and name CA, 1S78 and chain A and (resi 235-245 or resi 286-296) and name CA, cycles=0
align 1NQL and chain A and (resi 229-239 or resi 280-290) and name CA, 1S78 and chain A and (resi 235-245 or resi 286-296) and name CA, cycles=0


# PyMOL: Trastuzumab epitope RMSD
fetch 1N8Z
fetch 1M6B
fetch 2AHX
fetch 1NQL

remove solvent
remove organic
remove 1M6B and chain B
remove 2AHX and chain B
remove 1NQL and chain B

hide everything
show cartoon, all

select epitope_her2, 1N8Z and chain C and (resi 558-560 or resi 571-573 or resi 588-603)
select epitope_her3, 1M6B and chain A and (resi 550-552 or resi 563-565 or resi 580-593)
select epitope_her4, 2AHX and chain A and (resi 548-550 or resi 561-563 or resi 578-591)
select epitope_egfr, 1NQL and chain A and (resi 551-553 or resi 564-566 or resi 581-595)

align 1M6B and chain A and (resi 550-552 or resi 563-565 or resi 580-593) and name CA, 1N8Z and chain C and (resi 558-560 or resi 571-573 or resi 588-603) and name CA, cycles=0
align 2AHX and chain A and (resi 548-550 or resi 561-563 or resi 578-591) and name CA, 1N8Z and chain C and (resi 558-560 or resi 571-573 or resi 588-603) and name CA, cycles=0
align 1NQL and chain A and (resi 551-553 or resi 564-566 or resi 581-595) and name CA, 1N8Z and chain C and (resi 558-560 or resi 571-573 or resi 588-603) and name CA, cycles=0

rmsd_table <- tibble(
  epitope = c(rep("P", 3), rep("T", 3)),
  protein = rep(c("HER3", "HER4", "EGFR"), 2),
  backbone_rmsd = c(0.871, 0.800, 1.336, 0.587, 0.709, 0.620),
  rmsd_atoms = c(20, 20, 18, 5, 17, 15)
)
rmsd_table

## # A tibble: 6 × 4
##   epitope protein backbone_rmsd rmsd_atoms
##   <chr>   <chr>           <dbl>      <dbl>
## 1 P       HER3            0.871         20
## 2 P       HER4            0.8           20
## 3 P       EGFR            1.34          18
## 4 T       HER3            0.587          5
## 5 T       HER4            0.709         17
## 6 T       EGFR            0.62          15

4.3.3 B-factor analysis

base_path <- "results/HER2_analysis/Pymol/"

parse_bfactors <- function(pdb_file, resi_ranges, protein_name) {
  lines <- readLines(pdb_file)
  atom_lines <- lines[startsWith(lines, "ATOM")]
  
  tibble(
    resn = str_trim(substr(atom_lines, 18, 20)),
    resi = as.integer(str_trim(substr(atom_lines, 23, 26))),
    atom = str_trim(substr(atom_lines, 13, 16)),
    bfactor = as.numeric(substr(atom_lines, 61, 66))
  ) %>%
    filter(atom == "CA", resi %in% resi_ranges) %>%
    mutate(protein = protein_name) %>%
    select(protein, resi, resn, bfactor)
}

# Pertuzumab epitope B-factors (HER2 excluded — antibody-bound structure)
pert_bfactors <- bind_rows(
  parse_bfactors(file.path(base_path, "1S78_her2.pdb"), c(235:245, 286:296), "HER2"),
  parse_bfactors(file.path(base_path, "1M6B_her3.pdb"), c(229:239, 279:289), "HER3"),
  parse_bfactors(file.path(base_path, "2AHX_her4.pdb"), c(226:236, 276:286), "HER4"),
  parse_bfactors(file.path(base_path, "1NQL_egfr.pdb"), c(229:239, 280:290), "EGFR")
)

pert_bfactor_summary <- pert_bfactors %>%
  filter(protein != "HER2") %>%
  group_by(protein) %>%
  summarise(mean_bfactor = round(mean(bfactor), 2), .groups = "drop") %>%
  mutate(epitope = "P")

# Trastuzumab epitope B-factors (B = 0.0 residues excluded — unresolved density)
tras_bfactors <- bind_rows(
  parse_bfactors(file.path(base_path, "1N8Z_her2.pdb"), c(558:560, 571:573, 588:603), "HER2"),
  parse_bfactors(file.path(base_path, "1M6B_her3.pdb"), c(550:552, 563:565, 580:593), "HER3"),
  parse_bfactors(file.path(base_path, "2AHX_her4.pdb"), c(548:550, 561:563, 578:591), "HER4"),
  parse_bfactors(file.path(base_path, "1NQL_egfr.pdb"), c(551:553, 564:566, 581:595), "EGFR")
) %>%
  filter(bfactor > 0)

tras_expected <- c(HER2 = 22, HER3 = 20, HER4 = 20, EGFR = 21)

tras_bfactor_summary <- tras_bfactors %>%
  filter(protein != "HER2") %>%
  group_by(protein) %>%
  summarise(mean_bfactor = round(mean(bfactor), 2), n_resolved = n(), .groups = "drop") %>%
  mutate(
    n_expected = tras_expected[protein],
    n_unresolved = n_expected - n_resolved,
    epitope = "T"
  )

4.3.4 Electrostatic surface complementarity (APBS)

### Electrostatic surface complementarity (APBS)
# Antibody-antigen binding depends on electrostatic complementarity at the
# epitope surface. Potential maps and their comparison were generated as follows:
#
# 1. PDB2PQR (v3.7.1, AMBER force field) was run on each epitope-containing
#    structure to assign atomic charges and radii, producing a .pqr file and
#    an APBS input file (.in).
# 2. APBS (v3.4.1) was run on each .in file to produce an electrostatic
#    potential grid (.dx) for HER2 (1S78 for pertuzumab, 1N8Z for trastuzumab),
#    HER3 (1M6B), HER4 (2AHX), and EGFR (1NQL).
# 3. Epitope Cα coordinates were read directly from the original, UNALIGNED
#    structures (the same cleaned single-chain PDBs used for the B-factor
#    analysis), because each APBS grid is defined in its structure's own
#    native coordinate frame. Structures superimposed during the RMSD step
#    were NOT used here.
# 4. Each family member's epitope residues were matched to the corresponding
#    canonical HER2 positions using the same MAFFT alignment as the BLOSUM62
#    analysis, so alignment gaps are handled consistently across metrics.
#    Because crystal (PDB) residue numbering differs from Ensembl sequence
#    numbering, a constant per-protein offset (PDB = Ensembl - offset;
#    HER2 22, HER3 19, HER4 25, EGFR 24) was applied to convert alignment
#    positions to PDB residue numbers. Each offset was verified by
#    residue-identity matching (>99.8% agreement over ~550-610 residues).
# 5. The electrostatic potential was sampled at each epitope Cα position by
#    trilinear interpolation of the .dx grid (functions below).
# 6. HER2 was compared to each family member by Pearson correlation of the
#    per-residue potentials across canonical positions resolved in both
#    structures. This is a coarse, Cα-level proxy for the epitope's surface
#    electrostatics and is interpreted as one corroborating line of evidence
#    rather than a full surface-complementarity analysis.

Check sequence - PDB offset

check_offset <- function(aligned_obj, aligned_seq_name, pdb_file, protein) {
  # ungapped Ensembl sequence + its numbering (position in peptide)
  aln <- as.character(aligned_obj[[aligned_seq_name]])
  aa  <- strsplit(aln, "")[[1]]
  ens_pos <- cumsum(aa != "-")
  ens <- tibble(ens_num = ens_pos[aa != "-"],
                ens_aa  = aa[aa != "-"])

  # PDB Ca residues: number + one-letter aa
  pdb <- read.pdb(file.path(base_path, pdb_file))
  ca  <- pdb$atom[pdb$atom$elety == "CA", ]
  three2one <- bio3d::aa321(ca$resid)
  pdbdf <- tibble(pdb_num = ca$resno, pdb_aa = three2one)

  # try a range of constant offsets; score how many residues match
  cat("\n", protein, "(", pdb_file, ")\n")
  for (off in -40:40) {
    merged <- inner_join(ens, pdbdf,
                         by = c("ens_num" = "pdb_num"),
                         suffix = c("", "")) %>%
      mutate(pdb_num_shifted = ens_num) # placeholder
    # match ens residue n against pdb residue (n - off)
    m <- ens %>%
      mutate(target = ens_num - off) %>%
      inner_join(pdbdf, by = c("target" = "pdb_num"))
    if (nrow(m) > 20) {
      frac <- mean(m$ens_aa == m$pdb_aa)
      if (frac > 0.95) {
        cat("  offset =", off,
            " (PDB_resno = Ensembl_pos -", off, "),",
            "match:", round(frac, 3), "over", nrow(m), "residues\n")
      }
    }
  }
}

check_offset(family_aligned, "ERBB2_ENST00000269571", "1N8Z_her2.pdb", "HER2")

## 
##  HER2 ( 1N8Z_her2.pdb )
##   offset = 22  (PDB_resno = Ensembl_pos - 22 ), match: 1 over 581 residues

check_offset(family_aligned, "ERBB3_ENST00000267101", "1M6B_her3.pdb", "HER3")

## 
##  HER3 ( 1M6B_her3.pdb )
##   offset = 19  (PDB_resno = Ensembl_pos - 19 ), match: 0.998 over 549 residues

check_offset(family_aligned, "ERBB4_ENST00000342788", "2AHX_her4.pdb", "HER4")

## 
##  HER4 ( 2AHX_her4.pdb )
##   offset = 25  (PDB_resno = Ensembl_pos - 25 ), match: 0.998 over 611 residues

check_offset(family_aligned, "EGFR_ENST00000275493",  "1NQL_egfr.pdb", "EGFR")

## 
##  EGFR ( 1NQL_egfr.pdb )
##   offset = 24  (PDB_resno = Ensembl_pos - 24 ), match: 0.998 over 612 residues

# --- Per-protein offset: PDB_resno = Ensembl_pos - offset (from the identity check) ---
# HER2 exact over 581 residues; HER3/HER4/EGFR 99.8% over ~550-610 residues.
pdb_offsets <- c(ERBB2 = 22, ERBB3 = 19, ERBB4 = 25, EGFR = 24)

# --- DX file parser (handles scientific notation) ---
read_dx <- function(filepath) {
  lines <- readLines(filepath)
  
  grid_line <- grep("^object 1", lines, value = TRUE)
  dims <- as.integer(str_extract_all(grid_line, "\\d+")[[1]][2:4])
  
  num_pattern <- "-?[\\d]+\\.?[\\d]*[eE]?[+-]?\\d*"
  
  origin_line <- grep("^origin", lines, value = TRUE)
  origin <- as.numeric(str_match_all(origin_line, num_pattern)[[1]])
  
  delta_lines <- grep("^delta", lines, value = TRUE)
  dx <- as.numeric(str_match_all(delta_lines[1], num_pattern)[[1]])
  dy <- as.numeric(str_match_all(delta_lines[2], num_pattern)[[1]])
  dz <- as.numeric(str_match_all(delta_lines[3], num_pattern)[[1]])
  
  spacing <- c(dx[1], dy[2], dz[3])
  
  data_start <- max(grep("^object 3", lines)) + 1
  data_end <- grep("^attribute", lines)[1] - 1
  values <- as.numeric(unlist(strsplit(trimws(lines[data_start:data_end]), "\\s+")))
  
  list(dims = dims, origin = origin, spacing = spacing, values = values)
}

# --- Trilinear interpolation ---
get_potential <- function(dx_map, x, y, z) {
  fi <- (x - dx_map$origin[1]) / dx_map$spacing[1]
  fj <- (y - dx_map$origin[2]) / dx_map$spacing[2]
  fk <- (z - dx_map$origin[3]) / dx_map$spacing[3]
  
  i <- floor(fi); j <- floor(fj); k <- floor(fk)
  fx <- fi - i; fy <- fj - j; fz <- fk - k
  
  if (i < 0 || i >= dx_map$dims[1]-1 ||
      j < 0 || j >= dx_map$dims[2]-1 ||
      k < 0 || k >= dx_map$dims[3]-1) return(NA)
  
  idx <- function(ii, jj, kk) ii * dx_map$dims[2] * dx_map$dims[3] + jj * dx_map$dims[3] + kk + 1
  
  v000 <- dx_map$values[idx(i, j, k)]
  v100 <- dx_map$values[idx(i+1, j, k)]
  v010 <- dx_map$values[idx(i, j+1, k)]
  v110 <- dx_map$values[idx(i+1, j+1, k)]
  v001 <- dx_map$values[idx(i, j, k+1)]
  v101 <- dx_map$values[idx(i+1, j, k+1)]
  v011 <- dx_map$values[idx(i, j+1, k+1)]
  v111 <- dx_map$values[idx(i+1, j+1, k+1)]
  
  v000*(1-fx)*(1-fy)*(1-fz) + v100*fx*(1-fy)*(1-fz) +
    v010*(1-fx)*fy*(1-fz) + v110*fx*fy*(1-fz) +
    v001*(1-fx)*(1-fy)*fz + v101*fx*(1-fy)*fz +
    v011*(1-fx)*fy*fz + v111*fx*fy*fz
}

# --- Epitope Ca coordinates, read straight from the unaligned PDBs ---
get_epitope_coords <- function(pdb_file, resids, protein) {
  pdb <- read.pdb(file.path(base_path, pdb_file))
  sel <- pdb$atom$elety == "CA" & pdb$atom$resno %in% resids
  tibble(protein = protein, resno = pdb$atom$resno[sel],
         x = pdb$atom$x[sel], y = pdb$atom$y[sel], z = pdb$atom$z[sel])
}

# --- Canonical map derived from the SAME alignment as the BLOSUM step ---
canon_map_from_alignment <- function(aligned, epitope_mapping, offsets) {
  ens_pos_tbl <- map_dfr(names(aligned), function(nm) {
    aa  <- strsplit(as.character(aligned[[nm]]), "")[[1]]
    ens <- cumsum(aa != "-"); ens[aa == "-"] <- NA
    tibble(transcript_id = nm, position = seq_along(aa), ens_num = ens)
  })

  epitope_mapping %>%
    left_join(ens_pos_tbl, by = c("transcript_id", "position")) %>%
    mutate(
      gene = case_when(
        str_detect(transcript_id, "ERBB2") ~ "ERBB2",
        str_detect(transcript_id, "ERBB3") ~ "ERBB3",
        str_detect(transcript_id, "ERBB4") ~ "ERBB4",
        str_detect(transcript_id, "EGFR")  ~ "EGFR"
      ),
      protein = recode(gene, ERBB2 = "HER2", ERBB3 = "HER3",
                             ERBB4 = "HER4", EGFR = "EGFR"),
      resno = ens_num - offsets[gene]
    ) %>%
    select(protein, epitope, canonical_pos, resno)
}

all_canon <- canon_map_from_alignment(family_aligned, epitope_mapping, pdb_offsets)
pert_canon_map <- all_canon %>% filter(epitope == "P")
tras_canon_map <- all_canon %>% filter(epitope == "T")

# --- Run one epitope, matching on canonical position from the alignment map ---
run_apbs_epitope <- function(epitope_name, structs, canon_map) {
  potentials <- imap_dfr(structs, function(s, prot) {
    m    <- read_dx(file.path(base_path, s$dx))
    cmap <- canon_map %>% filter(protein == prot) %>% select(protein, resno, canonical_pos)
    get_epitope_coords(s$pdb, unlist(s$segments), prot) %>%
      left_join(cmap, by = c("protein", "resno")) %>%
      rowwise() %>%
      mutate(potential = get_potential(m, x, y, z)) %>%
      ungroup()
  })
  her2 <- potentials %>% filter(protein == "HER2") %>% select(canonical_pos, her2 = potential)
  map_dfr(c("HER3", "HER4", "EGFR"), function(prot) {
    other <- potentials %>% filter(protein == prot) %>% select(canonical_pos, other = potential)
    matched <- inner_join(her2, other, by = "canonical_pos") %>% drop_na(canonical_pos)
    tibble(epitope = epitope_name,
           comparison = paste0(prot, " vs HER2"),
           correlation = cor(matched$her2, matched$other, use = "complete.obs"),
           n_matched = nrow(matched))
  })
}

Correlations

# ===== PERTUZUMAB =====
pert_structs <- list(
  HER2 = list(pdb = "1S78_her2.pdb", dx = "1S78_her2.pqr.dx", segments = list(235:245, 286:296)),
  HER3 = list(pdb = "1M6B_her3.pdb", dx = "1M6B_her3.pqr.dx", segments = list(229:239, 279:289)),
  HER4 = list(pdb = "2AHX_her4.pdb", dx = "2AHX_her4.pqr.dx", segments = list(226:236, 276:286)),
  EGFR = list(pdb = "1NQL_egfr.pdb", dx = "1NQL_egfr.pqr.dx", segments = list(229:239, 280:290))
)
pert_correlations <- run_apbs_epitope("Pertuzumab", pert_structs, pert_canon_map)
cat("=== Pertuzumab APBS correlations ===\n"); print(pert_correlations)

## === Pertuzumab APBS correlations ===

## # A tibble: 3 × 4
##   epitope    comparison   correlation n_matched
##   <chr>      <chr>              <dbl>     <int>
## 1 Pertuzumab HER3 vs HER2      0.131         22
## 2 Pertuzumab HER4 vs HER2      0.288         22
## 3 Pertuzumab EGFR vs HER2     -0.0421        22

# ===== TRASTUZUMAB =====
tras_structs <- list(
  HER2 = list(pdb = "1N8Z_her2.pdb", dx = "1N8Z_her2.pqr.dx", segments = list(558:560, 571:573, 588:603)),
  HER3 = list(pdb = "1M6B_her3.pdb", dx = "1M6B_her3.pqr.dx", segments = list(550:552, 563:565, 580:593)),
  HER4 = list(pdb = "2AHX_her4.pdb", dx = "2AHX_her4.pqr.dx", segments = list(548:550, 561:563, 578:591)),
  EGFR = list(pdb = "1NQL_egfr.pdb", dx = "1NQL_egfr.pqr.dx", segments = list(551:553, 564:566, 581:595))
)
tras_correlations <- run_apbs_epitope("Trastuzumab", tras_structs, tras_canon_map)
cat("=== Trastuzumab APBS correlations ===\n"); print(tras_correlations)

## === Trastuzumab APBS correlations ===

## # A tibble: 3 × 4
##   epitope     comparison   correlation n_matched
##   <chr>       <chr>              <dbl>     <int>
## 1 Trastuzumab HER3 vs HER2       0.397         6
## 2 Trastuzumab HER4 vs HER2       0.572        19
## 3 Trastuzumab EGFR vs HER2       0.774        19

5 Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.5.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Copenhagen
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] jsonlite_2.0.0      httr_1.4.7          lubridate_1.9.4    
##  [4] forcats_1.0.1       stringr_1.6.0       dplyr_1.1.4        
##  [7] purrr_1.2.0         readr_2.1.6         tidyr_1.3.2        
## [10] tibble_3.3.0        ggplot2_4.0.1       tidyverse_2.0.0    
## [13] patchwork_1.3.2     ggnewscale_0.5.2    biomaRt_2.66.0     
## [16] bio3d_2.4-5         Biostrings_2.78.0   Seqinfo_1.0.0      
## [19] XVector_0.50.0      IRanges_2.44.0      S4Vectors_0.48.0   
## [22] BiocGenerics_0.56.0 generics_0.1.4     
## 
## loaded via a namespace (and not attached):
##  [1] KEGGREST_1.50.0      gtable_0.3.6         xfun_0.55           
##  [4] bslib_0.9.0          httr2_1.2.2          Biobase_2.70.0      
##  [7] tzdb_0.5.0           vctrs_0.6.5          tools_4.5.1         
## [10] curl_7.0.0           parallel_4.5.1       AnnotationDbi_1.72.0
## [13] RSQLite_2.4.5        blob_1.3.0           pkgconfig_2.0.3     
## [16] dbplyr_2.5.1         RColorBrewer_1.1-3   S7_0.2.1            
## [19] lifecycle_1.0.4      compiler_4.5.1       farver_2.1.2        
## [22] progress_1.2.3       htmltools_0.5.9      sass_0.4.10         
## [25] yaml_2.3.12          pillar_1.11.1        crayon_1.5.3        
## [28] jquerylib_0.1.4      cachem_1.1.0         tidyselect_1.2.1    
## [31] digest_0.6.39        stringi_1.8.7        labeling_0.4.3      
## [34] fastmap_1.2.0        grid_4.5.1           cli_3.6.5           
## [37] magrittr_2.0.4       utf8_1.2.6           withr_3.0.2         
## [40] prettyunits_1.2.0    filelock_1.0.3       scales_1.4.0        
## [43] rappdirs_0.3.3       bit64_4.6.0-1        timechange_0.3.0    
## [46] rmarkdown_2.30       bit_4.6.0            otel_0.2.0          
## [49] png_0.1-8            hms_1.1.4            memoise_2.0.1       
## [52] evaluate_1.0.5       knitr_1.51           BiocFileCache_3.0.0 
## [55] rlang_1.1.6          Rcpp_1.1.0           glue_1.8.0          
## [58] DBI_1.2.3            xml2_1.5.1           vroom_1.6.7         
## [61] rstudioapi_0.17.1    R6_2.6.1

CAR-T Target Assessment Workflow - coding

Giorgia Moranzoni

2026-07-06