1 Load libraries


library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(viridis)

# Optional: theme for plots
theme_set(theme_bw())

2 Load AD files


# Define files

files <- list(
N1 = "N1.AD.txt",
T1 = "T1.AD.txt",
X1 = "X1.AD.txt",
XX1 = "XX1.AD.txt"
)

# Function to read and process AD file

read_ad <- function(file){
read_tsv(file, col_names = c("CHROM","POS","REF","ALT","AD")) %>%
separate(AD, into=c("ref_count","alt_count"), sep=",", convert=TRUE) %>%
mutate(VAF = alt_count / (ref_count + alt_count))
}

# Load all samples into a list

ad_list <- lapply(files, read_ad)

# Name list elements

names(ad_list) <- names(files)

3 Compute basic statistics


# Compute basic statistics

vaf_summary <- lapply(ad_list, function(df){
summary(df$VAF)
})

vaf_summary
$N1
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.3596  0.4902  0.5537  0.9951  1.0000       1 

$T1
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.3529  0.5000  0.5663  0.9942  1.0000 

$X1
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.3626  0.5000  0.5692  1.0000  1.0000       1 

$XX1
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.3448  0.5000  0.5650  1.0000  1.0000       3 

4 Plot VAF distributions


# Combine all VAFs for plotting

vaf_df <- bind_rows(
lapply(names(ad_list), function(n) {
ad_list[[n]] %>% select(VAF) %>% mutate(Sample = n)
})
)

ggplot(vaf_df, aes(x=VAF, fill=Sample)) +
geom_histogram(bins=50, alpha=0.6, position="identity") +
scale_fill_viridis(discrete = TRUE) +
labs(title="VAF Distribution Across Samples",
x="Variant Allele Frequency (VAF)",
y="Number of Variants") +
theme_minimal()

NA
NA
NA

5 Identify shared variants


library(dplyr)

# Remove duplicate variants by CHROM and POS
T1_unique <- ad_list$T1 %>% distinct(CHROM, POS, .keep_all = TRUE)
XX1_unique <- ad_list$XX1 %>% distinct(CHROM, POS, .keep_all = TRUE)

# Find shared variants between tumor (T1) and cell line (XX1)
shared_T1_XX1 <- inner_join(
  T1_unique %>% select(CHROM, POS),
  XX1_unique %>% select(CHROM, POS),
  by = c("CHROM", "POS"),
  relationship = "many-to-many"  # explicitly allow many-to-many
)

cat("Number of shared variants between T1 and XX1:", nrow(shared_T1_XX1), "\n")
Number of shared variants between T1 and XX1: 44650 

6 Compute shared vs unique variants


get_variant_id <- function(df){
  df %>% mutate(VariantID = paste(CHROM, POS, sep="_")) %>% select(VariantID)
}

variant_ids <- lapply(ad_list, get_variant_id)
variant_ids <- lapply(variant_ids, pull)

shared_T1_X1 <- intersect(variant_ids$T1, variant_ids$X1)
shared_T1_XX1 <- intersect(variant_ids$T1, variant_ids$XX1)

unique_T1 <- setdiff(variant_ids$T1, union(variant_ids$X1, variant_ids$XX1))
unique_X1 <- setdiff(variant_ids$X1, variant_ids$T1)
unique_XX1 <- setdiff(variant_ids$XX1, variant_ids$T1)

variant_summary <- data.frame(
  Comparison = c("Shared_T1_X1","Shared_T1_XX1","Unique_T1","Unique_X1","Unique_XX1"),
  Count = c(length(shared_T1_X1), length(shared_T1_XX1),
            length(unique_T1), length(unique_X1), length(unique_XX1))
)
variant_summary

library(VennDiagram)
library(grid)

variants_sets <- list(
  T1 = variant_ids$T1,
  X1 = variant_ids$X1,
  XX1 = variant_ids$XX1
)

venn.diagram(
  x = variants_sets,
  filename = "Venn_Tumor_CellLines.png",
  fill = c("red", "green", "blue"),
  alpha = 0.5,
  cex = 1.5,
  cat.cex = 1.2,
  main = "Overlap of Variants: Tumor and Cell Lines"
)
[1] 1
# Use grid.draw to render
grid.newpage()
grid.draw(venn.plot)

7 VAF Heatmap for Shared Variants


library(tidyverse)
library(pheatmap)

# Combine all AD data
ad_df <- bind_rows(
  lapply(names(ad_list), function(s) {
    ad_list[[s]] %>% mutate(Sample = s)
  })
) %>%
  mutate(VariantID = paste(CHROM, POS, sep="_"))

# Shared variants between tumor and at least one cell line
shared_variants <- union(shared_T1_X1, shared_T1_XX1)

# FIX: Remove duplicates before pivoting
vaf_df <- ad_df %>%
  filter(VariantID %in% shared_variants) %>%
  group_by(VariantID, Sample) %>%      
  summarise(VAF = mean(VAF, na.rm = TRUE), .groups = "drop") %>%  # FIX
  pivot_wider(names_from = Sample, values_from = VAF, values_fill = 0)

# Convert to matrix
vaf_matrix <- as.matrix(vaf_df[,-1])
rownames(vaf_matrix) <- vaf_df$VariantID

# Heatmap
pheatmap(
  vaf_matrix,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = colorRampPalette(c("white","red"))(50),
  main = "VAF Heatmap: Shared Variants Tumor vs Cell Lines"
)

NA
NA
NA
LS0tCnRpdGxlOiAiV0VTIFZBRiBBbmFseXNpczogUGF0aWVudDEgYW5kIENlbGwgTGluZXMgTDEtTDIiCmF1dGhvcjogIk5hc2lyIE1haG1vb2QgQWJiYXNpIgpkYXRlOiAiYHIgZm9ybWF0KFN5cy50aW1lKCksICclQiAlZCwgJVknKWAiCm91dHB1dDoKICBodG1sX25vdGVib29rOgogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDoKICAgICAgY29sbGFwc2VkOiB0cnVlCiAgICB0aGVtZTogam91cm5hbAotLS0KCiMgTG9hZCBsaWJyYXJpZXMKYGBge3IgLCBtZXNzYWdlPUZBTFNFfQoKbGlicmFyeShkcGx5cikKbGlicmFyeSh0aWR5cikKbGlicmFyeShyZWFkcikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHZpcmlkaXMpCgojIE9wdGlvbmFsOiB0aGVtZSBmb3IgcGxvdHMKdGhlbWVfc2V0KHRoZW1lX2J3KCkpCgpgYGAKCgoKIyBMb2FkIEFEIGZpbGVzCgpgYGB7ciAsIG1lc3NhZ2U9RkFMU0V9CgojIERlZmluZSBmaWxlcwoKZmlsZXMgPC0gbGlzdCgKTjEgPSAiTjEuQUQudHh0IiwKVDEgPSAiVDEuQUQudHh0IiwKWDEgPSAiWDEuQUQudHh0IiwKWFgxID0gIlhYMS5BRC50eHQiCikKCiMgRnVuY3Rpb24gdG8gcmVhZCBhbmQgcHJvY2VzcyBBRCBmaWxlCgpyZWFkX2FkIDwtIGZ1bmN0aW9uKGZpbGUpewpyZWFkX3RzdihmaWxlLCBjb2xfbmFtZXMgPSBjKCJDSFJPTSIsIlBPUyIsIlJFRiIsIkFMVCIsIkFEIikpICU+JQpzZXBhcmF0ZShBRCwgaW50bz1jKCJyZWZfY291bnQiLCJhbHRfY291bnQiKSwgc2VwPSIsIiwgY29udmVydD1UUlVFKSAlPiUKbXV0YXRlKFZBRiA9IGFsdF9jb3VudCAvIChyZWZfY291bnQgKyBhbHRfY291bnQpKQp9CgojIExvYWQgYWxsIHNhbXBsZXMgaW50byBhIGxpc3QKCmFkX2xpc3QgPC0gbGFwcGx5KGZpbGVzLCByZWFkX2FkKQoKIyBOYW1lIGxpc3QgZWxlbWVudHMKCm5hbWVzKGFkX2xpc3QpIDwtIG5hbWVzKGZpbGVzKQoKCmBgYAoKIyBDb21wdXRlIGJhc2ljIHN0YXRpc3RpY3MKCmBgYHtyICwgbWVzc2FnZT1GQUxTRX0KCiMgQ29tcHV0ZSBiYXNpYyBzdGF0aXN0aWNzCgp2YWZfc3VtbWFyeSA8LSBsYXBwbHkoYWRfbGlzdCwgZnVuY3Rpb24oZGYpewpzdW1tYXJ5KGRmJFZBRikKfSkKCnZhZl9zdW1tYXJ5CgoKYGBgCgoKCiMgUGxvdCBWQUYgZGlzdHJpYnV0aW9ucwoKYGBge3IgLCBtZXNzYWdlPUZBTFNFfQoKIyBDb21iaW5lIGFsbCBWQUZzIGZvciBwbG90dGluZwoKdmFmX2RmIDwtIGJpbmRfcm93cygKbGFwcGx5KG5hbWVzKGFkX2xpc3QpLCBmdW5jdGlvbihuKSB7CmFkX2xpc3RbW25dXSAlPiUgc2VsZWN0KFZBRikgJT4lIG11dGF0ZShTYW1wbGUgPSBuKQp9KQopCgpnZ3Bsb3QodmFmX2RmLCBhZXMoeD1WQUYsIGZpbGw9U2FtcGxlKSkgKwpnZW9tX2hpc3RvZ3JhbShiaW5zPTUwLCBhbHBoYT0wLjYsIHBvc2l0aW9uPSJpZGVudGl0eSIpICsKc2NhbGVfZmlsbF92aXJpZGlzKGRpc2NyZXRlID0gVFJVRSkgKwpsYWJzKHRpdGxlPSJWQUYgRGlzdHJpYnV0aW9uIEFjcm9zcyBTYW1wbGVzIiwKeD0iVmFyaWFudCBBbGxlbGUgRnJlcXVlbmN5IChWQUYpIiwKeT0iTnVtYmVyIG9mIFZhcmlhbnRzIikgKwp0aGVtZV9taW5pbWFsKCkKCgoKYGBgCgojIElkZW50aWZ5IHNoYXJlZCB2YXJpYW50cwoKYGBge3IgLCBtZXNzYWdlPUZBTFNFfQoKbGlicmFyeShkcGx5cikKCiMgUmVtb3ZlIGR1cGxpY2F0ZSB2YXJpYW50cyBieSBDSFJPTSBhbmQgUE9TClQxX3VuaXF1ZSA8LSBhZF9saXN0JFQxICU+JSBkaXN0aW5jdChDSFJPTSwgUE9TLCAua2VlcF9hbGwgPSBUUlVFKQpYWDFfdW5pcXVlIDwtIGFkX2xpc3QkWFgxICU+JSBkaXN0aW5jdChDSFJPTSwgUE9TLCAua2VlcF9hbGwgPSBUUlVFKQoKIyBGaW5kIHNoYXJlZCB2YXJpYW50cyBiZXR3ZWVuIHR1bW9yIChUMSkgYW5kIGNlbGwgbGluZSAoWFgxKQpzaGFyZWRfVDFfWFgxIDwtIGlubmVyX2pvaW4oCiAgVDFfdW5pcXVlICU+JSBzZWxlY3QoQ0hST00sIFBPUyksCiAgWFgxX3VuaXF1ZSAlPiUgc2VsZWN0KENIUk9NLCBQT1MpLAogIGJ5ID0gYygiQ0hST00iLCAiUE9TIiksCiAgcmVsYXRpb25zaGlwID0gIm1hbnktdG8tbWFueSIgICMgZXhwbGljaXRseSBhbGxvdyBtYW55LXRvLW1hbnkKKQoKY2F0KCJOdW1iZXIgb2Ygc2hhcmVkIHZhcmlhbnRzIGJldHdlZW4gVDEgYW5kIFhYMToiLCBucm93KHNoYXJlZF9UMV9YWDEpLCAiXG4iKQoKCgpgYGAKCgoKIyBDb21wdXRlIHNoYXJlZCB2cyB1bmlxdWUgdmFyaWFudHMKCmBgYHtyICwgbWVzc2FnZT1GQUxTRX0KCiMgRnVuY3Rpb24gdG8gZ2V0IENIUk9NK1BPUyBhcyB1bmlxdWUgaWRlbnRpZmllcgpnZXRfdmFyaWFudF9pZCA8LSBmdW5jdGlvbihkZil7CiAgZGYgJT4lIG11dGF0ZShWYXJpYW50SUQgPSBwYXN0ZShDSFJPTSwgUE9TLCBzZXA9Il8iKSkgJT4lIHNlbGVjdChWYXJpYW50SUQpCn0KCiMgR2V0IHZhcmlhbnQgSURzIGZvciBlYWNoIHNhbXBsZQp2YXJpYW50X2lkcyA8LSBsYXBwbHkoYWRfbGlzdCwgZ2V0X3ZhcmlhbnRfaWQpCgojIENvbnZlcnQgbGlzdCB0byBuYW1lZCB2ZWN0b3JzCnZhcmlhbnRfaWRzIDwtIGxhcHBseSh2YXJpYW50X2lkcywgcHVsbCkKCiMgRmluZCBzaGFyZWQgYmV0d2VlbiB0dW1vciBhbmQgZWFjaCBjZWxsIGxpbmUKc2hhcmVkX1QxX1gxIDwtIGludGVyc2VjdCh2YXJpYW50X2lkcyRUMSwgdmFyaWFudF9pZHMkWDEpCnNoYXJlZF9UMV9YWDEgPC0gaW50ZXJzZWN0KHZhcmlhbnRfaWRzJFQxLCB2YXJpYW50X2lkcyRYWDEpCgojIEZpbmQgdmFyaWFudHMgdW5pcXVlIHRvIFQxICh0dW1vcikKdW5pcXVlX1QxIDwtIHNldGRpZmYodmFyaWFudF9pZHMkVDEsIHVuaW9uKHZhcmlhbnRfaWRzJFgxLCB2YXJpYW50X2lkcyRYWDEpKQoKIyBGaW5kIHZhcmlhbnRzIHVuaXF1ZSB0byBlYWNoIGNlbGwgbGluZQp1bmlxdWVfWDEgPC0gc2V0ZGlmZih2YXJpYW50X2lkcyRYMSwgdmFyaWFudF9pZHMkVDEpCnVuaXF1ZV9YWDEgPC0gc2V0ZGlmZih2YXJpYW50X2lkcyRYWDEsIHZhcmlhbnRfaWRzJFQxKQoKIyBTdW1tYXJ5CnZhcmlhbnRfc3VtbWFyeSA8LSBkYXRhLmZyYW1lKAogIENvbXBhcmlzb24gPSBjKCJTaGFyZWRfVDFfWDEiLCJTaGFyZWRfVDFfWFgxIiwiVW5pcXVlX1QxIiwiVW5pcXVlX1gxIiwiVW5pcXVlX1hYMSIpLAogIENvdW50ID0gYyhsZW5ndGgoc2hhcmVkX1QxX1gxKSwgbGVuZ3RoKHNoYXJlZF9UMV9YWDEpLAogICAgICAgICAgICBsZW5ndGgodW5pcXVlX1QxKSwgbGVuZ3RoKHVuaXF1ZV9YMSksIGxlbmd0aCh1bmlxdWVfWFgxKSkKKQpgYGAKCgoKCmBgYHtyICwgZmlnLmhlaWdodD00LCBmaWcud2lkdGg9NH0KCmxpYnJhcnkoVmVubkRpYWdyYW0pCmxpYnJhcnkoZ3JpZCkKCnZhcmlhbnRzX3NldHMgPC0gbGlzdCgKICBUMSA9IHZhcmlhbnRfaWRzJFQxLAogIFgxID0gdmFyaWFudF9pZHMkWDEsCiAgWFgxID0gdmFyaWFudF9pZHMkWFgxCikKCnZlbm4uZGlhZ3JhbSgKICB4ID0gdmFyaWFudHNfc2V0cywKICBmaWxlbmFtZSA9ICJWZW5uX1R1bW9yX0NlbGxMaW5lcy5wbmciLAogIGZpbGwgPSBjKCJyZWQiLCAiZ3JlZW4iLCAiYmx1ZSIpLAogIGFscGhhID0gMC41LAogIGNleCA9IDEuNSwKICBjYXQuY2V4ID0gMS4yLAogIG1haW4gPSAiT3ZlcmxhcCBvZiBWYXJpYW50czogVHVtb3IgYW5kIENlbGwgTGluZXMiCikKCiMgVXNlIGdyaWQuZHJhdyB0byByZW5kZXIKZ3JpZC5uZXdwYWdlKCkKZ3JpZC5kcmF3KHZlbm4ucGxvdCkKCmBgYAojIFZBRiBIZWF0bWFwIGZvciBTaGFyZWQgVmFyaWFudHMKCmBgYHtyICwgbWVzc2FnZT1GQUxTRX0KCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHBoZWF0bWFwKQoKIyBDb21iaW5lIGFsbCBBRCBkYXRhCmFkX2RmIDwtIGJpbmRfcm93cygKICBsYXBwbHkobmFtZXMoYWRfbGlzdCksIGZ1bmN0aW9uKHMpIHsKICAgIGFkX2xpc3RbW3NdXSAlPiUgbXV0YXRlKFNhbXBsZSA9IHMpCiAgfSkKKSAlPiUKICBtdXRhdGUoVmFyaWFudElEID0gcGFzdGUoQ0hST00sIFBPUywgc2VwPSJfIikpCgojIFNoYXJlZCB2YXJpYW50cyBiZXR3ZWVuIHR1bW9yIGFuZCBhdCBsZWFzdCBvbmUgY2VsbCBsaW5lCnNoYXJlZF92YXJpYW50cyA8LSB1bmlvbihzaGFyZWRfVDFfWDEsIHNoYXJlZF9UMV9YWDEpCgojIEZJWDogUmVtb3ZlIGR1cGxpY2F0ZXMgYmVmb3JlIHBpdm90aW5nCnZhZl9kZiA8LSBhZF9kZiAlPiUKICBmaWx0ZXIoVmFyaWFudElEICVpbiUgc2hhcmVkX3ZhcmlhbnRzKSAlPiUKICBncm91cF9ieShWYXJpYW50SUQsIFNhbXBsZSkgJT4lICAgICAgCiAgc3VtbWFyaXNlKFZBRiA9IG1lYW4oVkFGLCBuYS5ybSA9IFRSVUUpLCAuZ3JvdXBzID0gImRyb3AiKSAlPiUgICMgRklYCiAgcGl2b3Rfd2lkZXIobmFtZXNfZnJvbSA9IFNhbXBsZSwgdmFsdWVzX2Zyb20gPSBWQUYsIHZhbHVlc19maWxsID0gMCkKCiMgQ29udmVydCB0byBtYXRyaXgKdmFmX21hdHJpeCA8LSBhcy5tYXRyaXgodmFmX2RmWywtMV0pCnJvd25hbWVzKHZhZl9tYXRyaXgpIDwtIHZhZl9kZiRWYXJpYW50SUQKCiMgSGVhdG1hcApwaGVhdG1hcCgKICB2YWZfbWF0cml4LAogIGNsdXN0ZXJfcm93cyA9IFRSVUUsCiAgY2x1c3Rlcl9jb2xzID0gVFJVRSwKICBjb2xvciA9IGNvbG9yUmFtcFBhbGV0dGUoYygid2hpdGUiLCJyZWQiKSkoNTApLAogIG1haW4gPSAiVkFGIEhlYXRtYXA6IFNoYXJlZCBWYXJpYW50cyBUdW1vciB2cyBDZWxsIExpbmVzIgopCgoKCmBgYA==