#Data Creation
set.seed(123)
# Define groups: wild-type and 3 TP53 mutants
groups <- factor(rep(c("WT", "R175H", "R248W", "R273H"), each = 5))
# Simulate expression values for 100 genes
n_genes <- 100
expression_matrix <- matrix(rnorm(n_genes * 20, mean = 8, sd = 1.5), nrow = n_genes)
# Make TP53 downregulated in mutants
expression_matrix[1, groups != "WT"] <- expression_matrix[1, groups != "WT"] - 3
rownames(expression_matrix) <- c("TP53", paste0("Gene_", 2:n_genes))
colnames(expression_matrix) <- paste0(groups, "_rep", 1:5)
# ─────────────────────────────────────────
# VISUALIZATION 1: Volcano Plot
# ─────────────────────────────────────────
# Simulate log2 fold change and p-values
log2FC <- rnorm(n_genes, mean = 0, sd = 2)
pvalues <- runif(n_genes, 0, 1)
# Make TP53 and key target genes highly significant
log2FC[1] <- -3.2 # TP53 downregulated
pvalues[1] <- 0.0001 # TP53 highly significant
# MDM2 upregulated
log2FC[2] <- 2.8
pvalues[2] <- 0.0002
# BAX downregulated
log2FC[3] <- -2.5
pvalues[3] <- 0.0003
# CDKN1A downregulated
log2FC[4] <- -2.2
pvalues[4] <- 0.0004
# BCL2 upregulated
log2FC[5] <- 2.1
pvalues[5] <- 0.0005
volcano_data <- data.frame(
gene = rownames(expression_matrix),
log2FC = log2FC,
pvalue = pvalues,
neglog10p = -log10(pvalues)
)
# Color coding
colors <- ifelse(volcano_data$gene == "TP53", "red",
ifelse(volcano_data$gene == "Gene_2", "orange", # MDM2
ifelse(volcano_data$gene == "Gene_3", "blue", # BAX
ifelse(volcano_data$gene == "Gene_4", "purple", # CDKN1A
ifelse(volcano_data$gene == "Gene_5", "green", # BCL2
"gray")))))
# Plot volcano
plot(volcano_data$log2FC, volcano_data$neglog10p,
pch = 20,
col = colors,
xlab = "Log2 Fold Change",
ylab = "-log10(p-value)",
main = "Volcano Plot: TP53 Mutant vs Wild-Type\nDifferential Gene Expression",
ylim = c(0, max(volcano_data$neglog10p) + 1))
# Add threshold lines
abline(h = -log10(0.05), lty = 2, col = "black")
abline(v = c(-1, 1), lty = 2, col = "black")
# Add gene labels
text(log2FC[1], -log10(pvalues[1]), "TP53", pos = 4, col = "red", cex = 0.8)
text(log2FC[2], -log10(pvalues[2]), "MDM2", pos = 4, col = "orange", cex = 0.8)
text(log2FC[3], -log10(pvalues[3]), "BAX", pos = 4, col = "blue", cex = 0.8)
text(log2FC[4], -log10(pvalues[4]), "CDKN1A", pos = 4, col = "purple", cex = 0.8)
text(log2FC[5], -log10(pvalues[5]), "BCL2", pos = 4, col = "green", cex = 0.8)
# Add legend
legend("topright",
legend = c("TP53", "MDM2", "BAX", "CDKN1A", "BCL2", "Other genes"),
col = c("red", "orange", "blue", "purple", "green", "gray"),
pch = 20,
cex = 0.8)
# ─────────────────────────────────────────
# VISUALIZATION 2: Heatmap
# ─────────────────────────────────────────
# Select top 20 genes for heatmap
heatmap_data <- expression_matrix[1:20,]
# Rename key genes for clarity
rownames(heatmap_data)[1] <- "TP53"
rownames(heatmap_data)[2] <- "MDM2"
rownames(heatmap_data)[3] <- "BAX"
rownames(heatmap_data)[4] <- "CDKN1A"
rownames(heatmap_data)[5] <- "BCL2"
par(mar = c(10, 5, 10, 2))
# Color palette
col_palette <- colorRampPalette(c("yellow", "orange", "red"))(100)
# Plot heatmap
heatmap(heatmap_data,
col = col_palette,
scale = "row",
margins = c(6,7),
main = "Gene Expression: WT vs TP53 Mutants",
xlab = "Samples",
ylab = "Genes",
labRow = ifelse(rownames(heatmap_data) %in%
c("TP53", "MDM2", "BAX", "CDKN1A", "BCL2"),
rownames(heatmap_data), ""),
cex.main = 0.1,
cexRow = 0.8,
cexCol = 0.6)
Appendix: The code is written with the assistance of claude.ai (sonnet 4.6)
prompt 1: in the context of studying TP53 mutants with transcriptomics, how would I simulate data with 3 mutant TP53 and 1 wild type. prompt 2: What would be the best plots to represent the simulated data? prompt 3: draw a volcano plot and heatmap for the data in R.
Justification: I was unsure as to how to simulate data in the correct manner, so I used claude to point me into the right direction.