library(DESeq2)
library(ggplot2)
# Loading my dataset & metadata
counts_D2 <- read.delim("counts_D2", header = TRUE, sep = "\t", row.names = 1)
metadata_D2 <- read.csv("metadata_D2.csv", header = TRUE)
#checking the distribution of counts in my dataset
total_counts_per_gene_D2 <- rowSums(counts_D2)
hist(total_counts_per_gene_D2,
breaks=100,
main="Distribution of Total Counts per Gene - Dataset 2",
xlab="Total counts across all samples",
ylab="Number of genes")
summary(total_counts_per_gene_D2)
#ignoring low expressed gene to remove noise & get better results
print(paste("Genes before filtering:", nrow(counts_D2)))
counts_D2 <- counts_D2[which(rowSums(counts_D2) > 50), ]
print(paste("Genes after filtering:", nrow(counts_D2)))
#need to round counts for DESeq2 compatibility
counts_D2 <- round(counts_D2)
library(DESeq2)
#creating DESeq2 object with interaction design
dds_D2 <- DESeqDataSetFromMatrix(countData = counts_D2,
colData = metadata_D2,
design = ~ tissue + egfr_mutation + tissue:egfr_mutation)
dds_D2
#running the main DESeq2 analysis
dds_D2 <- DESeq(dds_D2)
#creating variance stabilized data for visualization
vsdata_D2 <- vst(dds_D2, blind = TRUE)
resultsNames(dds_D2)
vst_matrix_D2 <- assay(vsdata_D2)
head(vst_matrix_D2[1:5, 1:3])
library(reshape2)
vst_df_D2 <- as.data.frame(vst_matrix_D2)
vst_df_D2$gene_id <- rownames(vst_df_D2)
vst_final_D2 <- melt(vst_df_D2,
id.vars = "gene_id",
variable.name = "sample_id",
value.name = "expression_value")
# Check the result - should have exactly 3 columns
head(vst_final_D2)
results_WT_D2 <- results(dds_D2, name = "egfr_mutation_Wild.type_vs_Exon.19.deletion")
#SANITY CHECKS - because there's only 1 replicate for each mutation type
#mm <- model.matrix(design(dds_D2), colData(dds_D2))
#qr(mm)$rank
#ncol(mm)
# After running DESeq(dds_D2)
#res <- results(dds_D2, name = "egfr_mutation_Wild.type_vs_Exon.19.deletion")
#summary(res)
#(hist(res$pvalue, breaks=50))
#resLFC <- lfcShrink(dds_D2, coef="egfr_mutation_Wild.type_vs_Exon.19.deletion", type="apeglm")
#plotMA(resLFC)
library(ggplot2)
plotDispEsts(dds_D2)
# Save as PDF
pdf("DispersionPlot_D2.pdf", width = 6, height = 5)
plotDispEsts(dds_D2)
dev.off()
# Or save as high-resolution PNG
png("DispersionPlot_D2.png", width = 2000, height = 1600, res = 300)
plotDispEsts(dds_D2)
dev.off()
#creating an enhanced PCA plot to see sample clustering
pca_data_D2 <- plotPCA(vsdata_D2, intgroup = c("tissue", "egfr_mutation"), returnData = TRUE)
pca_plot_D2 <- ggplot(pca_data_D2, aes(PC1, PC2, color = interaction(egfr_mutation, tissue))) +
geom_point(size = 3, alpha = 0.8) +
xlab(paste0("PC1: ", round(100 * attr(pca_data_D2, "percentVar")[1]), "% variance")) +
ylab(paste0("PC2: ", round(100 * attr(pca_data_D2, "percentVar")[2]), "% variance")) +
labs(title = "PCA - Dataset 2: EGFR Mutations and Tissue Types",
color = "Group") +
theme_minimal() +
theme(legend.position = "right")
pca_plot_D2
#saving PCA plot in pdf & png
ggsave(("D2_PCA.pdf"), pca_plot_D2, width = 6, height = 5)
ggsave(("D2_PCA.png"), pca_plot_D2, width = 6, height = 5, dpi = 300)

#loading pheatmap for sample distance visualization
library(pheatmap)
#creating sample distance heatmap
sample_distances_D2 <- dist(t(assay(vsdata_D2)))
sampleDistMatrix_D2 <- as.matrix(sample_distances_D2)
pheatmap::pheatmap(sampleDistMatrix_D2,
clustering_distance_rows = sample_distances_D2,
clustering_distance_cols = sample_distances_D2,
annotation_col = as.data.frame(colData(dds_D2)[,c("tissue", "egfr_mutation")]),
main = "Sample-to-Sample Distances - Dataset 2",
filename = "SampleDistances_D2.png")
resultsNames(dds_D2)
[1] "Intercept"
[2] "tissue_Surgically.resected.tumor_vs_Patient.derived.xenograft.tumor"
[3] "egfr_mutation_Exon.21.L858R_vs_Exon.19.deletion"
[4] "egfr_mutation_Wild.type_vs_Exon.19.deletion"
[5] "tissueSurgically.resected.tumor.egfr_mutationExon.21.L858R"
[6] "tissueSurgically.resected.tumor.egfr_mutationWild.type"
summary_res_D2 <- results(dds_D2, name = "egfr_mutation_Wild.type_vs_Exon.19.deletion" )
summary(summary_res_D2)
out of 27943 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2227, 8%
LFC < 0 (down) : 1198, 4.3%
outliers [1] : 531, 1.9%
low counts [2] : 3791, 14%
(mean count < 8)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
#NON-NEGOTIABLE for reliable results
library(apeglm)
# Shrink coefficients for all main comparisons
res_WT_vs_del19_D2 <- lfcShrink(dds_D2, coef="egfr_mutation_Wild.type_vs_Exon.19.deletion", type="apeglm") #primary comparison
res_tissue_D2 <- lfcShrink(dds_D2, coef="tissue_Surgically.resected.tumor_vs_Patient.derived.xenograft.tumor", type="apeglm")
res_21_vs_del19_D2 <- lfcShrink(dds_D2, coef="egfr_mutation_Exon.21.L858R_vs_Exon.19.deletion", type="apeglm")
# Interaction effects (tissue-specific mutation effects)
res_interaction_21_D2 <- lfcShrink(dds_D2, coef="tissueSurgically.resected.tumor.egfr_mutationExon.21.L858R", type="apeglm")
res_interaction_WT_D2 <- lfcShrink(dds_D2, coef="tissueSurgically.resected.tumor.egfr_mutationWild.type", type="apeglm")
library(AnnotationDbi)
library(org.Hs.eg.db)
#converting ENSEMBL IDs to gene symbols
gene_symbols_D2 <- mapIds(org.Hs.eg.db,
keys = rownames(dds_D2),
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")
res_annotated_D2 <- results_WT_D2
res_annotated_D2$gene_symbol <- gene_symbols_D2
as.data.frame(res_annotated_D2)
#adding annotations to all my important results
resAnnotate_WT_D2 <- res_WT_vs_del19_D2
resAnnotate_WT_D2$gene_symbol <- gene_symbols_D2
resAnnotate_tissue_D2 <- res_tissue_D2
resAnnotate_tissue_D2$gene_symbol <- gene_symbols_D2
resAnnotate_21_D2 <- res_21_vs_del19_D2
resAnnotate_21_D2$gene_symbol <- gene_symbols_D2
# Summary statistics for each comparison
summary(res_tissue_D2, alpha=0.05)
summary(res_21_vs_del19_D2, alpha=0.05)
summary(res_WT_vs_del19_D2, alpha=0.05)
summary(res_interaction_21_D2, alpha=0.05)
summary(res_interaction_WT_D2, alpha=0.05)
# Extract significant genes (padj < 0.05, |LFC| > 1)
sig_tissue_D2 <- res_tissue_D2[which(res_tissue_D2$padj < 0.05 & abs(res_tissue_D2$log2FoldChange) > 1), ]
sig_21_D2 <- res_21_vs_del19_D2[which(res_21_vs_del19_D2$padj < 0.05 & abs(res_21_vs_del19_D2$log2FoldChange) > 1), ]
sig_WT_D2 <- res_WT_vs_del19_D2[which(res_WT_vs_del19_D2$padj < 0.05 & abs(res_WT_vs_del19_D2$log2FoldChange) > 1), ]
# See how many significant genes in each comparison
nrow(sig_tissue_D2) # Tissue effect genes
nrow(sig_21_D2) # L858R vs Del19 genes
nrow(sig_WT_D2) # WT vs Del19 genes (your drug targets!)
# Also complete the interaction extractions
sig_interaction_21_D2 <- res_interaction_21_D2[which(res_interaction_21_D2$padj < 0.05 & abs(res_interaction_21_D2$log2FoldChange) > 0.5), ]
sig_interaction_WT_D2 <- res_interaction_WT_D2[which(res_interaction_WT_D2$padj < 0.05 & abs(res_interaction_WT_D2$log2FoldChange) > 0.5), ]
nrow(sig_interaction_21_D2) # How many interaction genes
[1] 11
nrow(sig_interaction_WT_D2)
[1] 24
Visualization Plots
# MA plots for each comparison
plotMA(res_tissue_D2, ylim=c(-5,5), main="Tissue Effect")
plotMA(res_21_vs_del19_D2, ylim=c(-5,5), main="L858R vs Del19")
plotMA(res_WT_vs_del19_D2, ylim=c(-5,5), main="Wild Type vs Del19")
pdf(("MA_res_tissue_D2.pdf"), width = 6, height = 5)
plotMA(res_tissue_D2, ylim=c(-5,5), main="Tissue Effect")
dev.off()
png(("MA_res_tissue_D2.png"), width = 2000, height = 1600, res = 300)
plotMA(res_tissue_D2, ylim=c(-5,5), main="Tissue Effect")
dev.off()
pdf(("MA_res_21_vs_del19_D2.pdf"), width = 6, height = 5)
plotMA(res_21_vs_del19_D2, ylim=c(-5,5), main="L858R vs Del19")
dev.off()
png(("MA_res_21_vs_del19_D2.png"), width = 2000, height = 1600, res = 300)
plotMA(res_21_vs_del19_D2, ylim=c(-5,5), main="L858R vs Del19")
dev.off()
pdf(("MA_res_WT_vs_del19_D2.pdf"), width = 6, height = 5)
plotMA(res_WT_vs_del19_D2, ylim=c(-5,5), main="Wild Type vs Del19")
dev.off()
png(("MA_res_WT_vs_del19_D2.png"), width = 2000, height = 1600, res = 300)
plotMA(res_WT_vs_del19_D2, ylim=c(-5,5), main="Wild Type vs Del19")
dev.off()
# Volcano plots
library(EnhancedVolcano)
top_genes_WT_D2 <- rownames(resAnnotate_WT_D2[order(resAnnotate_WT_D2$padj), ][1:50, ])
volcano_WT_D2 <- EnhancedVolcano(resAnnotate_WT_D2,
lab = resAnnotate_WT_D2$gene_symbol,
x = 'log2FoldChange',
y = 'padj',
selectLab = resAnnotate_WT_D2$gene_symbol[rownames(resAnnotate_WT_D2) %in% top_genes_WT_D2],
title = 'Wild Type vs Exon 19 Deletion (Drug Resistance)')
print(volcano_WT_D2)
ggsave(("D2_Volcano.pdf"), volcano_WT_D2, width = 6, height = 5)
ggsave(("D2_Volcano.png"), volcano_WT_D2, width = 6, height = 5, dpi = 300)
# Heatmap of top differentially expressed genes - with values & genes
library(pheatmap)
keys <- rownames(resAnnotate_WT_D2)
values <- resAnnotate_WT_D2$gene_symbol
l <- list()
for (i in 1:length(keys)){
l[keys[i]] <- values[i]
}
l
library(dplyr)
top_WT_heatmap <- resAnnotate_WT_D2[!is.na(resAnnotate_WT_D2$padj) &
!is.na(resAnnotate_WT_D2$baseMean) &
!is.na(resAnnotate_WT_D2$log2FoldChange) &
(resAnnotate_WT_D2$padj < 0.05) &
(resAnnotate_WT_D2$baseMean > 50) &
(abs(resAnnotate_WT_D2$log2FoldChange) > 2),]
top_WT_heatmap <- top_WT_heatmap %>%
as.data.frame() %>%
head(50)
top_WT_heatmap
top_WT_heatmap <- top_WT_heatmap[order(top_WT_heatmap$log2FoldChange, decreasing = TRUE),]
#get normalized count data from vst object
mat <- assay(vsdata_D2)[rownames(top_WT_heatmap), metadata_D2$Run]
colnames(mat) <- metadata_D2$Run
base_mean <- rowMeans(mat)
mat.scaled <- t(apply(mat, 1, scale)) #center and scale each column (Z-score) then transpose
colnames(mat.scaled)<-colnames(mat)
num_keep <- 25
#1 to num_keep len-num_keep to len
rows_keep <- c(seq(1:num_keep), seq((nrow(mat.scaled)-num_keep), nrow(mat.scaled)) )
l2_val <- as.matrix(top_WT_heatmap[rows_keep,]$log2FoldChange) #getting log2 value for each gene we are keeping
colnames(l2_val)<-"logFC"
mean <- as.matrix(top_WT_heatmap[rows_keep,]$baseMean) #getting mean value for each gene we are keeping
colnames(mean)<-"AveExpr"
library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)
#maps values between b/w/r for min and max l2 values
col_logFC <- colorRamp2(c(min(l2_val),0, max(l2_val)), c("blue", "white", "red"))
#maps between 0% quantile, and 75% quantile of mean values --- 0, 25, 50, 75, 100
col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white", "red"))
col_zscore <- colorRamp2(c(-2, 0, 2), c("#4575b4", "white", "#d73027"))
ha <- HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 2),
height = unit(2, "cm")))
h1 <- Heatmap(mat.scaled[rows_keep,],
cluster_rows = TRUE, # Enable clustering for better grouping
column_labels = colnames(mat.scaled),
name="Z-score",
col = col_zscore, # Add color scheme
cluster_columns = TRUE,
show_row_names = FALSE, # Clean up row names
column_names_gp = gpar(fontsize = 9),
heatmap_legend_param = list(title_gp = gpar(fontsize = 10)))
h2 <- Heatmap(l2_val,
row_labels = top_WT_heatmap$gene_symbol_D2[rows_keep], # Use gene symbols
cluster_rows = FALSE,
name="logFC",
col = col_logFC,
width = unit(15, "mm"), # Fixed width
show_column_names = TRUE,
row_names_gp = gpar(fontsize = 8), # Smaller font for gene names
column_names_gp = gpar(fontsize = 9),
cell_fun = function(j, i, x, y, w, h, col) {
grid.text(round(l2_val[i, j], 1), x, y, gp = gpar(fontsize = 7)) # Smaller text
})
h3 <- Heatmap(mean,
row_labels = top_WT_heatmap$gene_symbol[rows_keep],
cluster_rows = FALSE,
name = "AveExpr",
col = col_AveExpr,
width = unit(15, "mm"), # Fixed width
show_row_names = TRUE, # Don't repeat gene names
show_column_names = TRUE,
column_names_gp = gpar(fontsize = 9),
row_names_gp = gpar(fontsize = 7),
cell_fun = function(j, i, x, y, w, h, col) {
grid.text(round(mean[i, j], 0), x, y, gp = gpar(fontsize = 7)) # Round to whole numbers
})
h_D2<-h1+h2+h3
h_D2
pdf("AnnotatedHeatmap_D2.pdf", width = 10, height = 6)
draw(h_D2)
dev.off()
png("AnnotatedHeatmap_D2.png", width = 3000, height = 2000, res = 300)
draw(h_D2)
dev.off()
#heatmap - for understanding experimental design
top_genes_heatmap_WT_D2 <- head(order(res_WT_vs_del19_D2$log2FoldChange, decreasing = TRUE), 50)
png("Top50_Heatmap_D2.png", width = 2000, height = 1600, res = 300)
pheatmap::pheatmap(assay(vsdata_D2)[top_genes_heatmap_WT_D2,],
annotation_col = as.data.frame(colData(dds_D2)[,c("tissue", "egfr_mutation")]),
scale = "row",
show_rownames = FALSE,
main = "Top 50 DEGs: WT vs Exon 19 Deletion",
cluster_cols = TRUE)
dev.off()
Pathway Analysis
#loading pathway analysis packages
library(DOSE)
library(enrichplot)
library(clusterProfiler)
# I'm separating upregulated and downregulated genes
sig_genes_up_WT_D2 <- rownames(sig_WT_D2)[sig_WT_D2$log2FoldChange > 1]
sig_genes_down_WT_D2 <- rownames(sig_WT_D2)[sig_WT_D2$log2FoldChange < -1]
print(paste("Upregulated:", length(sig_genes_up_WT_D2)))
print(paste("Downregulated:", length(sig_genes_down_WT_D2)))
# I'm converting ENSEMBL to ENTREZ for KEGG analysis
entrez_genes_up_WT_D2 <- mapIds(org.Hs.eg.db, sig_genes_up_WT_D2, "ENTREZID", "ENSEMBL")
entrez_genes_down_WT_D2 <- mapIds(org.Hs.eg.db, sig_genes_down_WT_D2, "ENTREZID", "ENSEMBL")
entrez_genes_up_WT_D2 <- entrez_genes_up_WT_D2[!is.na(entrez_genes_up_WT_D2)]
entrez_genes_down_WT_D2 <- entrez_genes_down_WT_D2[!is.na(entrez_genes_down_WT_D2)]
# I'm running GO Analysis
go_up_WT_D2 <- enrichGO(gene = sig_genes_up_WT_D2,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "ALL", # BP, CC, MF
pAdjustMethod = "BH",
pvalueCutoff = 0.05)
as.data.frame(go_up_WT_D2)
top_go_up_D2 <- go_up_WT_D2@result[1:10, ]
as.data.frame(top_go_up_D2)
p1_D2 <- plot(barplot(go_up_WT_D2, showCategory = 15)+labs(title = "Upregulated GO Terms (WT vs Del19)"))
ggsave("GO_WT_Upregulated_D2.pdf", p1_D2, width = 7, height = 6)
ggsave("GO_WT_Upregulated_D2.png", p1_D2, width = 7, height = 6, dpi = 300)
go_down_WT_D2 <- enrichGO(gene = sig_genes_down_WT_D2,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.05)
as.data.frame(go_down_WT_D2)
top_go_down_D2 <- go_down_WT_D2@result[1:10, ]
as.data.frame(top_go_down_D2)
p2_D2 <- plot(barplot(go_down_WT_D2, showCategory = 15)+labs(title = "Downregulated GO Terms (WT vs Del19)"))
ggsave("GO_WT_Downregulated_D2.pdf", p2_D2, width = 7, height = 6)
ggsave("GO_WT_Downregulated_D2.png", p2_D2, width = 7, height = 6, dpi = 300)
#running KEGG Pathway Analysis
kegg_up_WT_D2 <- enrichKEGG(gene = entrez_genes_up_WT_D2,
organism = 'hsa',
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
as.data.frame(kegg_up_WT_D2)
top_kegg_up_D2 <- kegg_up_WT_D2@result[1:10, ]
as.data.frame(top_kegg_up_D2)
p3_D2 <- plot(barplot(kegg_up_WT_D2, showCategory = 15)+labs(title = "Upregulated KEGG Pathways (WT vs Del19)"))
ggsave("KEGG_WT_Upregulated_D2.pdf", p3_D2, width = 7, height = 6)
ggsave("KEGG_WT_Upregulated_D2.png", p3_D2, width = 7, height = 6, dpi = 300)
kegg_down_WT_D2 <- enrichKEGG(gene = entrez_genes_down_WT_D2,
organism = 'hsa',
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
as.data.frame(kegg_down_WT_D2)
top_kegg_up_D2 <- kegg_up_WT_D2@result[1:10, ]
as.data.frame(top_kegg_up_D2)
p4_D2 <- plot(barplot(kegg_down_WT_D2, showCategory = 15)+labs(title = "Downregulated KEGG Pathways (WT vs Del19)"))
ggsave("KEGG_WT_Downregulated_D2.pdf", p4_D2, width = 7, height = 6)
ggsave("KEGG_WT_Downregulated_D2.png", p4_D2, width = 7, height = 6, dpi = 300)
#loading GSEA packages
library(fgsea)
library(msigdbr)
#preparing ranked gene list for GSEA
gene_ranks_WT_D2 <- res_WT_vs_del19_D2$log2FoldChange
names(gene_ranks_WT_D2) <- rownames(res_WT_vs_del19_D2)
gene_ranks_WT_D2 <- gene_ranks_WT_D2[!is.na(gene_ranks_WT_D2)]
gene_ranks_WT_D2 <- sort(gene_ranks_WT_D2, decreasing = TRUE)
#getting Hallmark pathways for GSEA
gse_D2 <- gseGO(gene_ranks_WT_D2,
ont = "BP",
keyType = "ENSEMBL",
OrgDb = "org.Hs.eg.db",
eps = 1e-300)
as.data.frame(gse_D2)
gse_plot_D2 <- gseaplot(gse_D2, geneSetID = 1)
gse_plot_D2
ggsave("GSEA_pathway_D2.pdf", gse_plot_D2, width = 7, height = 6)
ggsave("GSEA_pathway_D2.png", gse_plot_D2, width = 7, height = 6, dpi = 300)

hallmark_sets_D2 <- msigdbr(species = "Homo sapiens", category = "H")
hallmark_list_D2 <- split(hallmark_sets_D2$ensembl_gene, hallmark_sets_D2$gs_name)
#running GSEA analysis
fgsea_results_WT_D2 <- fgsea(pathways = hallmark_list_D2,
stats = gene_ranks_WT_D2,
minSize = 15,
maxSize = 500)
print(head(fgsea_results_WT_D2[order(pval), ], 10))
#plotting top 3 pathways
print("Top 3 GSEA pathways:")
[1] "Top 3 GSEA pathways:"
top_pathways_D2 <- head(fgsea_results_WT_D2[order(pval), ], 3)
print(top_pathways_D2[, c("pathway", "pval", "NES")])
for(i in 1:3) {
pathway_name <- top_pathways_D2$pathway[i]
pathways_D2 <- plotEnrichment(hallmark_list_D2[[pathway_name]], gene_ranks_WT_D2) +
labs(title = paste("Pathway", i, ":", pathway_name))
print(pathways_D2)
ggsave("Top3_GSEA_D2.pdf", pathways_D2, width = 7, height = 6)
ggsave("Top3_GSEA_D2.png", pathways_D2, width = 7, height = 6, dpi = 300)
}



# Summary statistics table for D2
summary_stats_D2 <- data.frame(
Metric = c("Total Samples", "Genes Analyzed", "Significant DEGs - WT vs Deletion 19", "Upregulated - WT vs Deletion 19", "Downregulated - WT vs Deletion 19"),
Value = c(ncol(counts_D2),
nrow(dds_D2),
nrow(sig_WT_D2),
length(sig_genes_up_WT_D2),
length(sig_genes_down_WT_D2))
)
summary_stats_D2
# Top genes table
top_genes_table_D2 <- data.frame(
Gene_Symbol = gene_symbols_D2[rownames(head(sig_WT_D2[order(sig_WT_D2$padj),], 20))],
ENSEMBL_ID = rownames(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)),
Log2FC = round(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)$log2FoldChange, 3),
Padj = format(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)$padj, scientific = TRUE, digits = 3),
Direction = ifelse(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)$log2FoldChange > 0, "Up", "Down")
)
print("Top 20 Significant Genes: WT vs Deletion 19")
print(top_genes_table_D2)
# Export main results
write.csv(as.data.frame(res_WT_vs_del19_D2), "D2_WT_annotated_results.csv", row.names = TRUE)
write.csv(sig_WT_D2, "D2_WT_significant_genes.csv", row.names = TRUE)
write.csv(summary_stats_D2, "D2_summary_statistics.csv", row.names = FALSE)
write.csv(top_genes_table_D2, "D2_top20_genes.csv", row.names = FALSE)
# Export normalized data
write.csv(res_annotated_D2, "D2_results.csv")
write.csv(counts(dds_D2, normalized=TRUE), "D2_normalized_counts.csv", row.names = TRUE)
write.csv(vst_final_D2, "D2_vst_data.csv", row.names = TRUE)
# Export pathway results
if(nrow(go_up_WT_D2@result) > 0) {
write.csv(go_up_WT_D2@result, "D2_GO_upregulated.csv", row.names = FALSE)
}
if(nrow(go_down_WT_D2@result) > 0) {
write.csv(go_down_WT_D2@result, "D2_GO_downregulated.csv", row.names = FALSE)
}
if(nrow(kegg_up_WT_D2@result) > 0) {
write.csv(kegg_up_WT_D2@result, "D2_KEGG_upregulated.csv", row.names = FALSE)
}
if(nrow(kegg_down_WT_D2@result) > 0) {
write.csv(kegg_down_WT_D2@result, "D2_KEGG_downregulated.csv", row.names = FALSE)
}
# Remove the leadingEdge column (which contains lists)
fgsea_results_clean_D2 <- fgsea_results_WT_D2 %>%
select(-leadingEdge)
write.csv(fgsea_results_clean_D2, "D2_GSEA_hallmarks.csv", row.names = FALSE)
---
title: "Dataset 2"
output: html_notebook
---

```{r}
library(DESeq2)
library(ggplot2)
```

```{r}
# Loading my dataset & metadata
counts_D2 <- read.delim("counts_D2", header = TRUE, sep = "\t", row.names = 1)
metadata_D2 <- read.csv("metadata_D2.csv", header = TRUE)

```

```{r}

#checking the distribution of counts in my dataset
total_counts_per_gene_D2 <- rowSums(counts_D2)
hist(total_counts_per_gene_D2, 
     breaks=100, 
     main="Distribution of Total Counts per Gene - Dataset 2", 
     xlab="Total counts across all samples",
     ylab="Number of genes")
summary(total_counts_per_gene_D2)
```

```{r}
#ignoring low expressed gene to remove noise & get better results

print(paste("Genes before filtering:", nrow(counts_D2)))
counts_D2 <- counts_D2[which(rowSums(counts_D2) > 50), ]
print(paste("Genes after filtering:", nrow(counts_D2)))
```

```{r}
#need to round counts for DESeq2 compatibility
counts_D2 <- round(counts_D2)

library(DESeq2)

#creating DESeq2 object with interaction design
dds_D2 <- DESeqDataSetFromMatrix(countData = counts_D2,
                                colData = metadata_D2,        
                                design = ~ tissue + egfr_mutation + tissue:egfr_mutation)
dds_D2

#running the main DESeq2 analysis
dds_D2 <- DESeq(dds_D2)

#creating variance stabilized data for visualization
vsdata_D2 <- vst(dds_D2, blind = TRUE)
resultsNames(dds_D2)

vst_matrix_D2 <- assay(vsdata_D2)

head(vst_matrix_D2[1:5, 1:3]) 

library(reshape2)

vst_df_D2 <- as.data.frame(vst_matrix_D2)
vst_df_D2$gene_id <- rownames(vst_df_D2)

vst_final_D2 <- melt(vst_df_D2, 
                 id.vars = "gene_id", 
                 variable.name = "sample_id", 
                 value.name = "expression_value")

# Check the result - should have exactly 3 columns
head(vst_final_D2)

results_WT_D2 <- results(dds_D2, name = "egfr_mutation_Wild.type_vs_Exon.19.deletion")

#SANITY CHECKS - because there's only 1 replicate for each mutation type
#mm <- model.matrix(design(dds_D2), colData(dds_D2))
#qr(mm)$rank
#ncol(mm)

# After running DESeq(dds_D2)
#res <- results(dds_D2, name = "egfr_mutation_Wild.type_vs_Exon.19.deletion")
#summary(res)

#(hist(res$pvalue, breaks=50))

#resLFC <- lfcShrink(dds_D2, coef="egfr_mutation_Wild.type_vs_Exon.19.deletion", type="apeglm")
#plotMA(resLFC)

```

```{r}
library(ggplot2)
plotDispEsts(dds_D2)

# Save as PDF
pdf("DispersionPlot_D2.pdf", width = 6, height = 5)
plotDispEsts(dds_D2)
dev.off()

# Or save as high-resolution PNG
png("DispersionPlot_D2.png", width = 2000, height = 1600, res = 300)
plotDispEsts(dds_D2)
dev.off()
```

```{r}
#creating an enhanced PCA plot to see sample clustering
pca_data_D2 <- plotPCA(vsdata_D2, intgroup = c("tissue", "egfr_mutation"), returnData = TRUE)

pca_plot_D2 <- ggplot(pca_data_D2, aes(PC1, PC2, color = interaction(egfr_mutation, tissue))) +
  geom_point(size = 3, alpha = 0.8) +
  xlab(paste0("PC1: ", round(100 * attr(pca_data_D2, "percentVar")[1]), "% variance")) +
  ylab(paste0("PC2: ", round(100 * attr(pca_data_D2, "percentVar")[2]), "% variance")) +
  labs(title = "PCA - Dataset 2: EGFR Mutations and Tissue Types",
       color = "Group") +
  theme_minimal() +
  theme(legend.position = "right")
pca_plot_D2

#saving PCA plot in pdf & png 
ggsave(("D2_PCA.pdf"), pca_plot_D2, width = 6, height = 5)
ggsave(("D2_PCA.png"), pca_plot_D2, width = 6, height = 5, dpi = 300)
```

```{r}
#loading pheatmap for sample distance visualization
library(pheatmap)

#creating sample distance heatmap
sample_distances_D2 <- dist(t(assay(vsdata_D2)))
sampleDistMatrix_D2 <- as.matrix(sample_distances_D2)
pheatmap::pheatmap(sampleDistMatrix_D2,
         clustering_distance_rows = sample_distances_D2,
         clustering_distance_cols = sample_distances_D2,
         annotation_col = as.data.frame(colData(dds_D2)[,c("tissue", "egfr_mutation")]),
         main = "Sample-to-Sample Distances - Dataset 2",
         filename = "SampleDistances_D2.png")
```


```{r}
resultsNames(dds_D2)
summary_res_D2 <- results(dds_D2, name = "egfr_mutation_Wild.type_vs_Exon.19.deletion" )
summary(summary_res_D2)
```

```{r}
#NON-NEGOTIABLE for reliable results
library(apeglm)

# Shrink coefficients for all main comparisons

res_WT_vs_del19_D2 <- lfcShrink(dds_D2, coef="egfr_mutation_Wild.type_vs_Exon.19.deletion", type="apeglm") #primary comparison
res_tissue_D2 <- lfcShrink(dds_D2, coef="tissue_Surgically.resected.tumor_vs_Patient.derived.xenograft.tumor", type="apeglm")
res_21_vs_del19_D2 <- lfcShrink(dds_D2, coef="egfr_mutation_Exon.21.L858R_vs_Exon.19.deletion", type="apeglm")

# Interaction effects (tissue-specific mutation effects)
res_interaction_21_D2 <- lfcShrink(dds_D2, coef="tissueSurgically.resected.tumor.egfr_mutationExon.21.L858R", type="apeglm")
res_interaction_WT_D2 <- lfcShrink(dds_D2, coef="tissueSurgically.resected.tumor.egfr_mutationWild.type", type="apeglm")
```


```{r}
library(AnnotationDbi)
library(org.Hs.eg.db)

#converting ENSEMBL IDs to gene symbols
gene_symbols_D2 <- mapIds(org.Hs.eg.db,
                         keys = rownames(dds_D2),
                         column = "SYMBOL",
                         keytype = "ENSEMBL",
                         multiVals = "first")
res_annotated_D2 <- results_WT_D2
res_annotated_D2$gene_symbol <- gene_symbols_D2
as.data.frame(res_annotated_D2)

#adding annotations to all my important results
resAnnotate_WT_D2 <- res_WT_vs_del19_D2
resAnnotate_WT_D2$gene_symbol <- gene_symbols_D2

resAnnotate_tissue_D2 <- res_tissue_D2
resAnnotate_tissue_D2$gene_symbol <- gene_symbols_D2

resAnnotate_21_D2 <- res_21_vs_del19_D2
resAnnotate_21_D2$gene_symbol <- gene_symbols_D2

```


```{r}
# Summary statistics for each comparison
summary(res_tissue_D2, alpha=0.05)
summary(res_21_vs_del19_D2, alpha=0.05)
summary(res_WT_vs_del19_D2, alpha=0.05)
summary(res_interaction_21_D2, alpha=0.05)
summary(res_interaction_WT_D2, alpha=0.05)

# Extract significant genes (padj < 0.05, |LFC| > 1)
sig_tissue_D2 <- res_tissue_D2[which(res_tissue_D2$padj < 0.05 & abs(res_tissue_D2$log2FoldChange) > 1), ]
sig_21_D2 <- res_21_vs_del19_D2[which(res_21_vs_del19_D2$padj < 0.05 & abs(res_21_vs_del19_D2$log2FoldChange) > 1), ]
sig_WT_D2 <- res_WT_vs_del19_D2[which(res_WT_vs_del19_D2$padj < 0.05 & abs(res_WT_vs_del19_D2$log2FoldChange) > 1), ]

# See how many significant genes in each comparison
nrow(sig_tissue_D2)    # Tissue effect genes
nrow(sig_21_D2)     # L858R vs Del19 genes  
nrow(sig_WT_D2)        # WT vs Del19 genes (your drug targets!)



```

```{r}
# Also complete the interaction extractions
sig_interaction_21_D2 <- res_interaction_21_D2[which(res_interaction_21_D2$padj < 0.05 & abs(res_interaction_21_D2$log2FoldChange) > 0.5), ]
sig_interaction_WT_D2 <- res_interaction_WT_D2[which(res_interaction_WT_D2$padj < 0.05 & abs(res_interaction_WT_D2$log2FoldChange) > 0.5), ]

nrow(sig_interaction_21_D2)  # How many interaction genes
nrow(sig_interaction_WT_D2)
```
Visualization Plots
```{r}
# MA plots for each comparison
plotMA(res_tissue_D2, ylim=c(-5,5), main="Tissue Effect")
plotMA(res_21_vs_del19_D2, ylim=c(-5,5), main="L858R vs Del19")
plotMA(res_WT_vs_del19_D2, ylim=c(-5,5), main="Wild Type vs Del19")

pdf(("MA_res_tissue_D2.pdf"), width = 6, height = 5)
plotMA(res_tissue_D2, ylim=c(-5,5), main="Tissue Effect")
dev.off()

png(("MA_res_tissue_D2.png"), width = 2000, height = 1600, res = 300)
plotMA(res_tissue_D2, ylim=c(-5,5), main="Tissue Effect")
dev.off()

pdf(("MA_res_21_vs_del19_D2.pdf"), width = 6, height = 5)
plotMA(res_21_vs_del19_D2, ylim=c(-5,5), main="L858R vs Del19")
dev.off()

png(("MA_res_21_vs_del19_D2.png"), width = 2000, height = 1600, res = 300)
plotMA(res_21_vs_del19_D2, ylim=c(-5,5), main="L858R vs Del19")
dev.off()

pdf(("MA_res_WT_vs_del19_D2.pdf"), width = 6, height = 5)
plotMA(res_WT_vs_del19_D2, ylim=c(-5,5), main="Wild Type vs Del19")
dev.off()

png(("MA_res_WT_vs_del19_D2.png"), width = 2000, height = 1600, res = 300)
plotMA(res_WT_vs_del19_D2, ylim=c(-5,5), main="Wild Type vs Del19")
dev.off()

# Volcano plots
library(EnhancedVolcano)

top_genes_WT_D2 <- rownames(resAnnotate_WT_D2[order(resAnnotate_WT_D2$padj), ][1:50, ])

volcano_WT_D2 <- EnhancedVolcano(resAnnotate_WT_D2,
                lab = resAnnotate_WT_D2$gene_symbol,
                x = 'log2FoldChange',
                y = 'padj',
                selectLab = resAnnotate_WT_D2$gene_symbol[rownames(resAnnotate_WT_D2) %in% top_genes_WT_D2],
                title = 'Wild Type vs Exon 19 Deletion (Drug Resistance)')
print(volcano_WT_D2)

ggsave(("D2_Volcano.pdf"), volcano_WT_D2, width = 6, height = 5)
ggsave(("D2_Volcano.png"), volcano_WT_D2, width = 6, height = 5, dpi = 300)

# Heatmap of top differentially expressed genes - with values & genes
library(pheatmap)

keys <- rownames(resAnnotate_WT_D2)
values <- resAnnotate_WT_D2$gene_symbol

l <- list()
for (i in 1:length(keys)){
  l[keys[i]] <- values[i]
}
l
library(dplyr)

top_WT_heatmap <- resAnnotate_WT_D2[!is.na(resAnnotate_WT_D2$padj) & 
                                    !is.na(resAnnotate_WT_D2$baseMean) & 
                                    !is.na(resAnnotate_WT_D2$log2FoldChange) &
                                    (resAnnotate_WT_D2$padj < 0.05) & 
                                    (resAnnotate_WT_D2$baseMean > 50) & 
                                    (abs(resAnnotate_WT_D2$log2FoldChange) > 2),]
top_WT_heatmap <- top_WT_heatmap %>%
  as.data.frame() %>%
  head(50)  
top_WT_heatmap
top_WT_heatmap <- top_WT_heatmap[order(top_WT_heatmap$log2FoldChange, decreasing = TRUE),]

#get normalized count data from vst object
mat <- assay(vsdata_D2)[rownames(top_WT_heatmap), metadata_D2$Run]
colnames(mat) <- metadata_D2$Run
base_mean <- rowMeans(mat)
mat.scaled <- t(apply(mat, 1, scale)) #center and scale each column (Z-score) then transpose
colnames(mat.scaled)<-colnames(mat)

num_keep <- 25
#1 to num_keep len-num_keep to len
rows_keep <- c(seq(1:num_keep), seq((nrow(mat.scaled)-num_keep), nrow(mat.scaled)) )

l2_val <- as.matrix(top_WT_heatmap[rows_keep,]$log2FoldChange) #getting log2 value for each gene we are keeping
colnames(l2_val)<-"logFC"

mean <- as.matrix(top_WT_heatmap[rows_keep,]$baseMean) #getting mean value for each gene we are keeping
colnames(mean)<-"AveExpr"

library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)

#maps values between b/w/r for min and max l2 values
col_logFC <- colorRamp2(c(min(l2_val),0, max(l2_val)), c("blue", "white", "red")) 

#maps between 0% quantile, and 75% quantile of mean values --- 0, 25, 50, 75, 100
col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white", "red"))
col_zscore <- colorRamp2(c(-2, 0, 2), c("#4575b4", "white", "#d73027"))

ha <- HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 2), 
                                               height = unit(2, "cm")))

h1 <- Heatmap(mat.scaled[rows_keep,], 
              cluster_rows = TRUE,  # Enable clustering for better grouping
              column_labels = colnames(mat.scaled), 
              name="Z-score",
              col = col_zscore,  # Add color scheme
              cluster_columns = TRUE,
              show_row_names = FALSE,  # Clean up row names
              column_names_gp = gpar(fontsize = 9),
              heatmap_legend_param = list(title_gp = gpar(fontsize = 10)))

h2 <- Heatmap(l2_val, 
              row_labels = top_WT_heatmap$gene_symbol_D2[rows_keep],  # Use gene symbols
              cluster_rows = FALSE, 
              name="logFC", 
              col = col_logFC,
              width = unit(15, "mm"),  # Fixed width
              show_column_names = TRUE,
              row_names_gp = gpar(fontsize = 8),  # Smaller font for gene names
              column_names_gp = gpar(fontsize = 9),
              cell_fun = function(j, i, x, y, w, h, col) { 
                grid.text(round(l2_val[i, j], 1), x, y, gp = gpar(fontsize = 7))  # Smaller text
              })
h3 <- Heatmap(mean, 
              row_labels = top_WT_heatmap$gene_symbol[rows_keep],
              cluster_rows = FALSE, 
              name = "AveExpr", 
              col = col_AveExpr,
              width = unit(15, "mm"),  # Fixed width
              show_row_names = TRUE,  # Don't repeat gene names
              show_column_names = TRUE,
              column_names_gp = gpar(fontsize = 9),
              row_names_gp = gpar(fontsize = 7),
              cell_fun = function(j, i, x, y, w, h, col) { 
                grid.text(round(mean[i, j], 0), x, y, gp = gpar(fontsize = 7))  # Round to whole numbers
              })

h_D2<-h1+h2+h3
h_D2

pdf("AnnotatedHeatmap_D2.pdf", width = 10, height = 6)
draw(h_D2)
dev.off()

png("AnnotatedHeatmap_D2.png", width = 3000, height = 2000, res = 300)
draw(h_D2)
dev.off()

```
```{r}
#heatmap - for understanding experimental design
top_genes_heatmap_WT_D2 <- head(order(res_WT_vs_del19_D2$log2FoldChange, decreasing = TRUE), 50)
png("Top50_Heatmap_D2.png", width = 2000, height = 1600, res = 300)
pheatmap::pheatmap(assay(vsdata_D2)[top_genes_heatmap_WT_D2,], 
         annotation_col = as.data.frame(colData(dds_D2)[,c("tissue", "egfr_mutation")]), 
         scale = "row", 
         show_rownames = FALSE,
         main = "Top 50 DEGs: WT vs Exon 19 Deletion",
         cluster_cols = TRUE)
dev.off()
```


Pathway Analysis
```{r}
  
  #loading pathway analysis packages
  library(DOSE)
  library(enrichplot)
  library(clusterProfiler)
  
  # I'm separating upregulated and downregulated genes
  sig_genes_up_WT_D2 <- rownames(sig_WT_D2)[sig_WT_D2$log2FoldChange > 1]
  sig_genes_down_WT_D2 <- rownames(sig_WT_D2)[sig_WT_D2$log2FoldChange < -1]
  
  print(paste("Upregulated:", length(sig_genes_up_WT_D2)))
print(paste("Downregulated:", length(sig_genes_down_WT_D2)))
  
  # I'm converting ENSEMBL to ENTREZ for KEGG analysis
  entrez_genes_up_WT_D2 <- mapIds(org.Hs.eg.db, sig_genes_up_WT_D2, "ENTREZID", "ENSEMBL")
  entrez_genes_down_WT_D2 <- mapIds(org.Hs.eg.db, sig_genes_down_WT_D2, "ENTREZID", "ENSEMBL")
  entrez_genes_up_WT_D2 <- entrez_genes_up_WT_D2[!is.na(entrez_genes_up_WT_D2)]
  entrez_genes_down_WT_D2 <- entrez_genes_down_WT_D2[!is.na(entrez_genes_down_WT_D2)]
  
  # I'm running GO Analysis
  go_up_WT_D2 <- enrichGO(gene = sig_genes_up_WT_D2,
                          OrgDb = org.Hs.eg.db,
                          keyType = "ENSEMBL",
                          ont = "ALL",  # BP, CC, MF
                          pAdjustMethod = "BH",
                          pvalueCutoff = 0.05)
  as.data.frame(go_up_WT_D2)
  top_go_up_D2 <- go_up_WT_D2@result[1:10, ]
as.data.frame(top_go_up_D2)

  p1_D2 <- plot(barplot(go_up_WT_D2, showCategory = 15)+labs(title = "Upregulated GO Terms (WT vs Del19)"))
  ggsave("GO_WT_Upregulated_D2.pdf", p1_D2, width = 7, height = 6)
  ggsave("GO_WT_Upregulated_D2.png", p1_D2, width = 7, height = 6, dpi = 300)
  
  go_down_WT_D2 <- enrichGO(gene = sig_genes_down_WT_D2,
                            OrgDb = org.Hs.eg.db,
                            keyType = "ENSEMBL",
                            ont = "ALL",
                            pAdjustMethod = "BH",
                            pvalueCutoff = 0.05)
  as.data.frame(go_down_WT_D2)
  top_go_down_D2 <- go_down_WT_D2@result[1:10, ]
as.data.frame(top_go_down_D2)

  p2_D2 <- plot(barplot(go_down_WT_D2, showCategory = 15)+labs(title = "Downregulated GO Terms (WT vs Del19)"))
  ggsave("GO_WT_Downregulated_D2.pdf", p2_D2, width = 7, height = 6)
  ggsave("GO_WT_Downregulated_D2.png", p2_D2, width = 7, height = 6, dpi = 300)
  
  #running KEGG Pathway Analysis
  kegg_up_WT_D2 <- enrichKEGG(gene = entrez_genes_up_WT_D2,
                              organism = 'hsa',
                              pvalueCutoff = 0.05,
                              pAdjustMethod = "BH")
  as.data.frame(kegg_up_WT_D2)
top_kegg_up_D2 <- kegg_up_WT_D2@result[1:10, ]
as.data.frame(top_kegg_up_D2)

  p3_D2 <- plot(barplot(kegg_up_WT_D2, showCategory = 15)+labs(title = "Upregulated KEGG Pathways (WT vs Del19)"))
  ggsave("KEGG_WT_Upregulated_D2.pdf", p3_D2, width = 7, height = 6)
  ggsave("KEGG_WT_Upregulated_D2.png", p3_D2, width = 7, height = 6, dpi = 300)
  
  kegg_down_WT_D2 <- enrichKEGG(gene = entrez_genes_down_WT_D2,
                                organism = 'hsa',
                                pvalueCutoff = 0.05,
                                pAdjustMethod = "BH")
  
  as.data.frame(kegg_down_WT_D2)
top_kegg_up_D2 <- kegg_up_WT_D2@result[1:10, ]
as.data.frame(top_kegg_up_D2)

 p4_D2 <- plot(barplot(kegg_down_WT_D2, showCategory = 15)+labs(title = "Downregulated KEGG Pathways (WT vs Del19)"))
  ggsave("KEGG_WT_Downregulated_D2.pdf", p4_D2, width = 7, height = 6)
  ggsave("KEGG_WT_Downregulated_D2.png", p4_D2, width = 7, height = 6, dpi = 300)
```



```{r}
#loading GSEA packages 
library(fgsea)
library(msigdbr)


#preparing ranked gene list for GSEA 
gene_ranks_WT_D2 <- res_WT_vs_del19_D2$log2FoldChange
names(gene_ranks_WT_D2) <- rownames(res_WT_vs_del19_D2)
gene_ranks_WT_D2 <- gene_ranks_WT_D2[!is.na(gene_ranks_WT_D2)]
gene_ranks_WT_D2 <- sort(gene_ranks_WT_D2, decreasing = TRUE)

#getting Hallmark pathways for GSEA 

gse_D2 <- gseGO(gene_ranks_WT_D2,
      ont = "BP",
      keyType = "ENSEMBL",
      OrgDb = "org.Hs.eg.db",
      eps = 1e-300)
as.data.frame(gse_D2)

gse_plot_D2 <- gseaplot(gse_D2, geneSetID = 1)
gse_plot_D2

ggsave("GSEA_pathway_D2.pdf", gse_plot_D2, width = 7, height = 6)
ggsave("GSEA_pathway_D2.png", gse_plot_D2, width = 7, height = 6, dpi = 300)


hallmark_sets_D2 <- msigdbr(species = "Homo sapiens", category = "H")
hallmark_list_D2 <- split(hallmark_sets_D2$ensembl_gene, hallmark_sets_D2$gs_name)

#running GSEA analysis 
fgsea_results_WT_D2 <- fgsea(pathways = hallmark_list_D2,
                            stats = gene_ranks_WT_D2,
                            minSize = 15,
                            maxSize = 500)

print(head(fgsea_results_WT_D2[order(pval), ], 10))

#plotting top 3 pathways
print("Top 3 GSEA pathways:")
top_pathways_D2 <- head(fgsea_results_WT_D2[order(pval), ], 3)
print(top_pathways_D2[, c("pathway", "pval", "NES")])

for(i in 1:3) {
  pathway_name <- top_pathways_D2$pathway[i]
  pathways_D2 <- plotEnrichment(hallmark_list_D2[[pathway_name]], gene_ranks_WT_D2) + 
    labs(title = paste("Pathway", i, ":", pathway_name))
  print(pathways_D2)
  ggsave("Top3_GSEA_D2.pdf", pathways_D2, width = 7, height = 6)
  ggsave("Top3_GSEA_D2.png", pathways_D2, width = 7, height = 6, dpi = 300)
}

```
 
```{r}
# Summary statistics table for D2
summary_stats_D2 <- data.frame(
  Metric = c("Total Samples", "Genes Analyzed", "Significant DEGs - WT vs Deletion 19", "Upregulated - WT vs Deletion 19", "Downregulated - WT vs Deletion 19"),
  Value = c(ncol(counts_D2), 
            nrow(dds_D2),
            nrow(sig_WT_D2),
            length(sig_genes_up_WT_D2),
            length(sig_genes_down_WT_D2))
)

summary_stats_D2

# Top genes table
top_genes_table_D2 <- data.frame(
  Gene_Symbol = gene_symbols_D2[rownames(head(sig_WT_D2[order(sig_WT_D2$padj),], 20))],
  ENSEMBL_ID = rownames(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)),
  Log2FC = round(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)$log2FoldChange, 3),
  Padj = format(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)$padj, scientific = TRUE, digits = 3),
  Direction = ifelse(head(sig_WT_D2[order(sig_WT_D2$padj),], 20)$log2FoldChange > 0, "Up", "Down")
)

print("Top 20 Significant Genes: WT vs Deletion 19")
print(top_genes_table_D2)


```

```{r}
# Export main results
write.csv(as.data.frame(res_WT_vs_del19_D2), "D2_WT_annotated_results.csv", row.names = TRUE)
write.csv(sig_WT_D2, "D2_WT_significant_genes.csv", row.names = TRUE)
write.csv(summary_stats_D2, "D2_summary_statistics.csv", row.names = FALSE)
write.csv(top_genes_table_D2, "D2_top20_genes.csv", row.names = FALSE)

# Export normalized data
write.csv(res_annotated_D2, "D2_results.csv")
write.csv(counts(dds_D2, normalized=TRUE), "D2_normalized_counts.csv", row.names = TRUE)
write.csv(vst_final_D2, "D2_vst_data.csv", row.names = TRUE)

# Export pathway results
if(nrow(go_up_WT_D2@result) > 0) {
  write.csv(go_up_WT_D2@result, "D2_GO_upregulated.csv", row.names = FALSE)
}
if(nrow(go_down_WT_D2@result) > 0) {
  write.csv(go_down_WT_D2@result, "D2_GO_downregulated.csv", row.names = FALSE)
}
if(nrow(kegg_up_WT_D2@result) > 0) {
  write.csv(kegg_up_WT_D2@result, "D2_KEGG_upregulated.csv", row.names = FALSE)
}
if(nrow(kegg_down_WT_D2@result) > 0) {
  write.csv(kegg_down_WT_D2@result, "D2_KEGG_downregulated.csv", row.names = FALSE)
}

# Remove the leadingEdge column (which contains lists)
fgsea_results_clean_D2 <- fgsea_results_WT_D2 %>%
  select(-leadingEdge)

write.csv(fgsea_results_clean_D2, "D2_GSEA_hallmarks.csv", row.names = FALSE)

```



