Ejecuta este bloque solo la primera vez. Si ya tienes los paquetes instalados, puedes saltarlo.
install.packages(c("tidyverse", "pheatmap", "ggrepel", "BiocManager"))
BiocManager::install(c(
"DESeq2", "AnnotationDbi", "org.Hs.eg.db", "clusterProfiler", "ReactomePA"
), ask = FALSE, update = FALSE)
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(ggrepel)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(clusterProfiler)
library(ReactomePA)
Guarda este Rmd en la misma carpeta donde estan:
count_matrix_GSE142279(1).csvcolData_GSE142279(1).csvcount_file <- "C:/Users/LETIZIA/Documents/rstudio/count_matrix_GSE142279.csv"
coldata_file <- "C:/Users/LETIZIA/Documents/rstudio/colData_GSE142279.csv"
counts_raw <- read.csv(count_file, row.names = 1, check.names = FALSE)
coldata <- read.csv(coldata_file, row.names = 1, check.names = FALSE)
# Verificacion basica
dim(counts_raw)
## [1] 39376 40
dim(coldata)
## [1] 40 1
table(coldata$condition)
##
## colorectal_cancer control
## 20 20
head(counts_raw[, 1:4])
head(coldata)
# Asegurar que el orden de columnas de counts coincida con filas de coldata
stopifnot(all(colnames(counts_raw) %in% rownames(coldata)))
coldata <- coldata[colnames(counts_raw), , drop = FALSE]
stopifnot(all(colnames(counts_raw) == rownames(coldata)))
# Definir control como referencia
coldata$condition <- factor(coldata$condition, levels = c("control", "colorectal_cancer"))
table(coldata$condition)
##
## control colorectal_cancer
## 20 20
# Redondear por seguridad y convertir a matriz
counts <- round(as.matrix(counts_raw))
storage.mode(counts) <- "integer"
La comparacion queda: colorectal_cancer vs
control.
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = coldata,
design = ~ condition
)
# Filtrar genes con muy pocos conteos
dds <- dds[rowSums(counts(dds)) >= 10, ]
dds <- DESeq(dds)
res <- results(dds, contrast = c("condition", "colorectal_cancer", "control"))
res <- lfcShrink(dds, contrast = c("condition", "colorectal_cancer", "control"), res = res, type = "ashr")
res_df <- as.data.frame(res) %>%
rownames_to_column("ENTREZID") %>%
arrange(padj)
summary(res)
##
## out of 31508 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 7185, 23%
## LFC < 0 (down) : 7639, 24%
## outliers [1] : 0, 0%
## low counts [2] : 1230, 3.9%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
head(res_df)
annotation <- AnnotationDbi::select(
org.Hs.eg.db,
keys = res_df$ENTREZID,
keytype = "ENTREZID",
columns = c("SYMBOL", "GENENAME")
) %>%
distinct(ENTREZID, .keep_all = TRUE)
res_annot <- res_df %>%
left_join(annotation, by = "ENTREZID") %>%
relocate(SYMBOL, GENENAME, .after = ENTREZID)
head(res_annot)
Criterios usados:
padj < 0.05abs(log2FoldChange) >= 1alpha <- 0.05
lfc_cutoff <- 1
res_annot <- res_annot %>%
mutate(
regulation = case_when(
!is.na(padj) & padj < alpha & log2FoldChange >= lfc_cutoff ~ "Upregulated",
!is.na(padj) & padj < alpha & log2FoldChange <= -lfc_cutoff ~ "Downregulated",
TRUE ~ "Not significant"
)
)
table(res_annot$regulation)
##
## Downregulated Not significant Upregulated
## 3943 24789 2784
up_genes <- res_annot %>% filter(regulation == "Upregulated")
down_genes <- res_annot %>% filter(regulation == "Downregulated")
deg_genes <- res_annot %>% filter(regulation != "Not significant")
head(up_genes)
head(down_genes)
write.csv(res_annot, "GSE142279_DESeq2_all_results.csv", row.names = FALSE)
write.csv(up_genes, "GSE142279_upregulated_genes.csv", row.names = FALSE)
write.csv(down_genes, "GSE142279_downregulated_genes.csv", row.names = FALSE)
write.csv(deg_genes, "GSE142279_all_DEGs.csv", row.names = FALSE)
# Listas sencillas para compartir con el grupo de cancer gastrico o pegar en MolBioTools
writeLines(na.omit(up_genes$SYMBOL), "GSE142279_upregulated_SYMBOL.txt")
writeLines(na.omit(down_genes$SYMBOL), "GSE142279_downregulated_SYMBOL.txt")
writeLines(na.omit(deg_genes$SYMBOL), "GSE142279_all_DEGs_SYMBOL.txt")
vsd <- vst(dds, blind = FALSE)
pca_data <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
percent_var <- round(100 * attr(pca_data, "percentVar"))
ggplot(pca_data, aes(PC1, PC2, color = condition)) +
geom_point(size = 3) +
labs(
title = "PCA - GSE142279",
x = paste0("PC1: ", percent_var[1], "% varianza"),
y = paste0("PC2: ", percent_var[2], "% varianza")
) +
theme_minimal()
volcano_df <- res_annot %>%
mutate(
neg_log10_padj = -log10(padj),
label = ifelse(regulation != "Not significant" & !is.na(SYMBOL), SYMBOL, NA)
)
top_labels <- volcano_df %>%
filter(regulation != "Not significant") %>%
arrange(padj) %>%
slice_head(n = 20)
ggplot(volcano_df, aes(x = log2FoldChange, y = neg_log10_padj, color = regulation)) +
geom_point(alpha = 0.7, size = 1.5) +
geom_vline(xintercept = c(-lfc_cutoff, lfc_cutoff), linetype = "dashed") +
geom_hline(yintercept = -log10(alpha), linetype = "dashed") +
geom_text_repel(data = top_labels, aes(label = SYMBOL), max.overlaps = 20, size = 3) +
labs(
title = "Volcano plot - Cancer colorrectal vs control",
x = "log2 Fold Change",
y = "-log10 adjusted p-value"
) +
theme_minimal()
top_deg_ids <- deg_genes %>%
arrange(padj) %>%
slice_head(n = 50) %>%
pull(ENTREZID)
mat <- assay(vsd)[top_deg_ids, ]
mat <- mat - rowMeans(mat)
annotation_col <- as.data.frame(colData(vsd)[, "condition", drop = FALSE])
pheatmap(
mat,
annotation_col = annotation_col,
show_rownames = FALSE,
show_colnames = FALSE,
main = "Top 50 DEGs - GSE142279"
)
up_entrez <- up_genes %>% pull(ENTREZID) %>% na.omit() %>% unique()
down_entrez <- down_genes %>% pull(ENTREZID) %>% na.omit() %>% unique()
up_go <- enrichGO(
gene = up_entrez,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2,
readable = TRUE
)
down_go <- enrichGO(
gene = down_entrez,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2,
readable = TRUE
)
head(as.data.frame(up_go))
head(as.data.frame(down_go))
dotplot(up_go, showCategory = 15) + ggtitle("GO BP - Upregulated genes")
dotplot(down_go, showCategory = 15) + ggtitle("GO BP - Downregulated genes")
up_reactome <- enrichPathway(
gene = up_entrez,
organism = "human",
pvalueCutoff = 0.05,
readable = TRUE
)
down_reactome <- enrichPathway(
gene = down_entrez,
organism = "human",
pvalueCutoff = 0.05,
readable = TRUE
)
head(as.data.frame(up_reactome))
head(as.data.frame(down_reactome))
dotplot(up_reactome, showCategory = 15) + ggtitle("Reactome - Upregulated genes")
dotplot(down_reactome, showCategory = 15) + ggtitle("Reactome - Downregulated genes")
Entregar al grupo de cancer gastrico estos tres archivos:
GSE142279_upregulated_SYMBOL.txtGSE142279_downregulated_SYMBOL.txtGSE142279_all_DEGs_SYMBOL.txtCuando recibas las listas del grupo de cancer gastrico, usa https://molbiotools.com/listcompare.php para hallar las intersecciones. Luego puedes hacer enriquecimiento web en Enrichr, DAVID o ShinyGO con los genes compartidos.
Incluye: