1 1. Instalacion de paquetes

Ejecuta este bloque solo la primera vez. Si ya tienes los paquetes instalados, puedes saltarlo.

install.packages(c("tidyverse", "pheatmap", "ggrepel", "BiocManager"))

BiocManager::install(c(
  "DESeq2", "AnnotationDbi", "org.Hs.eg.db", "clusterProfiler", "ReactomePA"
), ask = FALSE, update = FALSE)

2 2. Cargar librerias

library(DESeq2)
library(tidyverse)
library(pheatmap)
library(ggrepel)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(clusterProfiler)
library(ReactomePA)

3 3. Cargar archivos

Guarda este Rmd en la misma carpeta donde estan:

count_file <- "C:/Users/LETIZIA/Documents/rstudio/count_matrix_GSE142279.csv"
coldata_file <- "C:/Users/LETIZIA/Documents/rstudio/colData_GSE142279.csv"

counts_raw <- read.csv(count_file, row.names = 1, check.names = FALSE)
coldata <- read.csv(coldata_file, row.names = 1, check.names = FALSE)

# Verificacion basica
dim(counts_raw)
## [1] 39376    40
dim(coldata)
## [1] 40  1
table(coldata$condition)
## 
## colorectal_cancer           control 
##                20                20
head(counts_raw[, 1:4])
head(coldata)

4 4. Ordenar muestras y preparar metadata

# Asegurar que el orden de columnas de counts coincida con filas de coldata
stopifnot(all(colnames(counts_raw) %in% rownames(coldata)))
coldata <- coldata[colnames(counts_raw), , drop = FALSE]
stopifnot(all(colnames(counts_raw) == rownames(coldata)))

# Definir control como referencia
coldata$condition <- factor(coldata$condition, levels = c("control", "colorectal_cancer"))
table(coldata$condition)
## 
##           control colorectal_cancer 
##                20                20
# Redondear por seguridad y convertir a matriz
counts <- round(as.matrix(counts_raw))
storage.mode(counts) <- "integer"

5 5. Analisis de expresion diferencial con DESeq2

La comparacion queda: colorectal_cancer vs control.

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = coldata,
  design = ~ condition
)

# Filtrar genes con muy pocos conteos
dds <- dds[rowSums(counts(dds)) >= 10, ]

dds <- DESeq(dds)
res <- results(dds, contrast = c("condition", "colorectal_cancer", "control"))
res <- lfcShrink(dds, contrast = c("condition", "colorectal_cancer", "control"), res = res, type = "ashr")

res_df <- as.data.frame(res) %>%
  rownames_to_column("ENTREZID") %>%
  arrange(padj)

summary(res)
## 
## out of 31508 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 7185, 23%
## LFC < 0 (down)     : 7639, 24%
## outliers [1]       : 0, 0%
## low counts [2]     : 1230, 3.9%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
head(res_df)

6 6. Anotar genes

annotation <- AnnotationDbi::select(
  org.Hs.eg.db,
  keys = res_df$ENTREZID,
  keytype = "ENTREZID",
  columns = c("SYMBOL", "GENENAME")
) %>%
  distinct(ENTREZID, .keep_all = TRUE)

res_annot <- res_df %>%
  left_join(annotation, by = "ENTREZID") %>%
  relocate(SYMBOL, GENENAME, .after = ENTREZID)

head(res_annot)

7 7. Clasificar DEGs: upregulated y downregulated

Criterios usados:

alpha <- 0.05
lfc_cutoff <- 1

res_annot <- res_annot %>%
  mutate(
    regulation = case_when(
      !is.na(padj) & padj < alpha & log2FoldChange >= lfc_cutoff ~ "Upregulated",
      !is.na(padj) & padj < alpha & log2FoldChange <= -lfc_cutoff ~ "Downregulated",
      TRUE ~ "Not significant"
    )
  )

table(res_annot$regulation)
## 
##   Downregulated Not significant     Upregulated 
##            3943           24789            2784
up_genes <- res_annot %>% filter(regulation == "Upregulated")
down_genes <- res_annot %>% filter(regulation == "Downregulated")
deg_genes <- res_annot %>% filter(regulation != "Not significant")

head(up_genes)
head(down_genes)

8 8. Guardar resultados

write.csv(res_annot, "GSE142279_DESeq2_all_results.csv", row.names = FALSE)
write.csv(up_genes, "GSE142279_upregulated_genes.csv", row.names = FALSE)
write.csv(down_genes, "GSE142279_downregulated_genes.csv", row.names = FALSE)
write.csv(deg_genes, "GSE142279_all_DEGs.csv", row.names = FALSE)

# Listas sencillas para compartir con el grupo de cancer gastrico o pegar en MolBioTools
writeLines(na.omit(up_genes$SYMBOL), "GSE142279_upregulated_SYMBOL.txt")
writeLines(na.omit(down_genes$SYMBOL), "GSE142279_downregulated_SYMBOL.txt")
writeLines(na.omit(deg_genes$SYMBOL), "GSE142279_all_DEGs_SYMBOL.txt")

9 9. PCA

vsd <- vst(dds, blind = FALSE)

pca_data <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
percent_var <- round(100 * attr(pca_data, "percentVar"))

ggplot(pca_data, aes(PC1, PC2, color = condition)) +
  geom_point(size = 3) +
  labs(
    title = "PCA - GSE142279",
    x = paste0("PC1: ", percent_var[1], "% varianza"),
    y = paste0("PC2: ", percent_var[2], "% varianza")
  ) +
  theme_minimal()

10 10. Volcano plot

volcano_df <- res_annot %>%
  mutate(
    neg_log10_padj = -log10(padj),
    label = ifelse(regulation != "Not significant" & !is.na(SYMBOL), SYMBOL, NA)
  )

top_labels <- volcano_df %>%
  filter(regulation != "Not significant") %>%
  arrange(padj) %>%
  slice_head(n = 20)

ggplot(volcano_df, aes(x = log2FoldChange, y = neg_log10_padj, color = regulation)) +
  geom_point(alpha = 0.7, size = 1.5) +
  geom_vline(xintercept = c(-lfc_cutoff, lfc_cutoff), linetype = "dashed") +
  geom_hline(yintercept = -log10(alpha), linetype = "dashed") +
  geom_text_repel(data = top_labels, aes(label = SYMBOL), max.overlaps = 20, size = 3) +
  labs(
    title = "Volcano plot - Cancer colorrectal vs control",
    x = "log2 Fold Change",
    y = "-log10 adjusted p-value"
  ) +
  theme_minimal()

11 11. Heatmap de genes diferencialmente expresados

top_deg_ids <- deg_genes %>%
  arrange(padj) %>%
  slice_head(n = 50) %>%
  pull(ENTREZID)

mat <- assay(vsd)[top_deg_ids, ]
mat <- mat - rowMeans(mat)

annotation_col <- as.data.frame(colData(vsd)[, "condition", drop = FALSE])

pheatmap(
  mat,
  annotation_col = annotation_col,
  show_rownames = FALSE,
  show_colnames = FALSE,
  main = "Top 50 DEGs - GSE142279"
)

12 12. Enriquecimiento funcional GO

up_entrez <- up_genes %>% pull(ENTREZID) %>% na.omit() %>% unique()
down_entrez <- down_genes %>% pull(ENTREZID) %>% na.omit() %>% unique()

up_go <- enrichGO(
  gene = up_entrez,
  OrgDb = org.Hs.eg.db,
  keyType = "ENTREZID",
  ont = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.2,
  readable = TRUE
)

down_go <- enrichGO(
  gene = down_entrez,
  OrgDb = org.Hs.eg.db,
  keyType = "ENTREZID",
  ont = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.2,
  readable = TRUE
)

head(as.data.frame(up_go))
head(as.data.frame(down_go))
dotplot(up_go, showCategory = 15) + ggtitle("GO BP - Upregulated genes")

dotplot(down_go, showCategory = 15) + ggtitle("GO BP - Downregulated genes")

13 13. Enriquecimiento Reactome

up_reactome <- enrichPathway(
  gene = up_entrez,
  organism = "human",
  pvalueCutoff = 0.05,
  readable = TRUE
)

down_reactome <- enrichPathway(
  gene = down_entrez,
  organism = "human",
  pvalueCutoff = 0.05,
  readable = TRUE
)

head(as.data.frame(up_reactome))
head(as.data.frame(down_reactome))
dotplot(up_reactome, showCategory = 15) + ggtitle("Reactome - Upregulated genes")

dotplot(down_reactome, showCategory = 15) + ggtitle("Reactome - Downregulated genes")

14 14. Archivos para comparar con el grupo de cancer gastrico

Entregar al grupo de cancer gastrico estos tres archivos:

Cuando recibas las listas del grupo de cancer gastrico, usa https://molbiotools.com/listcompare.php para hallar las intersecciones. Luego puedes hacer enriquecimiento web en Enrichr, DAVID o ShinyGO con los genes compartidos.

15 15. Ideas para la infografia

Incluye:

  1. Pregunta biologica: comparacion cancer colorrectal vs control.
  2. Dataset: GSE142279, 20 controles y 20 cancer colorrectal.
  3. Metodo: DESeq2, padj < 0.05 y abs(log2FC) >= 1.
  4. Resultados: numero de genes upregulated y downregulated.
  5. Figuras: PCA, volcano plot y heatmap.
  6. Interpretacion: procesos/vias enriquecidas en genes upregulated y downregulated.
  7. Comparacion con cancer gastrico: genes comunes y vias compartidas.