1 1. Instalacion de paquetes
2 2. Cargar librerias
3 3. Cargar archivos
4 4. Ordenar muestras y preparar metadata
5 5. Analisis de expresion diferencial con DESeq2
6 6. Anotar genes
7 7. Clasificar DEGs: upregulated y downregulated
8 8. Guardar resultados
9 9. PCA
10 10. Volcano plot
11 11. Heatmap de genes diferencialmente expresados
12 12. Enriquecimiento funcional GO
13 13. Enriquecimiento Reactome
14 14. Archivos para comparar con el grupo de cancer gastrico
15 15. Ideas para la infografia

1 1. Instalacion de paquetes

Ejecuta este bloque solo la primera vez. Si ya tienes los paquetes instalados, puedes saltarlo.

install.packages(c("tidyverse", "pheatmap", "ggrepel", "BiocManager"))

BiocManager::install(c(
  "DESeq2", "AnnotationDbi", "org.Hs.eg.db", "clusterProfiler", "ReactomePA"
), ask = FALSE, update = FALSE)

2 2. Cargar librerias

library(DESeq2)
library(tidyverse)
library(pheatmap)
library(ggrepel)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(clusterProfiler)
library(ReactomePA)

3 3. Cargar archivos

Guarda este Rmd en la misma carpeta donde estan:

count_matrix_GSE142279(1).csv
colData_GSE142279(1).csv

count_file <- "C:/Users/LETIZIA/Documents/rstudio/count_matrix_GSE142279.csv"
coldata_file <- "C:/Users/LETIZIA/Documents/rstudio/colData_GSE142279.csv"

counts_raw <- read.csv(count_file, row.names = 1, check.names = FALSE)
coldata <- read.csv(coldata_file, row.names = 1, check.names = FALSE)

# Verificacion basica
dim(counts_raw)

## [1] 39376    40

dim(coldata)

## [1] 40  1

table(coldata$condition)

## 
## colorectal_cancer           control 
##                20                20

head(counts_raw[, 1:4])

head(coldata)

4 4. Ordenar muestras y preparar metadata

# Asegurar que el orden de columnas de counts coincida con filas de coldata
stopifnot(all(colnames(counts_raw) %in% rownames(coldata)))
coldata <- coldata[colnames(counts_raw), , drop = FALSE]
stopifnot(all(colnames(counts_raw) == rownames(coldata)))

# Definir control como referencia
coldata$condition <- factor(coldata$condition, levels = c("control", "colorectal_cancer"))
table(coldata$condition)

## 
##           control colorectal_cancer 
##                20                20

# Redondear por seguridad y convertir a matriz
counts <- round(as.matrix(counts_raw))
storage.mode(counts) <- "integer"

5 5. Analisis de expresion diferencial con DESeq2

La comparacion queda: colorectal_cancer vs control.

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = coldata,
  design = ~ condition
)

# Filtrar genes con muy pocos conteos
dds <- dds[rowSums(counts(dds)) >= 10, ]

dds <- DESeq(dds)
res <- results(dds, contrast = c("condition", "colorectal_cancer", "control"))
res <- lfcShrink(dds, contrast = c("condition", "colorectal_cancer", "control"), res = res, type = "ashr")

res_df <- as.data.frame(res) %>%
  rownames_to_column("ENTREZID") %>%
  arrange(padj)

summary(res)

## 
## out of 31508 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 7185, 23%
## LFC < 0 (down)     : 7639, 24%
## outliers [1]       : 0, 0%
## low counts [2]     : 1230, 3.9%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

head(res_df)

6 6. Anotar genes

annotation <- AnnotationDbi::select(
  org.Hs.eg.db,
  keys = res_df$ENTREZID,
  keytype = "ENTREZID",
  columns = c("SYMBOL", "GENENAME")
) %>%
  distinct(ENTREZID, .keep_all = TRUE)

res_annot <- res_df %>%
  left_join(annotation, by = "ENTREZID") %>%
  relocate(SYMBOL, GENENAME, .after = ENTREZID)

head(res_annot)

7 7. Clasificar DEGs: upregulated y downregulated

Criterios usados:

Gen significativo: padj < 0.05
Cambio biologicamente relevante: abs(log2FoldChange) >= 1
Upregulated: mayor expresion en cancer colorrectal que en control
Downregulated: menor expresion en cancer colorrectal que en control

alpha <- 0.05
lfc_cutoff <- 1

res_annot <- res_annot %>%
  mutate(
    regulation = case_when(
      !is.na(padj) & padj < alpha & log2FoldChange >= lfc_cutoff ~ "Upregulated",
      !is.na(padj) & padj < alpha & log2FoldChange <= -lfc_cutoff ~ "Downregulated",
      TRUE ~ "Not significant"
    )
  )

table(res_annot$regulation)

## 
##   Downregulated Not significant     Upregulated 
##            3943           24789            2784

up_genes <- res_annot %>% filter(regulation == "Upregulated")
down_genes <- res_annot %>% filter(regulation == "Downregulated")
deg_genes <- res_annot %>% filter(regulation != "Not significant")

head(up_genes)

head(down_genes)

8 8. Guardar resultados

write.csv(res_annot, "GSE142279_DESeq2_all_results.csv", row.names = FALSE)
write.csv(up_genes, "GSE142279_upregulated_genes.csv", row.names = FALSE)
write.csv(down_genes, "GSE142279_downregulated_genes.csv", row.names = FALSE)
write.csv(deg_genes, "GSE142279_all_DEGs.csv", row.names = FALSE)

# Listas sencillas para compartir con el grupo de cancer gastrico o pegar en MolBioTools
writeLines(na.omit(up_genes$SYMBOL), "GSE142279_upregulated_SYMBOL.txt")
writeLines(na.omit(down_genes$SYMBOL), "GSE142279_downregulated_SYMBOL.txt")
writeLines(na.omit(deg_genes$SYMBOL), "GSE142279_all_DEGs_SYMBOL.txt")

9 9. PCA

vsd <- vst(dds, blind = FALSE)

pca_data <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
percent_var <- round(100 * attr(pca_data, "percentVar"))

ggplot(pca_data, aes(PC1, PC2, color = condition)) +
  geom_point(size = 3) +
  labs(
    title = "PCA - GSE142279",
    x = paste0("PC1: ", percent_var[1], "% varianza"),
    y = paste0("PC2: ", percent_var[2], "% varianza")
  ) +
  theme_minimal()

10 10. Volcano plot

volcano_df <- res_annot %>%
  mutate(
    neg_log10_padj = -log10(padj),
    label = ifelse(regulation != "Not significant" & !is.na(SYMBOL), SYMBOL, NA)
  )

top_labels <- volcano_df %>%
  filter(regulation != "Not significant") %>%
  arrange(padj) %>%
  slice_head(n = 20)

ggplot(volcano_df, aes(x = log2FoldChange, y = neg_log10_padj, color = regulation)) +
  geom_point(alpha = 0.7, size = 1.5) +
  geom_vline(xintercept = c(-lfc_cutoff, lfc_cutoff), linetype = "dashed") +
  geom_hline(yintercept = -log10(alpha), linetype = "dashed") +
  geom_text_repel(data = top_labels, aes(label = SYMBOL), max.overlaps = 20, size = 3) +
  labs(
    title = "Volcano plot - Cancer colorrectal vs control",
    x = "log2 Fold Change",
    y = "-log10 adjusted p-value"
  ) +
  theme_minimal()

11 11. Heatmap de genes diferencialmente expresados

top_deg_ids <- deg_genes %>%
  arrange(padj) %>%
  slice_head(n = 50) %>%
  pull(ENTREZID)

mat <- assay(vsd)[top_deg_ids, ]
mat <- mat - rowMeans(mat)

annotation_col <- as.data.frame(colData(vsd)[, "condition", drop = FALSE])

pheatmap(
  mat,
  annotation_col = annotation_col,
  show_rownames = FALSE,
  show_colnames = FALSE,
  main = "Top 50 DEGs - GSE142279"
)

12 12. Enriquecimiento funcional GO

up_entrez <- up_genes %>% pull(ENTREZID) %>% na.omit() %>% unique()
down_entrez <- down_genes %>% pull(ENTREZID) %>% na.omit() %>% unique()

up_go <- enrichGO(
  gene = up_entrez,
  OrgDb = org.Hs.eg.db,
  keyType = "ENTREZID",
  ont = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.2,
  readable = TRUE
)

down_go <- enrichGO(
  gene = down_entrez,
  OrgDb = org.Hs.eg.db,
  keyType = "ENTREZID",
  ont = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.2,
  readable = TRUE
)

head(as.data.frame(up_go))

head(as.data.frame(down_go))

dotplot(up_go, showCategory = 15) + ggtitle("GO BP - Upregulated genes")

dotplot(down_go, showCategory = 15) + ggtitle("GO BP - Downregulated genes")

13 13. Enriquecimiento Reactome

up_reactome <- enrichPathway(
  gene = up_entrez,
  organism = "human",
  pvalueCutoff = 0.05,
  readable = TRUE
)

down_reactome <- enrichPathway(
  gene = down_entrez,
  organism = "human",
  pvalueCutoff = 0.05,
  readable = TRUE
)

head(as.data.frame(up_reactome))

head(as.data.frame(down_reactome))

dotplot(up_reactome, showCategory = 15) + ggtitle("Reactome - Upregulated genes")

dotplot(down_reactome, showCategory = 15) + ggtitle("Reactome - Downregulated genes")

14 14. Archivos para comparar con el grupo de cancer gastrico

Entregar al grupo de cancer gastrico estos tres archivos:

GSE142279_upregulated_SYMBOL.txt
GSE142279_downregulated_SYMBOL.txt
GSE142279_all_DEGs_SYMBOL.txt

Cuando recibas las listas del grupo de cancer gastrico, usa https://molbiotools.com/listcompare.php para hallar las intersecciones. Luego puedes hacer enriquecimiento web en Enrichr, DAVID o ShinyGO con los genes compartidos.

15 15. Ideas para la infografia

Incluye:

Pregunta biologica: comparacion cancer colorrectal vs control.
Dataset: GSE142279, 20 controles y 20 cancer colorrectal.
Metodo: DESeq2, padj < 0.05 y abs(log2FC) >= 1.
Resultados: numero de genes upregulated y downregulated.
Figuras: PCA, volcano plot y heatmap.
Interpretacion: procesos/vias enriquecidas en genes upregulated y downregulated.
Comparacion con cancer gastrico: genes comunes y vias compartidas.

Grupo 4 - RNA-Seq GSE142279: Cancer colorrectal

Letizia, Nicolas, Natalia, Juan Camilo

2026-04-30