require(knitr)
require(ggplot2)
require(dplyr)
require(tidyr)
require(rlang)
require(ggpubr)
require(gridExtra)
require(devtools)
require(matrixStats)
require(ConsensusTME)
require(pheatmap)
require(nortest)
require(car)
setwd(params$wd)
load("CRC_filt.RData")

Clasificación y exploración inicial de los datos

# Figura 2A
# Gráfico de barras para la distribución de las muestras en los distintos grupos

# jpeg("Figura2.jpeg")

clin %>%
  select(msi_imputed, cms, dataset, stage, group) %>%
  # Formato ancho
  gather(key = "variable", value = "value") %>%
  # Agrupación por variable y valor para contar frecuencias
  group_by(variable, value) %>%
  # Resumen de los datos / conteo
  summarise(count = n(), .groups = "drop") %>%
  # Agrupación por variable para calcular porcentajes dentro de cada grupo
  group_by(variable) %>%
  # Calcular porcentaje y ordenar factores
  mutate(percent = count / sum(count) * 100,
         variable = factor(variable, levels = c("msi_imputed", "cms", "stage", "group", "dataset")),
         value = factor(value, levels = unique(value))) %>%
  # Gráfico de barras
  ggplot(aes(x = variable, y = percent, fill = value)) +
  geom_bar(stat = "identity") +
  # Personalizar colores, leyenda, ejes y título
  scale_fill_manual(values = c("#FFFACD", "#FFD700", "#FFA07A", "#FF6347","#FFB6C1", "#FF6281", "#FF69B4", "#FF1493", "#C71585", "#DDA0DD", "#5F9EA0", "#B0E0E6", 
                               "#B0C4DE", "#ADD8E6", "#006400", "#9ACD32", "#4682B4", "#4169E1"),
                    guide = guide_legend(keyheight = 0.8, keywidth = 0.8)) +
  scale_x_discrete(labels = c("msi_imputed" = "MSI/MSS", "cms" = "CMS", "stage" = "Estadío", "group" = "Grupo", "dataset" = "Conjunto de Datos")) +
  labs(title = "Distribución porcentual de las muestras",
       x = "",
       y = "Porcentaje (%)",
       fill = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# dev.off()
# Filtrar la matriz de expresión por los genes que contienen "HLA"
ext <- t(ex)
hla <- as.data.frame(ext[, grep("HLA-", colnames(ext))])
hla <- hla[complete.cases(hla), ]
# Añadir la columna ID al dataframe
hla$ID <- rownames(hla)

# Sustituir - por _ en los genes
colnames(hla) <- gsub("-", "_", colnames(hla))
colnames(hla)
 [1] "HLA_DOB"  "HLA_E"    "HLA_DPB1" "HLA_DQA1" "HLA_DMB"  "HLA_DRB1"
 [7] "HLA_F"    "HLA_DOA"  "HLA_B"    "HLA_C"    "HLA_DRA"  "HLA_DQB1"
[13] "HLA_DRB5" "HLA_G"    "HLA_DPA1" "HLA_DQA2" "HLA_A"    "HLA_DMA" 
[19] "HLA_DPB2" "ID"      
# Unir dataframes de datos clínicos y expresión de HLA por la columna ID
aux <- merge(hla, clin, by = "ID")

# Cambiar nombres de las columnas
colnames(aux)[colnames(aux) == "msi_imputed"] <- "MSI_MSS"
colnames(aux)[colnames(aux) == "cms"] <- "CMS"
colnames(aux)[colnames(aux) == "group"] <- "Grupo"
colnames(aux)[colnames(aux) == "stage"] <- "Estadío"
colnames(aux)[colnames(aux) == "dataset"] <- "Dataset"

Análisis descriptivo y comparación de HLA-I y HLA-II

# Medias HLA para cada gen/grupo

# Listas de genes y grupos con los que trabajaré
genes <- c("HLA_A", "HLA_B", "HLA_C", "HLA_E", "HLA_F", "HLA_G", "HLA_DPA1", 
           "HLA_DPB1", "HLA_DPB2", "HLA_DQA1", "HLA_DQA2", "HLA_DQB1", "HLA_DRA", 
           "HLA_DRB1", "HLA_DRB5", "HLA_DOA", "HLA_DOB", "HLA_DMA", "HLA_DMB")
grupos <- c("MSI", "MSS", "CMS1", "CMS2", "CMS3", "CMS4", "CMS_NA", "II", "III",
            "Stage II, MSI", "Stage II, MSS", "Stage III, MSS", "Stage III, MSI",
            "CLX", "GSE13294", "GSE14333", "GSE17536", "GSE39582", "TCGA")

# Crear dataframe vacío para almacenar las medias
medias <- data.frame(matrix(ncol = length(genes), nrow = length(grupos)))
colnames(medias) <- genes
rownames(medias) <- grupos

# Función para calcular la media de cada gen en los diferentes grupos
summary_groups <- function(dataset, gen) {
  result <- list(
    MSI_MSS = dataset %>%
      group_by(MSI_MSS) %>%
      summarise(mean = mean(!!sym(gen))),
    # Gestionar NAs en CMS
    CMS = dataset %>%
      group_by(CMS) %>%
      summarise(mean = mean(!!sym(gen), na.rm = TRUE)),
    
    Estadío = dataset %>%
      group_by(Estadío) %>%
      summarise(mean = mean(!!sym(gen))),
    
    Grupo = dataset %>%
      group_by(Grupo) %>%
      summarise(mean = mean(!!sym(gen))),
    
    Dataset = dataset %>%
      group_by(Dataset) %>%
      summarise(mean = mean(!!sym(gen)))
  )
  
  means_values <- unlist(lapply(result, function(x) x$mean))
}

# Aplicar la función en bucle a cada gen HLA
for (i in 1:length(genes)) {
  gen <- genes[i]
  means_values <- summary_groups(aux, gen)
  medias[, i] <- means_values
}

# write.csv(round(medias,2), "medias.csv", row.names = TRUE)
# Normalidad 

sapply(aux[, genes], function(x) tapply(x, aux$MSI_MSS, function(g) lillie.test(g)$p.value))
           HLA_A        HLA_B        HLA_C     HLA_E        HLA_F        HLA_G
MSI 3.066634e-08 1.203296e-09 2.026948e-07 0.2283573 0.0004550275 0.0009773251
MSS 3.612352e-10 2.933637e-07 2.032924e-02 0.1901726 0.0001154771 0.0510795467
     HLA_DPA1    HLA_DPB1     HLA_DPB2     HLA_DQA1     HLA_DQA2     HLA_DQB1
MSI 0.0530483 0.007059809 3.603106e-03 6.502251e-04 0.1566153971 1.315557e-02
MSS 0.2888034 0.682543336 7.518458e-05 3.064773e-09 0.0003692047 5.708433e-07
         HLA_DRA   HLA_DRB1     HLA_DRB5      HLA_DOA     HLA_DOB    HLA_DMA
MSI 1.443613e-03 0.06232875 7.465578e-03 3.091942e-05 0.067113780 0.44246025
MSS 1.973161e-05 0.16890279 9.775493e-05 1.669563e-05 0.001989744 0.02308267
        HLA_DMB
MSI 0.087214095
MSS 0.001833143
sapply(aux[, genes], function(x) tapply(x, aux$Estadío, function(g) lillie.test(g)$p.value))
           HLA_A        HLA_B       HLA_C     HLA_E       HLA_F       HLA_G
II  1.225427e-16 1.271567e-11 0.000177941 0.2023203 0.001178098 0.078816748
III 1.160353e-01 1.296040e-02 0.066025059 0.6693122 0.484455912 0.006492417
     HLA_DPA1  HLA_DPB1     HLA_DPB2     HLA_DQA1    HLA_DQA2    HLA_DQB1
II  0.8209178 0.4512720 4.817419e-09 4.119761e-05 0.009657786 0.002589807
III 0.7062662 0.6433521 7.326587e-02 5.704059e-07 0.204551682 0.013148893
         HLA_DRA  HLA_DRB1     HLA_DRB5      HLA_DOA      HLA_DOB      HLA_DMA
II  0.0004857543 0.2593195 1.866009e-06 2.100874e-09 1.450688e-05 6.704813e-05
III 0.0354632806 0.9361247 5.545616e-03 9.677200e-03 1.229612e-01 2.232885e-02
        HLA_DMB
II  0.001490145
III 0.035982025
sapply(aux[, genes], function(x) tapply(x, aux$Grupo, function(g) lillie.test(g)$p.value))
                      HLA_A        HLA_B        HLA_C     HLA_E        HLA_F
Stage II, MSI  4.080332e-07 1.851513e-09 2.169721e-07 0.5348830 0.0003028999
Stage II, MSS  1.283128e-10 3.565077e-08 4.776726e-05 0.3628411 0.0033233221
Stage III, MSI 4.181368e-01 2.538514e-01 5.046414e-01 0.5512281 0.5331002274
Stage III, MSS 2.119283e-01 3.247516e-02 7.374305e-02 0.7007075 0.0916603722
                    HLA_G  HLA_DPA1    HLA_DPB1     HLA_DPB2     HLA_DQA1
Stage II, MSI  0.01881932 0.3205664 0.317888609 0.0018550216 7.286964e-02
Stage II, MSS  0.34083777 0.5259529 0.675124922 0.0001024912 3.255213e-04
Stage III, MSI 0.05810077 0.1207921 0.003648765 0.2972270054 6.897370e-03
Stage III, MSS 0.03648526 0.7232945 0.694119925 0.0638658092 3.685802e-06
                  HLA_DQA2     HLA_DQB1      HLA_DRA   HLA_DRB1     HLA_DRB5
Stage II, MSI  0.434733470 0.0078764664 1.846492e-02 0.30132740 0.0547344832
Stage II, MSS  0.000992465 0.0003500413 3.247153e-03 0.27283497 0.0000047213
Stage III, MSI 0.060050278 0.0267040452 6.410727e-05 0.08475466 0.0550592876
Stage III, MSS 0.079895402 0.0047985771 4.486172e-03 0.69922942 0.0244555191
                    HLA_DOA     HLA_DOB     HLA_DMA    HLA_DMB
Stage II, MSI  7.992422e-05 0.012549694 0.668478458 0.16011643
Stage II, MSS  2.161703e-06 0.002386683 0.001628236 0.01230053
Stage III, MSI 1.576862e-01 0.269682621 0.591995791 0.12081174
Stage III, MSS 7.584833e-02 0.157800753 0.491511353 0.24079062
sapply(aux[, genes], function(x) tapply(x, aux$CMS, function(g) lillie.test(g)$p.value))
            HLA_A        HLA_B        HLA_C      HLA_E        HLA_F
CMS1 7.122690e-07 6.206065e-08 3.992279e-06 0.03858435 0.0004192949
CMS2 2.387462e-03 3.754498e-02 5.029994e-01 0.85818361 0.0367717454
CMS3 1.630757e-10 1.274167e-02 2.834644e-01 0.78115287 0.5684743685
CMS4 5.224773e-02 2.306417e-03 2.003745e-01 0.21581476 0.1035650182
            HLA_G   HLA_DPA1    HLA_DPB1     HLA_DPB2     HLA_DQA1     HLA_DQA2
CMS1 0.0001532326 0.01134752 0.001008081 0.0284953181 3.298449e-05 2.737951e-01
CMS2 0.1072791131 0.41055425 0.593734908 0.0592733323 4.242202e-04 4.779283e-05
CMS3 0.7120072565 0.11792748 0.492627817 0.0503281914 7.443973e-02 3.070700e-01
CMS4 0.1339335567 0.19312608 0.044405869 0.0002891225 8.319083e-03 5.910872e-01
         HLA_DQB1      HLA_DRA   HLA_DRB1   HLA_DRB5     HLA_DOA    HLA_DOB
CMS1 1.195923e-02 0.0004205063 0.06103669 0.22986092 0.005307294 0.11540619
CMS2 8.129524e-05 0.1850794334 0.68234760 0.00175403 0.001263572 0.05334843
CMS3 4.890220e-01 0.0177216328 0.79270127 0.58058795 0.156870194 0.25150044
CMS4 1.139983e-01 0.5935144401 0.67082272 0.06081642 0.004831622 0.65882559
        HLA_DMA    HLA_DMB
CMS1 0.08809516 0.02505854
CMS2 0.01530076 0.42044963
CMS3 0.03734524 0.66174243
CMS4 0.36261993 0.16630600
# Homocedasticidad

sapply(aux[, genes], function(x) leveneTest(x ~ aux$MSI_MSS)$`Pr(>F)`[1])
       HLA_A        HLA_B        HLA_C        HLA_E        HLA_F        HLA_G 
5.534649e-04 2.458906e-04 1.271250e-02 4.324152e-01 1.277413e-05 4.682859e-03 
    HLA_DPA1     HLA_DPB1     HLA_DPB2     HLA_DQA1     HLA_DQA2     HLA_DQB1 
6.063271e-02 1.237921e-01 3.337868e-02 1.659092e-05 9.136438e-02 6.543415e-03 
     HLA_DRA     HLA_DRB1     HLA_DRB5      HLA_DOA      HLA_DOB      HLA_DMA 
8.240220e-01 2.325025e-02 1.877575e-02 6.492622e-04 8.330950e-04 2.389576e-07 
     HLA_DMB 
2.286031e-04 
sapply(aux[, genes], function(x) leveneTest(x ~ aux$Estadío)$`Pr(>F)`[1])
       HLA_A        HLA_B        HLA_C        HLA_E        HLA_F        HLA_G 
0.5525974279 0.3430180378 0.1711205050 0.9273933496 0.0685203054 0.7309293701 
    HLA_DPA1     HLA_DPB1     HLA_DPB2     HLA_DQA1     HLA_DQA2     HLA_DQB1 
0.1621512945 0.5886737976 0.7625293652 0.0003269449 0.0392597247 0.4268537192 
     HLA_DRA     HLA_DRB1     HLA_DRB5      HLA_DOA      HLA_DOB      HLA_DMA 
0.5873365337 0.7457327339 0.8083488016 0.4731056352 0.7497481797 0.0341973951 
     HLA_DMB 
0.3520702900 
sapply(aux[, genes], function(x) leveneTest(x ~ aux$Grupo)$`Pr(>F)`[1])
       HLA_A        HLA_B        HLA_C        HLA_E        HLA_F        HLA_G 
5.826855e-03 1.524839e-03 3.685053e-02 8.500460e-01 6.548883e-05 2.109130e-02 
    HLA_DPA1     HLA_DPB1     HLA_DPB2     HLA_DQA1     HLA_DQA2     HLA_DQB1 
3.379405e-02 5.841866e-02 1.336806e-01 1.426941e-06 1.843726e-02 5.772216e-04 
     HLA_DRA     HLA_DRB1     HLA_DRB5      HLA_DOA      HLA_DOB      HLA_DMA 
7.691261e-02 8.386636e-03 2.060704e-02 1.883003e-03 2.183706e-02 1.163259e-06 
     HLA_DMB 
8.509403e-05 
sapply(aux[, genes], function(x) leveneTest(x ~ aux$CMS)$`Pr(>F)`[1])
       HLA_A        HLA_B        HLA_C        HLA_E        HLA_F        HLA_G 
4.484204e-06 7.395708e-03 2.325005e-02 4.577231e-02 2.594631e-04 2.683716e-02 
    HLA_DPA1     HLA_DPB1     HLA_DPB2     HLA_DQA1     HLA_DQA2     HLA_DQB1 
9.057584e-04 6.814962e-05 3.624965e-06 1.545190e-10 4.056387e-03 1.038567e-04 
     HLA_DRA     HLA_DRB1     HLA_DRB5      HLA_DOA      HLA_DOB      HLA_DMA 
5.048597e-03 2.677954e-04 3.526353e-06 9.753256e-15 3.199983e-08 8.960718e-13 
     HLA_DMB 
1.533476e-12 
# Tabla 1
# Kruskal para cada gen/grupo

# Grupos de interés
grupos_ag <- c("MSI_MSS", "CMS", "Estadío", "Grupo", "Dataset")

# Dataframe para guardar resultados
kruskal_sig <- data.frame(matrix(ncol = length(genes), nrow = 5))
colnames(kruskal_sig) <- genes
rownames(kruskal_sig) <- grupos_ag

# Función kruskal
kruskal <- function(dataset, gen){
  result <- list(
    # Aplicar el test de kruskal a can gen ~ grupo
    # Guardar el p-valor
    kruskal.test(as.formula(paste(gen, "~ MSI_MSS")), dataset)$p.value,
    kruskal.test(as.formula(paste(gen, "~ CMS")), dataset)$p.value,
    kruskal.test(as.formula(paste(gen, "~ Estadío")), dataset)$p.value,
    kruskal.test(as.formula(paste(gen, "~ Grupo")), dataset)$p.value,
    kruskal.test(as.formula(paste(gen, "~ Dataset")), dataset)$p.value
  )
  # Convertir el resultado en vector
  result <- unlist(result)
}

# Aplicar la función en bucle a cada gen
# Ir añadiendo los resultados a la tabla
for (i in 1:length(genes)) {
  gen <- genes[i]
  p_values <- kruskal(aux, gen)
 kruskal_sig[, i] <- p_values
}

# Sustituir los valores de la tabla por NS, *, ** o *** 
 kruskal_sig <- apply(kruskal_sig, c(1, 2), function(p)  {
    if (p > 0.05) {
      return("NS")
    } else if (p <= 0.05 && p > 0.01) {
      return("*") 
    } else if (p <= 0.01 && p > 0.001) {
      return("**")  
    } else {
      return("***")
    }
  })
 
as.data.frame(kruskal_sig)
        HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1
MSI_MSS     *   ***   ***   ***   ***   ***      ***      ***      ***      ***
CMS       ***   ***   ***   ***   ***   ***      ***      ***      ***      ***
Estadío    NS    NS    NS    NS    NS    NS       NS       NS       NS       NS
Grupo      NS    **   ***   ***   ***   ***      ***      ***      ***      ***
Dataset    NS    NS    NS    NS    NS    NS       NS       NS        *       NS
        HLA_DQA2 HLA_DQB1 HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
MSI_MSS      ***      ***     ***      ***      ***     ***     ***     ***
CMS          ***      ***     ***      ***      ***     ***     ***     ***
Estadío       NS       NS      NS       NS       NS      NS      NS      NS
Grupo        ***      ***     ***      ***      ***     ***     ***     ***
Dataset       NS       NS      NS       NS       NS      NS      NS      NS
        HLA_DMB
MSI_MSS     ***
CMS         ***
Estadío      NS
Grupo       ***
Dataset      NS
# write.csv(t(as.data.frame(kruskal_sig)), "kruskal.csv", row.names = TRUE)
# Función comparación medias 2 a 2

comparaciones_significativas <- function(dataset, gen, grupo) {
  # Obtener los niveles del grupo
  dataset <- dataset %>% filter(!is.na(.data[[grupo]]), !is.na(.data[[gen]]))
  niveles_grupo <- unique(dataset[[grupo]])
  # Generar todas las combinaciones posibles de pares de niveles
  pares_niveles <- combn(niveles_grupo, 2, simplify = FALSE)
  # Inicializar la lista para almacenar comparaciones significativas
  comparisons_ <- list()
  # Realizar la comparación para cada par de niveles
  for (par in pares_niveles) {
    # Filtrar el dataset para solo considerar los dos niveles comparados
    dataset_filtrado <- dataset %>% filter(.data[[grupo]] %in% par)
    # Realizar la comparación
    resultado <- 
      compare_means(as.formula(paste(gen, "~", grupo)), dataset_filtrado)
    # Verificar si la comparación es significativa
    if (any(resultado$p.adj < 0.05)) {
      # Si es significativa, añadir a la lista
      comparisons_[[length(comparisons_) + 1]] <- par
    }
  }
  # Crear el dataset comparisons_ que se añade al Environment
  nombre_variable <- paste0("comparisons_", gen, "_", grupo)
  assign(nombre_variable, comparisons_, envir = .GlobalEnv)
}

# Bucle para generar las comparaciones significativos para cada gen/grupo
for (gen in genes){
  for (g in grupos_ag){
  comparaciones_significativas(aux, gen, g)
  }
}

# Los pares de compraciones significativas que se han generado los uso en el boxplot para indicar para que parejas de valores quiero mostrar el p-valor
# Figuras 6 y 7
# Gráficas boxplot con comparación de medias

# Función boxplot
box_plot <- function(dataset, gen, grupo, my_comparisons) {
  # Filtrar los datos nulos
  ggplot(dataset %>% filter(!is.na(.data[[gen]]), !is.na(.data[[grupo]])),
         # Definir función y tipo de gráfico
         aes_string(x = grupo, y = gen)) + 
    geom_boxplot(color = "black", outlier.shape = NA) +
    geom_jitter(position = position_jitter(0.35), size = 1, 
                aes_string(color = grupo, shape = grupo)) +
    # Definir etiquetas de los ejes
    labs(y = paste0("log2(", gen, ")"), x = grupo) +
    # Diseño del gráfico
    theme_classic() +
    scale_color_manual(values = c("grey80", "black", "grey60", "grey30", "grey80", "black"), name = NULL) +
    scale_shape_manual(values = c(15, 16, 17, 18, 19, 20), name = NULL) +
    theme(plot.title = element_text(hjust = 0.5, size = 10, face = "bold"),
          panel.spacing = unit(1, "lines"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_rect(colour = "white",
                                          fill = NA, size = 0.3),
          strip.background = element_rect(colour = "white", fill = "white")) +
    # Muestro la comparación de medias para las parejas preestablecidas
    stat_compare_means(comparisons = my_comparisons, size = 3.2) +
    stat_compare_means(method = "kruskal.test", label = "p.format",
                       size = 3.5, label.x = Inf, label.y = Inf,
                       hjust = 1, vjust = 1)
}

# Bucle para generar el gráfico para cada gen/grupo
for (gen in genes) {
  for (g in grupos_ag){
    comparisons_variable <- paste0("comparisons_", gen, "_", g)
    print(box_plot(aux, gen, g, get(comparisons_variable)))
  }
}

Inferencia de infiltración inmune

# Inferencia de la infiltración

# install.packages("devtools")
# devtools::install_github("cansysbio/ConsensusTME", force = TRUE)
consensus <- consensusTMEAnalysis(ex, cancer = "COAD", statMethod = "gsva")
Producing ConsensusTME Estimates Using The Following Parameters:
 Statistical Framework: "gsva"
 Gene Sets For Cancer Type: "COAD"
 Sample Size: 1079
consensus <- t(consensus)
consensus <- as.data.frame(consensus)
head(consensus)
           B_cells Cytotoxic_cells Dendritic_cells Eosinophils Macrophages
A2004_T  0.3846003       0.7574401      0.67385739   0.4963582   0.7454817
A2027_T -0.4675443      -0.7212254     -0.57181756  -0.5615878  -0.6001180
A2050_T -0.3017326      -0.3475906     -0.15946193  -0.2075509  -0.2973827
A2096_T -0.4533999      -0.5976738     -0.63175002  -0.6214394  -0.5710004
B2012_T -0.1011278       0.3055700     -0.01241025  -0.1185626  -0.1088708
B2035_T  0.1910087       0.5850891      0.46836995   0.4463827   0.5767133
         Mast_cells   NK_cells Neutrophils T_cells_CD4 T_cells_CD8
A2004_T  0.83617145  0.7196722  0.42038227   0.6765331   0.7075507
A2027_T -0.36807097 -0.3982661 -0.54971682  -0.4959341  -0.5454722
A2050_T -0.46779110 -0.2792276 -0.33565943  -0.2626395  -0.1483148
A2096_T -0.60028582 -0.5119227 -0.47687918  -0.6102899  -0.5970780
B2012_T  0.08368983  0.3687848 -0.09216623   0.2187569   0.2185119
B2035_T  0.40402857  0.4921594  0.63712235   0.4993724   0.4944613
        T_cells_gamma_delta T_regulatory_cells Macrophages_M1 Macrophages_M2
A2004_T          0.70307239         0.68086769     0.75651814      0.6721188
A2027_T         -0.63603256        -0.60666275    -0.65631656     -0.6724274
A2050_T         -0.02407792        -0.08748057    -0.09344834     -0.3061692
A2096_T         -0.61223525        -0.61304529    -0.62665252     -0.6257849
B2012_T          0.33289496         0.23500475     0.02736597     -0.2901599
B2035_T          0.56691249         0.55422403     0.63236466      0.5851547
        Endothelial Fibroblasts  Monocytes Plasma_cells Immune_Score
A2004_T   0.6324214   0.6673454  0.5419472   0.48748390   0.60808841
A2027_T  -0.6840517  -0.6200350 -0.5185551  -0.31564065  -0.52447000
A2050_T  -0.7017832  -0.5607488 -0.3403292  -0.41580284  -0.25857291
A2096_T  -0.5785607  -0.5363877 -0.5519135  -0.44922115  -0.54652336
B2012_T  -0.2501814  -0.6014534 -0.2075248   0.08238409   0.08995631
B2035_T  -0.3318157   0.5212461  0.5666761  -0.01377423   0.45110253
# Nuevo conjunto de datos uniendo esta información al anterior
consensus <- consensus %>%
  tibble::rownames_to_column(var = "ID")

merged_data <- aux %>%
  inner_join(consensus, by = "ID")
# Figura 10
# Boxplots para infiltración
variables <- c("B_cells", "Cytotoxic_cells", "Dendritic_cells", "Eosinophils",
               "Macrophages", "Mast_cells", "NK_cells", "Neutrophils",
               "T_cells_CD4", "T_cells_CD8", "T_cells_gamma_delta",
               "T_regulatory_cells", "Macrophages_M1", "Macrophages_M2",
               "Endothelial", "Fibroblasts", "Monocytes", "Plasma_cells")

# Bucle para generar los gráficos para cada grupo
for (g in grupos_ag) {
  data <- merged_data
  # Gestionar los valores faltantes de CMS
  if (g == "CMS") {
    data <- data %>% drop_na()  
  }
  # Función del gráfico
  plots <- lapply(variables, function(var) {
  ggplot(data, aes(x = .data[[g]], y = .data[[var]]))  + 
      # Tipo de gráfico y diseño
    geom_boxplot(color = "black", outlier.shape = NA) + 
    labs(y = paste(var), x = "") +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5, size = 6, face = "bold"),
          axis.text.x = element_text(size = 5),
          axis.text.y = element_text(size = 6),
          axis.title.y = element_text(size = 8),
          panel.spacing = unit(1, "lines"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_rect(colour = "white",
                                          fill = NA, size = 0.3),
          strip.background = element_rect(colour = "white", fill = "white")) +
      # Añadir comparaciones de medias
    stat_compare_means(method = "kruskal.test", label = "p.format", size = 3, 
                       label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) 
})

grid.arrange(grobs = plots[1:9], ncol = 3, heights = rep(3, 3)) 
grid.arrange(grobs = plots[10:18], ncol = 3, heights = rep(3, 3)) 

}

# Figura 12
# Heatmap

# Genes y células de interés
genes <- c("HLA_A", "HLA_B", "HLA_C", "HLA_E", "HLA_F", "HLA_G", "HLA_DPA1", 
           "HLA_DPB1", "HLA_DPB2", "HLA_DQA1", "HLA_DQA2", "HLA_DQB1", "HLA_DRA", 
           "HLA_DRB1", "HLA_DRB5", "HLA_DOA", "HLA_DOB", "HLA_DMA", "HLA_DMB")

cell_types <- c("B_cells", "Cytotoxic_cells", "Dendritic_cells", "Eosinophils",
                "Macrophages", "Mast_cells", "NK_cells", "Neutrophils",
                "T_cells_CD4", "T_cells_CD8", "T_cells_gamma_delta",
                "T_regulatory_cells", "Macrophages_M1", "Macrophages_M2", 
                "Endothelial", "Fibroblasts", "Monocytes", "Plasma_cells")

# Crear las anotaciones
genes_data <- merged_data[, genes]
cells_data <- merged_data[, cell_types]
group_annotation <- 
  data.frame(Grupo = rep(c("II/MSS", "II/MSI","III/MSS", "III/MSI"), each = 19))
group_annotation$Gen <- rep(genes, times=4)
rownames(group_annotation) <-
  paste(group_annotation$Grupo, group_annotation$Gen, sep = "_")
labels <- group_annotation$Gen
group_annotation <- subset(group_annotation, select = -Gen)

# Matriz de correlación por grupos
cor_list <- list()
for (group in unique(merged_data$Grupo)){
  group_data <- merged_data[merged_data$Grupo == group,]
  genes_data_group <- group_data[, genes]
  cells_data_group <- group_data[, cell_types]
  
  cor_matrix_group <- cor(cells_data_group, genes_data_group, method="spearman")
  cor_list[[group]] <- cor_matrix_group
}

cor_matrix_all <- do.call(cbind, cor_list)
colnames(cor_matrix_all) <- rownames(group_annotation)


# Definir colores para los grupos
group_colors <- list(Grupo = c("II/MSS" = "#9E0142", 
                               "II/MSI" = "#5E4FA2", 
                               "III/MSS"= "#3288BD", 
                               "III/MSI"= "#CEB944"))
# Crear el heatmap
pheatmap(cor_matrix_all, 
         annotation_col = group_annotation, 
         annotation_colors = group_colors,
         labels_col = labels,
         main = "Heatmap HLA e infiltracion",
         color = colorRampPalette(c("blue", "white", "red"))(100),
         show_colnames = T, show_rownames = T, fontsize = 8,
         fontsize_legend = 4, border_color = NA)

# Nuevo conjunto de datos incluyendo la expresión de NCR1 y genes HLA clase I
# Divido la expresión en "Baja" y "Alta" dependiendo si es mayor o menor que la media

merged_dataNK <- merged_data %>%
  select(ID, NK_cells, MSI_MSS, CMS, Estadío, Grupo, Dataset, HLA_A, HLA_B,
         HLA_C, HLA_E, HLA_F, HLA_G, HLA_DPA1, HLA_DPB1, HLA_DPB2, HLA_DQA1,
         HLA_DQA2, HLA_DQB1, HLA_DRA, HLA_DRB1, HLA_DRB5, HLA_DOA,
         HLA_DOB, HLA_DMA, HLA_DMB)

merged_dataNK <- merged_dataNK %>%
  mutate(
    HLA_Ae = ifelse(HLA_A > mean(HLA_A, na.rm = TRUE), "Alta", "Baja"),
    HLA_Be = ifelse(HLA_B > mean(HLA_B, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ce = ifelse(HLA_C > mean(HLA_C, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ee = ifelse(HLA_E > mean(HLA_E, na.rm = TRUE), "Alta", "Baja"),
    HLA_Fe = ifelse(HLA_F > mean(HLA_F, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ge = ifelse(HLA_G > mean(HLA_G, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPA1e = ifelse(HLA_DPA1 > mean(HLA_DPA1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPB1e = ifelse(HLA_DPB1 > mean(HLA_DPB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPB2e = ifelse(HLA_DPB2 > mean(HLA_DPB2, na.rm = TRUE), "Alta", "Baja"),
    HLA_DQA1e = ifelse(HLA_DQA1 > mean(HLA_DQA1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DQA2e = ifelse(HLA_DQA2 > mean(HLA_DQA2, na.rm = TRUE), "Alta", "Baja"),
    HLA_DQB1e = ifelse(HLA_DQB1 > mean(HLA_DQB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DRAe  = ifelse(HLA_DRA  > mean(HLA_DRA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DRB1e = ifelse(HLA_DRB1 > mean(HLA_DRB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DRB5e = ifelse(HLA_DRB5 > mean(HLA_DRB5, na.rm = TRUE), "Alta", "Baja"),
    HLA_DOAe  = ifelse(HLA_DOA  > mean(HLA_DOA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DOBe  = ifelse(HLA_DOB  > mean(HLA_DOB,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DMAe  = ifelse(HLA_DMA  > mean(HLA_DMA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DMBe  = ifelse(HLA_DMB  > mean(HLA_DMB,  na.rm = TRUE), "Alta", "Baja")
  )

NK <- data.frame(NCR1 = log2(ext[, "NCR1"]), ID = rownames(ext))

merged_dataNK <- merge(merged_dataNK, NK, by = "ID")
head(merged_dataNK)
       ID   NK_cells MSI_MSS  CMS Estadío         Grupo Dataset    HLA_A
1 A2004_T  0.7196722     MSS CMS4      II Stage II, MSS     CLX 13.63571
2 A2027_T -0.3982661     MSS CMS3      II Stage II, MSS     CLX 12.21874
3 A2050_T -0.2792276     MSS CMS2      II Stage II, MSS     CLX 13.03213
4 A2096_T -0.5119227     MSS CMS3      II Stage II, MSS     CLX 11.23669
5 B2012_T  0.3687848     MSS CMS3      II Stage II, MSS     CLX 11.46315
6 B2035_T  0.4921594     MSS CMS1      II Stage II, MSS     CLX 12.94721
     HLA_B    HLA_C     HLA_E     HLA_F     HLA_G  HLA_DPA1  HLA_DPB1 HLA_DPB2
1 13.23626 13.51612 10.891565 10.197729  9.433319 10.910054 11.474009 2.947866
2 12.09741 10.54568  9.553953  8.674593  9.184430  5.722075  8.253218 3.455365
3 12.75953 10.52239 10.531914  8.403979  9.055393  9.228111 10.353839 2.881734
4 10.57240 10.43151  8.799970  7.939218  8.607552  6.982154  8.927037 4.043707
5 12.48574 10.41478 10.439635 10.294109  9.292815  9.765838  8.348707 2.728756
6 13.59113 13.45148 10.906514  9.669588 10.563523 11.130406  9.839938 3.283621
  HLA_DQA1 HLA_DQA2  HLA_DQB1   HLA_DRA  HLA_DRB1 HLA_DRB5  HLA_DOA  HLA_DOB
1 7.525320 7.324660 10.475698 12.771633 11.516642 6.783054 8.248665 5.156801
2 6.433657 7.468530  4.411973  8.232406  5.384877 5.697542 4.706607 4.427860
3 6.109228 7.712724  8.297106 10.749566  9.491044 6.065309 5.974780 4.455425
4 6.103023 7.925025  7.513132  8.694044  7.166937 5.912588 5.076255 4.140544
5 5.826041 8.144257  9.524114 12.084706 10.263096 6.589685 5.339500 4.696205
6 6.507400 8.571823  5.184729 12.427694 11.266601 7.091718 6.775576 5.060046
    HLA_DMA   HLA_DMB HLA_Ae HLA_Be HLA_Ce HLA_Ee HLA_Fe HLA_Ge HLA_DPA1e
1 10.299078 10.292908   Alta   Alta   Alta   Alta   Alta   Baja      Alta
2  6.222437  6.430098   Baja   Baja   Baja   Baja   Baja   Baja      Baja
3  8.319735  8.251854   Alta   Alta   Baja   Baja   Baja   Baja      Alta
4  6.262953  6.751396   Baja   Baja   Baja   Baja   Baja   Baja      Baja
5  9.469835  9.138292   Baja   Baja   Baja   Baja   Alta   Baja      Alta
6 10.649061  9.397652   Alta   Alta   Alta   Alta   Alta   Alta      Alta
  HLA_DPB1e HLA_DPB2e HLA_DQA1e HLA_DQA2e HLA_DQB1e HLA_DRAe HLA_DRB1e
1      Alta      Baja      Alta      Baja      Alta     Alta      Alta
2      Baja      Alta      Alta      Baja      Baja     Baja      Baja
3      Alta      Baja      Baja      Baja      Alta     Baja      Alta
4      Baja      Alta      Baja      Alta      Alta     Baja      Baja
5      Baja      Baja      Baja      Alta      Alta     Alta      Alta
6      Alta      Alta      Alta      Alta      Baja     Alta      Alta
  HLA_DRB5e HLA_DOAe HLA_DOBe HLA_DMAe HLA_DMBe     NCR1
1      Alta     Alta     Alta     Alta     Alta 2.091668
2      Baja     Baja     Baja     Baja     Baja 1.384394
3      Baja     Baja     Baja     Baja     Alta 1.592652
4      Baja     Baja     Baja     Baja     Baja 1.467244
5      Baja     Baja     Baja     Alta     Alta 1.333561
6      Alta     Alta     Alta     Alta     Alta 1.777775
# Figura 14A
# Función para comparar la expresión de NCR1 para un gen específico en función de las categorías "Baja" y "Alta" mediante boxplot para cada Grupo 

NK_HLA <- function(dataset, gen){
  ggplot(dataset, aes(x = .data[[gen]], y = NCR1)) + 
  geom_boxplot(outlier.shape = NA, alpha = 0.6) + 
  geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
  facet_wrap(~ Grupo) +
  stat_compare_means(method = "wilcox.test", label = "p.format", size = 3,
                     label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) +
  theme_minimal() + 
  labs(
    title = paste("Comparación de la expresión de NCR1 entre las categorías de",
                  substr(gen, 1, nchar(gen) - 1)),
    x = NULL,
    y = "log2(NCR1)"
  ) +
  theme(legend.position = "none")
}

exp_HLA <- c("HLA_Ae", "HLA_Be", "HLA_Ce", "HLA_Ee", "HLA_Fe", "HLA_Ge",
             "HLA_DPA1e", "HLA_DPB1e", "HLA_DPB2e", "HLA_DQA1e", "HLA_DQA2e", 
             "HLA_DQB1e", "HLA_DRAe", "HLA_DRB1e", "HLA_DRB5e", "HLA_DOAe", 
             "HLA_DOBe", "HLA_DMAe", "HLA_DMBe")

for (gen in exp_HLA){
  print(NK_HLA(merged_dataNK, gen))
}

# Figura 14B
# Función para comparar la infiltración de NK para un gen específico en función de las categorías "Baja" y "Alta" mediante boxplot para cada Grupo 

NK_HLA <- function(dataset, gen){
  ggplot(dataset, aes(x = .data[[gen]], y = NK_cells)) + 
  geom_boxplot(outlier.shape = NA, alpha = 0.6) + 
  geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
  facet_wrap(~ Grupo) +
  stat_compare_means(method = "wilcox.test", label = "p.format", size = 3,
                     label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) +
  theme_minimal() + 
  labs(
    title = paste("Comparación de la expresión de NK_cells entre las categorías de", 
                  substr(gen, 1, nchar(gen) - 1)),
    x = NULL,
    y = "NK_cells"
  ) +
  theme(legend.position = "none")
}

for (gen in exp_HLA){
  print(NK_HLA(merged_dataNK, gen))
}