Anexo 2: Script cohorte TCGA

require(knitr)
require(ggplot2)
require(dplyr)
require(tidyr)
require(rlang)
require(ggpubr)
require(gridExtra)
require(devtools)
require(matrixStats)
require(ConsensusTME)
require(pheatmap)
require(caret)
require(class)
require(limma)
require(CMScaller)
require(openxlsx)
require(survival)
require(readxl)
require(xfun)
require(survminer)

setwd(params$wd)
load("tcga.coad.prim.RData")

Clasificación y exploración inicial de los datos

clin <- clin.tcga.coad

# Resumen variables clínicas
for (var in names(clin)[-c(1, 14, 15)]) {
  cat("\n###", var, "###\n")
  if (is.numeric(clin.tcga.coad[[var]])) {
    print(summary(clin.tcga.coad[[var]]))
  } else {
    print(table(clin.tcga.coad[[var]], useNA = "ifany"))
  }
}


### additional_studies ###

         TCGAFPPP 
     443       10 

### tumor_tissue_site ###

      Colon 
    1   452 

### histological_type ###

                                       Colon Adenocarcinoma 
                            5                           386 
Colon Mucinous Adenocarcinoma 
                           62 

### other_dx ###

                                              No 
                                             391 
                                             Yes 
                                              61 
Yes, History of Synchronous/Bilateral Malignancy 
                                               1 

### gender ###

FEMALE   MALE 
   214    239 

### vital_status ###

<NA> 
 453 

### days_to_birth ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 -32873  -28472  -25243  -24660  -21526  -11391       2 

### days_to_last_known_alive ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0   429.8  1974.0  1928.1  3424.0  3920.0     443 

### days_to_death ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0    75.5   290.0   573.5   750.5  3042.0     398 

### days_to_last_followup ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0   457.0   761.0   946.4  1117.5  4270.0     106 

### race_list ###

                                 AMERICAN INDIAN OR ALASKA NATIVE 
                             172                                1 
                           ASIAN        BLACK OR AFRICAN AMERICAN 
                              11                               59 
                           WHITE 
                             210 

### tissue_source_site ###

 3L  4N  4T  5M  A6  AA  AD  AM  AU  AY  AZ  CA  CK  CM  D5  DM  F4  G4  NH  QG 
  1   1   1   3  51 172  13   2   2  10  20  10  12  37  31  25  16  27   9   5 
 QL  RU  SS  T9  WS 
  1   1   1   1   1 

### history_of_neoadjuvant_treatment ###

 No Yes 
450   3 

### informed_consent_verified ###

YES 
453 

### icd_o_3_site ###

C18.0 C18.2 C18.3 C18.4 C18.5 C18.6 C18.7 C18.9 C19.9 
   85   103    10    21     5    17   110    96     6 

### icd_o_3_histology ###

8010/3 8140/3 8255/3 8260/3 8480/3 8560/3 8574/3 
     1    384      1      2     63      1      1 

### icd_10 ###

C18.0 C18.2 C18.3 C18.4 C18.5 C18.6 C18.7 C18.9   C19 
   85   103    10    21     5    17   110    96     6 

### tissue_prospective_collection_indicator ###

     NO YES 
 12 279 162 

### tissue_retrospective_collection_indicator ###

     NO YES 
 12 162 279 

### days_to_initial_pathologic_diagnosis ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0       0       0       0       0       0 

### age_at_initial_pathologic_diagnosis ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  31.00   58.00   69.00   67.05   77.00   90.00 

### year_of_initial_pathologic_diagnosis ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1998    2007    2009    2008    2010    2013 

### person_neoplasm_cancer_status ###

           TUMOR FREE WITH TUMOR 
        52        197        204 

### ethnicity ###

                           HISPANIC OR LATINO NOT HISPANIC OR LATINO 
                   182                      4                    267 

### weight ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  34.00   65.00   80.00   80.86   93.00  175.30     204 

### height ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   80.3   162.0   170.0   168.4   176.0   193.0     221 

### day_of_form_completion ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   1.00   15.00   18.00   16.85   19.00   31.00      25 

### month_of_form_completion ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   5.000   6.000   5.852   7.000  12.000 

### year_of_form_completion ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2010    2010    2011    2011    2011    2015 

### residual_tumor ###

     R0  R1  R2  RX 
 73 327   4  25  24 

### anatomic_neoplasm_subdivision ###

                            Ascending Colon                 Cecum 
                   20                    87                   107 
     Descending Colon       Hepatic Flexure Rectosigmoid Junction 
                   20                    26                     1 
        Sigmoid Colon       Splenic Flexure      Transverse Colon 
                  147                     7                    38 

### primary_lymph_node_presentation_assessment ###

     NO YES 
 10  15 428 

### lymph_node_examined_count ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00   14.00   20.00   22.46   27.50  109.00      22 

### number_of_lymphnodes_positive_by_he ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   0.000   0.000   2.068   2.000  50.000      26 

### number_of_lymphnodes_positive_by_ihc ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.0000  0.0000  0.7083  0.0000 12.0000     405 

### preoperative_pretreatment_cea_level ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    1.70    3.00   37.69    7.55 2614.00     168 

### non_nodal_tumor_deposits ###

     NO YES 
230 186  37 

### circumferential_resection_margin ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    4.00   20.00   29.72   40.00  165.00     384 

### venous_invasion ###

     NO YES 
 59 298  96 

### lymphatic_invasion ###

     NO YES 
 42 247 164 

### perineural_invasion_present ###

     NO YES 
274 133  46 

### microsatellite_instability ###

     NO YES 
362  80  11 

### number_of_loci_tested ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   5.000   5.000   4.925   5.000  10.000     400 

### number_of_abnormal_loci ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.0000  0.0000  0.7924  0.0000  9.0000     400 

### kras_gene_analysis_performed ###

     NO YES 
 36 369  48 

### kras_mutation_found ###

     NO YES 
407  24  22 

### kras_mutation_codon ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  12.00   12.00   12.00   12.19   12.00   13.00     432 

### braf_gene_analysis_performed ###

     NO YES 
 49 376  28 

### braf_gene_analysis_result ###

         Abnormal   Normal 
     425        3       25 

### synchronous_colon_cancer_present ###

     NO YES 
 44 385  24 

### history_of_colon_polyps ###

     NO YES 
 68 248 137 

### colon_polyps_present ###

     NO YES 
229 144  80 

### loss_expression_of_mismatch_repair_proteins_by_ihc ###

     NO YES 
114 283  56 

### loss_expression_of_mismatch_repair_proteins_by_ihc_results ###

                                                                         
                                                                     395 
                                            MLH1-ExpressedMSH2-Expressed 
                                                                       1 
                              MLH1-ExpressedMSH2-ExpressedMSH6-Expressed 
                                                                       1 
                MLH1-ExpressedMSH2-ExpressedPMS2-ExpressedMSH6-Expressed 
                                                                      27 
            MLH1-ExpressedMSH2-ExpressedPMS2-Not ExpressedMSH6-Expressed 
                                                                       3 
        MLH1-ExpressedMSH2-ExpressedPMS2-Not ExpressedMSH6-Not Expressed 
                                                                       1 
                      MLH1-ExpressedMSH2-Not ExpressedMSH6-Not Expressed 
                                                                       1 
        MLH1-ExpressedMSH2-Not ExpressedPMS2-ExpressedMSH6-Not Expressed 
                                                                       1 
                                            MLH1-ExpressedPMS2-Expressed 
                                                                       1 
        MLH1-Not ExpressedMSH2-ExpressedPMS2-Not ExpressedMSH6-Expressed 
                                                                       6 
                  MLH1-Not ExpressedMSH2-Not ExpressedMSH6-Not Expressed 
                                                                       1 
                  MLH1-Not ExpressedMSH2-Not ExpressedPMS2-Not Expressed 
                                                                       1 
MLH1-Not ExpressedMSH2-Not ExpressedPMS2-Not ExpressedMSH6-Not Expressed 
                                                                      14 

### number_of_first_degree_relatives_with_cancer_diagnosis ###
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.0000  0.0000  0.1825  0.0000  3.0000      64 

### radiation_therapy ###

     NO 
419  34 

### postoperative_rx_tx ###

     NO YES 
419  16  18 

### primary_therapy_outcome_success ###

                            Complete Remission/Response 
                        426                          17 
 Partial Remission/Response         Progressive Disease 
                          3                           6 
             Stable Disease 
                          1 

### has_new_tumor_events_information ###

 NO YES 
364  89 

### has_drugs_information ###

 NO YES 
302 151 

### has_radiations_information ###

 NO YES 
438  15 

### has_follow_ups_information ###

 NO YES 
 36 417 

### project ###

TCGA-COAD 
      453 

### stage_event_system_version ###

    2nd 5th 6th 7th 
 53   1  63 204 132 

### stage_event_clinical_stage ###

<NA> 
 453 

### stage_event_pathologic_stage ###

              Stage I   Stage IA   Stage II  Stage IIA  Stage IIB  Stage IIC 
        11         74          1         30        136         10          1 
 Stage III Stage IIIA Stage IIIB Stage IIIC   Stage IV  Stage IVA  Stage IVB 
        19          8         58         41         46         16          2 

### stage_event_tnm_categories ###

     T1N0    T1N0M0    T1N0MX   T1N1bM0      T2N0    T2N0M0    T2N0MX   T2N1aM0 
        1         8         1         1         1        59         7         1 
   T2N1M0    T2N1MX   T2N2aM0      T3N0    T3N0M0    T3N0M1   T3N0M1a   T3N0M1b 
        6         2         1         2       151         4         1         1 
   T3N0MX      T3N1   T3N1aM0   T3N1aM1  T3N1aM1a   T3N1aMX   T3N1bM0   T3N1bMX 
       15         1         6         1         2         2         7         2 
  T3N1cM0    T3N1M0    T3N1M1    T3N1MX   T3N2aM0   T3N2aM1   T3N2aMX     T3N2b 
        1        34        12         5         3         1         1         1 
  T3N2bM0  T3N2bM1b   T3N2bMX    T3N2M0    T3N2M1    T3N2MX   T4aN0M0  T4aN0M1a 
        3         1         3        29        17         3         1         1 
  T4aN0MX  T4aN1aM0 T4aN1aM1a  T4aN1bM0 T4aN1bM1a T4aN1cM1a   T4aN1M0   T4aN1M1 
        1         1         1         1         1         1         1         1 
  T4aN1MX  T4aN2aMX    T4aN2b  T4aN2bM0  T4aN2bM1 T4aN2bM1a  T4aN2bMX   T4aN2M1 
        1         2         1         1         1         1         1         1 
  T4bN0M0   T4bN0MX  T4bN1aMX T4bN1bM1b  T4bN1bMX   T4bN1M1   T4bN2M0    T4N0M0 
        2         1         1         1         1         1         2         8 
   T4N0M1  T4N1bM1a    T4N1M0    T4N1M1    T4N2M0    T4N2M1   TisN0MX 
        1         1         4         3         1         9         1 

### stage_event_psa ###

<NA> 
 453 

### stage_event_gleason_grading ###

<NA> 
 453 

### stage_event_ann_arbor ###

<NA> 
 453 

### stage_event_serum_markers ###

<NA> 
 453 

### stage_event_igcccg_stage ###

<NA> 
 453 

### stage_event_masaoka_stage ###

<NA> 
 453

# Modelo de clasificación de pacientes según el estado MSI/MSS: "up"
# Filtrar pacientes con estado MSI/MSS conocido
clin_known <- clin %>% 
  filter(microsatellite_instability %in% c("YES", "NO"))

# Submatriz de expresión para esos pacientes
expr_known <- 
  ex.tcga.coad[, colnames(ex.tcga.coad) %in% clin_known$bcr_patient_barcode]
expr_known <- 
  expr_known[, match(clin_known$bcr_patient_barcode, colnames(expr_known))]

# Seleccionar los 50 genes mas significativos, menor p-valor
group <- factor(clin_known$microsatellite_instability, levels = c("NO", "YES"))
design <- model.matrix(~ group)
fit <- lmFit(expr_known, design)
fit <- eBayes(fit)

top_genes <- topTable(fit, number = 50, sort.by = "p") %>% rownames()

X_train <- t(expr_known[rownames(expr_known) %in% top_genes, ])
y_train <- group

# Entrenar modelo kNN sampling = "up"

set.seed(2025)

ctrl <- trainControl(method = "cv", number = 10, sampling = "up")
modelo_knn <- train(
  X_train, y_train,
  method = "knn",
  tuneGrid = data.frame(k = 5),
  trControl = ctrl
)
modelo_knn

k-Nearest Neighbors 

91 samples
50 predictors
 2 classes: 'NO', 'YES' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 81, 82, 82, 82, 82, 82, ... 
Addtional sampling using up-sampling

Resampling results:

  Accuracy   Kappa
  0.9666667  0.84 

Tuning parameter 'k' was held constant at a value of 5

# Aplicar modelo

clin_missing <- clin %>%
  filter(!(microsatellite_instability %in% c("YES", "NO")))

expr_missing <- 
  ex.tcga.coad[, colnames(ex.tcga.coad) %in% clin_missing$bcr_patient_barcode]
expr_missing <- expr_missing[rownames(expr_missing) %in% top_genes, ]
X_missing <- t(expr_missing)

# Sensibilidad y especificidad

predicciones_up <- predict(modelo_knn, newdata = X_train)
cm_up <- confusionMatrix(predicciones_up, y_train, positive = "YES")
cm_up

Confusion Matrix and Statistics

          Reference
Prediction NO YES
       NO  78   0
       YES  2  11
                                          
               Accuracy : 0.978           
                 95% CI : (0.9229, 0.9973)
    No Information Rate : 0.8791          
    P-Value [Acc > NIR] : 0.0007365       
                                          
                  Kappa : 0.9041          
                                          
 Mcnemar's Test P-Value : 0.4795001       
                                          
            Sensitivity : 1.0000          
            Specificity : 0.9750          
         Pos Pred Value : 0.8462          
         Neg Pred Value : 1.0000          
             Prevalence : 0.1209          
         Detection Rate : 0.1209          
   Detection Prevalence : 0.1429          
      Balanced Accuracy : 0.9875          
                                          
       'Positive' Class : YES

# Predicción
predicted_labels_up <- as.character(predict(modelo_knn, X_missing))

# Modelo de clasificación de pacientes según el estado MSI/MSS: "down"
# Figura 3A
# Entrenar modelo kNN sampling = "down"

set.seed(2025)

ctrl <- trainControl(method = "cv", number = 10, sampling = "down")
modelo_knn <- train(
  X_train, y_train,
  method = "knn",
  tuneGrid = data.frame(k = 5),
  trControl = ctrl
)
modelo_knn

k-Nearest Neighbors 

91 samples
50 predictors
 2 classes: 'NO', 'YES' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 81, 82, 82, 82, 82, 82, ... 
Addtional sampling using down-sampling

Resampling results:

  Accuracy   Kappa    
  0.9666667  0.7608696

Tuning parameter 'k' was held constant at a value of 5

# Sensibilidad y especificidad

predicciones_down <- predict(modelo_knn, newdata = X_train)
cm_down <- confusionMatrix(predicciones_down, y_train, positive = "YES")
cm_down

Confusion Matrix and Statistics

          Reference
Prediction NO YES
       NO  80   1
       YES  0  10
                                          
               Accuracy : 0.989           
                 95% CI : (0.9403, 0.9997)
    No Information Rate : 0.8791          
    P-Value [Acc > NIR] : 0.0001094       
                                          
                  Kappa : 0.9462          
                                          
 Mcnemar's Test P-Value : 1.0000000       
                                          
            Sensitivity : 0.9091          
            Specificity : 1.0000          
         Pos Pred Value : 1.0000          
         Neg Pred Value : 0.9877          
             Prevalence : 0.1209          
         Detection Rate : 0.1099          
   Detection Prevalence : 0.1099          
      Balanced Accuracy : 0.9545          
                                          
       'Positive' Class : YES

# Predicción
predicted_labels_down <- as.character(predict(modelo_knn, X_missing))

# Figura 3C
# sampling = "up"

# Combinar datos de expresión
X_all <- rbind(X_train, X_missing)

# Obtener etiquetas combinadas
labels_all <- c(as.character(y_train), predicted_labels_up)

# Indicar si es conocido o predicho
known_status <- c(rep("Conocido", length(y_train)),
                  rep("Predecido", length(predicted_labels_up)))

# PCA
pca <- prcomp(X_all, scale. = TRUE)
pca_df <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  MSI_MSS = labels_all,
  Valor = known_status
)

# Convertir a factores con niveles bien definidos
pca_df$MSI_MSS <- factor(pca_df$MSI_MSS, levels = c("NO", "YES"))
pca_df$Valor <- factor(pca_df$Valor, levels = c("Conocido", "Predecido"))

# Visualización
library(ggplot2)

ggplot(pca_df, aes(x = PC1, y = PC2, color = MSI_MSS, shape = Valor)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_color_manual(values = c("NO" = "lightblue", "YES" = "lightpink")) +
  scale_shape_manual(values = c("Conocido" = 17, "Predecido" = 16)) +
  theme_minimal() +
  labs(
    title = "Sampling = up",
    x = "PC1",
    y = "PC2",
    color = "MSI/MSS",
    shape = "Valor MSI/MSS"
  )

# Figura 3B
# sampling = "down"

# Combinar datos de expresión
X_all <- rbind(X_train, X_missing)

# Obtener etiquetas combinadas
labels_all <- c(as.character(y_train), predicted_labels_down)

# Indicar si es conocido o predicho
known_status <- c(rep("Conocido", length(y_train)),
                  rep("Predecido", length(predicted_labels_down)))

# PCA
pca <- prcomp(X_all, scale. = TRUE)
pca_df <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  MSI_MSS = labels_all,
  Valor = known_status
)

# Convertir a factores con niveles bien definidos
pca_df$MSI_MSS <- factor(pca_df$MSI_MSS, levels = c("NO", "YES"))
pca_df$Valor <- factor(pca_df$Valor, levels = c("Conocido", "Predecido"))

# Visualización
library(ggplot2)

ggplot(pca_df, aes(x = PC1, y = PC2, color = MSI_MSS, shape = Valor)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_color_manual(values = c("NO" = "lightblue", "YES" = "lightpink")) +
  scale_shape_manual(values = c("Conocido" = 17, "Predecido" = 16)) +
  theme_minimal() +
  labs(
    title = "Sampling = down",
    x = "PC1",
    y = "PC2",
    color = "MSI/MSS",
    shape = "Valor MSI/MSS"
  )

# Guardar predicciones del modelo con sampling = "down"
clin$microsatellite_instability_pred<- ifelse(
  clin$bcr_patient_barcode %in% rownames(X_missing), predicted_labels_down,
  clin$microsatellite_instability
)
clin$microsatellite_instability_pred[clin$microsatellite_instability_pred == 2] <- "NO"
clin$microsatellite_instability_pred[clin$microsatellite_instability_pred == 3] <- "YES"

# Creación variable Estadío (I, II, II, IV)
clin <- clin %>%
  mutate(Estadío = case_when(
    stage_event_pathologic_stage %in% c("Stage I", "Stage IA") ~ "I",
    stage_event_pathologic_stage %in% c("Stage II", "Stage IIA", "Stage IIB", "Stage IIC") ~ "II",
    stage_event_pathologic_stage %in% c("Stage III", "Stage IIIA", "Stage IIIB", "Stage IIIC") ~ "III",
    stage_event_pathologic_stage %in% c("Stage IV", "Stage IVA", "Stage IVB") ~ "IV",
    stage_event_pathologic_stage ==  "" ~ NA_character_))

# Creación variable Grupo
clin <- clin %>%
   mutate(Grupo = case_when(
    Estadío != "" & microsatellite_instability_pred == "YES" ~ paste(Estadío, "MSI"),
    Estadío != "" & microsatellite_instability_pred == "NO"  ~ paste(Estadío, "MSS")))

# Figura 4
# Inferencia subtipos CMS
# https://github.com/peterawe/CMScaller
# devtools::install_github("Lothelab/CMScaller")

ex.tcga.coad2 <- 
  replaceGeneId(ex.tcga.coad, id.in = "symbol", id.out = "entrez")
cms_result <- CMScaller(ex.tcga.coad2, RNAseq = TRUE)


CMS1 CMS2 CMS3 CMS4 <NA> 
  78  121   66  139   49

clin$CMS <- 
  cms_result$prediction[match(clin$bcr_patient_barcode, rownames(cms_result))]

# Figura 5A
# Gráfico quesitos género

genero_table <- table(clin$gender)
percent_labels <- 
  paste0(c("Femenino", "Masculino"), "\n",
         round(100 * genero_table / sum(genero_table), 1), "%")
pie(genero_table,
    main = "Género",
    labels = percent_labels,
    col = rainbow(2),
    cex = 0.9,
    cex.main = 0.9)

# Figura 5B
# Gráfico quesitos raza
raza_table <- table(clin$race_list)
percent_labels <- 
  paste0(c("America, India, Alaska", "Asia", "Negro o África", "Blanco"), "\n",
                         round(100 * raza_table / sum(raza_table), 1), "%")
pie(table(clin$race_list),
    main = "Raza",
    labels = percent_labels,
    col = rainbow(5),
    cex = 0.9,
    cex.main = 0.9)

# Figura 5D
# Gráfico quesitos edad diagnóstico

edad_cut <- cut(clin$age_at_initial_pathologic_diagnosis, 
                breaks = 5, 
                include.lowest = TRUE)
edad_table <- table(edad_cut)

percent_labels <- paste0(names(edad_table),
                         "\n", round(100 * edad_table / sum(edad_table), 1),
                         "%")
pie(edad_table,
    labels = percent_labels,
    main = "Edad de diagnóstico",
    col = rainbow(5),
    cex = 0.9,
    cex.main = 0.9)

# Figura 5C
# Gráfico quesitos edad

edad <- -clin$days_to_birth / 365
edad_cut <- cut(edad, 
                breaks = 5, 
                include.lowest = TRUE)
edad_table <- table(edad_cut)

percent_labels <- 
  paste0(names(edad_table), "\n",
         round(100 * edad_table / sum(edad_table), 1), "%")
pie(edad_table,
    labels = percent_labels,
    main = "Edad",
    col = rainbow(5),
    cex = 0.9,
    cex.main = 0.9)

Análisis descriptivo y comparación de HLA-I y HLA-II

# Filtrar la matriz de expresión por los genes que contienen "HLA"
hla <- 
  as.data.frame(t(ex.tcga.coad)[, rownames(ex.tcga.coad) %in% c("HLA-A", "HLA-B", "HLA-C", "HLA-E", "HLA-F", "HLA-G", 
                                                                "HLA-DPA1", "HLA-DPB1", "HLA-DPB2", "HLA-DQA1", "HLA-DQA2",
                                                                "HLA-DQB1", "HLA-DRA", "HLA-DRB1", "HLA-DRB5", "HLA-DOA", 
                                                                "HLA-DOB", "HLA-DMA", "HLA-DMB")])
colnames(hla) <- gsub("-", "_", colnames(hla))

# Añadir la columna ID al dataframe
colnames(clin)[colnames(clin) == "bcr_patient_barcode"] <- "ID"
hla$ID <- rownames(hla)

# Unir dataframes de datos clínicos y expresión de HLA por la columna ID
aux <- merge(hla, clin, by = "ID")

# Cambiar nombres columnas
colnames(aux)[colnames(aux) == "microsatellite_instability_pred"] <- "MSI_MSS"
aux$MSI_MSS[aux$MSI_MSS == "NO"] <- "MSS"
aux$MSI_MSS[aux$MSI_MSS == "YES"] <- "MSI"

# Figura 2B
# Gráfico de barras para la distribución de las muestras en los distintos grupos

aux %>%
  select(MSI_MSS, CMS, Estadío, Grupo) %>%
  gather(key = "variable", value = "value") %>%
  bind_rows(tibble(variable = "Dataset", value = "TCGA")) %>% 
  # Agrupación por variable y valor para contar frecuencias
  group_by(variable, value) %>%
  # Resumen de los datos / conteo
  summarise(count = n(), .groups = "drop") %>%
  # Agrupación por variable para calcular porcentajes dentro de cada grupo
  group_by(variable) %>%
  # Calcular porcentaje y ordenar factores
  mutate(percent = count / sum(count) * 100,
         variable = factor(variable, levels = c("MSI_MSS", "CMS", "Estadío", "Grupo", "Dataset")),
         value = factor(value, levels = unique(value))) %>%
  # Gráfico de barras
  ggplot(aes(x = variable, y = percent, fill = value)) +
  geom_bar(stat = "identity") +
  # Personalizar colores, leyenda, ejes y título
  scale_fill_manual(values = c("#FFFACD", "#FFD700", "#FFA07A", "#FF6347", "#FFB6C1","#FF6281", "#FF69B4", "#FF1493","#DDA0DD", "#191970", "#4169E1", "#1E90FF",
                               "#6495ED", "#4682B4", "#87CEEB", "#87CEFA", "#00BFFF", "#006400", "#9ACD32" ),
                    guide = guide_legend(keyheight = 0.8, keywidth = 0.8)) +
  scale_x_discrete(labels = c("MSI_MSS" = "MSI/MSS",
                              "Dataset" = "Conjunto de Datos")) +
  labs(title = "Distribución porcentual de las muestras",
       x = "",
       y = "Porcentaje (%)",
       fill = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Medias HLA para cada gen/grupo

# Listas de genes y grupos con los que trabajaré
genes <- c("HLA_A", "HLA_B", "HLA_C", "HLA_E", "HLA_F", "HLA_G", "HLA_DPA1", 
           "HLA_DPB1", "HLA_DPB2", "HLA_DQA1", "HLA_DQB1", "HLA_DRA", 
           "HLA_DRB1", "HLA_DRB5", "HLA_DOA", "HLA_DOB", "HLA_DMA", "HLA_DMB")
grupos <- c("MSI", "MSS", "CMS1", "CMS2", "CMS3", "CMS4", "CMS_NA",  "I", "II",
            "III", "IV","Estadío_NA", "I MSI", "I MSS", "II MSI", "II MSS",
            "III MSI", "III MSS", "IV MSI", "IV MSS", "Grupo_NA")

# Crear dataframe vacío para almacenar las medias
medias <- data.frame(matrix(ncol = length(genes), nrow = length(grupos)))
colnames(medias) <- genes
rownames(medias) <- grupos

# Función para calcular la media de cada gen en los diferentes grupos
summary_groups <- function(dataset, gen) {
  result <- list(
    MSI_MSS = dataset %>%
      group_by(MSI_MSS) %>%
      summarise(mean = mean(!!sym(gen))),
    # Gestionar NAs en CMS
    CMS = dataset %>%
      group_by(CMS) %>%
      summarise(mean = mean(!!sym(gen), na.rm = TRUE)),
    
    Estadío = dataset %>%
      group_by(Estadío) %>%
      summarise(mean = mean(!!sym(gen), na.rm = TRUE)),
    
    Grupo = dataset %>%
      group_by(Grupo) %>%
      summarise(mean = mean(!!sym(gen), na.rm = TRUE))
  )
  
  means_values <- unlist(lapply(result, function(x) x$mean))
}

# Aplicar la función en bucle a cada gen HLA
for (i in 1:length(genes)) {
  gen <- genes[i]
  means_values <- summary_groups(aux, gen)
  medias[, i] <- means_values
}

# write.csv(round(medias,2), "medias.csv", row.names = TRUE)

# Tabla 2
# Kruskal para cada gen/grupo

# Grupos de interés
grupos_ag <- c("MSI_MSS", "CMS", "Estadío", "Grupo")

# Dataframe para guardar resultados
kruskal_sig <- data.frame(matrix(ncol = length(genes), nrow = 4))
colnames(kruskal_sig) <- genes
rownames(kruskal_sig) <- grupos_ag

# Función kruskal
kruskal <- function(dataset, gen){
  result <- list(
    # Aplicar el test de kruskal a can gen ~ grupo
    # Guardar el p-valor
    kruskal.test(as.formula(paste(gen, "~ MSI_MSS")), dataset)$p.value,
    kruskal.test(as.formula(paste(gen, "~ CMS")), dataset)$p.value,
    kruskal.test(as.formula(paste(gen, "~ Estadío")), dataset)$p.value,
    kruskal.test(as.formula(paste(gen, "~ Grupo")), dataset)$p.value
  )
  # Convertir el resultado en vector
  result <- unlist(result)
}

# Aplicar la función en bucle a cada gen
# Ir añadiendo los resultados a la tabla
for (i in 1:length(genes)) {
  gen <- genes[i]
  p_values <- kruskal(aux, gen)
 kruskal_sig[, i] <- p_values
}

# Sustituir los valores de la tabla por NS, *, ** o *** 
 kruskal_sig <- apply(kruskal_sig, c(1, 2), function(p)  {
    if (p > 0.05) {
      return("NS")
    } else if (p <= 0.05 && p > 0.01) {
      return("*") 
    } else if (p <= 0.01 && p > 0.001) {
      return("**")  
    } else {
      return("***")
    }
  })
 
as.data.frame(kruskal_sig)

        HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1
MSI_MSS    NS    NS    NS    NS    NS    NS        *       NS       NS       NS
CMS         *   ***   ***   ***   ***    NS      ***      ***      ***      ***
Estadío    NS    NS    NS    NS    NS    NS       NS       NS       NS       NS
Grupo      NS    NS    NS    NS    NS    NS        *        *       NS        *
        HLA_DQB1 HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA HLA_DMB
MSI_MSS       NS       *       **        *      NS      NS       *      NS
CMS          ***     ***      ***      ***     ***     ***     ***     ***
Estadío       NS      **        *        *      NS      NS       *       *
Grupo          *     ***      ***       **      NS      NS      **      **

# write.csv(t(as.data.frame(kruskal_sig)), "kruskal.csv", row.names = TRUE)

# Función comparación medias 2 a 2

comparaciones_significativas <- function(dataset, gen, grupo) {
  # Obtener los niveles del grupo
  dataset <- dataset %>% filter(!is.na(.data[[grupo]]), !is.na(.data[[gen]]))
  niveles_grupo <- unique(dataset[[grupo]])
  # Generar todas las combinaciones posibles de pares de niveles
  pares_niveles <- combn(niveles_grupo, 2, simplify = FALSE)
  # Inicializar la lista para almacenar comparaciones significativas
  comparisons_ <- list()
  # Realizar la comparación para cada par de niveles
  for (par in pares_niveles) {
    # Filtrar el dataset para solo considerar los dos niveles comparados
    dataset_filtrado <- dataset %>% filter(.data[[grupo]] %in% par)
    # Realizar la comparación
    resultado <- compare_means(as.formula(paste(gen, "~", grupo)),
                               dataset_filtrado)
    # Verificar si la comparación es significativa
    if (any(resultado$p.adj < 0.05)) {
      # Si es significativa, añadir a la lista
      comparisons_[[length(comparisons_) + 1]] <- par
    }
  }
  # Crear el dataset comparisons_ que se añade al Enviroment
  nombre_variable <- paste0("comparisons_", gen, "_", grupo)
  assign(nombre_variable, comparisons_, envir = .GlobalEnv)
}

# Bucle para generar las comparaciones significativos para cada gen/grupo
for (gen in genes){
  for (g in grupos_ag){
  comparaciones_significativas(aux, gen, g)
  }
}

# Los pares de compraciones significativas que se han generado los uso en el boxplot para indicar para que parejas de valores quiero mostrar el p-valor

# Figura 8 Y 9
# Gráficas boxplot con comparación de medias

# Función boxplot
box_plot <- function(dataset, gen, grupo, my_comparisons) {
  # Filtrar los datos nulos
  ggplot(dataset %>% filter(!is.na(.data[[gen]]), !is.na(.data[[grupo]])),
         # Definir función y tipo de gráfico
         aes_string(x = grupo, y = gen)) + 
    geom_boxplot(color = "black", outlier.shape = NA) +
    geom_jitter(position = position_jitter(0.35), size = 1,
                aes_string(color = grupo, shape = grupo)) +
    # Definir etiquetas de los ejes
    labs(y = paste0("log2(", gen, ")"), x = grupo) +
    # Diseño del gráfico
    theme_classic() +
    scale_color_manual(values = c("grey80", "black", "grey60", "grey30",
                                  "grey80", "black", "grey60", "grey30"),
                       name = NULL) +
    scale_shape_manual(values = c(15, 16, 17, 18, 19, 20, 21, 22), name = NULL) +
    theme(plot.title = element_text(hjust = 0.5, size = 10, face = "bold"),
          panel.spacing = unit(1, "lines"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_rect(colour = "white", fill = NA,
                                          size = 0.3),
          strip.background = element_rect(colour = "white", fill = "white")) +
    # Muestro la comparación de medias para las parejas preestablecidas
    stat_compare_means(comparisons = my_comparisons, size = 3.2) +
    stat_compare_means(method = "kruskal.test", label = "p.format", size = 3.5,
                       label.x = Inf, label.y = Inf, hjust = 1, vjust = 1)
}

# Bucle para generar el gráfico para cada gen/grupo
for (gen in genes) {
  for (g in grupos_ag){
    # comparisons_ depende del gen 
    comparisons_variable <- paste0("comparisons_", gen, "_", g)
    print(box_plot(aux, gen, g, get(comparisons_variable)))
  }
}

Inferencia de infiltración inmune

# Inferencia de la infiltración

# install.packages("devtools")
# devtools::install_github("cansysbio/ConsensusTME", force = TRUE)
consensus <- consensusTMEAnalysis(as.matrix(ex.tcga.coad), cancer = "COAD",
                                  statMethod = "gsva")

Producing ConsensusTME Estimates Using The Following Parameters:
 Statistical Framework: "gsva"
 Gene Sets For Cancer Type: "COAD"
 Sample Size: 453

consensus <- t(consensus)
consensus <- as.data.frame(consensus)
head(consensus)

                B_cells Cytotoxic_cells Dendritic_cells Eosinophils Macrophages
TCGA-AA-3854 -0.4803717      -0.5349437      -0.5303677  -0.5495252  -0.5019391
TCGA-QG-A5YW -0.3260441      -0.5005492      -0.3621965  -0.4450782  -0.4844821
TCGA-AA-3966  0.1942019       0.1978167       0.5275622   0.4570357   0.5420140
TCGA-G4-6626 -0.6913118      -0.7068806      -0.7105103  -0.7488931  -0.7322518
TCGA-CM-5868 -0.7431985      -0.7839818      -0.6869500  -0.7214234  -0.5813385
TCGA-AA-3973 -0.5550090      -0.5680799      -0.6612286  -0.7171094  -0.6431824
             Mast_cells   NK_cells Neutrophils T_cells_CD4 T_cells_CD8
TCGA-AA-3854 -0.6313005 -0.4848965  -0.3555158  -0.5997161  -0.5869838
TCGA-QG-A5YW -0.1522690 -0.3438767  -0.5377830  -0.2895701  -0.2996378
TCGA-AA-3966  0.4806968  0.2144912   0.6500641   0.3058948   0.2905591
TCGA-G4-6626 -0.6512359 -0.5892358  -0.7614573  -0.7293006  -0.6862090
TCGA-CM-5868 -0.6452313 -0.7665153  -0.7517783  -0.7225120  -0.7403492
TCGA-AA-3973 -0.5458429 -0.5851976  -0.7099543  -0.6548409  -0.6271639
             T_cells_gamma_delta T_regulatory_cells Macrophages_M1
TCGA-AA-3854          -0.5718952         -0.5460151     -0.4425823
TCGA-QG-A5YW          -0.1880826         -0.2298948     -0.3375183
TCGA-AA-3966           0.4239998          0.4604758      0.5720187
TCGA-G4-6626          -0.7090368         -0.7413858     -0.7003253
TCGA-CM-5868          -0.7823149         -0.8340382     -0.6285408
TCGA-AA-3973          -0.6164323         -0.7002066     -0.6732833
             Macrophages_M2 Endothelial Fibroblasts  Monocytes Plasma_cells
TCGA-AA-3854     -0.4736311 -0.71668517 -0.71237113 -0.4892901 -0.580307575
TCGA-QG-A5YW     -0.4408496 -0.14728208 -0.51466407 -0.5180534 -0.330513404
TCGA-AA-3966      0.5833930  0.01898697 -0.02941950  0.5634607 -0.009847754
TCGA-G4-6626     -0.7395050 -0.67590699 -0.85830728 -0.7476222 -0.678981009
TCGA-CM-5868     -0.6108191 -0.70355192 -0.07720652 -0.7016416 -0.650268303
TCGA-AA-3973     -0.5669731 -0.57851843 -0.46387035 -0.6760106 -0.564966974
             Immune_Score
TCGA-AA-3854   -0.5210926
TCGA-QG-A5YW   -0.4000719
TCGA-AA-3966    0.3972211
TCGA-G4-6626   -0.6967276
TCGA-CM-5868   -0.6749153
TCGA-AA-3973   -0.6314876

# Nuevo conjunto de datos uniendo esta información al anterior
consensus <- consensus %>%
  tibble::rownames_to_column(var = "ID")

merged_data <- aux %>%
  inner_join(consensus, by = "ID")

write.xlsx(merged_data, file = "merged_data.xlsx")

# Figura 11
# Boxplots para infiltración
grupos_ag <- c("MSI_MSS", "CMS", "Estadío", "Grupo")

variables <- c("B_cells", "Cytotoxic_cells", "Dendritic_cells", "Eosinophils",
               "Macrophages", "Mast_cells", "NK_cells", "Neutrophils",
               "T_cells_CD4", "T_cells_CD8", "T_cells_gamma_delta",
               "T_regulatory_cells", "Macrophages_M1", "Macrophages_M2",
               "Endothelial", "Fibroblasts", "Monocytes", "Plasma_cells")

# Bucle para generar los gráficos para cada grupo
for (g in grupos_ag) {
  data <- merged_data
  # Gestionar los valores faltantes de CMS
  if (g %in% c("CMS", "Estadío", "MSI_MSS", "Grupo")) {
    data <- data %>% filter(.data[[g]] != "") %>% drop_na(.data[[g]])  
  }
  # Función del gráfico
  plots <- lapply(variables, function(var) {
  ggplot(data, aes(x = .data[[g]], y = .data[[var]]))  + 
      # Tipo de gráfico y diseño
    geom_boxplot(color = "black", outlier.shape = NA) + 
    labs(y = paste(var), x = "") +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5, size = 6, face = "bold"),
          axis.text.x = element_text(size = 5),
          axis.text.y = element_text(size = 6),
          axis.title.y = element_text(size = 8),
          panel.spacing = unit(1, "lines"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_rect(colour = "white",
                                          fill = NA, size = 0.3),
          strip.background = element_rect(colour = "white", fill = "white")) +
      # Añadir comparaciones de medias
    stat_compare_means(method = "kruskal.test", label = "p.format", size = 3,
                       label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) 
})

grid.arrange(grobs = plots[1:9], ncol = 3, heights = rep(3, 3)) 
grid.arrange(grobs = plots[10:18], ncol = 3, heights = rep(3, 3)) 

}

# Figura 13
# Heatmap

# Genes y células de interés
genes <- c("HLA_A", "HLA_B", "HLA_C", "HLA_E", "HLA_F", "HLA_G", "HLA_DPA1", 
           "HLA_DPB1", "HLA_DPB2", "HLA_DQA1", "HLA_DQB1", "HLA_DRA", 
           "HLA_DRB1", "HLA_DRB5", "HLA_DOA", "HLA_DOB", "HLA_DMA", "HLA_DMB")
cell_types <- c("B_cells", "Cytotoxic_cells", "Dendritic_cells", "Eosinophils",
                "Macrophages", "Mast_cells", "NK_cells", "Neutrophils",
                "T_cells_CD4", "T_cells_CD8", "T_cells_gamma_delta",
                "T_regulatory_cells", "Macrophages_M1","Macrophages_M2", 
                "Endothelial", "Fibroblasts", "Monocytes", "Plasma_cells")

# Crear las anotaciones
merged_data_filt <-
  merged_data[!is.na(merged_data$Grupo) & merged_data$Grupo != "IV MSI",]
genes_data <- merged_data_filt[, genes] 
cells_data <- merged_data_filt[, cell_types]
group_annotation <- 
  data.frame(Grupo = rep(c("I MSI", "I MSS", "II MSI", "II MSS", "III MSI", "III MSS", "IV MSS"),
                         each = length(genes)))
group_annotation$Gen <- rep(genes, times=7)
rownames(group_annotation) <- paste(group_annotation$Grupo,
                                    group_annotation$Gen, sep = "_")
labels <- group_annotation$Gen
group_annotation <- subset(group_annotation, select = -Gen)

# Matriz de correlación por grupos
cor_list <- list()
for (group in unique(merged_data_filt$Grupo)){
  group_data <- merged_data_filt[merged_data_filt$Grupo == group,]
  genes_data_group <- group_data[, genes]
  cells_data_group <- group_data[, cell_types]
  cor_matrix_group <- cor(cells_data_group, genes_data_group, method="spearman")
  cor_list[[group]] <- cor_matrix_group
  
}

cor_matrix_all <- do.call(cbind, cor_list)
cor_matrix_all[is.na(cor_matrix_all)] <- 0
colnames(cor_matrix_all) <- rownames(group_annotation)

# Definir colores para los grupos
group_colors <- list(Grupo = c("II MSS" = "#9E0142", 
                               "II MSI" = "#5E4FA2", 
                               "III MSS"= "#3288BD", 
                               "III MSI"= "#CEB944",
                               "I MSS" = "#F46D43",
                               "I MSI" = "#67C2A4",
                               "IV MSS"= "#E6F598"))
# Crear el heatmap
pheatmap(cor_matrix_all, 
         annotation_col = group_annotation, 
         annotation_colors = group_colors,
         labels_col = labels,
         main = "Heatmap HLA e infiltracion",
         color = colorRampPalette(c("blue", "white", "red"))(100),
         show_colnames = T, show_rownames = T, fontsize = 8,
         fontsize_legend = 4, border_color = NA)

# Nuevo conjunto de datos incluyendo la expresión de NCR1
# Divido la expresión en "Baja" y "Alta" dependiendo si es mayor o menor que la media

merged_dataNK <- merged_data %>%
  select(ID, NK_cells, MSI_MSS, CMS, Estadío, Grupo, HLA_A, HLA_B, HLA_C, HLA_E,
         HLA_F, HLA_G, HLA_DPA1, HLA_DPB1, HLA_DPB2, HLA_DQA1, HLA_DQB1, HLA_DRA,
         HLA_DRB1, HLA_DRB5, HLA_DOA, HLA_DOB, HLA_DMA, HLA_DMB)

merged_dataNK <- merged_dataNK %>%
    filter(!is.na(Grupo))
  
merged_dataNK <- merged_dataNK %>%
  mutate(
    HLA_Ae = ifelse(HLA_A > mean(HLA_A, na.rm = TRUE), "Alta", "Baja"),
    HLA_Be = ifelse(HLA_B > mean(HLA_B, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ce = ifelse(HLA_C > mean(HLA_C, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ee = ifelse(HLA_E > mean(HLA_E, na.rm = TRUE), "Alta", "Baja"),
    HLA_Fe = ifelse(HLA_F > mean(HLA_F, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ge = ifelse(HLA_G > mean(HLA_G, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPA1e = ifelse(HLA_DPA1 > mean(HLA_DPA1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPB1e = ifelse(HLA_DPB1 > mean(HLA_DPB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPB2e = ifelse(HLA_DPB2 > mean(HLA_DPB2, na.rm = TRUE), "Alta", "Baja"),
    HLA_DQA1e = ifelse(HLA_DQA1 > mean(HLA_DQA1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DQB1e = ifelse(HLA_DQB1 > mean(HLA_DQB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DRAe  = ifelse(HLA_DRA  > mean(HLA_DRA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DRB1e = ifelse(HLA_DRB1 > mean(HLA_DRB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DRB5e = ifelse(HLA_DRB5 > mean(HLA_DRB5, na.rm = TRUE), "Alta", "Baja"),
    HLA_DOAe  = ifelse(HLA_DOA  > mean(HLA_DOA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DOBe  = ifelse(HLA_DOB  > mean(HLA_DOB,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DMAe  = ifelse(HLA_DMA  > mean(HLA_DMA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DMBe  = ifelse(HLA_DMB  > mean(HLA_DMB,  na.rm = TRUE), "Alta", "Baja")
  )

NK <- as.data.frame(t(ex.tcga.coad["NCR1", , drop = FALSE]))
NK$ID <- rownames(NK)
merged_dataNK <- merge(merged_dataNK, NK, by = "ID")
head(merged_dataNK)

            ID   NK_cells MSI_MSS  CMS Estadío   Grupo     HLA_A     HLA_B
1 TCGA-3L-AA1B -0.6359357     MSS CMS2       I   I MSS  9.422095  9.423582
2 TCGA-4N-A93T -0.6773970     MSS <NA>     III III MSS 10.087300 10.467855
3 TCGA-4T-AA8H -0.6143092     MSS CMS3      II  II MSS  9.515499  9.616729
4 TCGA-5M-AAT4 -0.7421081     MSS <NA>      IV  IV MSS  9.738799  9.741454
5 TCGA-5M-AAT6  0.3561739     MSS CMS4      IV  IV MSS 11.092478 11.579548
6 TCGA-5M-AATE -0.6622639     MSS CMS2      II  II MSS  9.898062  9.582778
      HLA_C    HLA_E    HLA_F    HLA_G HLA_DPA1 HLA_DPB1  HLA_DPB2 HLA_DQA1
1  9.276550 8.952226 4.825786 2.128095 5.720875 5.421493 0.3545650 4.680516
2 10.042796 9.472137 6.607422 2.737795 3.511506 3.611692 0.2461040 2.505180
3  8.800078 8.621431 4.894027 2.380009 2.540424 3.134747 0.4168397 3.026942
4  9.339448 8.983008 4.889235 2.767273 3.658440 3.322952 0.1180939 2.941989
5 10.896130 9.903491 7.151282 4.115008 7.628497 7.211774 2.3664201 5.949196
6  9.335188 8.616671 5.209192 3.176099 4.565591 3.974722 0.2412300 2.868667
  HLA_DQB1   HLA_DRA  HLA_DRB1 HLA_DRB5  HLA_DOA   HLA_DOB  HLA_DMA  HLA_DMB
1 5.665441  9.022051  8.437177 5.927275 3.316638 1.7563402 4.740857 3.289510
2 4.438073  8.056338  8.498138 6.859332 1.465295 1.2936059 5.042167 2.043240
3 2.896872  7.404167  6.608737 4.248558 1.256467 0.6273267 4.425493 1.646209
4 2.304365  6.963469  6.344530 3.971838 1.658737 0.7304008 5.028591 2.781360
5 7.789211 11.311419 10.803370 7.820975 4.998687 2.7762725 7.712894 5.156441
6 3.955024  7.663813  6.797785 4.956647 1.971442 0.1991226 4.486515 2.160920
  HLA_Ae HLA_Be HLA_Ce HLA_Ee HLA_Fe HLA_Ge HLA_DPA1e HLA_DPB1e HLA_DPB2e
1   Baja   Baja   Baja   Baja   Baja   Baja      Baja      Baja      Baja
2   Baja   Baja   Baja   Baja   Alta   Baja      Baja      Baja      Baja
3   Baja   Baja   Baja   Baja   Baja   Baja      Baja      Baja      Baja
4   Baja   Baja   Baja   Baja   Baja   Baja      Baja      Baja      Baja
5   Alta   Alta   Alta   Alta   Alta   Alta      Alta      Alta      Alta
6   Baja   Baja   Baja   Baja   Baja   Baja      Baja      Baja      Baja
  HLA_DQA1e HLA_DQB1e HLA_DRAe HLA_DRB1e HLA_DRB5e HLA_DOAe HLA_DOBe HLA_DMAe
1      Alta      Alta     Baja      Baja      Baja     Alta     Alta     Baja
2      Baja      Baja     Baja      Baja      Baja     Baja     Baja     Baja
3      Baja      Baja     Baja      Baja      Baja     Baja     Baja     Baja
4      Baja      Baja     Baja      Baja      Baja     Baja     Baja     Baja
5      Alta      Alta     Alta      Alta      Alta     Alta     Alta     Alta
6      Baja      Baja     Baja      Baja      Baja     Baja     Baja     Baja
  HLA_DMBe      NCR1
1     Baja 0.1604039
2     Baja 0.0000000
3     Baja 0.1908045
4     Baja 0.1003049
5     Alta 0.6564506
6     Baja 0.0000000

# Figura 15A, C y E
# Función para comparar la expresión de NCR1 para un gen específico en función de las categorías "Baja" y "Alta" mediante boxplot para cada Grupo 

NK_HLA <- function(dataset, gen){
  ggplot(dataset, aes(x = .data[[gen]], y = NCR1)) + 
  geom_boxplot(outlier.shape = NA, alpha = 0.6) + 
  geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
  facet_wrap(~ Grupo) +
  stat_compare_means(method = "wilcox.test", label = "p.format", size = 3,
                     label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) +
  theme_minimal() + 
  labs(
    title = paste("Comparación de la expresión de NCR1 entre las categorías de",
                  substr(gen, 1, nchar(gen) - 1)),
    x = NULL,
    y = "log2(NCR1)"
  ) +
  theme(legend.position = "none")
}

exp_HLA <- c("HLA_Ae", "HLA_Be", "HLA_Ce", "HLA_Ee", "HLA_Fe", "HLA_Ge",
             "HLA_DPA1e", "HLA_DPB1e", "HLA_DPB2e", "HLA_DQA1e", 
             "HLA_DQB1e", "HLA_DRAe", "HLA_DRB1e", "HLA_DRB5e", "HLA_DOAe", 
             "HLA_DOBe", "HLA_DMAe", "HLA_DMBe")

for (gen in exp_HLA){
  print(NK_HLA(merged_dataNK, gen))
}

# Figura 15B, D y F
# Función para comparar la infiltración de NK para un gen específico en función de las categorías "Baja" y "Alta" mediante boxplot para cada Grupo 

NK_HLA <- function(dataset, gen){
  ggplot(dataset, aes(x = .data[[gen]], y = NK_cells)) + 
  geom_boxplot(outlier.shape = NA, alpha = 0.6) + 
  geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
  facet_wrap(~ Grupo) +
  stat_compare_means(method = "wilcox.test", label = "p.format", size = 3,
                     label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) +
  theme_minimal() + 
  labs(
    title = paste("Comparación de la expresión de NK_cells
                  entre las categorías de",
                  substr(gen, 1, nchar(gen) - 1)),
    x = NULL,
    y = "NK_cells"
  ) +
  theme(legend.position = "none")
}

for (gen in exp_HLA){
  print(NK_HLA(merged_dataNK, gen))
}

Análisis de supervivencia

merged_data <- read_excel("merged_data.xlsx")

# Variables edad, evento, tiempo
clin <-  merged_data %>%
  mutate(
    # Edad
    edad = abs(days_to_birth) / 365.25,
    # Evento: 1 si hay una fecha de muerte, 0 si no
    evento = ifelse(!is.na(days_to_death), 1, 0),
    
    # Tiempo: usar días hasta la muerte si hay, si no usar seguimiento
    tiempo = ifelse(!is.na(days_to_death), days_to_death,
                  ifelse(!is.na(days_to_last_followup), days_to_last_followup,
                         days_to_last_known_alive))
  )

clin <- clin %>% drop_na(tiempo)

# Modelos de Cox simples

genes <- c("HLA_A", "HLA_B", "HLA_C", "HLA_E", "HLA_F", "HLA_G",
             "HLA_DPA1", "HLA_DPB1", "HLA_DPB2", "HLA_DQA1",
             "HLA_DQB1", "HLA_DRA", "HLA_DRB1", "HLA_DRB5", "HLA_DOA",
             "HLA_DOB", "HLA_DMA", "HLA_DMB")

for (gene in genes) {
  # Modelo de Cox para cada gen como variable continua
  formula <- as.formula(paste("Surv(tiempo, evento) ~", gene))
  cox_model <- coxph(formula, data = clin)
  
  # Mostrar resumen del modelo
  cat("Modelo de Cox para", gene, "\n")
  print(summary(cox_model))
  cat("\n--------------------------------\n")
}

Modelo de Cox para HLA_A 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

         coef exp(coef) se(coef)      z Pr(>|z|)
HLA_A -0.1453    0.8648   0.1533 -0.948    0.343

      exp(coef) exp(-coef) lower .95 upper .95
HLA_A    0.8648      1.156    0.6404     1.168

Concordance= 0.536  (se = 0.038 )
Likelihood ratio test= 0.88  on 1 df,   p=0.3
Wald test            = 0.9  on 1 df,   p=0.3
Score (logrank) test = 0.9  on 1 df,   p=0.3


--------------------------------
Modelo de Cox para HLA_B 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

         coef exp(coef) se(coef)      z Pr(>|z|)
HLA_B -0.1734    0.8408   0.1299 -1.335    0.182

      exp(coef) exp(-coef) lower .95 upper .95
HLA_B    0.8408      1.189    0.6518     1.085

Concordance= 0.566  (se = 0.041 )
Likelihood ratio test= 1.69  on 1 df,   p=0.2
Wald test            = 1.78  on 1 df,   p=0.2
Score (logrank) test = 1.78  on 1 df,   p=0.2


--------------------------------
Modelo de Cox para HLA_C 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

         coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_C -0.3310    0.7182   0.1531 -2.162   0.0306 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

      exp(coef) exp(-coef) lower .95 upper .95
HLA_C    0.7182      1.392     0.532    0.9695

Concordance= 0.566  (se = 0.04 )
Likelihood ratio test= 4.61  on 1 df,   p=0.03
Wald test            = 4.67  on 1 df,   p=0.03
Score (logrank) test = 4.68  on 1 df,   p=0.03


--------------------------------
Modelo de Cox para HLA_E 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

         coef exp(coef) se(coef)      z Pr(>|z|)
HLA_E -0.2731    0.7610   0.2025 -1.349    0.177

      exp(coef) exp(-coef) lower .95 upper .95
HLA_E     0.761      1.314    0.5118     1.132

Concordance= 0.565  (se = 0.04 )
Likelihood ratio test= 1.8  on 1 df,   p=0.2
Wald test            = 1.82  on 1 df,   p=0.2
Score (logrank) test = 1.82  on 1 df,   p=0.2


--------------------------------
Modelo de Cox para HLA_F 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

         coef exp(coef) se(coef)      z Pr(>|z|)
HLA_F -0.1124    0.8937   0.1114 -1.009    0.313

      exp(coef) exp(-coef) lower .95 upper .95
HLA_F    0.8937      1.119    0.7185     1.112

Concordance= 0.542  (se = 0.042 )
Likelihood ratio test= 0.99  on 1 df,   p=0.3
Wald test            = 1.02  on 1 df,   p=0.3
Score (logrank) test = 1.02  on 1 df,   p=0.3


--------------------------------
Modelo de Cox para HLA_G 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

         coef exp(coef) se(coef)      z Pr(>|z|)
HLA_G -0.0530    0.9484   0.1181 -0.449    0.654

      exp(coef) exp(-coef) lower .95 upper .95
HLA_G    0.9484      1.054    0.7524     1.195

Concordance= 0.49  (se = 0.039 )
Likelihood ratio test= 0.2  on 1 df,   p=0.7
Wald test            = 0.2  on 1 df,   p=0.7
Score (logrank) test = 0.2  on 1 df,   p=0.7


--------------------------------
Modelo de Cox para HLA_DPA1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

             coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DPA1 -0.14713   0.86318  0.08806 -1.671   0.0948 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

         exp(coef) exp(-coef) lower .95 upper .95
HLA_DPA1    0.8632      1.159    0.7263     1.026

Concordance= 0.586  (se = 0.044 )
Likelihood ratio test= 2.78  on 1 df,   p=0.1
Wald test            = 2.79  on 1 df,   p=0.09
Score (logrank) test = 2.8  on 1 df,   p=0.09


--------------------------------
Modelo de Cox para HLA_DPB1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

            coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DPB1 -0.2182    0.8040   0.1007 -2.167   0.0302 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

         exp(coef) exp(-coef) lower .95 upper .95
HLA_DPB1     0.804      1.244      0.66    0.9793

Concordance= 0.604  (se = 0.042 )
Likelihood ratio test= 4.71  on 1 df,   p=0.03
Wald test            = 4.7  on 1 df,   p=0.03
Score (logrank) test = 4.71  on 1 df,   p=0.03


--------------------------------
Modelo de Cox para HLA_DPB2 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

            coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DPB2 -0.4841    0.6162   0.2206 -2.195   0.0282 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

         exp(coef) exp(-coef) lower .95 upper .95
HLA_DPB2    0.6162      1.623    0.3999    0.9495

Concordance= 0.628  (se = 0.041 )
Likelihood ratio test= 5.92  on 1 df,   p=0.01
Wald test            = 4.82  on 1 df,   p=0.03
Score (logrank) test = 4.94  on 1 df,   p=0.03


--------------------------------
Modelo de Cox para HLA_DQA1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

             coef exp(coef) se(coef)      z Pr(>|z|)
HLA_DQA1 -0.13665   0.87227  0.09303 -1.469    0.142

         exp(coef) exp(-coef) lower .95 upper .95
HLA_DQA1    0.8723      1.146    0.7269     1.047

Concordance= 0.584  (se = 0.046 )
Likelihood ratio test= 2.18  on 1 df,   p=0.1
Wald test            = 2.16  on 1 df,   p=0.1
Score (logrank) test = 2.16  on 1 df,   p=0.1


--------------------------------
Modelo de Cox para HLA_DQB1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

             coef exp(coef) se(coef)      z Pr(>|z|)
HLA_DQB1 -0.07489   0.92784  0.08618 -0.869    0.385

         exp(coef) exp(-coef) lower .95 upper .95
HLA_DQB1    0.9278      1.078    0.7836     1.099

Concordance= 0.537  (se = 0.045 )
Likelihood ratio test= 0.76  on 1 df,   p=0.4
Wald test            = 0.76  on 1 df,   p=0.4
Score (logrank) test = 0.76  on 1 df,   p=0.4


--------------------------------
Modelo de Cox para HLA_DRA 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

            coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DRA -0.19709   0.82112  0.08723 -2.259   0.0239 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

        exp(coef) exp(-coef) lower .95 upper .95
HLA_DRA    0.8211      1.218    0.6921    0.9742

Concordance= 0.607  (se = 0.042 )
Likelihood ratio test= 5.07  on 1 df,   p=0.02
Wald test            = 5.11  on 1 df,   p=0.02
Score (logrank) test = 5.13  on 1 df,   p=0.02


--------------------------------
Modelo de Cox para HLA_DRB1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

             coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DRB1 -0.17850   0.83653  0.09225 -1.935    0.053 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

         exp(coef) exp(-coef) lower .95 upper .95
HLA_DRB1    0.8365      1.195    0.6982     1.002

Concordance= 0.591  (se = 0.043 )
Likelihood ratio test= 3.7  on 1 df,   p=0.05
Wald test            = 3.74  on 1 df,   p=0.05
Score (logrank) test = 3.74  on 1 df,   p=0.05


--------------------------------
Modelo de Cox para HLA_DRB5 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

             coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DRB5 -0.20288   0.81637  0.07945 -2.553   0.0107 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

         exp(coef) exp(-coef) lower .95 upper .95
HLA_DRB5    0.8164      1.225    0.6986    0.9539

Concordance= 0.627  (se = 0.04 )
Likelihood ratio test= 6.66  on 1 df,   p=0.01
Wald test            = 6.52  on 1 df,   p=0.01
Score (logrank) test = 6.52  on 1 df,   p=0.01


--------------------------------
Modelo de Cox para HLA_DOA 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

           coef exp(coef) se(coef)      z Pr(>|z|)
HLA_DOA -0.1393    0.8699   0.1052 -1.325    0.185

        exp(coef) exp(-coef) lower .95 upper .95
HLA_DOA    0.8699       1.15    0.7079     1.069

Concordance= 0.58  (se = 0.046 )
Likelihood ratio test= 1.83  on 1 df,   p=0.2
Wald test            = 1.75  on 1 df,   p=0.2
Score (logrank) test = 1.75  on 1 df,   p=0.2


--------------------------------
Modelo de Cox para HLA_DOB 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

          coef exp(coef) se(coef)     z Pr(>|z|)
HLA_DOB 0.1198    1.1273   0.1817 0.659     0.51

        exp(coef) exp(-coef) lower .95 upper .95
HLA_DOB     1.127     0.8871    0.7896     1.609

Concordance= 0.491  (se = 0.045 )
Likelihood ratio test= 0.43  on 1 df,   p=0.5
Wald test            = 0.43  on 1 df,   p=0.5
Score (logrank) test = 0.43  on 1 df,   p=0.5


--------------------------------
Modelo de Cox para HLA_DMA 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

           coef exp(coef) se(coef)      z Pr(>|z|)
HLA_DMA -0.1590    0.8530   0.1222 -1.301    0.193

        exp(coef) exp(-coef) lower .95 upper .95
HLA_DMA     0.853      1.172    0.6714     1.084

Concordance= 0.578  (se = 0.042 )
Likelihood ratio test= 1.74  on 1 df,   p=0.2
Wald test            = 1.69  on 1 df,   p=0.2
Score (logrank) test = 1.69  on 1 df,   p=0.2


--------------------------------
Modelo de Cox para HLA_DMB 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

           coef exp(coef) se(coef)     z Pr(>|z|)  
HLA_DMB -0.1994    0.8192   0.1146 -1.74   0.0818 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

        exp(coef) exp(-coef) lower .95 upper .95
HLA_DMB    0.8192      1.221    0.6544     1.025

Concordance= 0.581  (se = 0.042 )
Likelihood ratio test= 3.09  on 1 df,   p=0.08
Wald test            = 3.03  on 1 df,   p=0.08
Score (logrank) test = 3.04  on 1 df,   p=0.08


--------------------------------

# Modelos de Cox con interacción

for (gene in genes) {
  # Modelo de Cox para cada gen como variable continua
  formula <- as.formula(paste("Surv(tiempo, evento) ~", gene,"*NK_cells"))
  cox_model <- coxph(formula, data = clin)
  
  # Mostrar resumen del modelo
  cat("Modelo de Cox para", gene, "\n")
  print(summary(cox_model))
  cat("\n--------------------------------\n")
}

Modelo de Cox para HLA_A 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                  coef exp(coef) se(coef)      z Pr(>|z|)
HLA_A          -0.2869    0.7506   0.2543 -1.128    0.259
NK_cells        4.1974   66.5158   5.2173  0.805    0.421
HLA_A:NK_cells -0.4364    0.6464   0.5045 -0.865    0.387

               exp(coef) exp(-coef) lower .95 upper .95
HLA_A             0.7506    1.33223   0.45604 1.235e+00
NK_cells         66.5158    0.01503   0.00241 1.836e+06
HLA_A:NK_cells    0.6464    1.54712   0.24046 1.737e+00

Concordance= 0.537  (se = 0.041 )
Likelihood ratio test= 2.2  on 3 df,   p=0.5
Wald test            = 1.89  on 3 df,   p=0.6
Score (logrank) test = 1.9  on 3 df,   p=0.6


--------------------------------
Modelo de Cox para HLA_B 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                   coef exp(coef) se(coef)      z Pr(>|z|)
HLA_B          -0.18770   0.82886  0.24273 -0.773    0.439
NK_cells        0.67969   1.97326  4.55260  0.149    0.881
HLA_B:NK_cells -0.08643   0.91720  0.42107 -0.205    0.837

               exp(coef) exp(-coef) lower .95 upper .95
HLA_B             0.8289     1.2065  0.515079     1.334
NK_cells          1.9733     0.5068  0.000263 14803.563
HLA_B:NK_cells    0.9172     1.0903  0.401837     2.094

Concordance= 0.562  (se = 0.044 )
Likelihood ratio test= 2.04  on 3 df,   p=0.6
Wald test            = 2.01  on 3 df,   p=0.6
Score (logrank) test = 2.04  on 3 df,   p=0.6


--------------------------------
Modelo de Cox para HLA_C 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                  coef exp(coef) se(coef)      z Pr(>|z|)
HLA_C          -0.3357    0.7148   0.2710 -1.239    0.215
NK_cells        0.1653    1.1798   4.9792  0.033    0.974
HLA_C:NK_cells -0.0175    0.9827   0.4653 -0.038    0.970

               exp(coef) exp(-coef) lower .95 upper .95
HLA_C             0.7148     1.3989 4.202e-01     1.216
NK_cells          1.1798     0.8476 6.816e-05 20421.270
HLA_C:NK_cells    0.9827     1.0177 3.948e-01     2.446

Concordance= 0.566  (se = 0.04 )
Likelihood ratio test= 4.61  on 3 df,   p=0.2
Wald test            = 4.65  on 3 df,   p=0.2
Score (logrank) test = 4.72  on 3 df,   p=0.2


--------------------------------
Modelo de Cox para HLA_E 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                   coef exp(coef) se(coef)      z Pr(>|z|)
HLA_E          -0.12700   0.88073  0.38501 -0.330    0.741
NK_cells       -2.33390   0.09692  6.07498 -0.384    0.701
HLA_E:NK_cells  0.22079   1.24706  0.61137  0.361    0.718

               exp(coef) exp(-coef) lower .95 upper .95
HLA_E            0.88073     1.1354 4.141e-01     1.873
NK_cells         0.09692    10.3181 6.537e-07 14369.173
HLA_E:NK_cells   1.24706     0.8019 3.763e-01     4.133

Concordance= 0.575  (se = 0.043 )
Likelihood ratio test= 2.01  on 3 df,   p=0.6
Wald test            = 2.1  on 3 df,   p=0.6
Score (logrank) test = 2.11  on 3 df,   p=0.5


--------------------------------
Modelo de Cox para HLA_F 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                   coef exp(coef) se(coef)      z Pr(>|z|)
HLA_F          -0.12185   0.88528  0.22274 -0.547    0.584
NK_cells        0.22179   1.24831  2.40061  0.092    0.926
HLA_F:NK_cells -0.08198   0.92129  0.37743 -0.217    0.828

               exp(coef) exp(-coef) lower .95 upper .95
HLA_F             0.8853     1.1296    0.5721      1.37
NK_cells          1.2483     0.8011    0.0113    137.95
HLA_F:NK_cells    0.9213     1.0854    0.4397      1.93

Concordance= 0.547  (se = 0.045 )
Likelihood ratio test= 1.42  on 3 df,   p=0.7
Wald test            = 1.35  on 3 df,   p=0.7
Score (logrank) test = 1.37  on 3 df,   p=0.7


--------------------------------
Modelo de Cox para HLA_G 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                  coef exp(coef) se(coef)      z Pr(>|z|)
HLA_G          -0.1246    0.8828   0.2002 -0.623    0.534
NK_cells        0.5024    1.6527   1.4401  0.349    0.727
HLA_G:NK_cells -0.2557    0.7744   0.3958 -0.646    0.518

               exp(coef) exp(-coef) lower .95 upper .95
HLA_G             0.8828     1.1327   0.59631     1.307
NK_cells          1.6527     0.6051   0.09827    27.795
HLA_G:NK_cells    0.7744     1.2914   0.35651     1.682

Concordance= 0.587  (se = 0.046 )
Likelihood ratio test= 1.42  on 3 df,   p=0.7
Wald test            = 1.21  on 3 df,   p=0.8
Score (logrank) test = 1.23  on 3 df,   p=0.7


--------------------------------
Modelo de Cox para HLA_DPA1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                      coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DPA1           0.08895   1.09303  0.19099  0.466   0.6414  
NK_cells          -3.12032   0.04414  1.87412 -1.665   0.0959 .
HLA_DPA1:NK_cells  0.46347   1.58959  0.23154  2.002   0.0453 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                  exp(coef) exp(-coef) lower .95 upper .95
HLA_DPA1            1.09303     0.9149  0.751726     1.589
NK_cells            0.04414    22.6537  0.001121     1.738
HLA_DPA1:NK_cells   1.58959     0.6291  1.009713     2.502

Concordance= 0.617  (se = 0.043 )
Likelihood ratio test= 6.54  on 3 df,   p=0.09
Wald test            = 7.11  on 3 df,   p=0.07
Score (logrank) test = 7.29  on 3 df,   p=0.06


--------------------------------
Modelo de Cox para HLA_DPB1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                     coef exp(coef) se(coef)      z Pr(>|z|)
HLA_DPB1          -0.1126    0.8935   0.2179 -0.517    0.605
NK_cells          -1.8247    0.1613   1.9328 -0.944    0.345
HLA_DPB1:NK_cells  0.3724    1.4512   0.2647  1.407    0.160

                  exp(coef) exp(-coef) lower .95 upper .95
HLA_DPB1             0.8935     1.1192   0.58294     1.370
NK_cells             0.1613     6.2010   0.00365     7.124
HLA_DPB1:NK_cells    1.4512     0.6891   0.86376     2.438

Concordance= 0.629  (se = 0.04 )
Likelihood ratio test= 7.49  on 3 df,   p=0.06
Wald test            = 8.08  on 3 df,   p=0.04
Score (logrank) test = 8.24  on 3 df,   p=0.04


--------------------------------
Modelo de Cox para HLA_DPB2 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                     coef exp(coef) se(coef)      z Pr(>|z|)   
HLA_DPB2          -0.4428    0.6422   0.2558 -1.731  0.08349 . 
NK_cells          -0.7127    0.4903   0.6418 -1.110  0.26681   
HLA_DPB2:NK_cells  1.1805    3.2561   0.4570  2.583  0.00979 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                  exp(coef) exp(-coef) lower .95 upper .95
HLA_DPB2             0.6422     1.5571    0.3890     1.060
NK_cells             0.4903     2.0395    0.1394     1.725
HLA_DPB2:NK_cells    3.2561     0.3071    1.3295     7.975

Concordance= 0.65  (se = 0.038 )
Likelihood ratio test= 12.13  on 3 df,   p=0.007
Wald test            = 9.84  on 3 df,   p=0.02
Score (logrank) test = 10.05  on 3 df,   p=0.02


--------------------------------
Modelo de Cox para HLA_DQA1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                      coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DQA1           0.08199   1.08544  0.17615  0.465   0.6416  
NK_cells          -2.46253   0.08522  1.41026 -1.746   0.0808 .
HLA_DQA1:NK_cells  0.45773   1.58048  0.20637  2.218   0.0266 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                  exp(coef) exp(-coef) lower .95 upper .95
HLA_DQA1            1.08544     0.9213  0.768542     1.533
NK_cells            0.08522    11.7345  0.005372     1.352
HLA_DQA1:NK_cells   1.58048     0.6327  1.054702     2.368

Concordance= 0.633  (se = 0.042 )
Likelihood ratio test= 6.36  on 3 df,   p=0.1
Wald test            = 6.9  on 3 df,   p=0.08
Score (logrank) test = 7.07  on 3 df,   p=0.07


--------------------------------
Modelo de Cox para HLA_DQB1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                      coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DQB1           0.21953   1.24549  0.16832  1.304   0.1922  
NK_cells          -3.33412   0.03565  1.59110 -2.095   0.0361 *
HLA_DQB1:NK_cells  0.48711   1.62761  0.22258  2.188   0.0286 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                  exp(coef) exp(-coef) lower .95 upper .95
HLA_DQB1            1.24549     0.8029  0.895494     1.732
NK_cells            0.03565    28.0537  0.001576     0.806
HLA_DQB1:NK_cells   1.62761     0.6144  1.052175     2.518

Concordance= 0.598  (se = 0.044 )
Likelihood ratio test= 5.46  on 3 df,   p=0.1
Wald test            = 5.58  on 3 df,   p=0.1
Score (logrank) test = 5.66  on 3 df,   p=0.1


--------------------------------
Modelo de Cox para HLA_DRA 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                    coef exp(coef) se(coef)      z Pr(>|z|)
HLA_DRA          -0.1741    0.8402   0.2070 -0.841    0.400
NK_cells         -1.6298    0.1960   2.8682 -0.568    0.570
HLA_DRA:NK_cells  0.2172    1.2426   0.2521  0.861    0.389

                 exp(coef) exp(-coef) lower .95 upper .95
HLA_DRA             0.8402     1.1902 0.5600076     1.261
NK_cells            0.1960     5.1030 0.0007091    54.153
HLA_DRA:NK_cells    1.2426     0.8048 0.7580497     2.037

Concordance= 0.635  (se = 0.04 )
Likelihood ratio test= 7.03  on 3 df,   p=0.07
Wald test            = 7.61  on 3 df,   p=0.05
Score (logrank) test = 7.74  on 3 df,   p=0.05


--------------------------------
Modelo de Cox para HLA_DRB1 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                      coef exp(coef) se(coef)      z Pr(>|z|)
HLA_DRB1          -0.09347   0.91077  0.20267 -0.461    0.645
NK_cells          -1.98856   0.13689  2.66458 -0.746    0.455
HLA_DRB1:NK_cells  0.23644   1.26673  0.25615  0.923    0.356

                  exp(coef) exp(-coef) lower .95 upper .95
HLA_DRB1             0.9108     1.0980 0.6121977     1.355
NK_cells             0.1369     7.3050 0.0007384    25.379
HLA_DRB1:NK_cells    1.2667     0.7894 0.7667461     2.093

Concordance= 0.609  (se = 0.042 )
Likelihood ratio test= 4.84  on 3 df,   p=0.2
Wald test            = 5.3  on 3 df,   p=0.2
Score (logrank) test = 5.38  on 3 df,   p=0.1


--------------------------------
Modelo de Cox para HLA_DRB5 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                       coef exp(coef)  se(coef)      z Pr(>|z|)  
HLA_DRB5          -0.267407  0.765362  0.150752 -1.774   0.0761 .
NK_cells           0.540251  1.716437  1.752262  0.308   0.7578  
HLA_DRB5:NK_cells  0.002244  1.002246  0.224416  0.010   0.9920  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                  exp(coef) exp(-coef) lower .95 upper .95
HLA_DRB5             0.7654     1.3066   0.56957     1.028
NK_cells             1.7164     0.5826   0.05535    53.230
HLA_DRB5:NK_cells    1.0022     0.9978   0.64558     1.556

Concordance= 0.631  (se = 0.039 )
Likelihood ratio test= 7.59  on 3 df,   p=0.06
Wald test            = 7.52  on 3 df,   p=0.06
Score (logrank) test = 7.63  on 3 df,   p=0.05


--------------------------------
Modelo de Cox para HLA_DOA 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                     coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DOA          -0.00804   0.99199  0.15570 -0.052   0.9588  
NK_cells         -1.58218   0.20553  0.96877 -1.633   0.1024  
HLA_DOA:NK_cells  0.43095   1.53871  0.18104  2.380   0.0173 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                 exp(coef) exp(-coef) lower .95 upper .95
HLA_DOA             0.9920     1.0081   0.73110     1.346
NK_cells            0.2055     4.8656   0.03078     1.372
HLA_DOA:NK_cells    1.5387     0.6499   1.07909     2.194

Concordance= 0.604  (se = 0.047 )
Likelihood ratio test= 6.21  on 3 df,   p=0.1
Wald test            = 6.7  on 3 df,   p=0.08
Score (logrank) test = 6.88  on 3 df,   p=0.08


--------------------------------
Modelo de Cox para HLA_DOB 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                     coef exp(coef) se(coef)      z Pr(>|z|)   
HLA_DOB           0.76873   2.15701  0.26284  2.925  0.00345 **
NK_cells         -3.05793   0.04699  0.99866 -3.062  0.00220 **
HLA_DOB:NK_cells  1.17788   3.24750  0.38317  3.074  0.00211 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                 exp(coef) exp(-coef) lower .95 upper .95
HLA_DOB            2.15701     0.4636  1.288627    3.6106
NK_cells           0.04699    21.2834  0.006636    0.3327
HLA_DOB:NK_cells   3.24750     0.3079  1.532478    6.8818

Concordance= 0.659  (se = 0.041 )
Likelihood ratio test= 10.41  on 3 df,   p=0.02
Wald test            = 11.32  on 3 df,   p=0.01
Score (logrank) test = 11.25  on 3 df,   p=0.01


--------------------------------
Modelo de Cox para HLA_DMA 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                     coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DMA           0.03141   1.03191  0.20902  0.150   0.8805  
NK_cells         -2.71653   0.06610  1.81435 -1.497   0.1343  
HLA_DMA:NK_cells  0.43871   1.55071  0.26634  1.647   0.0995 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                 exp(coef) exp(-coef) lower .95 upper .95
HLA_DMA             1.0319     0.9691  0.685055     1.554
NK_cells            0.0661    15.1277  0.001887     2.315
HLA_DMA:NK_cells    1.5507     0.6449  0.920079     2.614

Concordance= 0.587  (se = 0.045 )
Likelihood ratio test= 4.11  on 3 df,   p=0.3
Wald test            = 4.32  on 3 df,   p=0.2
Score (logrank) test = 4.38  on 3 df,   p=0.2


--------------------------------
Modelo de Cox para HLA_DMB 
Call:
coxph(formula = formula, data = clin)

  n= 391, number of events= 55 

                     coef exp(coef) se(coef)      z Pr(>|z|)  
HLA_DMB          -0.01482   0.98529  0.24508 -0.060   0.9518  
NK_cells         -1.82214   0.16168  1.57340 -1.158   0.2468  
HLA_DMB:NK_cells  0.47928   1.61490  0.29065  1.649   0.0991 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                 exp(coef) exp(-coef) lower .95 upper .95
HLA_DMB             0.9853     1.0149  0.609466     1.593
NK_cells            0.1617     6.1851  0.007402     3.531
HLA_DMB:NK_cells    1.6149     0.6192  0.913585     2.855

Concordance= 0.612  (se = 0.041 )
Likelihood ratio test= 5.92  on 3 df,   p=0.1
Wald test            = 6.3  on 3 df,   p=0.1
Score (logrank) test = 6.43  on 3 df,   p=0.09


--------------------------------

# Estratificar por mediana

clin <- clin %>%
  mutate(
    HLA_Ae = ifelse(HLA_A > median(HLA_A, na.rm = TRUE), "Alta", "Baja"),
    HLA_Be = ifelse(HLA_B > median(HLA_B, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ce = ifelse(HLA_C > median(HLA_C, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ee = ifelse(HLA_E > median(HLA_E, na.rm = TRUE), "Alta", "Baja"),
    HLA_Fe = ifelse(HLA_F > median(HLA_F, na.rm = TRUE), "Alta", "Baja"),
    HLA_Ge = ifelse(HLA_G > median(HLA_G, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPA1e = ifelse(HLA_DPA1 > median(HLA_DPA1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPB1e = ifelse(HLA_DPB1 > median(HLA_DPB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DPB2e = ifelse(HLA_DPB2 > median(HLA_DPB2, na.rm = TRUE), "Alta", "Baja"),
    HLA_DQA1e = ifelse(HLA_DQA1 > median(HLA_DQA1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DQB1e = ifelse(HLA_DQB1 > median(HLA_DQB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DRAe  = ifelse(HLA_DRA  > median(HLA_DRA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DRB1e = ifelse(HLA_DRB1 > median(HLA_DRB1, na.rm = TRUE), "Alta", "Baja"),
    HLA_DRB5e = ifelse(HLA_DRB5 > median(HLA_DRB5, na.rm = TRUE), "Alta", "Baja"),
    HLA_DOAe  = ifelse(HLA_DOA  > median(HLA_DOA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DOBe  = ifelse(HLA_DOB  > median(HLA_DOB,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DMAe  = ifelse(HLA_DMA  > median(HLA_DMA,  na.rm = TRUE), "Alta", "Baja"),
    HLA_DMBe  = ifelse(HLA_DMB  > median(HLA_DMB,  na.rm = TRUE), "Alta", "Baja")
  )

# Curvas Kaplan-Meier

genes <- c("HLA_Ae", "HLA_Be", "HLA_Ce", "HLA_Ee", "HLA_Fe", "HLA_Ge",
             "HLA_DPA1e", "HLA_DPB1e", "HLA_DPB2e", "HLA_DQA1e", 
             "HLA_DQB1e", "HLA_DRAe", "HLA_DRB1e", "HLA_DRB5e", "HLA_DOAe", 
             "HLA_DOBe", "HLA_DMAe", "HLA_DMBe")

for (gene in genes) {

  # Ajuste del modelo de supervivencia
  fit <- survfit(as.formula(paste("Surv(tiempo, evento) ~", gene)), data = clin)
  # Gráfico
  plot <- ggsurvplot(fit,
                     data = clin,
                     pval = TRUE,
                     ggtheme = theme_minimal(),
                     title = "Curvas supervivencia estimador Kaplan-Meier",
                     subtitle = gene,
                     legend.labs = c("Alta", "Baja"),
                     risk.table.y.text.col = TRUE,
                     risk.table.y.text = FALSE)
  print(plot)
}