require(knitr)
require(ggplot2)
require(dplyr)
require(tidyr)
require(rlang)
require(ggpubr)
require(gridExtra)
require(devtools)
require(matrixStats)
require(ConsensusTME)
require(pheatmap)
require(nortest)
require(car)
setwd(params$wd)
load("CRC_filt.RData")
Clasificación y exploración inicial de los datos
# Figura 2A
# Gráfico de barras para la distribución de las muestras en los distintos grupos
# jpeg("Figura2.jpeg")
clin %>%
select(msi_imputed, cms, dataset, stage, group) %>%
# Formato ancho
gather(key = "variable", value = "value") %>%
# Agrupación por variable y valor para contar frecuencias
group_by(variable, value) %>%
# Resumen de los datos / conteo
summarise(count = n(), .groups = "drop") %>%
# Agrupación por variable para calcular porcentajes dentro de cada grupo
group_by(variable) %>%
# Calcular porcentaje y ordenar factores
mutate(percent = count / sum(count) * 100,
variable = factor(variable, levels = c("msi_imputed", "cms", "stage", "group", "dataset")),
value = factor(value, levels = unique(value))) %>%
# Gráfico de barras
ggplot(aes(x = variable, y = percent, fill = value)) +
geom_bar(stat = "identity") +
# Personalizar colores, leyenda, ejes y tÃtulo
scale_fill_manual(values = c("#FFFACD", "#FFD700", "#FFA07A", "#FF6347","#FFB6C1", "#FF6281", "#FF69B4", "#FF1493", "#C71585", "#DDA0DD", "#5F9EA0", "#B0E0E6",
"#B0C4DE", "#ADD8E6", "#006400", "#9ACD32", "#4682B4", "#4169E1"),
guide = guide_legend(keyheight = 0.8, keywidth = 0.8)) +
scale_x_discrete(labels = c("msi_imputed" = "MSI/MSS", "cms" = "CMS", "stage" = "EstadÃo", "group" = "Grupo", "dataset" = "Conjunto de Datos")) +
labs(title = "Distribución porcentual de las muestras",
x = "",
y = "Porcentaje (%)",
fill = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# dev.off()
# Filtrar la matriz de expresión por los genes que contienen "HLA"
ext <- t(ex)
hla <- as.data.frame(ext[, grep("HLA-", colnames(ext))])
hla <- hla[complete.cases(hla), ]
# Añadir la columna ID al dataframe
hla$ID <- rownames(hla)
# Sustituir - por _ en los genes
colnames(hla) <- gsub("-", "_", colnames(hla))
colnames(hla)
[1] "HLA_DOB" "HLA_E" "HLA_DPB1" "HLA_DQA1" "HLA_DMB" "HLA_DRB1"
[7] "HLA_F" "HLA_DOA" "HLA_B" "HLA_C" "HLA_DRA" "HLA_DQB1"
[13] "HLA_DRB5" "HLA_G" "HLA_DPA1" "HLA_DQA2" "HLA_A" "HLA_DMA"
[19] "HLA_DPB2" "ID"
# Unir dataframes de datos clÃnicos y expresión de HLA por la columna ID
aux <- merge(hla, clin, by = "ID")
# Cambiar nombres de las columnas
colnames(aux)[colnames(aux) == "msi_imputed"] <- "MSI_MSS"
colnames(aux)[colnames(aux) == "cms"] <- "CMS"
colnames(aux)[colnames(aux) == "group"] <- "Grupo"
colnames(aux)[colnames(aux) == "stage"] <- "EstadÃo"
colnames(aux)[colnames(aux) == "dataset"] <- "Dataset"
Análisis descriptivo y comparación de HLA-I y HLA-II
# Medias HLA para cada gen/grupo
# Listas de genes y grupos con los que trabajaré
genes <- c("HLA_A", "HLA_B", "HLA_C", "HLA_E", "HLA_F", "HLA_G", "HLA_DPA1",
"HLA_DPB1", "HLA_DPB2", "HLA_DQA1", "HLA_DQA2", "HLA_DQB1", "HLA_DRA",
"HLA_DRB1", "HLA_DRB5", "HLA_DOA", "HLA_DOB", "HLA_DMA", "HLA_DMB")
grupos <- c("MSI", "MSS", "CMS1", "CMS2", "CMS3", "CMS4", "CMS_NA", "II", "III",
"Stage II, MSI", "Stage II, MSS", "Stage III, MSS", "Stage III, MSI",
"CLX", "GSE13294", "GSE14333", "GSE17536", "GSE39582", "TCGA")
# Crear dataframe vacÃo para almacenar las medias
medias <- data.frame(matrix(ncol = length(genes), nrow = length(grupos)))
colnames(medias) <- genes
rownames(medias) <- grupos
# Función para calcular la media de cada gen en los diferentes grupos
summary_groups <- function(dataset, gen) {
result <- list(
MSI_MSS = dataset %>%
group_by(MSI_MSS) %>%
summarise(mean = mean(!!sym(gen))),
# Gestionar NAs en CMS
CMS = dataset %>%
group_by(CMS) %>%
summarise(mean = mean(!!sym(gen), na.rm = TRUE)),
EstadÃo = dataset %>%
group_by(EstadÃo) %>%
summarise(mean = mean(!!sym(gen))),
Grupo = dataset %>%
group_by(Grupo) %>%
summarise(mean = mean(!!sym(gen))),
Dataset = dataset %>%
group_by(Dataset) %>%
summarise(mean = mean(!!sym(gen)))
)
means_values <- unlist(lapply(result, function(x) x$mean))
}
# Aplicar la función en bucle a cada gen HLA
for (i in 1:length(genes)) {
gen <- genes[i]
means_values <- summary_groups(aux, gen)
medias[, i] <- means_values
}
# write.csv(round(medias,2), "medias.csv", row.names = TRUE)
# Normalidad
sapply(aux[, genes], function(x) tapply(x, aux$MSI_MSS, function(g) lillie.test(g)$p.value))
HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G
MSI 3.066634e-08 1.203296e-09 2.026948e-07 0.2283573 0.0004550275 0.0009773251
MSS 3.612352e-10 2.933637e-07 2.032924e-02 0.1901726 0.0001154771 0.0510795467
HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1 HLA_DQA2 HLA_DQB1
MSI 0.0530483 0.007059809 3.603106e-03 6.502251e-04 0.1566153971 1.315557e-02
MSS 0.2888034 0.682543336 7.518458e-05 3.064773e-09 0.0003692047 5.708433e-07
HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
MSI 1.443613e-03 0.06232875 7.465578e-03 3.091942e-05 0.067113780 0.44246025
MSS 1.973161e-05 0.16890279 9.775493e-05 1.669563e-05 0.001989744 0.02308267
HLA_DMB
MSI 0.087214095
MSS 0.001833143
sapply(aux[, genes], function(x) tapply(x, aux$EstadÃo, function(g) lillie.test(g)$p.value))
HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G
II 1.225427e-16 1.271567e-11 0.000177941 0.2023203 0.001178098 0.078816748
III 1.160353e-01 1.296040e-02 0.066025059 0.6693122 0.484455912 0.006492417
HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1 HLA_DQA2 HLA_DQB1
II 0.8209178 0.4512720 4.817419e-09 4.119761e-05 0.009657786 0.002589807
III 0.7062662 0.6433521 7.326587e-02 5.704059e-07 0.204551682 0.013148893
HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
II 0.0004857543 0.2593195 1.866009e-06 2.100874e-09 1.450688e-05 6.704813e-05
III 0.0354632806 0.9361247 5.545616e-03 9.677200e-03 1.229612e-01 2.232885e-02
HLA_DMB
II 0.001490145
III 0.035982025
sapply(aux[, genes], function(x) tapply(x, aux$Grupo, function(g) lillie.test(g)$p.value))
HLA_A HLA_B HLA_C HLA_E HLA_F
Stage II, MSI 4.080332e-07 1.851513e-09 2.169721e-07 0.5348830 0.0003028999
Stage II, MSS 1.283128e-10 3.565077e-08 4.776726e-05 0.3628411 0.0033233221
Stage III, MSI 4.181368e-01 2.538514e-01 5.046414e-01 0.5512281 0.5331002274
Stage III, MSS 2.119283e-01 3.247516e-02 7.374305e-02 0.7007075 0.0916603722
HLA_G HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1
Stage II, MSI 0.01881932 0.3205664 0.317888609 0.0018550216 7.286964e-02
Stage II, MSS 0.34083777 0.5259529 0.675124922 0.0001024912 3.255213e-04
Stage III, MSI 0.05810077 0.1207921 0.003648765 0.2972270054 6.897370e-03
Stage III, MSS 0.03648526 0.7232945 0.694119925 0.0638658092 3.685802e-06
HLA_DQA2 HLA_DQB1 HLA_DRA HLA_DRB1 HLA_DRB5
Stage II, MSI 0.434733470 0.0078764664 1.846492e-02 0.30132740 0.0547344832
Stage II, MSS 0.000992465 0.0003500413 3.247153e-03 0.27283497 0.0000047213
Stage III, MSI 0.060050278 0.0267040452 6.410727e-05 0.08475466 0.0550592876
Stage III, MSS 0.079895402 0.0047985771 4.486172e-03 0.69922942 0.0244555191
HLA_DOA HLA_DOB HLA_DMA HLA_DMB
Stage II, MSI 7.992422e-05 0.012549694 0.668478458 0.16011643
Stage II, MSS 2.161703e-06 0.002386683 0.001628236 0.01230053
Stage III, MSI 1.576862e-01 0.269682621 0.591995791 0.12081174
Stage III, MSS 7.584833e-02 0.157800753 0.491511353 0.24079062
sapply(aux[, genes], function(x) tapply(x, aux$CMS, function(g) lillie.test(g)$p.value))
HLA_A HLA_B HLA_C HLA_E HLA_F
CMS1 7.122690e-07 6.206065e-08 3.992279e-06 0.03858435 0.0004192949
CMS2 2.387462e-03 3.754498e-02 5.029994e-01 0.85818361 0.0367717454
CMS3 1.630757e-10 1.274167e-02 2.834644e-01 0.78115287 0.5684743685
CMS4 5.224773e-02 2.306417e-03 2.003745e-01 0.21581476 0.1035650182
HLA_G HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1 HLA_DQA2
CMS1 0.0001532326 0.01134752 0.001008081 0.0284953181 3.298449e-05 2.737951e-01
CMS2 0.1072791131 0.41055425 0.593734908 0.0592733323 4.242202e-04 4.779283e-05
CMS3 0.7120072565 0.11792748 0.492627817 0.0503281914 7.443973e-02 3.070700e-01
CMS4 0.1339335567 0.19312608 0.044405869 0.0002891225 8.319083e-03 5.910872e-01
HLA_DQB1 HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB
CMS1 1.195923e-02 0.0004205063 0.06103669 0.22986092 0.005307294 0.11540619
CMS2 8.129524e-05 0.1850794334 0.68234760 0.00175403 0.001263572 0.05334843
CMS3 4.890220e-01 0.0177216328 0.79270127 0.58058795 0.156870194 0.25150044
CMS4 1.139983e-01 0.5935144401 0.67082272 0.06081642 0.004831622 0.65882559
HLA_DMA HLA_DMB
CMS1 0.08809516 0.02505854
CMS2 0.01530076 0.42044963
CMS3 0.03734524 0.66174243
CMS4 0.36261993 0.16630600
# Homocedasticidad
sapply(aux[, genes], function(x) leveneTest(x ~ aux$MSI_MSS)$`Pr(>F)`[1])
HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G
5.534649e-04 2.458906e-04 1.271250e-02 4.324152e-01 1.277413e-05 4.682859e-03
HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1 HLA_DQA2 HLA_DQB1
6.063271e-02 1.237921e-01 3.337868e-02 1.659092e-05 9.136438e-02 6.543415e-03
HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
8.240220e-01 2.325025e-02 1.877575e-02 6.492622e-04 8.330950e-04 2.389576e-07
HLA_DMB
2.286031e-04
sapply(aux[, genes], function(x) leveneTest(x ~ aux$EstadÃo)$`Pr(>F)`[1])
HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G
0.5525974279 0.3430180378 0.1711205050 0.9273933496 0.0685203054 0.7309293701
HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1 HLA_DQA2 HLA_DQB1
0.1621512945 0.5886737976 0.7625293652 0.0003269449 0.0392597247 0.4268537192
HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
0.5873365337 0.7457327339 0.8083488016 0.4731056352 0.7497481797 0.0341973951
HLA_DMB
0.3520702900
sapply(aux[, genes], function(x) leveneTest(x ~ aux$Grupo)$`Pr(>F)`[1])
HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G
5.826855e-03 1.524839e-03 3.685053e-02 8.500460e-01 6.548883e-05 2.109130e-02
HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1 HLA_DQA2 HLA_DQB1
3.379405e-02 5.841866e-02 1.336806e-01 1.426941e-06 1.843726e-02 5.772216e-04
HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
7.691261e-02 8.386636e-03 2.060704e-02 1.883003e-03 2.183706e-02 1.163259e-06
HLA_DMB
8.509403e-05
sapply(aux[, genes], function(x) leveneTest(x ~ aux$CMS)$`Pr(>F)`[1])
HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G
4.484204e-06 7.395708e-03 2.325005e-02 4.577231e-02 2.594631e-04 2.683716e-02
HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1 HLA_DQA2 HLA_DQB1
9.057584e-04 6.814962e-05 3.624965e-06 1.545190e-10 4.056387e-03 1.038567e-04
HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
5.048597e-03 2.677954e-04 3.526353e-06 9.753256e-15 3.199983e-08 8.960718e-13
HLA_DMB
1.533476e-12
# Tabla 1
# Kruskal para cada gen/grupo
# Grupos de interés
grupos_ag <- c("MSI_MSS", "CMS", "EstadÃo", "Grupo", "Dataset")
# Dataframe para guardar resultados
kruskal_sig <- data.frame(matrix(ncol = length(genes), nrow = 5))
colnames(kruskal_sig) <- genes
rownames(kruskal_sig) <- grupos_ag
# Función kruskal
kruskal <- function(dataset, gen){
result <- list(
# Aplicar el test de kruskal a can gen ~ grupo
# Guardar el p-valor
kruskal.test(as.formula(paste(gen, "~ MSI_MSS")), dataset)$p.value,
kruskal.test(as.formula(paste(gen, "~ CMS")), dataset)$p.value,
kruskal.test(as.formula(paste(gen, "~ EstadÃo")), dataset)$p.value,
kruskal.test(as.formula(paste(gen, "~ Grupo")), dataset)$p.value,
kruskal.test(as.formula(paste(gen, "~ Dataset")), dataset)$p.value
)
# Convertir el resultado en vector
result <- unlist(result)
}
# Aplicar la función en bucle a cada gen
# Ir añadiendo los resultados a la tabla
for (i in 1:length(genes)) {
gen <- genes[i]
p_values <- kruskal(aux, gen)
kruskal_sig[, i] <- p_values
}
# Sustituir los valores de la tabla por NS, *, ** o ***
kruskal_sig <- apply(kruskal_sig, c(1, 2), function(p) {
if (p > 0.05) {
return("NS")
} else if (p <= 0.05 && p > 0.01) {
return("*")
} else if (p <= 0.01 && p > 0.001) {
return("**")
} else {
return("***")
}
})
as.data.frame(kruskal_sig)
HLA_A HLA_B HLA_C HLA_E HLA_F HLA_G HLA_DPA1 HLA_DPB1 HLA_DPB2 HLA_DQA1
MSI_MSS * *** *** *** *** *** *** *** *** ***
CMS *** *** *** *** *** *** *** *** *** ***
EstadÃo NS NS NS NS NS NS NS NS NS NS
Grupo NS ** *** *** *** *** *** *** *** ***
Dataset NS NS NS NS NS NS NS NS * NS
HLA_DQA2 HLA_DQB1 HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB HLA_DMA
MSI_MSS *** *** *** *** *** *** *** ***
CMS *** *** *** *** *** *** *** ***
EstadÃo NS NS NS NS NS NS NS NS
Grupo *** *** *** *** *** *** *** ***
Dataset NS NS NS NS NS NS NS NS
HLA_DMB
MSI_MSS ***
CMS ***
EstadÃo NS
Grupo ***
Dataset NS
# write.csv(t(as.data.frame(kruskal_sig)), "kruskal.csv", row.names = TRUE)
# Función comparación medias 2 a 2
comparaciones_significativas <- function(dataset, gen, grupo) {
# Obtener los niveles del grupo
dataset <- dataset %>% filter(!is.na(.data[[grupo]]), !is.na(.data[[gen]]))
niveles_grupo <- unique(dataset[[grupo]])
# Generar todas las combinaciones posibles de pares de niveles
pares_niveles <- combn(niveles_grupo, 2, simplify = FALSE)
# Inicializar la lista para almacenar comparaciones significativas
comparisons_ <- list()
# Realizar la comparación para cada par de niveles
for (par in pares_niveles) {
# Filtrar el dataset para solo considerar los dos niveles comparados
dataset_filtrado <- dataset %>% filter(.data[[grupo]] %in% par)
# Realizar la comparación
resultado <-
compare_means(as.formula(paste(gen, "~", grupo)), dataset_filtrado)
# Verificar si la comparación es significativa
if (any(resultado$p.adj < 0.05)) {
# Si es significativa, añadir a la lista
comparisons_[[length(comparisons_) + 1]] <- par
}
}
# Crear el dataset comparisons_ que se añade al Environment
nombre_variable <- paste0("comparisons_", gen, "_", grupo)
assign(nombre_variable, comparisons_, envir = .GlobalEnv)
}
# Bucle para generar las comparaciones significativos para cada gen/grupo
for (gen in genes){
for (g in grupos_ag){
comparaciones_significativas(aux, gen, g)
}
}
# Los pares de compraciones significativas que se han generado los uso en el boxplot para indicar para que parejas de valores quiero mostrar el p-valor
# Figuras 6 y 7
# Gráficas boxplot con comparación de medias
# Función boxplot
box_plot <- function(dataset, gen, grupo, my_comparisons) {
# Filtrar los datos nulos
ggplot(dataset %>% filter(!is.na(.data[[gen]]), !is.na(.data[[grupo]])),
# Definir función y tipo de gráfico
aes_string(x = grupo, y = gen)) +
geom_boxplot(color = "black", outlier.shape = NA) +
geom_jitter(position = position_jitter(0.35), size = 1,
aes_string(color = grupo, shape = grupo)) +
# Definir etiquetas de los ejes
labs(y = paste0("log2(", gen, ")"), x = grupo) +
# Diseño del gráfico
theme_classic() +
scale_color_manual(values = c("grey80", "black", "grey60", "grey30", "grey80", "black"), name = NULL) +
scale_shape_manual(values = c(15, 16, 17, 18, 19, 20), name = NULL) +
theme(plot.title = element_text(hjust = 0.5, size = 10, face = "bold"),
panel.spacing = unit(1, "lines"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "white",
fill = NA, size = 0.3),
strip.background = element_rect(colour = "white", fill = "white")) +
# Muestro la comparación de medias para las parejas preestablecidas
stat_compare_means(comparisons = my_comparisons, size = 3.2) +
stat_compare_means(method = "kruskal.test", label = "p.format",
size = 3.5, label.x = Inf, label.y = Inf,
hjust = 1, vjust = 1)
}
# Bucle para generar el gráfico para cada gen/grupo
for (gen in genes) {
for (g in grupos_ag){
comparisons_variable <- paste0("comparisons_", gen, "_", g)
print(box_plot(aux, gen, g, get(comparisons_variable)))
}
}































































































Inferencia de infiltración inmune
# Inferencia de la infiltración
# install.packages("devtools")
# devtools::install_github("cansysbio/ConsensusTME", force = TRUE)
consensus <- consensusTMEAnalysis(ex, cancer = "COAD", statMethod = "gsva")
Producing ConsensusTME Estimates Using The Following Parameters:
Statistical Framework: "gsva"
Gene Sets For Cancer Type: "COAD"
Sample Size: 1079
consensus <- t(consensus)
consensus <- as.data.frame(consensus)
head(consensus)
B_cells Cytotoxic_cells Dendritic_cells Eosinophils Macrophages
A2004_T 0.3846003 0.7574401 0.67385739 0.4963582 0.7454817
A2027_T -0.4675443 -0.7212254 -0.57181756 -0.5615878 -0.6001180
A2050_T -0.3017326 -0.3475906 -0.15946193 -0.2075509 -0.2973827
A2096_T -0.4533999 -0.5976738 -0.63175002 -0.6214394 -0.5710004
B2012_T -0.1011278 0.3055700 -0.01241025 -0.1185626 -0.1088708
B2035_T 0.1910087 0.5850891 0.46836995 0.4463827 0.5767133
Mast_cells NK_cells Neutrophils T_cells_CD4 T_cells_CD8
A2004_T 0.83617145 0.7196722 0.42038227 0.6765331 0.7075507
A2027_T -0.36807097 -0.3982661 -0.54971682 -0.4959341 -0.5454722
A2050_T -0.46779110 -0.2792276 -0.33565943 -0.2626395 -0.1483148
A2096_T -0.60028582 -0.5119227 -0.47687918 -0.6102899 -0.5970780
B2012_T 0.08368983 0.3687848 -0.09216623 0.2187569 0.2185119
B2035_T 0.40402857 0.4921594 0.63712235 0.4993724 0.4944613
T_cells_gamma_delta T_regulatory_cells Macrophages_M1 Macrophages_M2
A2004_T 0.70307239 0.68086769 0.75651814 0.6721188
A2027_T -0.63603256 -0.60666275 -0.65631656 -0.6724274
A2050_T -0.02407792 -0.08748057 -0.09344834 -0.3061692
A2096_T -0.61223525 -0.61304529 -0.62665252 -0.6257849
B2012_T 0.33289496 0.23500475 0.02736597 -0.2901599
B2035_T 0.56691249 0.55422403 0.63236466 0.5851547
Endothelial Fibroblasts Monocytes Plasma_cells Immune_Score
A2004_T 0.6324214 0.6673454 0.5419472 0.48748390 0.60808841
A2027_T -0.6840517 -0.6200350 -0.5185551 -0.31564065 -0.52447000
A2050_T -0.7017832 -0.5607488 -0.3403292 -0.41580284 -0.25857291
A2096_T -0.5785607 -0.5363877 -0.5519135 -0.44922115 -0.54652336
B2012_T -0.2501814 -0.6014534 -0.2075248 0.08238409 0.08995631
B2035_T -0.3318157 0.5212461 0.5666761 -0.01377423 0.45110253
# Nuevo conjunto de datos uniendo esta información al anterior
consensus <- consensus %>%
tibble::rownames_to_column(var = "ID")
merged_data <- aux %>%
inner_join(consensus, by = "ID")
# Figura 10
# Boxplots para infiltración
variables <- c("B_cells", "Cytotoxic_cells", "Dendritic_cells", "Eosinophils",
"Macrophages", "Mast_cells", "NK_cells", "Neutrophils",
"T_cells_CD4", "T_cells_CD8", "T_cells_gamma_delta",
"T_regulatory_cells", "Macrophages_M1", "Macrophages_M2",
"Endothelial", "Fibroblasts", "Monocytes", "Plasma_cells")
# Bucle para generar los gráficos para cada grupo
for (g in grupos_ag) {
data <- merged_data
# Gestionar los valores faltantes de CMS
if (g == "CMS") {
data <- data %>% drop_na()
}
# Función del gráfico
plots <- lapply(variables, function(var) {
ggplot(data, aes(x = .data[[g]], y = .data[[var]])) +
# Tipo de gráfico y diseño
geom_boxplot(color = "black", outlier.shape = NA) +
labs(y = paste(var), x = "") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 6, face = "bold"),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 6),
axis.title.y = element_text(size = 8),
panel.spacing = unit(1, "lines"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "white",
fill = NA, size = 0.3),
strip.background = element_rect(colour = "white", fill = "white")) +
# Añadir comparaciones de medias
stat_compare_means(method = "kruskal.test", label = "p.format", size = 3,
label.x = Inf, label.y = Inf, hjust = 1, vjust = 1)
})
grid.arrange(grobs = plots[1:9], ncol = 3, heights = rep(3, 3))
grid.arrange(grobs = plots[10:18], ncol = 3, heights = rep(3, 3))
}










# Figura 12
# Heatmap
# Genes y células de interés
genes <- c("HLA_A", "HLA_B", "HLA_C", "HLA_E", "HLA_F", "HLA_G", "HLA_DPA1",
"HLA_DPB1", "HLA_DPB2", "HLA_DQA1", "HLA_DQA2", "HLA_DQB1", "HLA_DRA",
"HLA_DRB1", "HLA_DRB5", "HLA_DOA", "HLA_DOB", "HLA_DMA", "HLA_DMB")
cell_types <- c("B_cells", "Cytotoxic_cells", "Dendritic_cells", "Eosinophils",
"Macrophages", "Mast_cells", "NK_cells", "Neutrophils",
"T_cells_CD4", "T_cells_CD8", "T_cells_gamma_delta",
"T_regulatory_cells", "Macrophages_M1", "Macrophages_M2",
"Endothelial", "Fibroblasts", "Monocytes", "Plasma_cells")
# Crear las anotaciones
genes_data <- merged_data[, genes]
cells_data <- merged_data[, cell_types]
group_annotation <-
data.frame(Grupo = rep(c("II/MSS", "II/MSI","III/MSS", "III/MSI"), each = 19))
group_annotation$Gen <- rep(genes, times=4)
rownames(group_annotation) <-
paste(group_annotation$Grupo, group_annotation$Gen, sep = "_")
labels <- group_annotation$Gen
group_annotation <- subset(group_annotation, select = -Gen)
# Matriz de correlación por grupos
cor_list <- list()
for (group in unique(merged_data$Grupo)){
group_data <- merged_data[merged_data$Grupo == group,]
genes_data_group <- group_data[, genes]
cells_data_group <- group_data[, cell_types]
cor_matrix_group <- cor(cells_data_group, genes_data_group, method="spearman")
cor_list[[group]] <- cor_matrix_group
}
cor_matrix_all <- do.call(cbind, cor_list)
colnames(cor_matrix_all) <- rownames(group_annotation)
# Definir colores para los grupos
group_colors <- list(Grupo = c("II/MSS" = "#9E0142",
"II/MSI" = "#5E4FA2",
"III/MSS"= "#3288BD",
"III/MSI"= "#CEB944"))
# Crear el heatmap
pheatmap(cor_matrix_all,
annotation_col = group_annotation,
annotation_colors = group_colors,
labels_col = labels,
main = "Heatmap HLA e infiltracion",
color = colorRampPalette(c("blue", "white", "red"))(100),
show_colnames = T, show_rownames = T, fontsize = 8,
fontsize_legend = 4, border_color = NA)

# Nuevo conjunto de datos incluyendo la expresión de NCR1 y genes HLA clase I
# Divido la expresión en "Baja" y "Alta" dependiendo si es mayor o menor que la media
merged_dataNK <- merged_data %>%
select(ID, NK_cells, MSI_MSS, CMS, EstadÃo, Grupo, Dataset, HLA_A, HLA_B,
HLA_C, HLA_E, HLA_F, HLA_G, HLA_DPA1, HLA_DPB1, HLA_DPB2, HLA_DQA1,
HLA_DQA2, HLA_DQB1, HLA_DRA, HLA_DRB1, HLA_DRB5, HLA_DOA,
HLA_DOB, HLA_DMA, HLA_DMB)
merged_dataNK <- merged_dataNK %>%
mutate(
HLA_Ae = ifelse(HLA_A > mean(HLA_A, na.rm = TRUE), "Alta", "Baja"),
HLA_Be = ifelse(HLA_B > mean(HLA_B, na.rm = TRUE), "Alta", "Baja"),
HLA_Ce = ifelse(HLA_C > mean(HLA_C, na.rm = TRUE), "Alta", "Baja"),
HLA_Ee = ifelse(HLA_E > mean(HLA_E, na.rm = TRUE), "Alta", "Baja"),
HLA_Fe = ifelse(HLA_F > mean(HLA_F, na.rm = TRUE), "Alta", "Baja"),
HLA_Ge = ifelse(HLA_G > mean(HLA_G, na.rm = TRUE), "Alta", "Baja"),
HLA_DPA1e = ifelse(HLA_DPA1 > mean(HLA_DPA1, na.rm = TRUE), "Alta", "Baja"),
HLA_DPB1e = ifelse(HLA_DPB1 > mean(HLA_DPB1, na.rm = TRUE), "Alta", "Baja"),
HLA_DPB2e = ifelse(HLA_DPB2 > mean(HLA_DPB2, na.rm = TRUE), "Alta", "Baja"),
HLA_DQA1e = ifelse(HLA_DQA1 > mean(HLA_DQA1, na.rm = TRUE), "Alta", "Baja"),
HLA_DQA2e = ifelse(HLA_DQA2 > mean(HLA_DQA2, na.rm = TRUE), "Alta", "Baja"),
HLA_DQB1e = ifelse(HLA_DQB1 > mean(HLA_DQB1, na.rm = TRUE), "Alta", "Baja"),
HLA_DRAe = ifelse(HLA_DRA > mean(HLA_DRA, na.rm = TRUE), "Alta", "Baja"),
HLA_DRB1e = ifelse(HLA_DRB1 > mean(HLA_DRB1, na.rm = TRUE), "Alta", "Baja"),
HLA_DRB5e = ifelse(HLA_DRB5 > mean(HLA_DRB5, na.rm = TRUE), "Alta", "Baja"),
HLA_DOAe = ifelse(HLA_DOA > mean(HLA_DOA, na.rm = TRUE), "Alta", "Baja"),
HLA_DOBe = ifelse(HLA_DOB > mean(HLA_DOB, na.rm = TRUE), "Alta", "Baja"),
HLA_DMAe = ifelse(HLA_DMA > mean(HLA_DMA, na.rm = TRUE), "Alta", "Baja"),
HLA_DMBe = ifelse(HLA_DMB > mean(HLA_DMB, na.rm = TRUE), "Alta", "Baja")
)
NK <- data.frame(NCR1 = log2(ext[, "NCR1"]), ID = rownames(ext))
merged_dataNK <- merge(merged_dataNK, NK, by = "ID")
head(merged_dataNK)
ID NK_cells MSI_MSS CMS EstadÃo Grupo Dataset HLA_A
1 A2004_T 0.7196722 MSS CMS4 II Stage II, MSS CLX 13.63571
2 A2027_T -0.3982661 MSS CMS3 II Stage II, MSS CLX 12.21874
3 A2050_T -0.2792276 MSS CMS2 II Stage II, MSS CLX 13.03213
4 A2096_T -0.5119227 MSS CMS3 II Stage II, MSS CLX 11.23669
5 B2012_T 0.3687848 MSS CMS3 II Stage II, MSS CLX 11.46315
6 B2035_T 0.4921594 MSS CMS1 II Stage II, MSS CLX 12.94721
HLA_B HLA_C HLA_E HLA_F HLA_G HLA_DPA1 HLA_DPB1 HLA_DPB2
1 13.23626 13.51612 10.891565 10.197729 9.433319 10.910054 11.474009 2.947866
2 12.09741 10.54568 9.553953 8.674593 9.184430 5.722075 8.253218 3.455365
3 12.75953 10.52239 10.531914 8.403979 9.055393 9.228111 10.353839 2.881734
4 10.57240 10.43151 8.799970 7.939218 8.607552 6.982154 8.927037 4.043707
5 12.48574 10.41478 10.439635 10.294109 9.292815 9.765838 8.348707 2.728756
6 13.59113 13.45148 10.906514 9.669588 10.563523 11.130406 9.839938 3.283621
HLA_DQA1 HLA_DQA2 HLA_DQB1 HLA_DRA HLA_DRB1 HLA_DRB5 HLA_DOA HLA_DOB
1 7.525320 7.324660 10.475698 12.771633 11.516642 6.783054 8.248665 5.156801
2 6.433657 7.468530 4.411973 8.232406 5.384877 5.697542 4.706607 4.427860
3 6.109228 7.712724 8.297106 10.749566 9.491044 6.065309 5.974780 4.455425
4 6.103023 7.925025 7.513132 8.694044 7.166937 5.912588 5.076255 4.140544
5 5.826041 8.144257 9.524114 12.084706 10.263096 6.589685 5.339500 4.696205
6 6.507400 8.571823 5.184729 12.427694 11.266601 7.091718 6.775576 5.060046
HLA_DMA HLA_DMB HLA_Ae HLA_Be HLA_Ce HLA_Ee HLA_Fe HLA_Ge HLA_DPA1e
1 10.299078 10.292908 Alta Alta Alta Alta Alta Baja Alta
2 6.222437 6.430098 Baja Baja Baja Baja Baja Baja Baja
3 8.319735 8.251854 Alta Alta Baja Baja Baja Baja Alta
4 6.262953 6.751396 Baja Baja Baja Baja Baja Baja Baja
5 9.469835 9.138292 Baja Baja Baja Baja Alta Baja Alta
6 10.649061 9.397652 Alta Alta Alta Alta Alta Alta Alta
HLA_DPB1e HLA_DPB2e HLA_DQA1e HLA_DQA2e HLA_DQB1e HLA_DRAe HLA_DRB1e
1 Alta Baja Alta Baja Alta Alta Alta
2 Baja Alta Alta Baja Baja Baja Baja
3 Alta Baja Baja Baja Alta Baja Alta
4 Baja Alta Baja Alta Alta Baja Baja
5 Baja Baja Baja Alta Alta Alta Alta
6 Alta Alta Alta Alta Baja Alta Alta
HLA_DRB5e HLA_DOAe HLA_DOBe HLA_DMAe HLA_DMBe NCR1
1 Alta Alta Alta Alta Alta 2.091668
2 Baja Baja Baja Baja Baja 1.384394
3 Baja Baja Baja Baja Alta 1.592652
4 Baja Baja Baja Baja Baja 1.467244
5 Baja Baja Baja Alta Alta 1.333561
6 Alta Alta Alta Alta Alta 1.777775
# Figura 14A
# Función para comparar la expresión de NCR1 para un gen especÃfico en función de las categorÃas "Baja" y "Alta" mediante boxplot para cada Grupo
NK_HLA <- function(dataset, gen){
ggplot(dataset, aes(x = .data[[gen]], y = NCR1)) +
geom_boxplot(outlier.shape = NA, alpha = 0.6) +
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
facet_wrap(~ Grupo) +
stat_compare_means(method = "wilcox.test", label = "p.format", size = 3,
label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) +
theme_minimal() +
labs(
title = paste("Comparación de la expresión de NCR1 entre las categorÃas de",
substr(gen, 1, nchar(gen) - 1)),
x = NULL,
y = "log2(NCR1)"
) +
theme(legend.position = "none")
}
exp_HLA <- c("HLA_Ae", "HLA_Be", "HLA_Ce", "HLA_Ee", "HLA_Fe", "HLA_Ge",
"HLA_DPA1e", "HLA_DPB1e", "HLA_DPB2e", "HLA_DQA1e", "HLA_DQA2e",
"HLA_DQB1e", "HLA_DRAe", "HLA_DRB1e", "HLA_DRB5e", "HLA_DOAe",
"HLA_DOBe", "HLA_DMAe", "HLA_DMBe")
for (gen in exp_HLA){
print(NK_HLA(merged_dataNK, gen))
}



















# Figura 14B
# Función para comparar la infiltración de NK para un gen especÃfico en función de las categorÃas "Baja" y "Alta" mediante boxplot para cada Grupo
NK_HLA <- function(dataset, gen){
ggplot(dataset, aes(x = .data[[gen]], y = NK_cells)) +
geom_boxplot(outlier.shape = NA, alpha = 0.6) +
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
facet_wrap(~ Grupo) +
stat_compare_means(method = "wilcox.test", label = "p.format", size = 3,
label.x = Inf, label.y = Inf, hjust = 1, vjust = 1) +
theme_minimal() +
labs(
title = paste("Comparación de la expresión de NK_cells entre las categorÃas de",
substr(gen, 1, nchar(gen) - 1)),
x = NULL,
y = "NK_cells"
) +
theme(legend.position = "none")
}
for (gen in exp_HLA){
print(NK_HLA(merged_dataNK, gen))
}


















