title: “Evaluación del módulo Bioestadistica” subtitle: “Procesamiento y Simulación de Datos Biológicos” author: “JESUS AMADO MORINIGO SOSA” date: “Curso 2025-2026” urlcolor: blue output: html_document: toc: no —

0. Lectura de los datos

datos_clinicos <- read.csv("data/datos_clinicos.csv", stringsAsFactors = FALSE)
farmacos       <- read.csv("data/farmacos.csv", stringsAsFactors = FALSE)
metadatos      <- read.csv("data/metadatos.csv", stringsAsFactors = FALSE)

kable(data.frame(
  fichero = c("datos_clinicos","farmacos","metadatos"),
  filas  = c(nrow(datos_clinicos), nrow(farmacos), nrow(metadatos)),
  columnas = c(ncol(datos_clinicos), ncol(farmacos), ncol(metadatos))
))
fichero filas columnas
datos_clinicos 1888 4
farmacos 249 2
metadatos 249 4

1. Análisis descriptivo

df <- datos_clinicos %>%
  left_join(farmacos, by = "ID") %>%
  left_join(metadatos, by = "ID")

meta_unicos <- metadatos %>% distinct(ID, .keep_all = TRUE)

kable(data.frame(
  Edad_media = mean(meta_unicos$Age, na.rm = TRUE),
  Peso_medio = mean(meta_unicos$Weight, na.rm = TRUE)
), digits = 2)
Edad_media Peso_medio
12.73 26.12
kable(summary(df %>% select(Dias, Volumen, Metastasis)))
Dias Volumen Metastasis
Min. : 0.0 Min. :22.05 Min. :0.000
1st Qu.: 5.0 1st Qu.:45.00 1st Qu.:0.000
Median :20.0 Median :48.95 Median :1.000
Mean :19.6 Mean :50.45 Mean :1.023
3rd Qu.:30.0 3rd Qu.:56.32 3rd Qu.:2.000
Max. :45.0 Max. :78.57 Max. :4.000

Ratones por sexo

kable(df %>% distinct(ID, Sex) %>% count(Sex))
Sex n
Female 124
Male 125

Ratones por fármaco

kable(df %>% distinct(ID, Farmaco) %>% count(Farmaco) %>% arrange(desc(n)))
Farmaco n
Capomulin 25
Ceftamin 25
Infubinol 25
Ketapril 25
Naftisol 25
Placebo 25
Propriva 25
Ramicane 25
Zoniferol 25
Stelasyn 24

Ratones macho por fármaco

kable(df %>% distinct(ID, Farmaco, Sex) %>%
        filter(Sex %in% c("Male","M","macho","Macho")) %>%
        count(Farmaco))
Farmaco n
Capomulin 12
Ceftamin 12
Infubinol 13
Ketapril 16
Naftisol 12
Placebo 12
Propriva 13
Ramicane 16
Stelasyn 9
Zoniferol 10

Volumen final por fármaco

ultimo <- df %>%
  group_by(ID) %>%
  filter(Dias == max(Dias, na.rm = TRUE)) %>%
  ungroup() %>%
  distinct(ID, .keep_all = TRUE)

resumen_vol <- ultimo %>%
  group_by(Farmaco) %>%
  summarise(
    n = n(),
    min = min(Volumen),
    mediana = median(Volumen),
    media = mean(Volumen),
    max = max(Volumen),
    sd = sd(Volumen),
    var = var(Volumen),
    .groups="drop"
  ) %>%
  arrange(media)

kable(resumen_vol, digits = 2)
Farmaco n min mediana media max sd var
Ramicane 25 22.05 36.56 36.19 45.22 5.67 32.17
Capomulin 25 23.34 38.13 36.67 47.69 5.72 32.66
Propriva 25 45.00 55.84 56.74 72.46 8.33 69.35
Ceftamin 25 45.00 59.85 57.75 68.92 8.37 69.98
Infubinol 25 36.32 60.17 58.18 72.23 8.60 74.01
Zoniferol 25 45.00 61.84 59.18 73.32 8.77 76.86
Placebo 25 45.00 62.03 60.51 73.21 8.87 78.76
Stelasyn 24 45.00 62.19 61.00 75.12 9.50 90.33
Naftisol 25 45.00 63.28 61.21 76.67 10.30 106.03
Ketapril 25 45.00 64.49 62.81 78.57 9.95 98.92
ggplot(ultimo, aes(x = reorder(Farmaco, Volumen, FUN = median), y = Volumen)) +
  geom_boxplot() +
  coord_flip()

Variación del volumen (desde 45)

ultimo <- ultimo %>% mutate(Variacion = Volumen - 45)

kable(ultimo %>%
        group_by(Farmaco) %>%
        summarise(media_variacion = mean(Variacion),
                  sd_variacion = sd(Variacion),
                  .groups="drop") %>%
        arrange(media_variacion),
      digits = 2)
Farmaco media_variacion sd_variacion
Ramicane -8.81 5.67
Capomulin -8.33 5.72
Propriva 11.74 8.33
Ceftamin 12.75 8.37
Infubinol 13.18 8.60
Zoniferol 14.18 8.77
Placebo 15.51 8.87
Stelasyn 16.00 9.50
Naftisol 16.21 10.30
Ketapril 17.81 9.95

Metástasis final por fármaco

kable(ultimo %>%
        group_by(Farmaco) %>%
        summarise(media = mean(Metastasis),
                  mediana = median(Metastasis),
                  sd = sd(Metastasis),
                  .groups="drop") %>%
        arrange(media),
      digits = 2)
Farmaco media mediana sd
Ramicane 1.20 1 0.87
Capomulin 1.28 1 0.98
Stelasyn 1.46 1 1.06
Propriva 1.56 1 1.36
Infubinol 1.60 1 1.22
Ceftamin 1.72 1 1.43
Ketapril 1.92 2 1.66
Naftisol 2.00 2 1.35
Zoniferol 2.00 2 1.50
Placebo 2.08 2 1.50
ggplot(ultimo, aes(x = reorder(Farmaco, Metastasis, FUN = median), y = Metastasis)) +
  geom_boxplot() +
  coord_flip()

2. Subconjunto de 4 fármacos

subfarm <- c("Capomulin","Infubinol","Ketapril","Placebo")

ultimo4 <- ultimo %>% filter(Farmaco %in% subfarm)

kable(ultimo4 %>% count(Farmaco))
Farmaco n
Capomulin 25
Infubinol 25
Ketapril 25
Placebo 25
kable(ultimo4 %>%
        group_by(Farmaco) %>%
        summarise(media_vol = mean(Volumen),
                  sd_vol = sd(Volumen),
                  media_met = mean(Metastasis),
                  sd_met = sd(Metastasis),
                  .groups="drop"),
      digits = 2)
Farmaco media_vol sd_vol media_met sd_met
Capomulin 36.67 5.72 1.28 0.98
Infubinol 58.18 8.60 1.60 1.22
Ketapril 62.81 9.95 1.92 1.66
Placebo 60.51 8.87 2.08 1.50
p1 <- ggplot(ultimo4, aes(Farmaco, Volumen)) + geom_boxplot()
p2 <- ggplot(ultimo4, aes(Farmaco, Metastasis)) + geom_boxplot()

grid.arrange(p1, p2, ncol=2)

3. ANOVA

modelo_aov <- aov(Volumen ~ Farmaco, data = ultimo4)
kable(as.data.frame(anova(modelo_aov)), digits = 4)
Df Sum Sq Mean Sq F value Pr(>F)
Farmaco 3 10915.314 3638.4380 51.1816 0
Residuals 96 6824.529 71.0888 NA NA
kable(as.data.frame(TukeyHSD(modelo_aov)$Farmaco), digits = 6)
diff lwr upr p adj
Infubinol-Capomulin 21.510678 15.275454 27.745903 0.000000
Ketapril-Capomulin 26.138624 19.903399 32.373848 0.000000
Placebo-Capomulin 23.840847 17.605622 30.076071 0.000000
Ketapril-Infubinol 4.627946 -1.607279 10.863170 0.218170
Placebo-Infubinol 2.330168 -3.905056 8.565393 0.762787
Placebo-Ketapril -2.297777 -8.533002 3.937447 0.770412

4. Correlación y Regresión

medias4 <- ultimo4 %>%
  group_by(Farmaco) %>%
  summarise(media = mean(Volumen), .groups="drop") %>%
  arrange(media)

mejor <- medias4$Farmaco[1]

df_best <- ultimo4 %>% filter(Farmaco == mejor)

cor_res <- cor.test(df_best$Weight, df_best$Volumen)

kable(data.frame(
  Farmaco = mejor,
  r = unname(cor_res$estimate),
  p_value = cor_res$p.value
), digits = 6)
Farmaco r p_value
Capomulin 0.876706 0
modelo_lm <- lm(Volumen ~ Weight, data = df_best)
kable(as.data.frame(summary(modelo_lm)$coefficients), digits = 6)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.448147 4.068291 0.355960 0.725115
Weight 1.750468 0.200265 8.740754 0.000000
ggplot(df_best, aes(Weight, Volumen)) +
  geom_point() +
  geom_smooth(method="lm", se=TRUE)