#DISEÑO EN MEDIDAS REPETIDAS

#Primeros diseños donde el tiempo juega un papel en el modelado, los anteriores no tomaban esta variable.

#REPEATED MEASURES ANOVA IN R

LINK: https://www.datanovia.com/en/lessons/repeated-measures-anova-in-r/

#Factor - intrasujetos= Tiempo

#Factor - entresuejtos = FSCA, FSBA, FCCA, FCBA.

#Medidas repetidas en: una vía, dos vías y tres vías.

#MEDIDAS REPETIDAS DE UNA VÍA Antes se llamaba #PREPARACIÓN DE LOS DATOS #UNA VIA: Cuado hay un solo factor y solo está el TIEMPO #id:Tiempo se convierte como factor no como número

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
# install.packages("datarium")
data("selfesteem", package = "datarium")

# Todos los rendimientos se colocaron en un solo formato largo en vez del ancho (todo en una sola columna).
#Los primeros son de t1, los siguientes son de t2 y los últimos son
#Función gather.

datos = selfesteem

# De formato ancho a largo
datos = datos %>%
gather(key = "tiempo", 
         value  = "rto",
         t1, t2, t3) %>%
mutate_at(vars(id, tiempo), as.factor)
View(datos)

boxplot(datos)

#Una vía = cuando hay un solo factor y es precisamente el tiempo. No tratamiento. No bloque. Una sola respuesta. #Cortes en diferentes tiempos (tiempos equidistances) t1 = 30 días despues de la siembra. t2 = 60 días despues de la siembra. t3 = 90 días despues de la siembr

# Resumen estadistico
datos%>%
  group_by(tiempo)%>%
  summarise(media = mean (rto), 
            desv = sd (rto),
            n = n (),
            cv =100*desv/media)

## # A tibble: 3 × 5
##   tiempo media  desv     n    cv
##   <fct>  <dbl> <dbl> <int> <dbl>
## 1 t1      3.14 0.552    10  17.6
## 2 t2      4.93 0.863    10  17.5
## 3 t3      7.64 1.14     10  15.0

#NOTA: Coeficiente varianzacion <20% se asocian a datos con comprotamiento normal, es decir, es bueno. (POCA VARIABILIDAD) 
  #SI son superiores a 20% se observa heterogeneidad.

boxplot(datos$rto~ datos$tiempo)

#DETECCIÓN OUTLIER

#install.packages(rstatix)
library(rstatix)

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

datos %>%
  group_by(tiempo) %>%
  identify_outliers(rto)

## # A tibble: 2 × 5
##   tiempo id      rto is.outlier is.extreme
##   <fct>  <fct> <dbl> <lgl>      <lgl>     
## 1 t1     6      2.05 TRUE       FALSE     
## 2 t2     2      6.91 TRUE       FALSE

#PROBANDO NORMALIDAD
datos %>%
  group_by(tiempo) %>%
  shapiro_test(rto)

## # A tibble: 3 × 4
##   tiempo variable statistic     p
##   <fct>  <chr>        <dbl> <dbl>
## 1 t1     rto          0.967 0.859
## 2 t2     rto          0.876 0.117
## 3 t3     rto          0.923 0.380

#Aparecen p-valores, se le hace prueba de normalidad a cada tiempo (TODOS LOS DATOS EN CADA TIEMPO SON NORMALES) si se tiene un tiempo con datos anormales se pone en la nota y pueden ser causados por datos atipicos

#SUPUESTO DE ESPERICIDAD (IGUALDAD DE VARIANZAS) #Compara las varianzas entre tiempos

res.aov <-anova_test(data = datos, dv = rto, wid = id, within=tiempo)
#ESFERICIDAD
res.aov$`Mauchly's Test for Sphericity`

##   Effect     W     p p<.05
## 1 tiempo 0.551 0.092

get_anova_table(res.aov)

## ANOVA Table (type III tests)
## 
##   Effect DFn DFd      F        p p<.05   ges
## 1 tiempo   2  18 55.469 2.01e-08     * 0.829

#No hay problema de varianzas desiguales. #SI LA PRUEBA DE MAUCHLY— da con varianzas desiguales se debe usar otra metodologia (buscar cual es)

#ges (tamaño del efecto) #Se rechaza H0, es decir, el aceite que se produce en los tres tiempos es diferente. Es decir, el tiempo de corte tiene efecto en la abundancia en el aceite. EL corte tres da más aceite. #no se usa tukey sino bonferroni

#se usa el p-valor ajustado (p.adj)
datos %>%
  pairwise_t_test(
    rto ~ tiempo,
    paired = TRUE,
    p.adjust.method 
    = "bonferroni")

## # A tibble: 3 × 10
##   .y.   group1 group2    n1    n2 statistic    df           p p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>       <dbl> <dbl> <chr>       
## 1 rto   t1     t2        10    10     -4.97     9 0.000772     2e-3 **          
## 2 rto   t1     t3        10    10    -13.2      9 0.000000334  1e-6 ****        
## 3 rto   t2     t3        10    10     -4.87     9 0.000886     3e-3 **

res.aov <- anova_test(data = datos,
                      dv = rto,
                      wid = id,
                      within = tiempo)

# Esfericidad
res.aov$`Mauchly's Test for Sphericity`

##   Effect     W     p p<.05
## 1 tiempo 0.551 0.092

get_anova_table(res.aov)

## ANOVA Table (type III tests)
## 
##   Effect DFn DFd      F        p p<.05   ges
## 1 tiempo   2  18 55.469 2.01e-08     * 0.829

#NOTA: El p-valor que se usa es el ajustado (todos se rechazan) El rendimie.nto del t1 y t2 son diferentes en cuanto a rendimiento.

#se usa el p-valor ajustado (p.adj)

datos %>%
  pairwise_t_test(
    rto ~ tiempo,
    paired = TRUE,
    p.adjust.method = 
      "bonferroni")

## # A tibble: 3 × 10
##   .y.   group1 group2    n1    n2 statistic    df           p p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>       <dbl> <dbl> <chr>       
## 1 rto   t1     t2        10    10     -4.97     9 0.000772     2e-3 **          
## 2 rto   t1     t3        10    10    -13.2      9 0.000000334  1e-6 ****        
## 3 rto   t2     t3        10    10     -4.87     9 0.000886     3e-3 **

#Resultado: Todos los tiempo son diferentes en rendimiento. El mejor es el t3. Los peores con el t1 y t2.

#DOS VÍAS: Tiempo y un factor Se aplicó un fertilizante. Interesados en saber si la fertilización ayudó y el tiempo influyó.

data("selfesteem2", package = "datarium")

datos2 = selfesteem2

datos2$treatment = gl(2,12,24, c('con fert', 'sin fert'))

#convertir formato largo

datos2 = datos2 %>% 
  gather(key='tiempo', value = 'rto',
         t1,t2,t3)
View(datos2)

#Resumen estadistico

datos2 %>%
  group_by(treatment, tiempo) %>%
  summarise(media = mean(rto),
            desv = sd(rto),
            n = n(),
            cv = 100*desv/media)

## `summarise()` has grouped output by 'treatment'. You can override using the
## `.groups` argument.

## # A tibble: 6 × 6
## # Groups:   treatment [2]
##   treatment tiempo media  desv     n    cv
##   <fct>     <chr>  <dbl> <dbl> <int> <dbl>
## 1 con fert  t1      88    8.08    12  9.18
## 2 con fert  t2      83.8 10.2     12 12.2 
## 3 con fert  t3      78.7 10.5     12 13.4 
## 4 sin fert  t1      87.6  7.62    12  8.70
## 5 sin fert  t2      87.8  7.42    12  8.45
## 6 sin fert  t3      87.7  8.14    12  9.28

Coeficientes de variancion <20%, son homogeneos los datos. El que tiene mejor aceite es el que tiene la media más alta.

#Visualización

library(ggplot2)

ggplot(datos2)+
  aes(tiempo, rto, fill=treatment)+
  geom_boxplot()

Con fertilizante los t1 y t2 parecen iguales, si el cuadro es más grande (tiene mayor variabilidad). Lo que muestra el boxplot es que fertilizar o no fertilizar da lo mismo en el rendimiento.

Analizarlos outliner

datos2 %>%
  group_by(treatment, tiempo)%>%
  identify_outliers(rto)

## [1] treatment  tiempo     id         rto        is.outlier is.extreme
## <0 rows> (or 0-length row.names)

#REVISIÓN SUPUESTOS

datos2 %>%
group_by(treatment, tiempo) %>%
  shapiro_test(rto)

## # A tibble: 6 × 5
##   treatment tiempo variable statistic      p
##   <fct>     <chr>  <chr>        <dbl>  <dbl>
## 1 con fert  t1     rto          0.828 0.0200
## 2 con fert  t2     rto          0.868 0.0618
## 3 con fert  t3     rto          0.887 0.107 
## 4 sin fert  t1     rto          0.919 0.279 
## 5 sin fert  t2     rto          0.923 0.316 
## 6 sin fert  t3     rto          0.886 0.104

Se hace sobre los datos de aceite y no de los residuales.

#ANALISIS DE VARIANZA

res.aov <- anova_test(
  data = datos2,
  dv = rto,
  wid = id,
  within = c(treatment,
             tiempo)
  )
get_anova_table(res.aov)

## ANOVA Table (type III tests)
## 
##             Effect  DFn   DFd      F        p p<.05   ges
## 1        treatment 1.00 11.00 15.541 2.00e-03     * 0.059
## 2           tiempo 1.31 14.37 27.369 5.03e-05     * 0.049
## 3 treatment:tiempo 2.00 22.00 30.424 4.63e-07     * 0.050

Nota: El primer valor que se revisa es la interaccion, <5% si hay interacción, por tanto, no se mira ni tiempo ni tratamiento. NO SE DEBEN HACER COMPARACIONES CUANDO HAY INTERACCION

#PROCEDIMIENTO PARA HACER CUANDO HAY INTERACCIÓN #GRAFICO DE INTERACCIÓN

interaction.plot(datos2$tiempo,
                 datos2$treatment,
                 datos2$rto)

res.aov$`Mauchly's Test for Sphericity`

##             Effect     W     p p<.05
## 1           tiempo 0.469 0.023     *
## 2 treatment:tiempo 0.616 0.089

get_anova_table(res.aov)

## ANOVA Table (type III tests)
## 
##             Effect  DFn   DFd      F        p p<.05   ges
## 1        treatment 1.00 11.00 15.541 2.00e-03     * 0.059
## 2           tiempo 1.31 14.37 27.369 5.03e-05     * 0.049
## 3 treatment:tiempo 2.00 22.00 30.424 4.63e-07     * 0.050

datos2 %>% 
  group_by(tiempo, treatment) %>% 
  summarise(mean_rto = mean(rto)) %>% 
  ggplot()+
  aes(tiempo, mean_rto,
      color=treatment,
      group=treatment)+
  geom_point(size=5)+
  geom_line(linewidth=3)

## `summarise()` has grouped output by 'tiempo'. You can override using the
## `.groups` argument.

##NOTA:SI LAS LINEAS SE CRUZAN HAY INTERACCION. Se observa una diferencia en el momento t2 y t3 con fertilizante y en el t1 no hya diferencias, por tanto, hay interaccion. Si no hubiese interaccion, serian iguales las lineas (constante)

##Medidas repetidadas en 3 vías

library(rstatix)
library(datarium)

set.seed(123)
data("weightloss", package = "datarium")

datos3 = weightloss %>%
  rename(FQ = diet,
         FO = exercises) %>%
pivot_longer(c(t1,t2,t3),
              names_to = 'tiempo',
              values_to = 'rto')
datos3

## # A tibble: 144 × 5
##    id    FQ    FO    tiempo   rto
##    <fct> <fct> <fct> <chr>  <dbl>
##  1 1     no    no    t1      10.4
##  2 1     no    no    t2      13.2
##  3 1     no    no    t3      11.6
##  4 2     no    no    t1      11.6
##  5 2     no    no    t2      10.7
##  6 2     no    no    t3      13.2
##  7 3     no    no    t1      11.4
##  8 3     no    no    t2      11.1
##  9 3     no    no    t3      11.4
## 10 4     no    no    t1      11.1
## # ℹ 134 more rows

table(datos3$FQ, datos3$FO, datos3$tiempo)

## , ,  = t1
## 
##      
##       no yes
##   no  12  12
##   yes 12  12
## 
## , ,  = t2
## 
##      
##       no yes
##   no  12  12
##   yes 12  12
## 
## , ,  = t3
## 
##      
##       no yes
##   no  12  12
##   yes 12  12

PARTE DESCRIPTIVA ANALITICA (Tabular)

datos3 %>%
  group_by(FQ,FO, tiempo) %>%
  get_summary_stats(rto, type = "mean_sd")

## # A tibble: 12 × 7
##    FQ    FO    tiempo variable     n  mean    sd
##    <fct> <fct> <chr>  <fct>    <dbl> <dbl> <dbl>
##  1 no    no    t1     rto         12  10.9 0.868
##  2 no    no    t2     rto         12  11.6 1.30 
##  3 no    no    t3     rto         12  11.4 0.935
##  4 no    yes   t1     rto         12  10.8 1.27 
##  5 no    yes   t2     rto         12  13.4 1.01 
##  6 no    yes   t3     rto         12  16.8 1.53 
##  7 yes   no    t1     rto         12  11.7 0.938
##  8 yes   no    t2     rto         12  12.4 1.42 
##  9 yes   no    t3     rto         12  13.8 1.43 
## 10 yes   yes   t1     rto         12  11.4 1.09 
## 11 yes   yes   t2     rto         12  13.2 1.22 
## 12 yes   yes   t3     rto         12  14.7 0.625

#PIPE: %>% AGRUPAR (Entre los factores (FQ, FO, tiempo)) 
#Se pueden ver todas las combinaciones entre los factores

#NOTA: Cuando se aplica solamente fertilizante organico en el tiempo 3 se tiene mayor rendimiento. Por otro lado, el rendimiento más bajo se tiene cuando se aplica fertilizante organico en el tiempo 1.

PARTE DESCRIPTIVA VISUALIZACIÓN

library (ggpubr)
ggboxplot(
  datos3, x = "FQ", y = "rto",
  color = "tiempo",palette = 'jco',
  facet.by = "FO", short.panel.labs = FALSE
)

#NOTA: El mejor tratamiento organico en el tiempo 3 es mejor que el tratamiento químico. tiempo

##IDENTIFICACIÓN DE DATOS ATÍPICOS

datos3 %>%
  group_by(FQ, FO, tiempo) %>%
  identify_outliers(rto)

## # A tibble: 5 × 7
##   FQ    FO    tiempo id      rto is.outlier is.extreme
##   <fct> <fct> <chr>  <fct> <dbl> <lgl>      <lgl>     
## 1 no    no    t3     2      13.2 TRUE       FALSE     
## 2 yes   no    t1     1      10.2 TRUE       FALSE     
## 3 yes   no    t1     3      13.2 TRUE       FALSE     
## 4 yes   no    t1     4      10.2 TRUE       FALSE     
## 5 yes   no    t2     10     15.3 TRUE       FALSE

#NOTA: Como no tiene ningún “true” en el lado extremo (is.extreme), no hay datos atípicos. Si apareciera un extremo se diría “Este dato podría afectar la estabilidad de todos los resultados”

#PRUEBA NORMALIDAD

ggplot(datos3)+
aes(rto)+
  geom_density()

ggplot(datos3)+
aes(rto, fill = tiempo )+
  geom_density(alpha=0.5)

##PRUEBA DE NORMALIDAD A TODOS LOS DATOS EN TODOS LOS TIEMPOS

#Se hacen varias pruebas de normalidad. 
1.

## [1] 1

shapiro.test(datos$rto)

## 
##  Shapiro-Wilk normality test
## 
## data:  datos$rto
## W = 0.94886, p-value = 0.1576

2.

## [1] 2

library(nortest)
lillie.test(datos3$rto)

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  datos3$rto
## D = 0.10013, p-value = 0.001229

3.

## [1] 3

cvm.test(datos3$rto)

## 
##  Cramer-von Mises normality test
## 
## data:  datos3$rto
## W = 0.23031, p-value = 0.002205

4.

## [1] 4

sf.test(datos3$rto)

## 
##  Shapiro-Francia normality test
## 
## data:  datos3$rto
## W = 0.96406, p-value = 0.001295

5.

## [1] 5

ad.test(datos3$rto)

## 
##  Anderson-Darling normality test
## 
## data:  datos3$rto
## A = 1.432, p-value = 0.001021

#PRUEBA DE SIMETRIA

library(lawstat)
symmetry.test(datos3$rto)

## 
##  m-out-of-n bootstrap symmetry test by Miao, Gel, and Gastwirth (2006)
## 
## data:  datos3$rto
## Test statistic = 2.2399, p-value = 0.032
## alternative hypothesis: the distribution is asymmetric.
## sample estimates:
## bootstrap optimal m 
##                  19

#NOTA: Esta prueba indica que tiene datos simetricos cuando se usa el 1%. Aún sin normalidad, se puede conformar al haber simetria.

#PARTE INFERENCIAL - ANALISIS DE VARIANZA

rest.aov =  anova_test(
  data = datos3, dv = rto, wid = id,
  within = c(FQ, FO, tiempo)
  )

  get_anova_table(rest.aov)

## ANOVA Table (type III tests)
## 
##         Effect  DFn   DFd       F        p p<.05   ges
## 1           FQ 1.00 11.00   6.021 3.20e-02     * 0.028
## 2           FO 1.00 11.00  58.928 9.65e-06     * 0.284
## 3       tiempo 2.00 22.00 110.942 3.22e-12     * 0.541
## 4        FQ:FO 1.00 11.00  75.356 2.98e-06     * 0.157
## 5    FQ:tiempo 1.38 15.17   0.603 5.01e-01       0.013
## 6    FO:tiempo 2.00 22.00  20.826 8.41e-06     * 0.274
## 7 FQ:FO:tiempo 2.00 22.00  14.246 1.07e-04     * 0.147

##NOTA: Hay triple interacción (1.07e-04), por esta razón, no se puede interpretar los otros resultados del aov. Esto quiere decir, que una fertilización es mejor en un tiempo y es la peor en otro. No se tiene un mejor tratamiento. #Aquí se mira el p-valor

#GRAFICO DE INTERACCIÓN #Cuandp hay interacción se hacen los graficos correspondientes a cada fertilizante

interaction.plot(datos3$tiempo,
            datos3$FQ,
            datos3$rto)

interaction.plot(datos3$tiempo,
            datos3$FO,
            datos3$rto)

#NOTA: En el tiempo 3 son iguales en el fertilizante quimico, sin embargo, en el fertilizante organico tuvo peores rendimientos en el t1 y mejor en el t3.

#Con presencia de interacción NO SE HACE COMPARACIONES DE MEDIA

#TABLA DE MEDIAS

tbl = tapply(datos3$rto,list(datos3$FQ,
                      datos3$FO,
                      datos3$tiempo),
      mean)
addmargins(tbl, FUN = mean)

## Margins computed over dimensions
## in the following order:
## 1: 
## 2: 
## 3:

## , , t1
## 
##            no      yes     mean
## no   10.90917 10.79417 10.85167
## yes  11.74250 11.39333 11.56792
## mean 11.32583 11.09375 11.20979
## 
## , , t2
## 
##            no      yes     mean
## no   11.56583 13.42083 12.49333
## yes  12.41583 13.22500 12.82042
## mean 11.99083 13.32292 12.65688
## 
## , , t3
## 
##            no      yes     mean
## no   11.45000 16.81750 14.13375
## yes  13.78667 14.65500 14.22083
## mean 12.61833 15.73625 14.17729
## 
## , , mean
## 
##            no      yes     mean
## no   11.30833 13.67750 12.49292
## yes  12.64833 13.09111 12.86972
## mean 11.97833 13.38431 12.68132

#NOTA: #Izq: FQ #Dere: FO #En el t1 se tiene que casi no se nota la interacción, porque al identificar el valor más grande de FO (11.32) y FQ (11.56) se unen en un punto(11.74),por otro lado, en el t2, donde el mayor es FQ y FO, donde al unirse la columna de los margenes no da lo mismo que el mayor de los valores dentro del cuerpo de la tabla. #Por tanto, se podría decir que los tiempo t2 y t3 son los que causan la interacción y el más grande ocurre en el t3 (16.81) donde se aplica si aplica el FO y no aplica el FQ.

#IMPORTANTE Si no se tiene ningun tipo de interacción se , puede analizar en el aov (F1, F2 Y t) haciendo al comparación de medias (post-hock y tukey o bonferroni). Estableciendo cual es el mejor del factor 1 y cuales el mejor del factor 2 y cual es mejor en el tiempo.

#En un caso hipotetico donde la interacción involucre el F1 y el t, como no se toma el f2 se puede interpretar el anova (prueba de comparaciones) para ese factor y no el F1 ni el tiempo.

Clase 2/06/23

Natalia Deaquiz

2023-06-02

Analizarlos outliner

Se hace sobre los datos de aceite y no de los residuales.