Estadística descriptiva

Author

Luis La Cruz & German Chacón

library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(egg)
Loading required package: gridExtra

Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine
library(tidyverse)
library(ggplot2)
library(ggpmisc)
Loading required package: ggpp
Registered S3 methods overwritten by 'ggpp':
  method                  from   
  heightDetails.titleGrob ggplot2
  widthDetails.titleGrob  ggplot2

Attaching package: 'ggpp'

The following object is masked from 'package:ggplot2':

    annotate

Registered S3 method overwritten by 'ggpmisc':
  method                  from   
  as.character.polynomial polynom
library(broom)
library(ggplot2)
library(patchwork)
library(egg)
library(ggpubr)

Attaching package: 'ggpubr'

The following objects are masked from 'package:ggpp':

    as_npc, as_npcx, as_npcy

The following object is masked from 'package:egg':

    ggarrange
library(readxl)
library(tidyverse)
library(egg)
library(tidyverse)
library(ggplot2)
library(dplyr)

dat_clean=read.csv("dat_clean_modified_zscore_especies.csv")

dat_clean$Banda <- factor(dat_clean$Banda,
  levels = c("35-45","45-90","90-170","170-260"),labels = c("35-45","45-90","90-170","170-260"))

dat_clean=dat_clean %>%
  mutate(Value_linear = 10^(Value/10))  #%>%
  #filter(!is.na(Value_linear))

unique(dat_clean$Class)
[1] "Anchoveta"    "Plancton"     "Vinciguerria" "Salpas"       "Múnida"      
#library(explore)
#explore(dat_clean)

descriptiva_clean=dat_clean %>% 
  group_by(Class) %>%
  get_summary_stats(type = "common")  %>%
  dplyr::filter(variable=="Value")

dclass1=descriptiva_clean%>% 
  select(-variable,-iqr,-mean,-sd,-se,-ci)

library(dplyr)
library(magrittr)  # Para el operador %>%

Attaching package: 'magrittr'
The following object is masked from 'package:purrr':

    set_names
The following object is masked from 'package:tidyr':

    extract
library(seewave)

Attaching package: 'seewave'
The following object is masked from 'package:lubridate':

    duration
The following object is masked from 'package:readr':

    spec
# Calcular la media en dB por grupo
descriptiva_clean2 <- dat_clean %>%
  group_by(Class) %>%
  mutate(mean_dB = meandB(Value), sd_dB=sddB(Value, level="SPL"),
    sd_lineal=sd(Value_linear))  # Calcular la media en dB usando seewave::meandB()

summary_table <- descriptiva_clean2 %>%
  group_by(Class) %>%
  summarise(mean_dB = mean(mean_dB),mean_sd_dB = mean(sd_dB),mean_sd_lineal = mean(sd_lineal))  # Calcular el promedio de mean_dB por grupo

# Mostrar la tabla resumen
d2=summary_table %>%
  select(mean_dB, mean_sd_dB)

Tabla_D=cbind(dclass1,d2)%>%
  rename(Especie="Class",Mínimo="min",Máximo="max",Mediana="median",Media="mean_dB", sd="mean_sd_dB")%>%
  mutate_if(is.numeric, ~round(., 2))

Tabla_D
       Especie      n Mínimo Máximo Mediana  Media    sd
1    Anchoveta 207204 -83.66 -22.36  -55.35 -45.51 11.45
2       Múnida  13462 -77.92 -38.57  -58.66 -54.64  6.80
3     Plancton  54934 -89.94 -40.77  -66.25 -59.95  8.96
4       Salpas   2675 -73.67 -59.05  -66.45 -65.72  2.46
5 Vinciguerria 155306 -89.28 -42.55  -66.01 -58.33  9.96
descriptiva_clean3=dat_clean %>% 
  group_by(Class,Banda) %>%
  get_summary_stats(type = "common") %>%
  dplyr::filter(variable=="Value")

d3=descriptiva_clean3%>% 
  select(-variable,-iqr,-mean,-sd,-se,-ci)%>%
   dplyr::rename(Especie="Class",Mínimo="min",Máximo="max",Mediana="median")%>%
  mutate_if(is.numeric, ~round(., 2))
library(dplyr)
library(magrittr)  # Para el operador %>%
library(seewave)

# Calcular la media en dB por grupo
descriptiva_clean4 <- dat_clean %>%
  group_by(Class, Banda) %>%
  mutate(mean_dB = meandB(Value), sd_dB=sddB(Value, level="SPL"),
    sd_lineal=sd(Value_linear))  # Calcular la media en dB usando 

summary_table4 <- descriptiva_clean4 %>%
  group_by(Class, Banda) %>%
  summarise(mean_dB = mean(mean_dB),mean_sd_dB = mean(sd_dB),mean_sd_lineal = mean(sd_lineal))  # Calcular el promedio de mean_dB por grupo
`summarise()` has grouped output by 'Class'. You can override using the
`.groups` argument.
# Mostrar la tabla resumen
d4=summary_table4 %>%
  select(mean_dB, mean_sd_dB)
Adding missing grouping variables: `Class`
Tabla_D=cbind(d3,d4)


Tabla_D%>%
  select(-Class)%>%
 rename(Media="mean_dB", sd="mean_sd_dB")%>%
  mutate_if(is.numeric, ~round(., 2))
        Especie   Banda     n Mínimo Máximo Mediana  Media    sd
1     Anchoveta   35-45 10135 -82.51 -26.47  -48.56 -41.68  8.87
2     Anchoveta   45-90 35034 -82.76 -22.50  -53.72 -44.12 11.61
3     Anchoveta  90-170 69232 -81.97 -22.36  -54.90 -44.96 11.72
4     Anchoveta 170-260 92803 -83.66 -23.86  -56.98 -47.73 10.57
5        Múnida   35-45   636 -76.16 -52.27  -62.81 -61.45  4.27
6        Múnida   45-90  2438 -77.92 -39.95  -64.08 -59.97  7.31
7        Múnida  90-170  3922 -75.24 -38.57  -60.52 -55.79  6.87
8        Múnida 170-260  6466 -69.51 -39.38  -55.99 -52.93  5.57
9      Plancton   35-45  2507 -89.94 -45.26  -68.20 -63.15  7.98
10     Plancton   45-90  9659 -87.04 -41.45  -69.87 -63.77  9.02
11     Plancton  90-170 17148 -87.89 -40.77  -65.75 -59.20  9.11
12     Plancton 170-260 25620 -87.92 -41.08  -65.02 -59.36  8.38
13       Salpas   35-45    87 -68.16 -59.08  -61.72 -61.65  1.99
14       Salpas   45-90   465 -73.67 -59.07  -68.01 -67.41  2.90
15       Salpas  90-170   781 -69.08 -59.05  -66.42 -65.62  2.00
16       Salpas 170-260  1342 -70.61 -59.24  -66.11 -65.72  2.02
17 Vinciguerria   35-45  7209 -87.93 -42.56  -63.50 -56.71  8.74
18 Vinciguerria   45-90 27786 -89.28 -42.55  -67.88 -58.41 11.25
19 Vinciguerria  90-170 46489 -83.68 -42.55  -67.39 -58.44 10.81
20 Vinciguerria 170-260 73822 -82.09 -42.55  -65.04 -58.43  9.12
ggplot(dat_clean, aes(y = Value, x=as.numeric(Frequency)))+
  geom_line(alpha=0.5, aes(color=Class),lwd=1, show.legend = F)+
  geom_smooth(size=1,method = "gam", color="black", show.legend = F)+
  theme_presentation() +
  xlab("Frecuencia (kHz)") +
  ylab("Sv (dB)")+
  scale_x_continuous(breaks = c(38,70,90,120,170,200,260))+
  geom_vline(xintercept = c(38,45,90,170,260),linetype = c("dashed"),color="gray")+
  geom_hline(yintercept = c(-80,-70,-60,-50,-40,-30,-20),linetype = c("dashed"),color="gray")+
  scale_color_viridis_d(option = "C")+
  theme(legend.title=element_blank())+
  facet_wrap(~Class,nrow = 3)

######################################################

ggplot(dat_clean, aes(y = Value, x=as.numeric(Frequency)))+
  geom_smooth(size=1,method = "gam", aes(color=Class), show.legend = F)+
  theme_presentation() +
  xlab("Frecuencia (kHz)") +
  ylab("Sv (dB)")+
  scale_x_continuous(breaks = c(38,70,90,120,170,200,260))+
  geom_vline(xintercept = c(38,45,90,170,260),linetype = c("dashed"),color="gray")+
  geom_hline(yintercept = c(-80,-70,-60,-50,-40,-30,-20),linetype = c("dashed"),color="gray")+
  scale_color_viridis_d()+
  theme(legend.title=element_blank())+
  facet_wrap(~Class,nrow = 3)

######################################################


ggplot(dat_clean, aes(y = Value, x=as.numeric(Frequency)))+
  geom_line(alpha=0.5, aes(color=Class),lwd=1, show.legend = F)+
  geom_smooth(size=1,method = "lm", color="black", show.legend = F)+
  stat_regline_equation()+
  theme_presentation() +
  xlab("Frecuencia (kHz)") +
  ylab("Sv (dB)")+
  scale_x_continuous(breaks = c(38,70,90,120,170,200,260))+
  geom_vline(xintercept = c(38,45,90,170,260),linetype = c("dashed"),color="gray")+
  geom_hline(yintercept = c(-80,-70,-60,-50,-40,-30,-20),linetype = c("dashed"),color="gray")+
  scale_color_viridis_d()+
  theme(legend.title=element_blank())+
  facet_wrap(~Class,nrow = 3)

######################################################

ggplot(dat_clean, aes(y = Value, x=as.numeric(Frequency)))+
  geom_smooth(size=1,method = "lm", aes(color=Class),show.legend = F)+
  stat_regline_equation()+
  theme_presentation() +
  xlab("Frecuencia (kHz)") +
  ylab("Sv (dB)")+
  scale_x_continuous(breaks = c(38,70,90,120,170,200,260))+
  geom_vline(xintercept = c(38,45,90,170,260),linetype = c("dashed"),color="gray")+
  geom_hline(yintercept = c(-80,-70,-60,-50,-40,-30,-20),linetype = c("dashed"),color="gray")+
  scale_color_viridis_d()+
  theme(legend.title=element_blank())+
  facet_wrap(~Class,nrow = 3)

w=ggplot(dat_clean, aes(y = Value, x=as.numeric(Frequency),color=Class) )+
  #geom_line(alpha=0.1, aes(color=group),lwd=1, show.legend = T)+
  geom_smooth(size=1.5,method = "lm", show.legend = T)+
  scale_y_continuous(limits = c(-65,-45))+
  #stat_regline_equation()+
  theme_presentation() +
  xlab("Frecuencia (kHz)") +
  ylab("Sv (dB)")+
  scale_x_continuous(breaks = c(38,70,90,120,170,200,260))+
  geom_vline(xintercept = c(38,45,90,170,260),linetype = c("dashed"),color="gray")+
  geom_hline(yintercept = c(-70,-60,-50,-40),linetype = c("dashed"),color="gray")+
  scale_color_viridis_d()+
  theme(legend.title=element_blank())+
   theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
       theme(panel.grid.major.x = element_line(color = "gray", linetype = "dashed"))



k=ggplot(dat_clean, aes(y = Value, x=as.numeric(Frequency),color=Class) )+
  #geom_line(alpha=0.1, aes(color=group),lwd=1, show.legend = T)+
  geom_smooth(size=1.5,method = "gam", show.legend = F)+
  #stat_regline_equation()+
  theme_presentation() +
  xlab("Frecuencia (kHz)") +
  ylab("Sv (dB)")+
  scale_y_continuous(limits = c(-65,-45))+
  scale_x_continuous(breaks = c(38,70,90,120,170,200,260))+
  geom_vline(xintercept = c(38,45,90,170,260),linetype = c("dashed"),color="gray")+
  geom_hline(yintercept = c(-70,-60,-50,-40),linetype = c("dashed"),color="gray")+
  scale_color_viridis_d()+
  theme(legend.title=element_blank())+
   theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
     theme(panel.grid.major.x = element_line(color = "gray", linetype = "dashed"))


library(cowplot)

# Crear las gráficas w y k (código que proporcionaste)

# Obtener la leyenda de una de las gráficas (por ejemplo, w)
legend_w <- get_legend(w)

# Combinar las dos gráficas y agregar la leyenda
combined_plot <- plot_grid(
  k + theme(legend.position = "none"),  # Ocultar la leyenda de la gráfica w
  w + theme(legend.position = "none"),  # Ocultar la leyenda de la gráfica k
  legend_w,
  ncol = 3, rel_heights = c(1, 1, 1),rel_widths = c(1,1,0.35),  # Ajustar las alturas relativas
  labels = c("(a)", "(b)", ""),  # Etiquetas de enumeración
  align = "h"  # Alinear horizontalmente las partes
)

# Ajustar el tamaño de la leyenda
combined_plot <- combined_plot + theme(
  legend.text = element_text(size = 19),  # Tamaño del texto de la leyenda
  legend.title = element_text(size = 14)  # Tamaño del título de la leyenda
)

# Imprimir la figura combinada
print(combined_plot)

ggplot(dat_clean, aes(x = Value, group = Class, color = Class))+
    stat_ecdf(size=1.5,pad = T) +
  theme_presentation() +
  scale_x_continuous(breaks = c(-90,-80,-70,-60,-50,-40,-30,-20))+
  scale_color_viridis_d()+
  xlab("Sv (dB)") +
  ylab("ECDF")+
  ggtitle("Prueba K-S") +
  theme(legend.position = "right")+
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
  theme(panel.grid.major.x = element_line(color = "gray", linetype = "dashed"),
    legend.title=element_blank())

#library(coin)

#oneway_test(Value ~ Class, data = dat_clean)
# Realizar la prueba K-S para todas las combinaciones de grupos
# Asegúrate de que dat_clean esté definido correctamente
# dat_clean <- ...

# Asegúrate de que dat_clean esté definido correctamente
# dat_clean <- ...

# Crear un data frame para almacenar los resultados
results <- data.frame(group1 = character(),
                      group2 = character(),
                      D_value = numeric(),
                      p_value = numeric(),
                      stringsAsFactors = FALSE)

# Obtener grupos únicos
groups <- unique(dat_clean$Class)

# Realizar la prueba K-S para todas las combinaciones de grupos
for (i in 1:(length(groups) - 1)) {
  for (j in (i + 1):length(groups)) {
    group1 <- groups[i]
    group2 <- groups[j]
    
    # Subconjuntos de datos para los dos grupos
    data_group1 <- dat_clean$Value[dat_clean$Class == group1]
    data_group2 <- dat_clean$Value[dat_clean$Class == group2]
    
    # Realizar la prueba K-S
    test_result <- ks.test(data_group1, data_group2)
    
    # Almacenar resultados en el data frame results
    results <- rbind(results, data.frame(
      group1 = group1,
      group2 = group2,
      D_value = round(test_result$statistic, 2),
      p_value = round(test_result$p.value, 2)
    ))
  }
}

# Mostrar el resultado final con las modificaciones requeridas
print(results)
         group1       group2 D_value p_value
D     Anchoveta     Plancton    0.51       0
D1    Anchoveta Vinciguerria    0.50       0
D2    Anchoveta       Salpas    0.75       0
D3    Anchoveta       Múnida    0.25       0
D4     Plancton Vinciguerria    0.08       0
D5     Plancton       Salpas    0.27       0
D6     Plancton       Múnida    0.41       0
D7 Vinciguerria       Salpas    0.27       0
D8 Vinciguerria       Múnida    0.41       0
D9       Salpas       Múnida    0.68       0
library(ggplot2)

ggplot(dat_clean)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Class), show.legend = F)+
  theme_presentation(base_size = 16) +
  ylab("Sv (dB)")+
  xlab("Especie")+
  scale_fill_viridis_d()+
  theme(legend.position = "top")+
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))

library(dplyr)
library(rstatix)
library(ggplot2)


df_wilcox <- dat_clean %>%
  pairwise_wilcox_test(Value ~ Class) %>%
  add_y_position(step.increase = 0.2)




library(ggplot2)

ggplot(dat_clean)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Class), show.legend = F)+
  theme_presentation(base_size = 16) +
  ylab("Sv (dB)")+
  xlab("Especie")+
  scale_fill_viridis_d()+
  theme(legend.position = "none")+
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
  stat_kruskal_test(aes(y = Value, x=Class, group=Class),p.adjust.method = "bonferroni",label.x = 1,label.y = 55)+#label = "as_detailed_italic"
   stat_pvalue_manual(df_wilcox,color ="group1",step.group.by="group1",tip.length = 0,step.increase = 0.02)+
  scale_color_viridis_d(name = "Especie")

library(ggplot2)
library(dplyr)

# Realizar la prueba de Kruskal-Wallis
kw_test <- kruskal.test(Value_linear ~ Class, data = dat_clean)

# Realizar las pruebas de Wilcoxon pairwise con corrección de Bonferroni
pw_test <- pairwise.wilcox.test(dat_clean$Value_linear, dat_clean$Class,
                                p.adjust.method = "bonferroni")


kw_test 

    Kruskal-Wallis rank sum test

data:  Value_linear by Class
Kruskal-Wallis chi-squared = 139112, df = 4, p-value < 2.2e-16
pw_test

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  dat_clean$Value_linear and dat_clean$Class 

             Anchoveta Múnida  Plancton Salpas 
Múnida       < 2e-16   -       -        -      
Plancton     < 2e-16   < 2e-16 -        -      
Salpas       < 2e-16   < 2e-16 1        -      
Vinciguerria < 2e-16   < 2e-16 < 2e-16  4.6e-05

P value adjustment method: bonferroni 
ggplot(dat_clean)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Class), show.legend = F)+
  theme_presentation(base_size = 14) +
  ylab("Sv (dB)")+
  xlab("Especie")+
  scale_fill_viridis_d()+
  theme(legend.position = "top")+ 
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
  facet_wrap(~Banda)

df_wilcox_banda <- dat_clean %>%
  group_by(Banda)%>%
  pairwise_wilcox_test(Value ~ Class) %>%
  add_y_position(step.increase = 0.2)

df_wilcox_banda 
# A tibble: 40 × 12
   Banda .y.   group1    group2          n1    n2 statistic         p     p.adj
   <fct> <chr> <chr>     <chr>        <int> <int>     <dbl>     <dbl>     <dbl>
 1 35-45 Value Anchoveta Múnida       10135   636  6005753  5.20e-293 4.16e-292
 2 35-45 Value Anchoveta Plancton     10135  2507 24355309  0         0        
 3 35-45 Value Anchoveta Salpas       10135    87   819016  2.66e- 43 1.33e- 42
 4 35-45 Value Anchoveta Vinciguerria 10135  7209 65264036. 0         0        
 5 35-45 Value Múnida    Plancton       636  2507  1152750. 9.08e- 68 5.45e- 67
 6 35-45 Value Múnida    Salpas         636    87    23116. 1.3 e-  2 3.8 e-  2
 7 35-45 Value Múnida    Vinciguerria   636  7209  2207930  1.23e-  1 1.23e-  1
 8 35-45 Value Plancton  Salpas        2507    87    42818  5.19e- 22 2.08e- 21
 9 35-45 Value Plancton  Vinciguerria  2507  7209  5017376. 5.01e-242 3.51e-241
10 35-45 Value Salpas    Vinciguerria    87  7209   357660. 2.4 e-  2 4.8 e-  2
# ℹ 30 more rows
# ℹ 3 more variables: p.adj.signif <chr>, y.position <dbl>, groups <named list>
ggplot(dat_clean)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Class), show.legend = F)+
  theme_presentation(base_size = 14) +
  ylab("Sv (dB)")+
  xlab("Especie")+
  scale_fill_viridis_d()+
  theme(legend.position = "none")+ 
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+

    stat_kruskal_test(aes(y = Value, x=Class,group = Class),p.adjust.method = "bonferroni",label.x.npc = 0.5,label.y.npc = 0.5)+#label = "as_detailed_italic"
   stat_pvalue_manual(df_wilcox_banda,color ="group1",step.group.by="group1",tip.length = 0,step.increase = 0.02)+
  scale_color_viridis_d()+
    facet_wrap(~Banda)

ggplot(dat_clean)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Banda), show.legend = F)+
  theme_presentation(base_size = 15) +
  ylab("Sv (dB)")+ 
  xlab("Frecuencia (kHz)")+
  scale_fill_viridis_d()+
  theme(legend.position = "top")+ 
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
  facet_wrap(~Class,scale="free")