Objetivo 2: Datos y limpieza

Author

Luis La Cruz & German Chacón

Published

September 2, 2024

library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(egg)
Cargando paquete requerido: gridExtra

Adjuntando el paquete: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine
library(tidyverse)
library(ggplot2)
library(ggpmisc)
Cargando paquete requerido: ggpp
Registered S3 methods overwritten by 'ggpp':
  method                  from   
  heightDetails.titleGrob ggplot2
  widthDetails.titleGrob  ggplot2

Adjuntando el paquete: 'ggpp'

The following object is masked from 'package:ggplot2':

    annotate
library(broom)
library(ggplot2)
library(patchwork)
library(egg)
library(ggpubr)

Adjuntando el paquete: 'ggpubr'

The following objects are masked from 'package:ggpp':

    as_npc, as_npcx, as_npcy

The following object is masked from 'package:egg':

    ggarrange
library(readxl)
library(tidyverse)
library(egg)
library(tidyverse)
datos_sv=read_excel("Datos_FM_Tesis_moda.xlsx", sheet="Total_data")

datos_sv_longer=datos_sv %>%
    pivot_longer(cols = c(14:285),
      names_to = c("Frequency"),
      values_to = "Value")


datos_sv_longer$Range=cut(as.numeric(datos_sv_longer$Depth_school), breaks=c(0,5,10,15,20,25,50,100,150,200,250,500),
  include.lowest=F, right=F,labels=c("0-5","5-10","10-15","15-20","20-25","25-50","50-100","100-150","150-200","200-250","250-500"))


datos_sv_longer$Banda_name=cut(as.numeric(datos_sv_longer$Frequency), breaks=c(0,18.1,45,90,170,270),
  include.lowest=F, right=F,labels=c("b0","b1","b2","b3","b4"))

datos_sv_longer$Banda=cut(as.numeric(datos_sv_longer$Frequency), breaks=c(0,18.1,45,90,170,270),
  include.lowest=F, right=F,labels=c("18","35-45","45-90","90-170","170-260"))


 Data_Sv=datos_sv_longer[datos_sv_longer$Data=="Sv",]

Datos Sv y Std por especies

Data_Sv <- Data_Sv %>%
  filter(Class %in% c("Anchoveta", "Múnida","Vinciguerria","Plancton","Salpas","Otros"))

#Data_Std=datos_sv_longer[datos_sv_longer$Data=="Std",]
#Data_Std <- Data_Std %>%
#filter(Class %in% c("Anchoveta", "Múnida","Vinciguerria","Plancton","Salpas","Otros"))

Datos Sv y Std “Anchoveta x tallas”

dat=Data_Sv[Data_Sv$Class=="Anchoveta",]

dat <-dat %>%
  filter(Subclass_n %in% c("Moda 3.5 cm", "Moda 4 cm", "Moda 5 cm", "Moda 7.5 cm","Moda 10.5 cm", "Moda 11 cm", "Moda 12 cm","Moda 12.5 cm","Moda 13.5 cm"))

dat$group <- factor(dat$Subclass_n,      # Reordering group factor levels
                         levels = c("Moda 3.5 cm", "Moda 4 cm", "Moda 5 cm", "Moda 7.5 cm","Moda 10.5 cm", "Moda 11 cm", "Moda 12 cm","Moda 12.5 cm","Moda 13.5 cm"),labels = c("3.5", "4", "5", "7.5","10.5","11","12","12.5","13.5"))

dat$Banda <- factor(dat$Banda,
  levels = c("35-45","45-90","90-170","170-260"),labels = c("35-45","45-90","90-170","170-260"))

dat$Value_linear=10^(dat$Value/10)

################

# dat_std=Data_Std[Data_Std$Class=="Anchoveta",]
# 
# dat_std <-dat_std %>%
#   filter(Subclass_n %in% c("Moda 3.5 cm", "Moda 4 cm", "Moda 5 cm", "Moda 7.5 cm","Moda 10.5 cm", "Moda 11 cm", "Moda 12 cm","Moda 12.5 cm","Moda 13 cm","Moda 13.5 cm"))
# 
# dat_std$group <- factor(dat_std$Subclass_n,      # Reordering group factor levels
#                          levels = c("Moda 3.5 cm", "Moda 4 cm", "Moda 5 cm", "Moda 7.5 cm","Moda 10.5 cm", "Moda 11 cm", "Moda 12 cm","Moda 12.5 cm","Moda 13 cm","Moda 13.5 cm"))

Plot densidad “sin filtrar”

# Calcular el número de observaciones por grupo
observaciones <- dat %>%
  group_by(group) %>%
  summarise(n = n())

# Crear el gráfico de densidad con facetas y añadir número de observaciones y cardúmenes
ggplot(dat) +
  geom_density(alpha = 1, size = 0.75, aes(fill = group, x = Value)) +
  theme_presentation(base_size = 16) +  
  facet_wrap(facets = "group", ncol = 3, strip.position = "top") +
  coord_cartesian(expand = FALSE) 
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: Removed 19303 rows containing non-finite outside the scale range
(`stat_density()`).

  geom_text(data = observaciones, aes(label = paste("n =", n)),
            x = Inf, y = -Inf, hjust = 1, vjust = 0, size = 5, color = "black") 
mapping: label = ~paste("n =", n) 
geom_text: parse = FALSE, check_overlap = FALSE, size.unit = mm, na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity 

Filtro manual

dat_filter_one_by_one=dat%>%
 filter(!(group == "3.5" & N_Catch_Year == "C074_Y1802-04"))%>%
  
filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 24 200 kHz @ 4.5m(dB)"))%>%

filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 16 200 kHz @ 4.1m(dB)"))%>%  


filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 18 200 kHz @ 4.0m(dB)"))%>% 

filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 1 200 kHz @ 4.5m(dB)"))%>% 

  filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 1 200 kHz @ 8.3m(dB)"))%>%  

  filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 23 200 kHz @ 5.1m(dB)"))%>% 

filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 6 200 kHz @ 4.0m(dB)"))%>% 
  
filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 22 200 kHz @ 4.0m(dB)"))%>% 
  
filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 25 200 kHz @ 4.1m(dB)"))%>% 

filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 12 200 kHz @ 4.5m(dB)"))%>% 
  
filter(!(group == "3.5" & N_Catch_Year == "C078_Y1802-04" & Detect_school == "School 26 200 kHz @ 4.0m(dB)"))%>%  
  

  filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "Otros3 1 200 kHz @ 7.4m(dB)"))%>%

    filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 111 200 kHz @ 15.7m(dB)"))%>%
  
    filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 113 200 kHz @ 8.9m(dB)"))%>%
  
    filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 114 200 kHz @ 8.0m(dB)"))%>%
  
    filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 115 200 kHz @ 7.1m(dB)"))%>%
  
    filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 116 200 kHz @ 6.6m(dB)"))%>%
  
  
    
    filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 95 200 kHz @ 8.8m(dB)"))%>%
  
      filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 100 200 kHz @ 13.0m(dB)"))%>%
  
        filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 91 200 kHz @ 8.4m(dB)"))%>%
  
          filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 92 200 kHz @ 7.4m(dB)"))%>%
  
            filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 109 200 kHz @ 11.9m(dB)"))%>%
  
              filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 107 200 kHz @ 16.1m(dB)"))%>%
  
                filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 97 200 kHz @ 10.1m(dB)"))%>%
  
                filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 74 200 kHz @ 8.3m(dB)"))%>%
  
                filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 81 200 kHz @ 7.7m(dB)"))%>%
  
filter(!(group == "4" & N_Catch_Year == "C116_Y2202-04" & Detect_school == "School 99 200 kHz @ 16.4m(dB)"))%>%
  
  
  
  
  
  

  filter(!(group == "5" & Detect_school == "School 12 200 kHz @ 5.3m(dB)" ))%>%
    
    
  filter(!(group == "5" & Detect_school == "School 1 200 kHz @ 4.7m(dB)"))%>%
    
  filter(!(group == "5" & Detect_school == "School 2 200 kHz @ 4.5m(dB)"))%>%
  
    filter(!(group == "5" & Detect_school == "School 17 200 kHz @ 5.2m(dB)"))%>%



  
  
filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 17 38 kHz @ 4.8m(dB)"))%>%
  
filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 13 38 kHz @ 5.2m(dB)"))%>%

filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 13 38 kHz @ 5.2m(dB)"))%>%



filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 11 38 kHz @ 7.3m(dB)"))%>%

filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 29 38 kHz @ 9.8m(dB)"))%>%
  
filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 12 38 kHz @ 7.3m(dB)"))%>%

filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 32 38 kHz @ 9.9m(dB)"))%>%

  filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 5 38 kHz @ 7.2m(dB)"))%>%
  
filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 31 38 kHz @ 10.5m(dB)"))%>%
  
  
  filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 2 38 kHz @ 8.7m(dB)"))%>%
  
  filter(!(group == "7.5" & N_Catch_Year == "C004_Y2202-04" & Detect_school == "School 7 38 kHz @ 10.4m(dB)"))%>%
  
  filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 11 38 kHz @ 7.3m(dB)"))%>%
  
    filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 18 38 kHz @ 10.6m(dB)"))%>%
  
      filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 29 38 kHz @ 9.8m(dB)"))%>%
  
        filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 26 38 kHz @ 14.1m(dB)"))%>%
  
    
        filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 16 38 kHz @ 12.4m(dB)"))%>%

          filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 12 38 kHz @ 7.3m(dB)"))%>%

  
            filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 7 38 kHz @ 10.4m(dB)"))%>%

filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 17 38 kHz @ 4.8m(dB)"))%>%

  filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 13 38 kHz @ 5.2m(dB)"))%>%
  
    filter(!(group == "7.5" & N_Catch_Year == "C075_Y2202-04" & Detect_school == "School 14 38 kHz @ 8.5m(dB)"))%>%
  
  
filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "User defined 74 200 kHz @ 14.8m(dB)"))%>%
  
  filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "User defined 79 200 kHz @ 15.4m(dB)"))%>%
  
filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "User defined 92 200 kHz @ 12.6m(dB)"))%>%
  
filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "User defined 93 200 kHz @ 12.4m(dB)"))%>%
  
  filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "User defined 94 200 kHz @ 12.8m(dB)"))%>%
  

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 62 200 kHz @ 7.0m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 14 200 kHz @ 5.7m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 6 200 kHz @ 6.9m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 7 200 kHz @ 9.2m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 36 200 kHz @ 12.9m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 52 200 kHz @ 9.5m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 53 200 kHz @ 8.5m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 54 200 kHz @ 7.3m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 55 200 kHz @ 7.7m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 28 200 kHz @ 7.5m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 31 200 kHz @ 8.9m(dB)"))%>%

    filter(!(group == "10.5" & N_Catch_Year == "C110_Y2202-04" & Detect_school == "School 47 200 kHz @ 15.9m(dB)"))%>%

filter(!(group == "12" & N_Catch_Year == "C104_Y2202-04" & Detect_school == "User defined 14 38 kHz @ 32.7m(dB)"))%>%

filter(!(group == "12" & N_Catch_Year == "C104_Y2202-04" & Detect_school == "User defined 5 38 kHz @ 33.5m(dB)"))%>%

filter(!(group == "12.5" & N_Catch_Year == "C101_Y2202-04"))%>%
  
  # filter(!(group == "12.5" & N_Catch_Year == "C101_Y2202-04" & Detect_school == "School 4 38 kHz @ 9.6m(dB)"))%>%

# filter(!(group == "12.5" & N_Catch_Year == "C101_Y2202-04" & Detect_school == "School 6 38 kHz @ 8.6m(dB)"))%>%

filter(!(group == "12.5" & N_Catch_Year == "C012_Y2109-11" & Detect_school == "School 1 38 kHz @ 6.8m(dB)"))%>%

filter(!(group == "12.5" & N_Catch_Year == "C012_Y2109-11" & Detect_school == "School 17 38 kHz @ 5.3m(dB)"))%>%

filter(!(group == "12.5" & N_Catch_Year == "C012_Y2109-11" & Detect_school == "School 20 38 kHz @ 6.5m(dB)"))%>%


filter(!(group == "12.5" & N_Catch_Year == "C012_Y2109-11" & Detect_school == "School 6 38 kHz @ 8.3m(dB)"))%>%


filter(!(group == "13.5" & N_Catch_Year == "C012_Y1802-04"))
plot_gg=ggplot(dat=dat_filter_one_by_one[dat_filter_one_by_one$group == "3.5",])+
  geom_density(alpha=1,size=0.75, aes(color=Detect_school,x = Value),show.legend = F)+
  theme_presentation(base_size = 12)+
  
  facet_wrap(facets=group~N_Catch_Year+Banda,scales = "free_y",ncol = 1,strip.position = "left")
  

library(plotly)

Adjuntando el paquete: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
ggplotly(plot_gg, tooltip = c("Detect_school"))
# Calcular el número de observaciones por grupo
observaciones <- dat_filter_one_by_one %>%
  group_by(group) %>%
  summarise(n = n())

# Crear el gráfico de densidad con facetas y añadir número de observaciones
ggplot(dat_filter_one_by_one) +
  geom_density(alpha = 1, size = 0.75, aes(fill = group, x = Value)) +
  theme_presentation(base_size = 24) +  # Cambiado a theme_minimal para simplificar
  facet_wrap(facets = "group", ncol = 1, strip.position = "right") +
  coord_cartesian(expand = F) +
 geom_label(data = observaciones, aes(label = paste("n =", n)),
             x = Inf, y = -Inf, hjust = 2, vjust = -1, size = 5, fill = "white", color = "black")

Limpieza de datos sobre Sv

Z-score Modificado

# Definir función para filtrar valores atípicos por grupo usando Z-Score modificado
filter_outliers_modified_zscore <- function(data, group_var, value_var, threshold = 3.5) {
  data %>%
    group_by({{group_var}}) %>%
    mutate(median_value = median({{value_var}}, na.rm = TRUE),
           mad_value = mad({{value_var}}, constant = 1, na.rm = TRUE),
           modified_z_score = abs({{value_var}} - median_value) / (mad_value * 1.4826)) %>%
    filter(modified_z_score <= threshold) %>%
    select(-median_value, -mad_value, -modified_z_score)
}

# Aplicar la función para filtrar valores atípicos por grupo usando Z-Score modificado
dat_clean_modified_zscore <- dat_filter_one_by_one %>%
  filter_outliers_modified_zscore(group, Value, threshold = 3.5)

Box plot

dat_clean_modified_z-score

library(ggplot2)

Figura01=ggplot(dat_clean_modified_zscore)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=group,y = Value, x=group), show.legend = F)+
  theme_presentation(base_size = 15) +
  ylab("Sv (dB)")+ #Average response
  #scale_fill_brewer(palette = "RdYlBu",name="Especie")+
    #scale_fill_viridis_d(option = "C")+
  scale_fill_manual(name="Anchoveta",values =c("#5f5f5f","#0000ff","#000080","#00bf00","#ffff00","#ff8000","#ff00bf","#ff0000","#a6533c"))+  
  scale_x_discrete(name = "Longitud (cm)", labels = c("3.5","4","5","7.5","10.5","11","12","12.5","13.5"))+
  theme(legend.position = "top")+ #legend.title=element_blank()
  #scale_x_discrete(name = "Bandas (kHz)")+
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))



Figura02=ggplot(dat_clean_modified_zscore)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=group,y = Value, x=group), show.legend = F)+
  theme_presentation(base_size = 12) +
  ylab("Sv (dB)")+ #Average response
  #scale_fill_brewer(palette = "RdYlBu",name="Especie")+
    #scale_fill_viridis_d(option = "C")+
  scale_fill_manual(name="Anchoveta",values =c("#5f5f5f","#0000ff","#000080","#00bf00","#ffff00","#ff8000","#ff00bf","#ff0000","#a6533c"))+  
  scale_x_discrete(name = "Longitud (cm)", labels = c("3.5","4","5","7.5","10.5","11","12","12.5","13.5"))+
  theme(legend.position = "top")+ #legend.title=element_blank()
  #scale_x_discrete(name = "Bandas (kHz)")+
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
  facet_wrap(~Banda)






ggsave(filename = "Boxplot_FM_global_modas.png",
  plot = Figura01,     
  height = 5,             # Specifies the height of the plot in inches
       width = 6,              # Specifies the width of the plot in inches
       dpi = 1000,             # Specifies the resolution in dots per inch
       path = "F:/Tesis abordo/Tesis abordo/Figuras/Objetivo02/",device = "png")  

ggsave(filename = "Boxplot_FM_modas.png",
  plot = Figura02,     
  height = 5,             # Specifies the height of the plot in inches
       width = 6,              # Specifies the width of the plot in inches
       dpi = 1000,             # Specifies the resolution in dots per inch
       path = "F:/Tesis abordo/Tesis abordo/Figuras/Objetivo02/",device = "png") 
ggplot(dat_clean_modified_zscore)+
  geom_density(alpha=0.5,size=0.75, aes(fill=group,x = Value))+
  theme_presentation(base_size = 12)+
  facet_wrap(facets = "group",scales = "free_y",ncol = 1,strip.position = "right")+
   scale_fill_manual(name="Anchoveta (LT, cm)",values =c("#5f5f5f","#0000ff","#000080","#00bf00","#ffff00","#ff8000","#ff00bf","#ff0000","#a6533c"))

#write.csv(dat_std_clean,"dat_clean_std_anchoveta.csv")

write.csv(dat_clean_modified_zscore,"dat_clean_modified_zscore_anchoveta.csv")