Objetivo 1: Datos y limpieza

Author

Luis La Cruz & German Chacón

Published

September 2, 2024

library(readxl)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(egg)

Cargando paquete requerido: gridExtra

Adjuntando el paquete: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine

library(tidyverse)
library(ggplot2)
library(ggpmisc)

Cargando paquete requerido: ggpp
Registered S3 methods overwritten by 'ggpp':
  method                  from   
  heightDetails.titleGrob ggplot2
  widthDetails.titleGrob  ggplot2

Adjuntando el paquete: 'ggpp'

The following object is masked from 'package:ggplot2':

    annotate

library(broom)
library(ggplot2)
library(patchwork)
library(egg)
library(ggpubr)


Adjuntando el paquete: 'ggpubr'

The following objects are masked from 'package:ggpp':

    as_npc, as_npcx, as_npcy

The following object is masked from 'package:egg':

    ggarrange

library(readxl)
library(tidyverse)
library(egg)
library(tidyverse)
library(dplyr)

datos_sv=read_excel("Datos_FM_Tesis_moda.xlsx", sheet="Total_data")

datos_sv_longer=datos_sv %>%
    pivot_longer(cols = c(14:285),
      names_to = c("Frequency"),
      values_to = "Value")

datos_sv_longer$Range=cut(as.numeric(datos_sv_longer$Depth_school), breaks=c(0,5,10,15,20,25,50,100,150,200,250,500),
  include.lowest=F, right=F,labels=c("0-5","5-10","10-15","15-20","20-25","25-50","50-100","100-150","150-200","200-250","250-500"))

datos_sv_longer$Banda_name=cut(as.numeric(datos_sv_longer$Frequency), breaks=c(0,18.1,45,90,170,270),
  include.lowest=F, right=F,labels=c("b0","b1","b2","b3","b4"))

datos_sv_longer$Banda=cut(as.numeric(datos_sv_longer$Frequency), breaks=c(0,18.1,45,90,170,270),
  include.lowest=F, right=F,labels=c("18","35-45","45-90","90-170","170-260"))

Data_Sv=datos_sv_longer[datos_sv_longer$Data=="Sv",]

Datos acústicos por especies

Data_Sv <- Data_Sv %>%
  dplyr::filter(Class %in% c("Múnida","Vinciguerria","Plancton","Salpas","Otros"))

#Data_Std=datos_sv_longer[datos_sv_longer$Data=="Std",]
#Data_Std <- Data_Std %>%
#filter(Class %in% c("Anchoveta", "Múnida","Vinciguerria","Plancton","Salpas","Otros"))


dat=Data_Sv

Plot de densidad “sin filtrar”

ggplot(dat)+
  geom_density(alpha=0.5,size=0.75, aes(fill=Class,x = Value))+
  theme_presentation(base_size = 12)+
  scale_fill_viridis_d()+
   xlab("Sv (dB)")+ 
  ylab("Densidad")+
  facet_wrap(facets = "Class",scales = "free_y",ncol = 1,strip.position = "right")

dat_filter_one_by_one=dat

names(dat)

 [1] "ID_esp3"       "Detect_school" "Year"          "Survey"       
 [5] "Event"         "N_Catch"       "Data"          "N_Catch_Year" 
 [9] "Class_n"       "Class"         "Sub_class"     "Subclass_n"   
[13] "18"            "Mean_b0"       "b1"            "b2"           
[17] "b3"            "b4"            "b1_b2"         "b1_b3"        
[21] "b1_b4"         "b2_b3"         "b2_b4"         "b3_b4"        
[25] "b3b4"          "b2b3"          "b1b2"          "b1b2b3b4"     
[29] "Pendiente_b1"  "Intercepto_b1" "Pendiente_b2"  "Intercepto_b2"
[33] "Pendiente_b3"  "Intercepto_b3" "Pendiente_b4"  "Intercepto_b4"
[37] "Depth_school"  "Pendiente_b"   "Intercepto_b"  "Clase_modal"  
[41] "Frequency"     "Value"         "Range"         "Banda_name"   
[45] "Banda"

unique(dat$Class)

[1] "Plancton"     "Vinciguerria" "Salpas"       "Múnida"

unique(dat$Banda)

[1] 35-45   45-90   90-170  170-260
Levels: 18 35-45 45-90 90-170 170-260

filter=dat%>%
filter(Class == "Vinciguerria")

unique(filter$N_Catch_Year)

[1] "C019_Y2109-11" "C028_Y2109-11" "N002_Y2202-04" "N003_Y2202-04"
[5] "N004_Y2202-04"

dat_filter_one_by_one=dat%>%
 filter(!(Class == "Múnida" & N_Catch_Year == "N002_Y2011-12"))%>%


  filter(!(Class == "Múnida" & N_Catch_Year == "C112_Y2202-04" & Detect_school == "Munida 23 200 kHz @ 21.6m(dB)"))%>%


  filter(!(Class == "Plancton" & N_Catch_Year == "N004_Y2202-04"))%>%

  
  filter(!(Class == "Plancton" & N_Catch_Year == "N001_Y2205-05"))%>%

  filter(!(Class == "Plancton" & N_Catch_Year == "C019_Y1802-04"))

  #filter(!(Class == "Vinciguerria" & N_Catch_Year == "C028_Y2109-11"))

plot_gg=ggplot(dat=dat_filter_one_by_one[dat_filter_one_by_one$Class == "Vinciguerria",])+
  geom_density(alpha=1,size=0.75, aes(color=Detect_school,x = Value),show.legend = F)+
  theme_presentation(base_size = 12)+
  
  facet_wrap(facets="N_Catch_Year",scales = "free_y",ncol = 3,strip.position = "left")
  

library(plotly)


Adjuntando el paquete: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

#ggplotly(plot_gg, tooltip = c("Detect_school"))



ggplot(dat=dat_filter_one_by_one[dat_filter_one_by_one$Class == "Vinciguerria",]) +
  geom_line(aes(x = as.numeric(Frequency), y=Value, color=N_Catch_Year))+
    facet_wrap(facets="N_Catch_Year",scales = "free_y",ncol = 3,strip.position = "left")

Limpieza de datos aplicado a los valores acústicos (Sv)

Z-score Modificado

Definir función para filtrar valores atípicos por grupo usando Z-Score modificado

filter_outliers_modified_zscore <- function(data, group_var, value_var, threshold = 3.5) {
  data %>%
    group_by({{group_var}}) %>%
    mutate(median_value = median({{value_var}}, na.rm = TRUE),
           mad_value = mad({{value_var}}, constant = 1, na.rm = TRUE),
           modified_z_score = abs({{value_var}} - median_value) / (mad_value * 1.4826)) %>%
    dplyr::filter(modified_z_score <= threshold) %>%
    dplyr::select(-median_value, -mad_value, -modified_z_score)
}

# Aplicar la función para filtrar valores atípicos por grupo usando Z-Score modificado


dat=dat_filter_one_by_one

dat_clean_modified_zscore <- dat %>%
  filter_outliers_modified_zscore(Class, Value, threshold = 3.5)

ggplot(dat_clean_modified_zscore)+
  geom_density(alpha=0.5,size=0.75, aes(fill=Class,x = Value))+
  theme_presentation(base_size = 12)+
  scale_fill_viridis_d(name="Especie")+
   xlab("Sv (dB)")+ 
  ylab("Densidad")+
  facet_wrap(facets = "Class",scales = "free_y",ncol = 1,strip.position = "right")

Box plot utilizando los datos limpios:

library(ggplot2)

ggplot(dat_clean_modified_zscore)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Class), show.legend = F)+
  theme_presentation(base_size = 15) +
  ylab("Sv (dB)")+
  scale_fill_viridis_d()+
  theme(legend.position = "top")+
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))

ggplot(dat_clean_modified_zscore)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Class), show.legend = F)+
  theme_presentation(base_size = 12) +
  ylab("Sv (dB)")+ 
  scale_fill_viridis_d()+
  theme(legend.position = "top")+ 
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
  facet_wrap(~Banda)

ggplot(dat_clean_modified_zscore)+
  geom_boxplot(alpha=0.5,size=0.75, aes(fill=Class,y = Value, x=Banda), show.legend = F)+
  theme_presentation(base_size = 12) +
  ylab("Sv (dB)")+ 
  xlab("Frecuencia (kHz)")+
  scale_fill_viridis_d()+
  theme(legend.position = "top")+ 
  theme(panel.grid.major.y = element_line(color = "gray", linetype = "dashed"))+
  facet_wrap(~Class)

sp=dat_clean_modified_zscore %>%
  select(Frequency,Value,Banda,Depth_school,Class,Detect_school,N_Catch_Year)%>%
  mutate(group=Class, Frequency=as.numeric(Frequency))

names(dat_clean_modified_zscore)

 [1] "ID_esp3"       "Detect_school" "Year"          "Survey"       
 [5] "Event"         "N_Catch"       "Data"          "N_Catch_Year" 
 [9] "Class_n"       "Class"         "Sub_class"     "Subclass_n"   
[13] "18"            "Mean_b0"       "b1"            "b2"           
[17] "b3"            "b4"            "b1_b2"         "b1_b3"        
[21] "b1_b4"         "b2_b3"         "b2_b4"         "b3_b4"        
[25] "b3b4"          "b2b3"          "b1b2"          "b1b2b3b4"     
[29] "Pendiente_b1"  "Intercepto_b1" "Pendiente_b2"  "Intercepto_b2"
[33] "Pendiente_b3"  "Intercepto_b3" "Pendiente_b4"  "Intercepto_b4"
[37] "Depth_school"  "Pendiente_b"   "Intercepto_b"  "Clase_modal"  
[41] "Frequency"     "Value"         "Range"         "Banda_name"   
[45] "Banda"

###

dat_clean_anchoveta=read_csv("dat_clean_modified_zscore_anchoveta.csv")

New names:
Rows: 116908 Columns: 48
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(13): Detect_school, Survey, Event, Data, N_Catch_Year, Class_n, Class, ... dbl
(34): ...1, ID_esp3, Year, N_Catch, 18, b1, b2, b3, b4, b1_b2, b1_b3, b1... lgl
(1): Mean_b0
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`

anc=dat_clean_anchoveta%>%
  select(Frequency,Value,Banda,Depth_school,Class,Detect_school,group,N_Catch_Year)%>%
  mutate(group=as.factor(group))

names(anc)

[1] "Frequency"     "Value"         "Banda"         "Depth_school" 
[5] "Class"         "Detect_school" "group"         "N_Catch_Year"

dat_clean_modified_zscore_plus=rbind(sp,anc)

unique(dat_clean_modified_zscore_plus$group)

 [1] "Plancton"     "Vinciguerria" "Salpas"       "Múnida"       "13.5"        
 [6] "7.5"          "12.5"         "11"           "12"           "3.5"         
[11] "10.5"         "4"            "5"

write.csv(dat_clean_modified_zscore_plus,"dat_clean_modified_zscore_especies.csv")

ggplot(dat=dat_clean_modified_zscore_plus[dat_clean_modified_zscore_plus$Class == "Vinciguerria",]) +
  geom_line(aes(x = as.numeric(Frequency), y=Value, color=N_Catch_Year))

    facet_wrap(facets="N_Catch_Year",scales = "free_y",ncol = 3,strip.position = "left")

<ggproto object: Class FacetWrap, Facet, gg>
    compute_layout: function
    draw_back: function
    draw_front: function
    draw_labels: function
    draw_panels: function
    finish_data: function
    init_scales: function
    map_data: function
    params: list
    setup_data: function
    setup_params: function
    shrink: TRUE
    train_scales: function
    vars: function
    super:  <ggproto object: Class FacetWrap, Facet, gg>