# ===================================================
# CÓDIGO FINAL SIN NINGÚN ERROR – CLASIFICACIÓN
# ===================================================

setwd("/cloud/project")
datos <- read.csv("Sedimentos Marinos.csv", header = TRUE, sep = ";", dec = ".")

# Instalar paquetes si te faltan (solo la primera vez)
if (!require(dplyr))    install.packages("dplyr")
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if (!require(ggplot2))  install.packages("ggplot2")
## Loading required package: ggplot2
if (!require(forcats))  install.packages("forcats")
## Loading required package: forcats
if (!require(scales))   install.packages("scales")
## Loading required package: scales
if (!require(kableExtra)) install.packages("kableExtra")
## Loading required package: kableExtra
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Cargar paquetes (¡con la "r" en dplyr!)
library(dplyr)      # ← aquí estaba el error antes
library(ggplot2)
library(forcats)
library(scales)
library(kableExtra)

# Tabla de frecuencias
tabla <- datos %>%
  count(CLASSIFICATION, name = "ni") %>%
  mutate(hi = round(ni/sum(ni)*100, 3),
         Clasificación = CLASSIFICATION) %>%
  select(Clasificación, ni, hi) %>%
  arrange(desc(ni))

# Añadir TOTAL
tabla <- bind_rows(tabla,
                   data.frame(Clasificación = "TOTAL", 
                              ni = sum(tabla$ni), 
                              hi = 100))

# Tabla bonita
kable(tabla, col.names = c("Clasificación", "ni", "hi (%)"), digits = 3) %>%
  kable_styling(full_width = FALSE, font_size = 15, bootstrap_options = c("striped", "hover")) %>%
  row_spec(nrow(tabla), bold = TRUE, background = "#2E7D32", color = "white") %>%
  row_spec(1:6, background = "#FFEB3B", color = "black")
Clasificación ni hi (%)
SAND 3059 21.535
SAND 2114 14.882
GRAVELLY SEDIMENT 1499 10.553
SAND 1150 8.096
CLAYEY SILT 1074 7.561
CLAYEY SILT 831 5.850
SAND SILT CLAY 698 4.914
SILTY SAND 601 4.231
GRAVEL 596 4.196
SILTY SAND 530 3.731
SILTY CLAY 373 2.626
SANDY SILT 325 2.288
SANDY SILT 242 1.704
SILTY CLAY 211 1.485
BOULDERS 143 1.007
SILTY SAND 142 1.000
SILT 116 0.817
SILT 114 0.803
CLAY 106 0.746
GRAVEL > 10% 66 0.465
CLAY 40 0.282
CLAYEY SAND 32 0.225
CLAYEY SAND 30 0.211
28 0.197
SAND 27 0.190
BEDROCK 21 0.148
SAND 14 0.099
SANDY CLAY 8 0.056
SANDY SILT 5 0.035
MUD 4 0.028
GRAVEL 3 0.021
MUDDY SAND 1 0.007
SANDY CLAY 1 0.007
SILT 1 0.007
TOTAL 14205 100.000
# Datos para gráficos
g <- tabla %>% filter(Clasificación != "TOTAL")

# 1. Barras horizontales (perfectas para muchas categorías)
ggplot(g, aes(x = fct_reorder(Clasificación, ni), y = ni)) +
  geom_col(fill = "#1976D2", width = 0.75) +
  geom_text(aes(label = paste(ni, "(", hi, "%)")), 
            hjust = -0.1, size = 4.5, fontface = "bold") +
  coord_flip() +
  labs(title = "Distribución de la Clasificación de Sedimentos Marinos",
       subtitle = paste("Total de muestras =", sum(g$ni)),
       x = "Clasificación", y = "Frecuencia absoluta (ni)") +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))

# 2. Tarta limpia (solo top 8 + Otros)
top8 <- g %>% slice_max(ni, n = 8)
otros <- g %>% slice_min(ni, n = nrow(g)-8) %>% 
  summarise(ni = sum(ni), hi = sum(hi), Clasificación = "Otros")
tarta <- bind_rows(top8, otros)

ggplot(tarta, aes(x = "", y = ni, fill = Clasificación)) +
  geom_bar(stat = "identity", width = 0.5, color = "white", size = 1.2) +
  coord_polar("y") +
  geom_text(aes(label = paste0(hi, "%")), position = position_stack(vjust = 0.5),
            color = "white", size = 6, fontface = "bold") +
  labs(title = "Proporción de Clasificación (Top 8 + Otros)") +
  theme_void() +
  scale_fill_brewer(palette = "Set3") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
        legend.position = "right")
## Warning in geom_bar(stat = "identity", width = 0.5, color = "white", size =
## 1.2): Ignoring unknown parameters: `size`

# 3. Pareto elegante
pareto <- g %>% arrange(desc(ni)) %>% mutate(Hi_acum = cumsum(hi))

ggplot(pareto, aes(x = reorder(Clasificación, -ni))) +
  geom_col(aes(y = ni), fill = "#8E24AA") +
  geom_line(aes(y = Hi_acum * max(ni)/100, group = 1), color = "#E91E63", size = 2) +
  geom_point(aes(y = Hi_acum * max(ni)/100), color = "#E91E63", size = 4) +
  geom_text(aes(y = Hi_acum * max(ni)/100, label = paste0(Hi_acum, "%")),
            vjust = -1, color = "#E91E63", fontface = "bold") +
  scale_y_continuous(name = "Frecuencia absoluta",
                     sec.axis = sec_axis(~ . * 100 / max(pareto$ni), 
                                         name = "Porcentaje acumulado (%)")) +
  labs(title = "Diagrama de Pareto – Clasificación", x = "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title.y.right = element_text(color = "#E91E63"),
        axis.text.y.right = element_text(color = "#E91E63"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Guardar automáticamente las imágenes (opcional)
ggsave("01_barras_clasificacion.png", width = 14, height = 9, dpi = 300)
ggsave("02_tarta_clasificacion.png",   width = 10, height = 10, dpi = 300)
ggsave("03_pareto_clasificacion.png",  width = 14, height = 8, dpi = 300)