# =====================================================
# ANÁLISIS ESTADÍSTICO DE ARCILLA - VERSIÓN LIMPIA
# Grupo 3
# =====================================================

# 1. Cargar datos
datos <- read.csv("ESTADISTICA/dataset_geologico_limpio_80.csv",
                  header = TRUE, sep = ",", dec = ".", stringsAsFactors = FALSE)

cat("✅ Columnas cargadas:", ncol(datos), "\n")   # debe decir 58
## ✅ Columnas cargadas: 58
# 2. Variable Arcilla (columna correcta según tu archivo)
arcilla_raw <- as.numeric(gsub("[^0-9.-]", "", datos$CLAY_PCT))

arcilla <- na.omit(arcilla_raw)
arcilla <- arcilla[arcilla >= 0 & arcilla <= 100]

cat("Valores válidos de arcilla:", length(arcilla), "\n\n")
## Valores válidos de arcilla: 27240
# 3. Cálculos básicos
n <- length(arcilla)
minimo <- min(arcilla)
maximo <- max(arcilla)
R <- maximo - minimo
k <- max(1, floor(1 + 3.3 * log10(n)))
A <- R / k

Li <- round(seq(minimo, maximo - A + 1e-6, by = A), 2)
Ls <- round(Li + A, 2)
Ls[length(Ls)] <- maximo
MC <- round((Li + Ls)/2, 2)

# Frecuencias
ni <- numeric(length(Li))
for(i in 1:length(Li)){
  if(i == length(Li)){
    ni[i] <- sum(arcilla >= Li[i] & arcilla <= Ls[i])
  } else {
    ni[i] <- sum(arcilla >= Li[i] & arcilla < Ls[i])
  }
}

hi <- round(ni / sum(ni) * 100, 3)

Niasc <- cumsum(ni)
Nidsc <- rev(cumsum(rev(ni)))
Hiasc <- round(cumsum(hi), 2)
Hidsc <- round(rev(cumsum(rev(hi))), 2)

# 4. Tabla de Distribución (bonita)
library(gt)
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
TDFarcilla <- data.frame(Li, Ls, MC, ni, hi, Niasc, Nidsc, Hiasc, Hidsc)

fila_total <- data.frame(Li = "TOTAL", Ls = "", MC = "", 
                         ni = sum(ni), hi = round(sum(hi),2),
                         Niasc = "", Nidsc = "", Hiasc = "", Hidsc = "")

TDFarcilla_p <- rbind(TDFarcilla, fila_total)

tabla_arcilla_p <- TDFarcilla_p %>%
  gt() %>%
  tab_header(title = md("**Tabla Nº 1**"),
             subtitle = md("Tabla de distribución de la arcilla de los sedimentos marinos")) %>%
  tab_source_note(md("Autor: Grupo 3")) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    table_body.hlines.color = "gray"
  )

tabla_arcilla_p
Tabla Nº 1
Tabla de distribución de la arcilla de los sedimentos marinos
Li Ls MC ni hi Niasc Nidsc Hiasc Hidsc
0 6.31 3.15 14156 51.981 14156 27233 51.98 100
6.31 12.62 9.46 2706 9.936 16862 13077 61.92 48.02
12.63 18.94 15.79 2536 9.312 19398 10371 71.23 38.08
18.94 25.25 22.09 1725 6.334 21123 7835 77.56 28.77
25.26 31.57 28.41 1437 5.277 22560 6110 82.84 22.44
31.57 37.88 34.73 1325 4.865 23885 4673 87.7 17.16
37.88 44.19 41.03 1037 3.808 24922 3348 91.51 12.29
44.2 50.51 47.36 680 2.497 25602 2311 94.01 8.49
50.51 56.82 53.66 457 1.678 26059 1631 95.69 5.99
56.83 63.14 59.98 371 1.362 26430 1174 97.05 4.31
63.14 69.45 66.3 298 1.094 26728 803 98.14 2.95
69.45 75.76 72.61 250 0.918 26978 505 99.06 1.85
75.77 82.08 78.92 162 0.595 27140 255 99.66 0.94
82.08 88.39 85.24 74 0.272 27214 93 99.93 0.34
88.4 94.71 91.56 19 0.070 27233 19 100 0.07
TOTAL 27233 100.000
Autor: Grupo 3
# 5. Gráficos
library(moments)
colores <- gray.colors(length(ni), start = 0.3, end = 0.9)

hist(arcilla, breaks = k, col = colores,
     main = "Gráfica Nº2: Distribución de la arcilla",
     xlab = "Arcilla (%)", ylab = "Frecuencia")

boxplot(arcilla, horizontal = TRUE, col = "lightblue",
        main = "Gráfica Nº6: Boxplot de la arcilla", xlab = "Arcilla (%)")

# 6. Indicadores estadísticos
x <- mean(arcilla)
Me <- median(arcilla)
sd_val <- sd(arcilla)
CV <- round((sd_val / x) * 100, 2)
As <- round(skewness(arcilla), 2)
K  <- round(kurtosis(arcilla), 2)

TablaIndicadores <- data.frame(
  Variable = "Arcilla (%)",
  Mínimo = round(minimo,2), 
  Máximo = round(maximo,2), 
  Media = round(x,2),
  Mediana = round(Me,2), 
  `Desv. Est.` = round(sd_val,2), 
  `CV (%)` = CV,
  Asimetría = As, 
  Curtosis = K
)

library(knitr)
library(kableExtra)
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
kable(TablaIndicadores, format = "markdown", align = "c",
      caption = "Tabla N°3: Indicadores estadísticos de la variable arcilla") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Tabla N°3: Indicadores estadísticos de la variable arcilla
Variable Mínimo Máximo Media Mediana Desv..Est. CV…. Asimetría Curtosis
Arcilla (%) 0 94.71 14.21 5.37 18.39 129.46 1.55 4.89
# 7. Outliers
outliers <- boxplot.stats(arcilla)$out
num_outliers <- length(outliers)
min_out <- ifelse(num_outliers > 0, round(min(outliers),2), NA)
max_out <- ifelse(num_outliers > 0, round(max(outliers),2), NA)

TablaOutliers <- data.frame(
  "Cantidad de outliers" = num_outliers,
  "Mínimo" = min_out,
  "Máximo" = max_out
)

kable(TablaOutliers, format = "markdown", align = "c",
      caption = "Tabla N°4: Outliers de la variable arcilla") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Tabla N°4: Outliers de la variable arcilla
Cantidad.de.outliers Mínimo Máximo
1275 55.43 94.71