##Cargar datos

database <- read.csv("database-_1_.csv", header = TRUE, sep = ",", dec = ".", check.names = FALSE)
raw_dates<- database$`Lost.Commodity.Costs`
library(gt)
library(dplyr)
datos <- read.csv("database-_1_.csv")
variable_interes <- datos$Lost.Commodity.Costs 
costos <- na.omit(variable_interes)
k <- 1 + (3.322 * log10(length(costos)))
k <- floor(k)
min_val <- min(costos)
max_val <- max(costos)
R_val <- max_val - min_val
A <- R_val / k
Li_num <- seq(from = min_val, to = max_val - A, by = A)
if(length(Li_num) < k) { Li_num <- c(Li_num, Li_num[length(Li_num)] + A) }
if(max(Li_num) + A < max_val) { Li_num <- c(Li_num, tail(Li_num, 1) + A) }

Ls_num <- Li_num + A
MC_num <- (Li_num + Ls_num) / 2

ni <- numeric(length(Li_num))

for (i in 1:length(Li_num)) {
  if (i == length(Li_num)) {
      ni[i] <- sum(costos >= Li_num[i] & costos <= (max_val + 100000))
  } else {
      ni[i] <- sum(costos >= Li_num[i] & costos < Ls_num[i]) 
  }
}
hi <- ni / sum(ni) * 100
Niasc <- cumsum(ni)
Nidsc <- rev(cumsum(rev(ni)))
Hiasc <- round(cumsum(hi), 2)
Hidsc <- round(rev(cumsum(rev(hi))), 2)
TDFCostos <- data.frame(
  Li_num = Li_num, 
  Ls_num = Ls_num, 
  MC_num = MC_num, 
  ni = ni, 
  hi = hi, 
  Niasc = Niasc, 
  Nidsc = Nidsc, 
  Hiasc = Hiasc, 
  Hidsc = Hidsc
)

tabla1_sturges <- TDFCostos %>%
  gt() %>%
  tab_header(
    title = md("*Tabla 1: Distribución de Frecuencias*"),
    subtitle = md("**Variable: Lost Commodity Costs**")
  ) %>%
  cols_label(
    Li_num = "Desde ($)",
    Ls_num = "Hasta ($)",
    MC_num = "Marca Clase",
    ni = "Frec. Abs.",
    hi = "Frec. Rel. %",
    Niasc = "Ni Asc.",
    Nidsc = "Ni Desc.",
    Hiasc = "Hi Asc. %",
    Hidsc = "Hi Desc. %"
  ) %>%
  fmt_number(columns = c(Li_num, Ls_num, MC_num), decimals = 2) %>%
  fmt_number(columns = c(hi, Hiasc, Hidsc), decimals = 2, pattern = "{x}%")
tabla1_sturges
Tabla 1: Distribución de Frecuencias
Variable: Lost Commodity Costs
Desde ($) Hasta ($) Marca Clase Frec. Abs. Frec. Rel. % Ni Asc. Ni Desc. Hi Asc. % Hi Desc. %
0.00 118,153.25 59,076.62 2737 98.07% 2737 2791 98.07% 100.00%
118,153.25 236,306.50 177,229.88 30 1.07% 2767 54 99.14% 1.93%
236,306.50 354,459.75 295,383.12 10 0.36% 2777 24 99.50% 0.86%
354,459.75 472,613.00 413,536.38 1 0.04% 2778 14 99.53% 0.50%
472,613.00 590,766.25 531,689.62 3 0.11% 2781 13 99.64% 0.47%
590,766.25 708,919.50 649,842.88 3 0.11% 2784 10 99.75% 0.36%
708,919.50 827,072.75 767,996.12 2 0.07% 2786 7 99.82% 0.25%
827,072.75 945,226.00 886,149.38 0 0.00% 2786 5 99.82% 0.18%
945,226.00 1,063,379.25 1,004,302.62 3 0.11% 2789 5 99.93% 0.18%
1,063,379.25 1,181,532.50 1,122,455.88 1 0.04% 2790 2 99.96% 0.07%
1,181,532.50 1,299,685.75 1,240,609.12 0 0.00% 2790 1 99.96% 0.04%
1,299,685.75 1,417,839.00 1,358,762.38 1 0.04% 2791 1 100.00% 0.04%
#histograma global ni
library(ggplot2)
library(dplyr)
library(scales)
costos <- na.omit(datos$Lost.Commodity.Costs)
k <- 1 + (3.322 * log10(length(costos)))
R <- max(costos) - min(costos)
A <- R / floor(k) 

limit_zoom <- A 
datos_zoom <- datos %>%
  filter(!is.na(Lost.Commodity.Costs)) %>%
  filter(Lost.Commodity.Costs <= limit_zoom)
p_zoom <- ggplot(datos_zoom, aes(x = Lost.Commodity.Costs)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white", alpha = 0.8) +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  
  labs(
    title = paste("Gráfica 1: Distribución de costos generales"), 
    x = "Costo ($)",
    y = "Cantidad"
  ) +
  
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p_zoom)

#histograma local ni
library(ggplot2)
library(dplyr)
library(scales)
datos_zoom_extremo <- datos %>%
  filter(!is.na(Lost.Commodity.Costs)) %>%
  filter(Lost.Commodity.Costs >= 0 & Lost.Commodity.Costs <= 2000)
p_zoom_extremo <- ggplot(datos_zoom_extremo, aes(x = Lost.Commodity.Costs)) +
  geom_histogram(bins = 40, fill = "steelblue", color = "white", alpha = 0.8) +
  scale_x_continuous(
    labels = scales::dollar_format(prefix = "$"),
    breaks = seq(0, 2000, by = 200) 
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  
  labs(
    title = "Gráfica 2: Distribución de Costos menores",
    x = "Costo ($)",
    y = "Cantidad"
  ) +
  
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p_zoom_extremo)

#Histograma global hi
library(ggplot2)
library(scales)
p_hi <- ggplot(TDFCostos, aes(x = MC_num, y = hi)) +
  # Barras: width = A asegura que tengan el ancho correcto del intervalo
  geom_col(fill = "steelblue", color = "black", alpha = 0.8, width = A, linewidth = 0.5) +
  scale_x_continuous(
    breaks = TDFCostos$MC_num,
    labels = scales::dollar_format(prefix = "$", big.mark = ",")
  ) +
  scale_y_continuous(
    limits = c(0, 100),
    expand = expansion(mult = c(0, 0.05)),
    labels = function(x) paste0(x, "%")
  ) +
  labs(
    title = "Gráfica 3: Porcentaje de Ocurrencia por Costo",
    subtitle = "Distribución Relativa Global",
    x = "Costo ($)",
    y = "Porcentaje (%)"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    axis.text.x = element_text(angle = 45, hjust = 1, color = "black"),
    axis.text.y = element_text(color = "black"),
    axis.line = element_line(linewidth = 0.5, color = "black")
  )
print(p_hi)

#Histograma local hi
library(ggplot2)
library(dplyr)
library(scales)
datos_zoom <- datos %>%
  filter(!is.na(Lost.Commodity.Costs)) %>%
  filter(Lost.Commodity.Costs >= 0 & Lost.Commodity.Costs <= 2000)
p_hi_zoom <- ggplot(datos_zoom, aes(x = Lost.Commodity.Costs)) +
  geom_histogram(
    aes(y = after_stat(count) / sum(after_stat(count)) * 100),
    bins = 40, 
    fill = "steelblue", 
    color = "black", 
    alpha = 0.8
  ) +
  scale_x_continuous(
    labels = scales::dollar_format(prefix = "$"),
    breaks = seq(0, 2000, by = 200)
  ) +
  scale_y_continuous(
    labels = function(x) paste0(round(x, 1), "%"),
    expand = expansion(mult = c(0, 0.1)) # Un poco de aire arriba
  ) +
  
  labs(
    title = "Gráfico 4: Porcentaje de Ocurrencia",
    x = "Costo ($)",
    y = "Porcentaje (%)"
  ) +
  
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p_hi_zoom)

#Ojivas combinadas de la frecuencia Ni
library(ggplot2)
library(dplyr)
library(scales)
datos_asc <- data.frame(
  x = c(min_val, TDFCostos$Ls_num),  
  y = c(0, TDFCostos$Niasc),         
  Tipo = "Ascendente"
)
datos_dsc <- data.frame(
  x = c(TDFCostos$Li_num, max_val),  
  y = c(TDFCostos$Nidsc, 0),         
  Tipo = "Descendente"
)

datos_ojivas_plot <- rbind(datos_asc, datos_dsc)
p_ojiva_cruzada <- ggplot(datos_ojivas_plot, aes(x = x, y = y, color = Tipo, linetype = Tipo)) +
  geom_line(linewidth = 0.8) +
  geom_point(size = 2) +
  scale_x_continuous(
    labels = scales::dollar_format(prefix = "$", big.mark = ","),
    breaks = scales::pretty_breaks(n = 5)
  ) +
  scale_color_manual(values = c("Ascendente" = "black", "Descendente" = "blue")) +
  scale_linetype_manual(values = c("Ascendente" = "longdash", "Descendente" = "solid")) +
  labs(
    title = "Gráfica 5: Ojivas por cantidad",
    x = "Costo ($)",
    y = "Cantidad Acumulada",
    color = NULL,
    linetype = NULL
  ) +
  
  theme_bw() + 
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    legend.position = c(0.85, 0.5),
    legend.background = element_rect(color = "black", fill = "white"),
    axis.text = element_text(color = "black")
  )

print(p_ojiva_cruzada)

#diagrama de cajas
variable_box <- na.omit(datos$Lost.Commodity.Costs)
boxplot(variable_box, 
        horizontal = TRUE, 
        col = "skyblue", 
        border = "gray30",         
        medcol = "red",            
        boxwex = 0.6,              
        outline = FALSE,           
        main = "Gráfica 6: Distribución de Costos ",
        xlab = "Costo ($)",
        xaxt = "n")                #
limite_visible <- boxplot.stats(variable_box)$stats[5] 
puntos_eje <- pretty(c(0, limite_visible))             

axis(1, at = puntos_eje, labels = format(puntos_eje, big.mark = ",", scientific = FALSE), las = 1)
grid(nx = NULL, ny = NA, col = "lightgray", lty = "dotted", lwd = 1)

#indice
library(e1071)
library(knitr)

variable_costos <- na.omit(datos$Lost.Commodity.Costs)
ri <- min(variable_costos)
rs <- max(variable_costos)
mediana <- median(variable_costos)
media_aritmetica <- mean(variable_costos)
t <- table(variable_costos)
Mo <- as.numeric(names(t)[which.max(t)])
desviacion_estandar <- sd(variable_costos)
coeficiente_variabilidad <- (desviacion_estandar / media_aritmetica) * 100
As <- skewness(variable_costos)
curtosis_val <- kurtosis(variable_costos)
Variable <- "Lost Commodity Costs"
S_texto <- paste("$", format(round(desviacion_estandar, 2), big.mark=","))

Tabla_indicadores <- data.frame(
  Variable,
  paste("$", format(ri, big.mark=",")),       
  paste("$", format(rs, big.mark=",")),       
  paste("$", format(round(media_aritmetica, 2), big.mark=",")), 
  paste("$", format(mediana, big.mark=",")),  
  paste("$", format(Mo, big.mark=",")),       
  S_texto,                                    
  paste(round(coeficiente_variabilidad, 2), "%"), 
  round(As, 2), 
  round(curtosis_val, 2)
)

colnames(Tabla_indicadores) <- c("Variable","Mínimo","Máximo","x (Media)","Me (Mediana)","Mo (Moda)","S (Desv.)","Cv (%)","As","K")
kable(Tabla_indicadores, format = "markdown", caption = "Tabla No. 1: Indicadores estadísticos de la variable Lost Commodity Costs.")
Tabla No. 1: Indicadores estadísticos de la variable Lost Commodity Costs.
Variable Mínimo Máximo x (Media) Me (Mediana) Mo (Moda) S (Desv.) Cv (%) As K
Lost Commodity Costs $ 0 $ 1,417,839 $ 9,805.3 $ 100 $ 0 $ 63,840.75 651.08 % 12.92 203.68
stats_outliers <- boxplot.stats(variable_costos)$out
num_outliers <- length(stats_outliers)
minimooutliers <- if(num_outliers > 0) min(stats_outliers) else NA
maximooutliers <- if(num_outliers > 0) max(stats_outliers) else NA

cat("\nAnálisis de Outliers:\n")
## 
## Análisis de Outliers:
cat("Número de valores atípicos:", num_outliers, "\n")
## Número de valores atípicos: 495
cat("Mínimo Outlier:", if(!is.na(minimooutliers)) paste("$", format(minimooutliers, big.mark=",")) else "Ninguno", "\n")
## Mínimo Outlier: $ 1,750
cat("Máximo Outlier:", if(!is.na(maximooutliers)) paste("$", format(maximooutliers, big.mark=",")) else "Ninguno", "\n")
## Máximo Outlier: $ 1,417,839
#La distribución de los costos presenta una fuerte asimetría positiva y es leptocúrtica, concentrando los datos en cero. El alto coeficiente de variabilidad (651%) confirma que la media no es representativa, ya que se ve sesgada por valores atípicos extremos, evidenciando que los accidentes costosos son anomalías dentro de la muestra."