##Cargar datos
database <- read.csv("database-_1_.csv", header = TRUE, sep = ",", dec = ".", check.names = FALSE)
raw_dates<- database$`Lost.Commodity.Costs`
library(gt)
library(dplyr)
datos <- read.csv("database-_1_.csv")
variable_interes <- datos$Lost.Commodity.Costs
costos <- na.omit(variable_interes)
k <- 1 + (3.322 * log10(length(costos)))
k <- floor(k)
min_val <- min(costos)
max_val <- max(costos)
R_val <- max_val - min_val
A <- R_val / k
Li_num <- seq(from = min_val, to = max_val - A, by = A)
if(length(Li_num) < k) { Li_num <- c(Li_num, Li_num[length(Li_num)] + A) }
if(max(Li_num) + A < max_val) { Li_num <- c(Li_num, tail(Li_num, 1) + A) }
Ls_num <- Li_num + A
MC_num <- (Li_num + Ls_num) / 2
ni <- numeric(length(Li_num))
for (i in 1:length(Li_num)) {
if (i == length(Li_num)) {
ni[i] <- sum(costos >= Li_num[i] & costos <= (max_val + 100000))
} else {
ni[i] <- sum(costos >= Li_num[i] & costos < Ls_num[i])
}
}
hi <- ni / sum(ni) * 100
Niasc <- cumsum(ni)
Nidsc <- rev(cumsum(rev(ni)))
Hiasc <- round(cumsum(hi), 2)
Hidsc <- round(rev(cumsum(rev(hi))), 2)
TDFCostos <- data.frame(
Li_num = Li_num,
Ls_num = Ls_num,
MC_num = MC_num,
ni = ni,
hi = hi,
Niasc = Niasc,
Nidsc = Nidsc,
Hiasc = Hiasc,
Hidsc = Hidsc
)
tabla1_sturges <- TDFCostos %>%
gt() %>%
tab_header(
title = md("*Tabla 1: Distribución de Frecuencias*"),
subtitle = md("**Variable: Lost Commodity Costs**")
) %>%
cols_label(
Li_num = "Desde ($)",
Ls_num = "Hasta ($)",
MC_num = "Marca Clase",
ni = "Frec. Abs.",
hi = "Frec. Rel. %",
Niasc = "Ni Asc.",
Nidsc = "Ni Desc.",
Hiasc = "Hi Asc. %",
Hidsc = "Hi Desc. %"
) %>%
fmt_number(columns = c(Li_num, Ls_num, MC_num), decimals = 2) %>%
fmt_number(columns = c(hi, Hiasc, Hidsc), decimals = 2, pattern = "{x}%")
tabla1_sturges
| Tabla 1: Distribución de Frecuencias |
| Variable: Lost Commodity Costs |
| Desde ($) |
Hasta ($) |
Marca Clase |
Frec. Abs. |
Frec. Rel. % |
Ni Asc. |
Ni Desc. |
Hi Asc. % |
Hi Desc. % |
| 0.00 |
118,153.25 |
59,076.62 |
2737 |
98.07% |
2737 |
2791 |
98.07% |
100.00% |
| 118,153.25 |
236,306.50 |
177,229.88 |
30 |
1.07% |
2767 |
54 |
99.14% |
1.93% |
| 236,306.50 |
354,459.75 |
295,383.12 |
10 |
0.36% |
2777 |
24 |
99.50% |
0.86% |
| 354,459.75 |
472,613.00 |
413,536.38 |
1 |
0.04% |
2778 |
14 |
99.53% |
0.50% |
| 472,613.00 |
590,766.25 |
531,689.62 |
3 |
0.11% |
2781 |
13 |
99.64% |
0.47% |
| 590,766.25 |
708,919.50 |
649,842.88 |
3 |
0.11% |
2784 |
10 |
99.75% |
0.36% |
| 708,919.50 |
827,072.75 |
767,996.12 |
2 |
0.07% |
2786 |
7 |
99.82% |
0.25% |
| 827,072.75 |
945,226.00 |
886,149.38 |
0 |
0.00% |
2786 |
5 |
99.82% |
0.18% |
| 945,226.00 |
1,063,379.25 |
1,004,302.62 |
3 |
0.11% |
2789 |
5 |
99.93% |
0.18% |
| 1,063,379.25 |
1,181,532.50 |
1,122,455.88 |
1 |
0.04% |
2790 |
2 |
99.96% |
0.07% |
| 1,181,532.50 |
1,299,685.75 |
1,240,609.12 |
0 |
0.00% |
2790 |
1 |
99.96% |
0.04% |
| 1,299,685.75 |
1,417,839.00 |
1,358,762.38 |
1 |
0.04% |
2791 |
1 |
100.00% |
0.04% |
#histograma global ni
library(ggplot2)
library(dplyr)
library(scales)
costos <- na.omit(datos$Lost.Commodity.Costs)
k <- 1 + (3.322 * log10(length(costos)))
R <- max(costos) - min(costos)
A <- R / floor(k)
limit_zoom <- A
datos_zoom <- datos %>%
filter(!is.na(Lost.Commodity.Costs)) %>%
filter(Lost.Commodity.Costs <= limit_zoom)
p_zoom <- ggplot(datos_zoom, aes(x = Lost.Commodity.Costs)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white", alpha = 0.8) +
scale_x_continuous(labels = scales::dollar_format(prefix = "$")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(
title = paste("Gráfica 1: Distribución de costos generales"),
x = "Costo ($)",
y = "Cantidad"
) +
theme_classic() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
print(p_zoom)

#histograma local ni
library(ggplot2)
library(dplyr)
library(scales)
datos_zoom_extremo <- datos %>%
filter(!is.na(Lost.Commodity.Costs)) %>%
filter(Lost.Commodity.Costs >= 0 & Lost.Commodity.Costs <= 2000)
p_zoom_extremo <- ggplot(datos_zoom_extremo, aes(x = Lost.Commodity.Costs)) +
geom_histogram(bins = 40, fill = "steelblue", color = "white", alpha = 0.8) +
scale_x_continuous(
labels = scales::dollar_format(prefix = "$"),
breaks = seq(0, 2000, by = 200)
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(
title = "Gráfica 2: Distribución de Costos menores",
x = "Costo ($)",
y = "Cantidad"
) +
theme_classic() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
print(p_zoom_extremo)

#Histograma global hi
library(ggplot2)
library(scales)
p_hi <- ggplot(TDFCostos, aes(x = MC_num, y = hi)) +
# Barras: width = A asegura que tengan el ancho correcto del intervalo
geom_col(fill = "steelblue", color = "black", alpha = 0.8, width = A, linewidth = 0.5) +
scale_x_continuous(
breaks = TDFCostos$MC_num,
labels = scales::dollar_format(prefix = "$", big.mark = ",")
) +
scale_y_continuous(
limits = c(0, 100),
expand = expansion(mult = c(0, 0.05)),
labels = function(x) paste0(x, "%")
) +
labs(
title = "Gráfica 3: Porcentaje de Ocurrencia por Costo",
subtitle = "Distribución Relativa Global",
x = "Costo ($)",
y = "Porcentaje (%)"
) +
theme_classic() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "black"),
axis.text.y = element_text(color = "black"),
axis.line = element_line(linewidth = 0.5, color = "black")
)
print(p_hi)

#Histograma local hi
library(ggplot2)
library(dplyr)
library(scales)
datos_zoom <- datos %>%
filter(!is.na(Lost.Commodity.Costs)) %>%
filter(Lost.Commodity.Costs >= 0 & Lost.Commodity.Costs <= 2000)
p_hi_zoom <- ggplot(datos_zoom, aes(x = Lost.Commodity.Costs)) +
geom_histogram(
aes(y = after_stat(count) / sum(after_stat(count)) * 100),
bins = 40,
fill = "steelblue",
color = "black",
alpha = 0.8
) +
scale_x_continuous(
labels = scales::dollar_format(prefix = "$"),
breaks = seq(0, 2000, by = 200)
) +
scale_y_continuous(
labels = function(x) paste0(round(x, 1), "%"),
expand = expansion(mult = c(0, 0.1)) # Un poco de aire arriba
) +
labs(
title = "Gráfico 4: Porcentaje de Ocurrencia",
x = "Costo ($)",
y = "Porcentaje (%)"
) +
theme_classic() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
print(p_hi_zoom)

#Ojivas combinadas de la frecuencia Ni
library(ggplot2)
library(dplyr)
library(scales)
datos_asc <- data.frame(
x = c(min_val, TDFCostos$Ls_num),
y = c(0, TDFCostos$Niasc),
Tipo = "Ascendente"
)
datos_dsc <- data.frame(
x = c(TDFCostos$Li_num, max_val),
y = c(TDFCostos$Nidsc, 0),
Tipo = "Descendente"
)
datos_ojivas_plot <- rbind(datos_asc, datos_dsc)
p_ojiva_cruzada <- ggplot(datos_ojivas_plot, aes(x = x, y = y, color = Tipo, linetype = Tipo)) +
geom_line(linewidth = 0.8) +
geom_point(size = 2) +
scale_x_continuous(
labels = scales::dollar_format(prefix = "$", big.mark = ","),
breaks = scales::pretty_breaks(n = 5)
) +
scale_color_manual(values = c("Ascendente" = "black", "Descendente" = "blue")) +
scale_linetype_manual(values = c("Ascendente" = "longdash", "Descendente" = "solid")) +
labs(
title = "Gráfica 5: Ojivas por cantidad",
x = "Costo ($)",
y = "Cantidad Acumulada",
color = NULL,
linetype = NULL
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.position = c(0.85, 0.5),
legend.background = element_rect(color = "black", fill = "white"),
axis.text = element_text(color = "black")
)
print(p_ojiva_cruzada)

#diagrama de cajas
variable_box <- na.omit(datos$Lost.Commodity.Costs)
boxplot(variable_box,
horizontal = TRUE,
col = "skyblue",
border = "gray30",
medcol = "red",
boxwex = 0.6,
outline = FALSE,
main = "Gráfica 6: Distribución de Costos ",
xlab = "Costo ($)",
xaxt = "n") #
limite_visible <- boxplot.stats(variable_box)$stats[5]
puntos_eje <- pretty(c(0, limite_visible))
axis(1, at = puntos_eje, labels = format(puntos_eje, big.mark = ",", scientific = FALSE), las = 1)
grid(nx = NULL, ny = NA, col = "lightgray", lty = "dotted", lwd = 1)

#indice
library(e1071)
library(knitr)
variable_costos <- na.omit(datos$Lost.Commodity.Costs)
ri <- min(variable_costos)
rs <- max(variable_costos)
mediana <- median(variable_costos)
media_aritmetica <- mean(variable_costos)
t <- table(variable_costos)
Mo <- as.numeric(names(t)[which.max(t)])
desviacion_estandar <- sd(variable_costos)
coeficiente_variabilidad <- (desviacion_estandar / media_aritmetica) * 100
As <- skewness(variable_costos)
curtosis_val <- kurtosis(variable_costos)
Variable <- "Lost Commodity Costs"
S_texto <- paste("$", format(round(desviacion_estandar, 2), big.mark=","))
Tabla_indicadores <- data.frame(
Variable,
paste("$", format(ri, big.mark=",")),
paste("$", format(rs, big.mark=",")),
paste("$", format(round(media_aritmetica, 2), big.mark=",")),
paste("$", format(mediana, big.mark=",")),
paste("$", format(Mo, big.mark=",")),
S_texto,
paste(round(coeficiente_variabilidad, 2), "%"),
round(As, 2),
round(curtosis_val, 2)
)
colnames(Tabla_indicadores) <- c("Variable","Mínimo","Máximo","x (Media)","Me (Mediana)","Mo (Moda)","S (Desv.)","Cv (%)","As","K")
kable(Tabla_indicadores, format = "markdown", caption = "Tabla No. 1: Indicadores estadísticos de la variable Lost Commodity Costs.")
Tabla No. 1: Indicadores estadísticos de la variable Lost
Commodity Costs.
| Lost Commodity Costs |
$ 0 |
$ 1,417,839 |
$ 9,805.3 |
$ 100 |
$ 0 |
$ 63,840.75 |
651.08 % |
12.92 |
203.68 |
stats_outliers <- boxplot.stats(variable_costos)$out
num_outliers <- length(stats_outliers)
minimooutliers <- if(num_outliers > 0) min(stats_outliers) else NA
maximooutliers <- if(num_outliers > 0) max(stats_outliers) else NA
cat("\nAnálisis de Outliers:\n")
##
## Análisis de Outliers:
cat("Número de valores atípicos:", num_outliers, "\n")
## Número de valores atípicos: 495
cat("Mínimo Outlier:", if(!is.na(minimooutliers)) paste("$", format(minimooutliers, big.mark=",")) else "Ninguno", "\n")
## Mínimo Outlier: $ 1,750
cat("Máximo Outlier:", if(!is.na(maximooutliers)) paste("$", format(maximooutliers, big.mark=",")) else "Ninguno", "\n")
## Máximo Outlier: $ 1,417,839
#La distribución de los costos presenta una fuerte asimetría positiva y es leptocúrtica, concentrando los datos en cero. El alto coeficiente de variabilidad (651%) confirma que la media no es representativa, ya que se ve sesgada por valores atípicos extremos, evidenciando que los accidentes costosos son anomalías dentro de la muestra."