1 1. Preparación

1.1 1.1 Librerías

##==============================================================###
##                UNIVERSIDAD CENTRAL DEL ECUADOR                ###
##Facultad de Ingeniería en Geología, Minas, Petróleos y Ambiental###
##Materia: Estadística — Docente: Ing. Christian Mejía E. MSc.     ###
##==============================================================###

suppressPackageStartupMessages({
  library(dplyr)
  library(ggplot2)
  library(ggfortify)
  library(tidyverse)
  library(fdth)
  library(lattice)
  library(MASS)
  library(PASWR)
  library(magick)
  library(readxl)
  library(plotly)
  library(psych)
  library(car)
  library(ggpmisc)
  library(scatterplot3d)
  library(corrplot)
  library(GGally)
  library(RSNNS)
})

1.2 1.2 Carga y preparación de datos

#############################
#### Directorio de trabajo ##
#############################

setwd("D:/Personal/Escritorio/ESTADISTICA JIM/Kaggle")
Datos<-read.csv("market_pipe_thickness_loss_dataset_LIMPIO.csv", header = T, sep=";",dec =".", fileEncoding = "ISO-8859-1")
Datos <- na.omit(Datos)
str(Datos)
## 'data.frame':    1000 obs. of  11 variables:
##  $ Pipe_Size       : int  800 800 400 1500 1500 600 200 300 150 800 ...
##  $ Thickness       : num  15.5 22 12.1 38.7 24.3 ...
##  $ Material        : chr  "Carbon Steel" "PVC" "Carbon Steel" "Carbon Steel" ...
##  $ Grade           : chr  "ASTM A333 Grade 6" "ASTM A106 Grade B" "API 5L X52" "API 5L X42" ...
##  $ Max_Pressure    : int  300 150 2500 1500 1500 600 1500 900 300 150 ...
##  $ Temperature     : num  84.9 14.1 0.6 52.7 11.7 67.3 89.6 40.8 3.2 11.6 ...
##  $ Corrosion_Impact: num  16.04 7.38 2.12 5.58 12.29 ...
##  $ Thickness_Loss  : num  4.91 7.32 6.32 6.2 8.58 5.21 5.86 3.02 2.47 0.53 ...
##  $ Material_Loss   : num  31.7 33.3 52.5 16 35.3 ...
##  $ Time            : int  2 4 7 19 20 11 6 21 19 1 ...
##  $ Condition       : chr  "Moderate" "Critical" "Critical" "Critical" ...
names(Datos)
##  [1] "Pipe_Size"        "Thickness"        "Material"         "Grade"           
##  [5] "Max_Pressure"     "Temperature"      "Corrosion_Impact" "Thickness_Loss"  
##  [9] "Material_Loss"    "Time"             "Condition"
# Normalización de Material_Loss
normalizacion <- normalizeData(Datos$Material_Loss, type = "0_1")
Datos$Material_Loss_norm <- as.numeric(normalizacion)

# Asegurar que la variable normalizada quede como vector numérico
Datos$Material_Loss_norm <- as.numeric(Datos$Material_Loss_norm)

# Vista rápida
dplyr::glimpse(Datos)
## Rows: 1,000
## Columns: 12
## $ Pipe_Size          <int> 800, 800, 400, 1500, 1500, 600, 200, 300, 150, 800,…
## $ Thickness          <dbl> 15.48, 22.00, 12.05, 38.72, 24.32, 16.75, 9.94, 13.…
## $ Material           <chr> "Carbon Steel", "PVC", "Carbon Steel", "Carbon Stee…
## $ Grade              <chr> "ASTM A333 Grade 6", "ASTM A106 Grade B", "API 5L X…
## $ Max_Pressure       <int> 300, 150, 2500, 1500, 1500, 600, 1500, 900, 300, 15…
## $ Temperature        <dbl> 84.9, 14.1, 0.6, 52.7, 11.7, 67.3, 89.6, 40.8, 3.2,…
## $ Corrosion_Impact   <dbl> 16.04, 7.38, 2.12, 5.58, 12.29, 2.06, 1.34, 5.57, 1…
## $ Thickness_Loss     <dbl> 4.91, 7.32, 6.32, 6.20, 8.58, 5.21, 5.86, 3.02, 2.4…
## $ Material_Loss      <dbl> 31.72, 33.27, 52.45, 16.01, 35.28, 31.10, 58.95, 21…
## $ Time               <int> 2, 4, 7, 19, 20, 11, 6, 21, 19, 1, 13, 6, 6, 13, 7,…
## $ Condition          <chr> "Moderate", "Critical", "Critical", "Critical", "Cr…
## $ Material_Loss_norm <dbl> 0.0992876644, 0.1041516302, 0.1643392852, 0.0499890…

2 2. Análisis exploratorio por variables

2.1 2.1 Variables cualitativas

2.1.1 Material

TDF_Material <- Datos %>%
  count(Material) %>%
  mutate(
    hi = round(n / sum(n), 4),
    Porcentaje = round(hi * 100, 2),
    Etiqueta = paste0(n, " (", Porcentaje, "%)")
  )

knitr::kable(TDF_Material, caption = "Tabla de frecuencias — Material")
Tabla de frecuencias — Material
Material n hi Porcentaje Etiqueta
Carbon Steel 210 0.210 21.0 210 (21%)
Fiberglass 219 0.219 21.9 219 (21.9%)
HDPE 184 0.184 18.4 184 (18.4%)
PVC 186 0.186 18.6 186 (18.6%)
Stainless Steel 201 0.201 20.1 201 (20.1%)
# Barras: Material (frecuencia absoluta + etiqueta n y %)
ggplot(TDF_Material, aes(x = reorder(Material, -n), y = n)) +
  geom_bar(stat = "identity", color = "black", fill = (c("red", "blue", "green", "yellow", "orange"))) +
  geom_text(aes(label = Etiqueta), vjust = -0.35, size = 3.4) +
  labs(
    title = "Distribución del material",
    x = "Material",
    y = "Frecuencia (n)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(axis.text.x = element_text(angle = 25, hjust = 1)) +
  expand_limits(y = max(TDF_Material$n) * 1.12)

# Circular: Material (porcentaje)
ggplot(TDF_Material, aes(x = "", y = Porcentaje, fill = Material)) +
  geom_bar(stat = "identity", color = "black", width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  geom_text(aes(label = paste0(Porcentaje, "%")),
            position = position_stack(vjust = 0.5), color = "black") +
  labs(title = "Distribución del material (porcentaje)")

Interpretación breve.
La variable Material está bastante equilibrada: Fiberglass (21.9%), Carbon Steel (21.0%), Stainless Steel (20.1%), PVC (18.6%) y HDPE (18.4%). Esto sugiere que el dataset no está dominado por un solo material, lo cual es positivo para comparaciones y para evitar sesgos fuertes por represetación significativa de una categoría. —

2.1.2 Grado del material (Grade)

TDF_Grade <- Datos %>%
  count(Grade) %>%
  mutate(
    hi = round(n / sum(n), 4),
    Porcentaje = round(hi * 100, 2),
    Etiqueta = paste0(n, " (", Porcentaje, "%)")
  )

knitr::kable(TDF_Grade, caption = "Tabla de frecuencias — Grado del material (Grade)")
Tabla de frecuencias — Grado del material (Grade)
Grade n hi Porcentaje Etiqueta
API 5L X42 186 0.186 18.6 186 (18.6%)
API 5L X52 191 0.191 19.1 191 (19.1%)
API 5L X65 183 0.183 18.3 183 (18.3%)
ASTM A106 Grade B 212 0.212 21.2 212 (21.2%)
ASTM A333 Grade 6 228 0.228 22.8 228 (22.8%)
# Barras: Grade (frecuencia absoluta + etiqueta n y %)
ggplot(TDF_Grade, aes(x = reorder(Grade, -n), y = n)) +
  geom_bar(stat = "identity", color = "black", fill = (c("red", "blue", "green", "yellow", "orange"))) +
  geom_text(aes(label = Etiqueta), vjust = -0.35, size = 3.2) +
  labs(
    title = "Distribución del grado del material (Grade)",
    x = "Grado (Grade)",
    y = "Frecuencia (n)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(axis.text.x = element_text(angle = 25, hjust = 1)) +
  expand_limits(y = max(TDF_Grade$n) * 1.12)

# Circular: Grade (porcentaje)
ggplot(TDF_Grade, aes(x = "", y = Porcentaje, fill = Grade)) +
  geom_bar(stat = "identity", color = "black", width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  geom_text(aes(label = paste0(Porcentaje, "%")),
            position = position_stack(vjust = 0.5), color = "black") +
  labs(title = "Distribución del grado del material (porcentaje)")

Interpretación breve.
Grade también presenta una distribución relativamente balanceada, con un leve predominio de ASTM A333 Grade 6 (22.8%) y ASTM A106 Grade B (21.2%), mientras que API 5L X52 (19.1%), API 5L X42 (18.6%) y API 5L X65 (18.3%) quedan cerca. En términos descriptivos, esto permite evaluar comportamiento del deterioro sin que un grado reduzca la fiabilidad del análisis. —

2.1.3 Condición (Condition)

TDF_Condition <- Datos %>%
  count(Condition) %>%
  mutate(
    hi = round(n / sum(n), 4),
    Porcentaje = round(hi * 100, 2),
    Etiqueta = paste0(n, " (", Porcentaje, "%)")
  )

knitr::kable(TDF_Condition, caption = "Tabla de frecuencias — Condición (Condition)")
Tabla de frecuencias — Condición (Condition)
Condition n hi Porcentaje Etiqueta
Critical 487 0.487 48.7 487 (48.7%)
Moderate 299 0.299 29.9 299 (29.9%)
Normal 214 0.214 21.4 214 (21.4%)
# Barras: Condition (frecuencia absoluta + etiqueta n y %)
ggplot(TDF_Condition, aes(x = reorder(Condition, -n), y = n)) +
  geom_bar(stat = "identity", color = "black", fill = (c("red", "blue", "green"))) +
  geom_text(aes(label = Etiqueta), vjust = -0.35, size = 3.4) +
  labs(
    title = "Distribución de la condición (Condition)",
    x = "Condición",
    y = "Frecuencia (n)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  expand_limits(y = max(TDF_Condition$n) * 1.12)

# Circular: Condition (porcentaje)
ggplot(TDF_Condition, aes(x = "", y = Porcentaje, fill = Condition)) +
  geom_bar(stat = "identity", color = "black", width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  geom_text(aes(label = paste0(Porcentaje, "%")),
            position = position_stack(vjust = 0.5), color = "black") +
  labs(title = "Distribución de la condición (porcentaje)")

Interpretación breve.
En Condition sí hay un desbalance claro: Critical representa 48.7%, Moderate 29.9% y Normal 21.4%. Esto indica que el dataset está orientado hacia condiciones más severas, por lo que los promedios globales de deterioro tenderán a reflejar un entorno de mayor riesgo. —

2.1.4 Presión máxima (Max_Pressure)

TDF_Max_Pressure <- Datos %>%
  count(Max_Pressure) %>%
  mutate(
    hi = round(n / sum(n), 4),
    Porcentaje = round(hi * 100, 2),
    Etiqueta = paste0(n, " (", Porcentaje, "%)")
  )

knitr::kable(TDF_Max_Pressure, caption = "Tabla de frecuencias — Presión máxima (Max_Pressure)")
Tabla de frecuencias — Presión máxima (Max_Pressure)
Max_Pressure n hi Porcentaje Etiqueta
150 168 0.168 16.8 168 (16.8%)
300 161 0.161 16.1 161 (16.1%)
600 163 0.163 16.3 163 (16.3%)
900 172 0.172 17.2 172 (17.2%)
1500 162 0.162 16.2 162 (16.2%)
2500 174 0.174 17.4 174 (17.4%)
# Barras: Max_Pressure (frecuencia absoluta + etiqueta n y %)
ggplot(TDF_Max_Pressure, aes(x = reorder(Max_Pressure, -n), y = n)) +
  geom_bar(stat = "identity", color = "black", fill = (c("red", "blue", "green", "yellow", "orange", "purple"))) +
  geom_text(aes(label = Etiqueta), vjust = -0.35, size = 3.2) +
  labs(
    title = "Distribución de la presión máxima (Max_Pressure)",
    x = "Presión máxima",
    y = "Frecuencia (n)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  expand_limits(y = max(TDF_Max_Pressure$n) * 1.12)

# Circular: Max_Pressure (porcentaje)
ggplot(TDF_Max_Pressure, aes(x = "", y = Porcentaje, fill = as.factor(Max_Pressure))) +
  geom_bar(stat = "identity", color = "black", width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  geom_text(aes(label = paste0(Porcentaje, "%")),
            position = position_stack(vjust = 0.5), color = "black") +
  labs(title = "Distribución de la presión máxima (porcentaje)", fill = "Max_Pressure")

Interpretación breve.
Corrosion_Impact presenta un comportamiento prácticamente simétrico: media 9.75 y mediana 9.72, no aparecen outliers. Esto sugiere que el impacto de corrosión está distribuido de forma extendida a lo largo de su dominio, sin valores atípicos que distorsionen el resumen central


2.2 2.2 Variables cuantitativas

2.2.1 Pérdida de espesor (Thickness_Loss)

##Thickness Loss#####
#####################
x <- Datos$Thickness_Loss
x <- x[!is.na(x)]
n <- length(x)

# Número de clases (Sturges)
k <- ceiling(1 + 3.322 * log10(n))

# Intervalos
breaks <- pretty(range(x), n = k)

# Tabla de frecuencias por intervalos
clase <- cut(x, breaks = breaks, include.lowest = TRUE, right = TRUE)
tabla <- as.data.frame(table(clase))
names(tabla) <- c("Intervalo", "fi")

# Frecuencias relativas y acumuladas
tabla$hi <- round(tabla$fi / n, 4)
tabla$Fi <- cumsum(tabla$fi)
tabla$Hi <- round(cumsum(tabla$hi), 4)

# Marca de clase (xi)
tabla$xi <- (head(breaks, -1) + tail(breaks, -1)) / 2

# Tabla final
TDF_Thickness_Loss <- tabla[, c("Intervalo", "xi", "fi", "hi", "Fi", "Hi")]
knitr::kable(TDF_Thickness_Loss, caption = "Tabla de distribución de frecuencias — Pérdida de espesor (Thickness_Loss)")
Tabla de distribución de frecuencias — Pérdida de espesor (Thickness_Loss)
Intervalo xi fi hi Fi Hi
[0,1] 0.5 124 0.124 124 0.124
(1,2] 1.5 90 0.090 214 0.214
(2,3] 2.5 100 0.100 314 0.314
(3,4] 3.5 88 0.088 402 0.402
(4,5] 4.5 111 0.111 513 0.513
(5,6] 5.5 101 0.101 614 0.614
(6,7] 6.5 91 0.091 705 0.705
(7,8] 7.5 105 0.105 810 0.810
(8,9] 8.5 105 0.105 915 0.915
(9,10] 9.5 85 0.085 1000 1.000
##Histograma: Thickness_Loss (con etiquetas de frecuencia por clase)

Histo_Thickness_loss <- ggplot(Datos, aes(x = Thickness_Loss)) +
  geom_histogram(breaks = breaks, color = "black", fill = "orange") +
  geom_text(
    stat = "bin",
    breaks = breaks,
    aes(label = after_stat(count)),
    vjust = -0.35,
    size = 3.2
  ) +
  geom_freqpoly(breaks = breaks, color = "black", linewidth = 1) +
  labs(
    title = "Histograma de pérdida de espesor (Thickness_Loss)",
    x = "Pérdida de espesor",
    y = "Frecuencia (n)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.15), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

Histo_Thickness_loss

#############################
## OJIVAS: Thickness_loss ###
#############################

ojiva_TL <- TDF_Thickness_Loss

# Frecuencia acumulada descendente (absoluta)
ojiva_TL$F_desc <- rev(cumsum(rev(ojiva_TL$fi)))

# Frecuencia acumulada descendente (relativa)
ojiva_TL$H_desc <- round(ojiva_TL$F_desc / sum(ojiva_TL$fi), 4)

# Ojiva ABSOLUTA
Ojiva_TL_abs <- ggplot(ojiva_TL) +
  geom_line(aes(x = xi, y = Fi), color = "blue", linewidth = 1) +
  geom_point(aes(x = xi, y = Fi), color = "blue", size = 2.5) +
  geom_text(aes(x = xi, y = Fi, label = Fi), vjust = -0.7, color = "blue", size = 3) +
  geom_line(aes(x = xi, y = F_desc), color = "red", linewidth = 1) +
  geom_point(aes(x = xi, y = F_desc), color = "red", size = 2.5) +
  geom_text(aes(x = xi, y = F_desc, label = F_desc), vjust = 1.3, color = "red", size = 3) +
  labs(
    title = "Ojiva absoluta — Pérdida de espesor (Thickness_Loss)",
    x = "Marca de clase (xi)",
    y = "Frecuencia acumulada (Ni)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  ) +
  expand_limits(y = c(0, sum(ojiva_TL$fi)))

Ojiva_TL_abs

# Ojiva RELATIVA
Ojiva_TL_rel <- ggplot(ojiva_TL) +
  geom_line(aes(x = xi, y = Hi), color = "green3", linewidth = 1) +
  geom_point(aes(x = xi, y = Hi), color = "green3", size = 2.5) +
  geom_text(aes(x = xi, y = Hi, label = paste0(round(Hi * 100, 0), "%")), vjust = -0.7, color = "green4", size = 3) +
  geom_line(aes(x = xi, y = H_desc), color = "red", linewidth = 1) +
  geom_point(aes(x = xi, y = H_desc), color = "red", size = 2.5) +
  geom_text(aes(x = xi, y = H_desc, label = paste0(round(H_desc * 100, 0), "%")), vjust = 1.3, color = "red", size = 3) +
  labs(
    title = "Ojiva relativa — Pérdida de espesor (Thickness_Loss)",
    x = "Marca de clase (xi)",
    y = "Frecuencia acumulada (Hi)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  scale_y_continuous(labels = function(z) paste0(round(z * 100, 0), "%")) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  ) +
  expand_limits(y = c(0, 1))

Ojiva_TL_rel

############################################################
####### Diagrama de caja y bigotes (con etiquetas) #########
############################################################

# Estadísticos para etiquetar (sin cambiar el cálculo del boxplot)
q1 <- quantile(Datos$Thickness_Loss, 0.25, na.rm = TRUE)
med <- median(Datos$Thickness_Loss, na.rm = TRUE)
q3 <- quantile(Datos$Thickness_Loss, 0.75, na.rm = TRUE)

Caja_Thickness_loss <- ggplot(Datos, aes(x = Thickness_Loss, y = "")) +
  stat_boxplot(geom = "errorbar", width = 0.3) +
  geom_boxplot(color = "black", fill = "pink",
               outlier.colour = "black", outlier.shape = 1) +
  annotate("text", x = q1,  y = 1.12, label = paste0("Q1=", round(q1, 3)),  size = 3.2) +
  annotate("text", x = med, y = 1.18, label = paste0("Mediana=", round(med, 3)), size = 3.2, fontface = "bold") +
  annotate("text", x = q3,  y = 1.12, label = paste0("Q3=", round(q3, 3)),  size = 3.2) +
  labs(
    title = "Diagrama de caja y bigotes — Pérdida de espesor (Thickness_Loss)",
    x = "Pérdida de espesor",
    y = "",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

Caja_Thickness_loss

Interpretación breve.
Thickness_Loss se distribuye en todo su rango (0.01 a 9.99) con tendencia casi simétrica: media ≈ 4.89 y mediana ≈ 4.92. En el boxplot se esperaría poca presencia de outliers, lo que sugiere variación amplia pero controlada dentro del rango típico del dataset. —

2.2.2 Pérdida de material normalizada (Material_Loss_norm)

##Material_Loss_norm#
#####################

x <- Datos$Material_Loss_norm
x <- x[!is.na(x)]
n <- length(x)

# Intervalos
breaks <- seq(0, 1, by = 0.1) # 10 clases

# Tabla de frecuencias por intervalos
clase <- cut(x, breaks = breaks, include.lowest = TRUE, right = TRUE)
tabla <- as.data.frame(table(clase))
names(tabla) <- c("Intervalo", "fi")

# Frecuencias relativas y acumuladas
tabla$hi <- round(tabla$fi / n, 4)
tabla$Fi <- cumsum(tabla$fi)
tabla$Hi <- round(cumsum(tabla$hi), 4)

# Marca de clase (xi)
tabla$xi <- (head(breaks, -1) + tail(breaks, -1)) / 2

# Tabla final
TDF_Material_Loss_norm <- tabla[, c("Intervalo", "xi", "fi", "hi", "Fi", "Hi")]
knitr::kable(TDF_Material_Loss_norm, caption = "Tabla de distribución de frecuencias — Pérdida de material normalizada (Material_Loss_norm)")
Tabla de distribución de frecuencias — Pérdida de material normalizada (Material_Loss_norm)
Intervalo xi fi hi Fi Hi
[0,0.1] 0.05 503 0.503 503 0.503
(0.1,0.2] 0.15 264 0.264 767 0.767
(0.2,0.3] 0.25 101 0.101 868 0.868
(0.3,0.4] 0.35 61 0.061 929 0.929
(0.4,0.5] 0.45 32 0.032 961 0.961
(0.5,0.6] 0.55 21 0.021 982 0.982
(0.6,0.7] 0.65 10 0.010 992 0.992
(0.7,0.8] 0.75 5 0.005 997 0.997
(0.8,0.9] 0.85 0 0.000 997 0.997
(0.9,1] 0.95 3 0.003 1000 1.000
Histo_Material_Loss_norm <- ggplot(data.frame(Material_Loss_norm = x), aes(x = Material_Loss_norm)) +
  geom_histogram(breaks = breaks, color = "black", fill = "red") +
  geom_text(
    stat = "bin",
    breaks = breaks,
    aes(label = after_stat(count)),
    vjust = -0.35,
    size = 3.2
  ) +
  geom_freqpoly(breaks = breaks, color = "black", linewidth = 1) +
  labs(
    title = "Histograma de pérdida de material normalizada (Material_Loss_norm)",
    x = "Pérdida de material",
    y = "Frecuencia (n)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.15), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

Histo_Material_Loss_norm

ojiva_MLN <- TDF_Material_Loss_norm
ojiva_MLN$F_desc <- rev(cumsum(rev(ojiva_MLN$fi)))
ojiva_MLN$H_desc <- round(ojiva_MLN$F_desc / sum(ojiva_MLN$fi), 4)

# Ojiva ABSOLUTA
Ojiva_MLN_abs <- ggplot(ojiva_MLN) +
  geom_line(aes(x = xi, y = Fi), color = "blue", linewidth = 1) +
  geom_point(aes(x = xi, y = Fi), color = "blue", size = 2.5) +
  geom_text(aes(x = xi, y = Fi, label = Fi), vjust = -0.7, color = "blue", size = 3) +
  geom_line(aes(x = xi, y = F_desc), color = "red", linewidth = 1) +
  geom_point(aes(x = xi, y = F_desc), color = "red", size = 2.5) +
  geom_text(aes(x = xi, y = F_desc, label = F_desc), vjust = 1.3, color = "red", size = 3) +
  labs(
    title = "Ojiva absoluta — Pérdida de material normalizada (Material_Loss_norm)",
    x = "Marca de clase (xi)",
    y = "Frecuencia acumulada (Ni)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  ) +
  expand_limits(y = c(0, sum(ojiva_MLN$fi)))

Ojiva_MLN_abs

# Ojiva RELATIVA
Ojiva_MLN_rel <- ggplot(ojiva_MLN) +
  geom_line(aes(x = xi, y = Hi), color = "green3", linewidth = 1) +
  geom_point(aes(x = xi, y = Hi), color = "green3", size = 2.5) +
  geom_text(aes(x = xi, y = Hi, label = paste0(round(Hi * 100, 0), "%")), vjust = -0.7, color = "green4", size = 3) +
  geom_line(aes(x = xi, y = H_desc), color = "red", linewidth = 1) +
  geom_point(aes(x = xi, y = H_desc), color = "red", size = 2.5) +
  geom_text(aes(x = xi, y = H_desc, label = paste0(round(H_desc * 100, 0), "%")), vjust = 1.3, color = "red", size = 3) +
  labs(
    title = "Ojiva relativa — Pérdida de material normalizada (Material_Loss_norm)",
    x = "Marca de clase (xi)",
    y = "Frecuencia acumulada (Hi)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  scale_y_continuous(labels = function(z) paste0(round(z * 100, 0), "%")) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  ) +
  expand_limits(y = c(0, 1))

Ojiva_MLN_rel

q1 <- quantile(Datos$Material_Loss_norm, 0.25, na.rm = TRUE)
med <- median(Datos$Material_Loss_norm, na.rm = TRUE)
q3 <- quantile(Datos$Material_Loss_norm, 0.75, na.rm = TRUE)

Caja_Material_Loss_norm <- ggplot(Datos, aes(x = Material_Loss_norm, y = "")) +
  stat_boxplot(geom = "errorbar", width = 0.3) +
  geom_boxplot(color = "black", fill = "pink",
               outlier.colour = "black", outlier.shape = 1) +
  annotate("text", x = q1,  y = 1.12, label = paste0("Q1=", round(q1, 3)),  size = 3.2) +
  annotate("text", x = med, y = 1.18, label = paste0("Mediana=", round(med, 3)), size = 3.2, fontface = "bold") +
  annotate("text", x = q3,  y = 1.12, label = paste0("Q3=", round(q3, 3)),  size = 3.2) +
  labs(
    title = "Diagrama de caja y bigotes — Pérdida de material normalizada (Material_Loss_norm)",
    x = "Pérdida de material",
    y = "",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

Caja_Material_Loss_norm

Interpretación breve.
La normalización conserva la forma de la distribución original pero en escala 0–1, facilitando comparación visual y lectura rápida. La variable Material Loss es la que muestra el patrón estadístico más “crítico”: presenta un sesgo fuerte a la derecha y cola pesada (curtosis alta), lo cual se evidencia en la diferencia entre media (46.75) y mediana (31.66). Se identifican outliers altos, indicando un subconjunto de casos severos que inflan la media. Por lo tanto, la interpretación descriptiva para esta variable debe basarse en mediana, y percentiles, porque el promedio queda fuertemente afectado por pérdidas extremas. La versión normalizada Material Loss norm conserva exactamente la forma estadística de Material_Loss (misma asimetría y cola), pero en escala 0–1, lo cual facilita comparación e interpretación visual. —

2.2.3 Impacto de corrosión (Corrosion_Impact)

##Corrosion_Impact###
#####################
x <- Datos$Corrosion_Impact
x <- x[!is.na(x)]
n <- length(x)

# Número de clases (Sturges)
k <- ceiling(1 + 3.322 * log10(n))

# Intervalos
breaks <- pretty(range(x), n = k)

# Tabla de frecuencias por intervalos
clase <- cut(x, breaks = breaks, include.lowest = TRUE, right = TRUE)
tabla <- as.data.frame(table(clase))
names(tabla) <- c("Intervalo", "fi")

# Frecuencias relativas y acumuladas
tabla$hi <- round(tabla$fi / n, 4)
tabla$Fi <- cumsum(tabla$fi)
tabla$Hi <- round(cumsum(tabla$hi), 4)

# Marca de clase (xi)
tabla$xi <- (head(breaks, -1) + tail(breaks, -1)) / 2

# Tabla final
TDF_Corrosion_Impact <- tabla[, c("Intervalo", "xi", "fi", "hi", "Fi", "Hi")]
knitr::kable(TDF_Corrosion_Impact, caption = "Tabla de distribución de frecuencias — Impacto de corrosión (Corrosion_Impact)")
Tabla de distribución de frecuencias — Impacto de corrosión (Corrosion_Impact)
Intervalo xi fi hi Fi Hi
[0,2] 1 102 0.102 102 0.102
(2,4] 3 124 0.124 226 0.226
(4,6] 5 99 0.099 325 0.325
(6,8] 7 87 0.087 412 0.412
(8,10] 9 105 0.105 517 0.517
(10,12] 11 92 0.092 609 0.609
(12,14] 13 105 0.105 714 0.714
(14,16] 15 90 0.090 804 0.804
(16,18] 17 99 0.099 903 0.903
(18,20] 19 97 0.097 1000 1.000
Histo_Corrosion_Impact <- ggplot(Datos, aes(x = Corrosion_Impact)) +
  geom_histogram(breaks = breaks, color = "black", fill = "purple") +
  geom_text(
    stat = "bin",
    breaks = breaks,
    aes(label = after_stat(count)),
    vjust = -0.35,
    size = 3.2
  ) +
  geom_freqpoly(breaks = breaks, color = "black", linewidth = 1) +
  labs(
    title = "Histograma de impacto de corrosión (Corrosion_Impact)",
    x = "Impacto de corrosión",
    y = "Frecuencia (n)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.15), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

Histo_Corrosion_Impact

ojiva_CI <- TDF_Corrosion_Impact
ojiva_CI$F_desc <- rev(cumsum(rev(ojiva_CI$fi)))
ojiva_CI$H_desc <- round(ojiva_CI$F_desc / sum(ojiva_CI$fi), 4)

# Ojiva ABSOLUTA
Ojiva_CI_abs <- ggplot(ojiva_CI) +
  geom_line(aes(x = xi, y = Fi), color = "blue", linewidth = 1) +
  geom_point(aes(x = xi, y = Fi), color = "blue", size = 2.5) +
  geom_text(aes(x = xi, y = Fi, label = Fi), vjust = -0.7, color = "blue", size = 3) +
  geom_line(aes(x = xi, y = F_desc), color = "red", linewidth = 1) +
  geom_point(aes(x = xi, y = F_desc), color = "red", size = 2.5) +
  geom_text(aes(x = xi, y = F_desc, label = F_desc), vjust = 1.3, color = "red", size = 3) +
  labs(
    title = "Ojiva absoluta — Impacto de corrosión (Corrosion_Impact)",
    x = "Marca de clase (xi)",
    y = "Frecuencia acumulada (Ni)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  ) +
  expand_limits(y = c(0, sum(ojiva_CI$fi)))

Ojiva_CI_abs

# Ojiva RELATIVA
Ojiva_CI_rel <- ggplot(ojiva_CI) +
  geom_line(aes(x = xi, y = Hi), color = "green3", linewidth = 1) +
  geom_point(aes(x = xi, y = Hi), color = "green3", size = 2.5) +
  geom_text(aes(x = xi, y = Hi, label = paste0(round(Hi * 100, 0), "%")), vjust = -0.7, color = "green4", size = 3) +
  geom_line(aes(x = xi, y = H_desc), color = "red", linewidth = 1) +
  geom_point(aes(x = xi, y = H_desc), color = "red", size = 2.5) +
  geom_text(aes(x = xi, y = H_desc, label = paste0(round(H_desc * 100, 0), "%")), vjust = 1.3, color = "red", size = 3) +
  labs(
    title = "Ojiva relativa — Impacto de corrosión (Corrosion_Impact)",
    x = "Marca de clase (xi)",
    y = "Frecuencia acumulada (Hi)",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  scale_y_continuous(labels = function(z) paste0(round(z * 100, 0), "%")) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  ) +
  expand_limits(y = c(0, 1))

Ojiva_CI_rel

q1 <- quantile(Datos$Corrosion_Impact, 0.25, na.rm = TRUE)
med <- median(Datos$Corrosion_Impact, na.rm = TRUE)
q3 <- quantile(Datos$Corrosion_Impact, 0.75, na.rm = TRUE)

Caja_Corrosion_Impact <- ggplot(Datos, aes(x = Corrosion_Impact, y = "")) +
  stat_boxplot(geom = "errorbar", width = 0.3) +
  geom_boxplot(color = "black", fill = "pink",
               outlier.colour = "black", outlier.shape = 1) +
  annotate("text", x = q1,  y = 1.12, label = paste0("Q1=", round(q1, 3)),  size = 3.2) +
  annotate("text", x = med, y = 1.18, label = paste0("Mediana=", round(med, 3)), size = 3.2, fontface = "bold") +
  annotate("text", x = q3,  y = 1.12, label = paste0("Q3=", round(q3, 3)),  size = 3.2) +
  labs(
    title = "Diagrama de caja y bigotes — Impacto de corrosión (Corrosion_Impact)",
    x = "Impacto de corrosión",
    y = "",
    caption = "Jim Acuña y Davis Piguave"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

Caja_Corrosion_Impact

Interpretación breve.
Corrosion_Impact presenta un comportamiento prácticamente simétrico: media 9.75 y mediana 9.72, no aparecen outliers. Esto sugiere que el impacto de corrosión está distribuido de forma extendida a lo largo de su dominio, sin valores atípicos que distorsionen el resumen central.