1 Carga de librerías y datos

##============================================================== ###
##                UNIVERSIDAD CENTRAL DEL ECUADOR                ###
##Facultad de Ingeniería en Geología,Minas, Petróleos y Ambiental ##
##===============================================================###
##Materia:Estadistica ======== Docente:Ing.Christian Mejía .E MSc.##
##===============================================================###
##   Utilización del Programa R-Studio en Estadistica             ##
###=============================================================####
##Expuesto- Proyecto Semestral                                    ##          
##===============================================================###
#Tema: Proyecto de Estadística en R                               ##
##===============================================================###
#Elaborado por: Jim Acuña y Davis Piguave                         ##
##===============================================================###

########################
###### Librerias #######
########################

library(dplyr)
library(ggplot2)
library(ggfortify)
library(tidyverse)
library(fdth)
library(lattice)
library(MASS)
library(PASWR)
library(magick)
library(readxl)
library(plotly)
library(psych)
library(car)
library(ggpmisc)
library(scatterplot3d)
library(corrplot)
library(GGally)
library(RSNNS)
library(broom)
#############################
#### Directorio de trabajo ##
#############################

setwd("D:/Personal/Escritorio/ESTADISTICA JIM/Kaggle")
Datos<-read.csv("market_pipe_thickness_loss_dataset_LIMPIO.csv", header = T, sep=";",dec =".", fileEncoding = "ISO-8859-1")
Datos <- na.omit(Datos)
str(Datos)
## 'data.frame':    1000 obs. of  11 variables:
##  $ Pipe_Size       : int  800 800 400 1500 1500 600 200 300 150 800 ...
##  $ Thickness       : num  15.5 22 12.1 38.7 24.3 ...
##  $ Material        : chr  "Carbon Steel" "PVC" "Carbon Steel" "Carbon Steel" ...
##  $ Grade           : chr  "ASTM A333 Grade 6" "ASTM A106 Grade B" "API 5L X52" "API 5L X42" ...
##  $ Max_Pressure    : int  300 150 2500 1500 1500 600 1500 900 300 150 ...
##  $ Temperature     : num  84.9 14.1 0.6 52.7 11.7 67.3 89.6 40.8 3.2 11.6 ...
##  $ Corrosion_Impact: num  16.04 7.38 2.12 5.58 12.29 ...
##  $ Thickness_Loss  : num  4.91 7.32 6.32 6.2 8.58 5.21 5.86 3.02 2.47 0.53 ...
##  $ Material_Loss   : num  31.7 33.3 52.5 16 35.3 ...
##  $ Time            : int  2 4 7 19 20 11 6 21 19 1 ...
##  $ Condition       : chr  "Moderate" "Critical" "Critical" "Critical" ...
names(Datos)
##  [1] "Pipe_Size"        "Thickness"        "Material"         "Grade"           
##  [5] "Max_Pressure"     "Temperature"      "Corrosion_Impact" "Thickness_Loss"  
##  [9] "Material_Loss"    "Time"             "Condition"
#View(Datos)
#Normalizacion de Variable Material Loss#
normalizacion<-normalizeData(Datos$Material_Loss,type ="0_1")
Datos$Material_Loss_norm<-as.data.frame(normalizacion)
summary(Datos$Material_Loss)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.08   15.66   31.66   46.75   61.03  318.75
summary(Datos$Material_Loss_norm)
##        V1         
##  Min.   :0.00000  
##  1st Qu.:0.04891  
##  Median :0.09910  
##  Mean   :0.14644  
##  3rd Qu.:0.19128  
##  Max.   :1.00000
#view(Datos)

######################################################################
# =========================================================
# ETAPA 3: MODELOS DE PROBABILIDAD
# =========================================================

#####################################
"ETAPA 3: Modelos de Probabilidad"
## [1] "ETAPA 3: Modelos de Probabilidad"
#####################################

# =========================================================
# 0) Preparación de variables
# =========================================================

# Variable A: Thickness_Loss  
xA <- Datos$Thickness_Loss
xA <- xA[is.finite(xA)]

# Variable B: Material_Loss   
xB <- Datos$Material_Loss
xB <- xB[is.finite(xB)]
xB <- xB[xB > 0]  # necesario para lognormal/gamma/exp


# =========================================================
# 1) ANÁLISIS GRÁFICO
# =========================================================

#####################################
"1) Análisis gráfico - Histogramas"
## [1] "1) Análisis gráfico - Histogramas"
#####################################

ggplot(data.frame(xA), aes(x = xA)) +
  geom_histogram(bins = 30, fill = "grey80", color = "grey30") +
  labs(title = "Histograma: Thickness_Loss", x = "Thickness_Loss", y = "Frecuencia")

ggplot(data.frame(xB), aes(x = xB)) +
  geom_histogram(bins = 30, fill = "grey80", color = "grey30") +
  labs(title = "Histograma: Material_Loss", x = "Material_Loss", y = "Frecuencia")

# Extra visual: en log suele verse “más normal” si es lognormal
ggplot(data.frame(xBlog = log(xB)), aes(x = xBlog)) +
  geom_histogram(bins = 30, fill = "grey80", color = "grey30") +
  labs(title = "Histograma: log(Material_Loss)", x = "log(Material_Loss)", y = "Frecuencia")

# =========================================================
# 2) CONJETURAR EL MODELO
# =========================================================

# Conjetura A (Thickness_Loss)
aA <- min(xA); bA <- max(xA)
aA; bA
## [1] 0.01
## [1] 9.99
# Conjetura B (Material_Loss)
meanB <- mean(xB); varB <- var(xB); sdB <- sd(xB)
meanB; varB; sdB
## [1] 46.74756
## [1] 2171.798
## [1] 46.60255
# =========================================================
# 3) PRUEBAS DE BONDAD Y AJUSTE (Pearson X^2 y Chi-cuadrado)
# =========================================================

# ---------------------------------------------------------
# 3A) Thickness_Loss ~ Uniforme(a,b)
# ---------------------------------------------------------

#####################################
"3A) GOF Thickness_Loss ~ Uniforme(a,b)"
## [1] "3A) GOF Thickness_Loss ~ Uniforme(a,b)"
#####################################

# (i) Frecuencias observadas con hist()
kA <- 20  # número de intervalos
hA <- hist(xA, breaks = kA, plot = FALSE)

O_A <- hA$counts
brA <- hA$breaks
a_intA <- brA[-length(brA)]
b_intA <- brA[-1]
nA <- length(xA)

# (ii) Probabilidades esperadas por intervalo bajo Uniforme(aA,bA)
P_A <- punif(b_intA, min = aA, max = bA) - punif(a_intA, min = aA, max = bA)
P_A[P_A < 0] <- 0
P_A <- P_A / sum(P_A)

# (iii) Frecuencias esperadas
E_A <- nA * P_A

# Revisión rápida de condición E>=5 (si no, reduce kA)
min(E_A)
## [1] 49.0982
# (iv) Pearson X^2
X2_A <- sum((O_A - E_A)^2 / E_A)

# df ajustado por parámetros estimados (Uniforme tiene 2: a y b)
dfA_adj <- length(O_A) - 1 - 2
pA_adj  <- pchisq(X2_A, df = dfA_adj, lower.tail = FALSE)

X2_A
## [1] 28.46914
dfA_adj
## [1] 17
pA_adj
## [1] 0.03974618
# (v) Chi-cuadrado con chisq.test
chiA <- suppressWarnings(chisq.test(O_A, p = P_A, rescale.p = TRUE))
chiA
## 
##  Chi-squared test for given probabilities
## 
## data:  O_A
## X-squared = 28.469, df = 19, p-value = 0.07481
# (vi) Visual: histograma + curva uniforme
ggplot(data.frame(xA), aes(x = xA)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
  stat_function(fun = dunif, args = list(min = aA, max = bA), linewidth = 1) +
  labs(title = "Thickness_Loss: Ajuste Uniforme(a,b)", x = "Thickness_Loss", y = "Densidad")

# ---------------------------------------------------------
# 3B) Material_Loss: comparar Normal vs Lognormal vs Gamma vs Exponencial
# ---------------------------------------------------------

kB <- 20
hB <- hist(xB, breaks = kB, plot = FALSE)

O_B <- hB$counts
brB <- hB$breaks
a_intB <- brB[-length(brB)]
b_intB <- brB[-1]
nB <- length(xB)

# ---------- Modelo 1: Normal(mu, sd) ----------
muN <- mean(xB)
sdN <- sd(xB)

P_N <- pnorm(b_intB, mean = muN, sd = sdN) - pnorm(a_intB, mean = muN, sd = sdN)
P_N[P_N < 0] <- 0
P_N <- P_N / sum(P_N)
E_N <- nB * P_N

X2_N <- sum((O_B - E_N)^2 / E_N)
dfN_adj <- length(O_B) - 1 - 2
pN_adj  <- pchisq(X2_N, df = dfN_adj, lower.tail = FALSE)
chiN <- suppressWarnings(chisq.test(O_B, p = P_N, rescale.p = TRUE))

# ---------- Modelo 2: Lognormal(meanlog, sdlog) ----------
meanlogL <- mean(log(xB))
sdlogL   <- sd(log(xB))

P_LN <- plnorm(b_intB, meanlog = meanlogL, sdlog = sdlogL) - plnorm(a_intB, meanlog = meanlogL, sdlog = sdlogL)
P_LN[P_LN < 0] <- 0
P_LN <- P_LN / sum(P_LN)
E_LN <- nB * P_LN

X2_LN <- sum((O_B - E_LN)^2 / E_LN)
dfLN_adj <- length(O_B) - 1 - 2
pLN_adj  <- pchisq(X2_LN, df = dfLN_adj, lower.tail = FALSE)
chiLN <- suppressWarnings(chisq.test(O_B, p = P_LN, rescale.p = TRUE))

# ---------- Modelo 3: Gamma  ----------
# mean = shape/rate; var = shape/rate^2  => shape=(mean^2)/var ; rate=mean/var
shapeG <- (mean(xB)^2) / var(xB)
rateG  <- mean(xB) / var(xB)

P_G <- pgamma(b_intB, shape = shapeG, rate = rateG) - pgamma(a_intB, shape = shapeG, rate = rateG)
P_G[P_G < 0] <- 0
P_G <- P_G / sum(P_G)
E_G <- nB * P_G

X2_G <- sum((O_B - E_G)^2 / E_G)
dfG_adj <- length(O_B) - 1 - 2
pG_adj  <- pchisq(X2_G, df = dfG_adj, lower.tail = FALSE)
chiG <- suppressWarnings(chisq.test(O_B, p = P_G, rescale.p = TRUE))

# ---------- Modelo 4: Exponencial ----------
rateE <- 1 / mean(xB)

P_E <- pexp(b_intB, rate = rateE) - pexp(a_intB, rate = rateE)
P_E[P_E < 0] <- 0
P_E <- P_E / sum(P_E)
E_E <- nB * P_E

X2_E <- sum((O_B - E_E)^2 / E_E)
dfE_adj <- length(O_B) - 1 - 1
pE_adj  <- pchisq(X2_E, df = dfE_adj, lower.tail = FALSE)
chiE <- suppressWarnings(chisq.test(O_B, p = P_E, rescale.p = TRUE))

# ---------- Tabla comparativa ----------
tabla_B <- data.frame(
  Modelo = c("Normal", "Lognormal", "Gamma", "Exponencial"),
  X2_Pearson = c(X2_N, X2_LN, X2_G, X2_E),
  df_adj = c(dfN_adj, dfLN_adj, dfG_adj, dfE_adj),
  p_adj = c(pN_adj, pLN_adj, pG_adj, pE_adj),
  p_chisq_R = c(chiN$p.value, chiLN$p.value, chiG$p.value, chiE$p.value)
)
tabla_B <- tabla_B[order(-tabla_B$p_adj), ]
tabla_B
# Visual del mejor modelo
mejor_modelo_B <- as.character(tabla_B$Modelo[1])
mejor_modelo_B
## [1] "Exponencial"
ggplot(data.frame(xB), aes(x = xB)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
  labs(title = paste("Material_Loss + curva teórica:", mejor_modelo_B),
       x = "Material_Loss", y = "Densidad")

# Agregar curva según el ganador
if(mejor_modelo_B == "Normal"){
  ggplot(data.frame(xB), aes(x = xB)) +
    geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
    stat_function(fun = dnorm, args = list(mean = muN, sd = sdN), linewidth = 1) +
    labs(title = "Material_Loss: Ajuste Normal", x = "Material_Loss", y = "Densidad")
}
if(mejor_modelo_B == "Lognormal"){
  ggplot(data.frame(xB), aes(x = xB)) +
    geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
    stat_function(fun = dlnorm, args = list(meanlog = meanlogL, sdlog = sdlogL), linewidth = 1) +
    labs(title = "Material_Loss: Ajuste Lognormal", x = "Material_Loss", y = "Densidad")
}
if(mejor_modelo_B == "Gamma"){
  ggplot(data.frame(xB), aes(x = xB)) +
    geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
    stat_function(fun = dgamma, args = list(shape = shapeG, rate = rateG), linewidth = 1) +
    labs(title = "Material_Loss: Ajuste Gamma", x = "Material_Loss", y = "Densidad")
}
if(mejor_modelo_B == "Exponencial"){
  ggplot(data.frame(xB), aes(x = xB)) +
    geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
    stat_function(fun = dexp, args = list(rate = rateE), linewidth = 1) +
    labs(title = "Material_Loss: Ajuste Exponencial", x = "Material_Loss", y = "Densidad")
}

# =========================================================
# 4) OPTIMIZACIÓN Y AJUSTE DE ESCALA
# =========================================================

#####################################
"4) Optimización: outliers + bins + escala"
## [1] "4) Optimización: outliers + bins + escala"
#####################################

# 4.1 Filtrado de outliers por IQR (Material_Loss)
Q1 <- quantile(xB, 0.25)
Q3 <- quantile(xB, 0.75)
IQRv <- Q3 - Q1
LI <- Q1 - 1.5*IQRv
LS <- Q3 + 1.5*IQRv

xB_f <- xB[xB >= LI & xB <= LS]
length(xB); length(xB_f)
## [1] 1000
## [1] 930
# 4.2 Reducir intervalos (para aumentar E>=5)
kB2 <- 10
hB2 <- hist(xB_f, breaks = kB2, plot = FALSE)

O_B2 <- hB2$counts
brB2 <- hB2$breaks
a2 <- brB2[-length(brB2)]
b2 <- brB2[-1]
nB2 <- length(xB_f)

# Re-estimar parámetros
muN2 <- mean(xB_f); sdN2 <- sd(xB_f)
meanlogL2 <- mean(log(xB_f)); sdlogL2 <- sd(log(xB_f))
shapeG2 <- (mean(xB_f)^2) / var(xB_f)
rateG2  <- mean(xB_f) / var(xB_f)
rateE2 <- 1 / mean(xB_f)

# =========================================================
# 5) REEVALUACIÓN (bondad y ajuste tras optimización)
# =========================================================

# Normal (filtrado)
P_N2 <- pnorm(b2, mean = muN2, sd = sdN2) - pnorm(a2, mean = muN2, sd = sdN2)
P_N2[P_N2 < 0] <- 0; P_N2 <- P_N2 / sum(P_N2)
E_N2 <- nB2 * P_N2
X2_N2 <- sum((O_B2 - E_N2)^2 / E_N2)
dfN2 <- length(O_B2) - 1 - 2
pN2  <- pchisq(X2_N2, df = dfN2, lower.tail = FALSE)

# Lognormal (filtrado)
P_LN2 <- plnorm(b2, meanlog = meanlogL2, sdlog = sdlogL2) - plnorm(a2, meanlog = meanlogL2, sdlog = sdlogL2)
P_LN2[P_LN2 < 0] <- 0; P_LN2 <- P_LN2 / sum(P_LN2)
E_LN2 <- nB2 * P_LN2
X2_LN2 <- sum((O_B2 - E_LN2)^2 / E_LN2)
dfLN2 <- length(O_B2) - 1 - 2
pLN2  <- pchisq(X2_LN2, df = dfLN2, lower.tail = FALSE)

# Gamma (filtrado)
P_G2 <- pgamma(b2, shape = shapeG2, rate = rateG2) - pgamma(a2, shape = shapeG2, rate = rateG2)
P_G2[P_G2 < 0] <- 0; P_G2 <- P_G2 / sum(P_G2)
E_G2 <- nB2 * P_G2
X2_G2 <- sum((O_B2 - E_G2)^2 / E_G2)
dfG2 <- length(O_B2) - 1 - 2
pG2  <- pchisq(X2_G2, df = dfG2, lower.tail = FALSE)

# Exponencial (filtrado)
P_E2 <- pexp(b2, rate = rateE2) - pexp(a2, rate = rateE2)
P_E2[P_E2 < 0] <- 0; P_E2 <- P_E2 / sum(P_E2)
E_E2 <- nB2 * P_E2
X2_E2 <- sum((O_B2 - E_E2)^2 / E_E2)
dfE2 <- length(O_B2) - 1 - 1
pE2  <- pchisq(X2_E2, df = dfE2, lower.tail = FALSE)

tabla_B2 <- data.frame(
  Modelo = c("Normal", "Lognormal", "Gamma", "Exponencial"),
  X2_Pearson = c(X2_N2, X2_LN2, X2_G2, X2_E2),
  df_adj = c(dfN2, dfLN2, dfG2, dfE2),
  p_adj = c(pN2, pLN2, pG2, pE2)
)
tabla_B2 <- tabla_B2[order(-tabla_B2$p_adj), ]
tabla_B2
mejor_B2 <- as.character(tabla_B2$Modelo[1])
mejor_B2
## [1] "Exponencial"
# =========================================================
# 6) CÁLCULO DE PROBABILIDADES
# =========================================================

# --- Variable A (Uniforme): Thickness_Loss ---
# Pregunta A1: P(Thickness_Loss > 8)
pA1 <- 1 - punif(8, min = aA, max = bA)
pA1
## [1] 0.1993988
# Pregunta A2: P(3 < Thickness_Loss < 5)
pA2 <- punif(5, min = aA, max = bA) - punif(3, min = aA, max = bA)
pA2
## [1] 0.2004008
# --- Variable B  ---
# B1: P(Material_Loss > 80)
# B2: P(20 < Material_Loss < 60)

if(mejor_B2 == "Normal"){
  pB1 <- 1 - pnorm(80, mean = muN2, sd = sdN2)
  pB2 <- pnorm(60, mean = muN2, sd = sdN2) - pnorm(20, mean = muN2, sd = sdN2)
  pB1; pB2
}

if(mejor_B2 == "Lognormal"){
  pB1 <- 1 - plnorm(80, meanlog = meanlogL2, sdlog = sdlogL2)
  pB2 <- plnorm(60, meanlog = meanlogL2, sdlog = sdlogL2) - plnorm(20, meanlog = meanlogL2, sdlog = sdlogL2)
  pB1; pB2
}

if(mejor_B2 == "Gamma"){
  pB1 <- 1 - pgamma(80, shape = shapeG2, rate = rateG2)
  pB2 <- pgamma(60, shape = shapeG2, rate = rateG2) - pgamma(20, shape = shapeG2, rate = rateG2)
  pB1; pB2
}

if(mejor_B2 == "Exponencial"){
  pB1 <- 1 - pexp(80, rate = rateE2)
  pB2 <- pexp(60, rate = rateE2) - pexp(20, rate = rateE2)
  pB1; pB2
}
## [1] 0.3848328
# =========================================================
# 7) TOMA DE DECISIONES
# =========================================================

# Umbral de riesgo
umbral_riesgo <- 0.20

# Decisión para Thickness_Loss
decision_A <- ifelse(pA1 > umbral_riesgo,
                     "RIESGO ALTO (Thickness_Loss > 8): priorizar inspección y mitigación",
                     "RIESGO CONTROLADO: mantener monitoreo rutinario")
decision_A
## [1] "RIESGO CONTROLADO: mantener monitoreo rutinario"
# Decisión para Material_Loss
if(exists("pB1")){
  decision_B <- ifelse(pB1 > umbral_riesgo,
                       "RIESGO ALTO (Material_Loss > 80): plan de mantenimiento preventivo inmediato",
                       "RIESGO CONTROLADO: seguimiento y mantenimiento programado")
  decision_B
}
## [1] "RIESGO CONTROLADO: seguimiento y mantenimiento programado"