Carga de librerías y
datos
##============================================================== ###
## UNIVERSIDAD CENTRAL DEL ECUADOR ###
##Facultad de Ingeniería en Geología,Minas, Petróleos y Ambiental ##
##===============================================================###
##Materia:Estadistica ======== Docente:Ing.Christian Mejía .E MSc.##
##===============================================================###
## Utilización del Programa R-Studio en Estadistica ##
###=============================================================####
##Expuesto- Proyecto Semestral ##
##===============================================================###
#Tema: Proyecto de Estadística en R ##
##===============================================================###
#Elaborado por: Jim Acuña y Davis Piguave ##
##===============================================================###
########################
###### Librerias #######
########################
library(dplyr)
library(ggplot2)
library(ggfortify)
library(tidyverse)
library(fdth)
library(lattice)
library(MASS)
library(PASWR)
library(magick)
library(readxl)
library(plotly)
library(psych)
library(car)
library(ggpmisc)
library(scatterplot3d)
library(corrplot)
library(GGally)
library(RSNNS)
library(broom)
#############################
#### Directorio de trabajo ##
#############################
setwd("D:/Personal/Escritorio/ESTADISTICA JIM/Kaggle")
Datos<-read.csv("market_pipe_thickness_loss_dataset_LIMPIO.csv", header = T, sep=";",dec =".", fileEncoding = "ISO-8859-1")
Datos <- na.omit(Datos)
str(Datos)
## 'data.frame': 1000 obs. of 11 variables:
## $ Pipe_Size : int 800 800 400 1500 1500 600 200 300 150 800 ...
## $ Thickness : num 15.5 22 12.1 38.7 24.3 ...
## $ Material : chr "Carbon Steel" "PVC" "Carbon Steel" "Carbon Steel" ...
## $ Grade : chr "ASTM A333 Grade 6" "ASTM A106 Grade B" "API 5L X52" "API 5L X42" ...
## $ Max_Pressure : int 300 150 2500 1500 1500 600 1500 900 300 150 ...
## $ Temperature : num 84.9 14.1 0.6 52.7 11.7 67.3 89.6 40.8 3.2 11.6 ...
## $ Corrosion_Impact: num 16.04 7.38 2.12 5.58 12.29 ...
## $ Thickness_Loss : num 4.91 7.32 6.32 6.2 8.58 5.21 5.86 3.02 2.47 0.53 ...
## $ Material_Loss : num 31.7 33.3 52.5 16 35.3 ...
## $ Time : int 2 4 7 19 20 11 6 21 19 1 ...
## $ Condition : chr "Moderate" "Critical" "Critical" "Critical" ...
names(Datos)
## [1] "Pipe_Size" "Thickness" "Material" "Grade"
## [5] "Max_Pressure" "Temperature" "Corrosion_Impact" "Thickness_Loss"
## [9] "Material_Loss" "Time" "Condition"
#View(Datos)
#Normalizacion de Variable Material Loss#
normalizacion<-normalizeData(Datos$Material_Loss,type ="0_1")
Datos$Material_Loss_norm<-as.data.frame(normalizacion)
summary(Datos$Material_Loss)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08 15.66 31.66 46.75 61.03 318.75
summary(Datos$Material_Loss_norm)
## V1
## Min. :0.00000
## 1st Qu.:0.04891
## Median :0.09910
## Mean :0.14644
## 3rd Qu.:0.19128
## Max. :1.00000
#view(Datos)
######################################################################
# =========================================================
# ETAPA 3: MODELOS DE PROBABILIDAD
# =========================================================
#####################################
"ETAPA 3: Modelos de Probabilidad"
## [1] "ETAPA 3: Modelos de Probabilidad"
#####################################
# =========================================================
# 0) Preparación de variables
# =========================================================
# Variable A: Thickness_Loss
xA <- Datos$Thickness_Loss
xA <- xA[is.finite(xA)]
# Variable B: Material_Loss
xB <- Datos$Material_Loss
xB <- xB[is.finite(xB)]
xB <- xB[xB > 0] # necesario para lognormal/gamma/exp
# =========================================================
# 1) ANÁLISIS GRÁFICO
# =========================================================
#####################################
"1) Análisis gráfico - Histogramas"
## [1] "1) Análisis gráfico - Histogramas"
#####################################
ggplot(data.frame(xA), aes(x = xA)) +
geom_histogram(bins = 30, fill = "grey80", color = "grey30") +
labs(title = "Histograma: Thickness_Loss", x = "Thickness_Loss", y = "Frecuencia")

ggplot(data.frame(xB), aes(x = xB)) +
geom_histogram(bins = 30, fill = "grey80", color = "grey30") +
labs(title = "Histograma: Material_Loss", x = "Material_Loss", y = "Frecuencia")

# Extra visual: en log suele verse “más normal” si es lognormal
ggplot(data.frame(xBlog = log(xB)), aes(x = xBlog)) +
geom_histogram(bins = 30, fill = "grey80", color = "grey30") +
labs(title = "Histograma: log(Material_Loss)", x = "log(Material_Loss)", y = "Frecuencia")

# =========================================================
# 2) CONJETURAR EL MODELO
# =========================================================
# Conjetura A (Thickness_Loss)
aA <- min(xA); bA <- max(xA)
aA; bA
## [1] 0.01
## [1] 9.99
# Conjetura B (Material_Loss)
meanB <- mean(xB); varB <- var(xB); sdB <- sd(xB)
meanB; varB; sdB
## [1] 46.74756
## [1] 2171.798
## [1] 46.60255
# =========================================================
# 3) PRUEBAS DE BONDAD Y AJUSTE (Pearson X^2 y Chi-cuadrado)
# =========================================================
# ---------------------------------------------------------
# 3A) Thickness_Loss ~ Uniforme(a,b)
# ---------------------------------------------------------
#####################################
"3A) GOF Thickness_Loss ~ Uniforme(a,b)"
## [1] "3A) GOF Thickness_Loss ~ Uniforme(a,b)"
#####################################
# (i) Frecuencias observadas con hist()
kA <- 20 # número de intervalos
hA <- hist(xA, breaks = kA, plot = FALSE)
O_A <- hA$counts
brA <- hA$breaks
a_intA <- brA[-length(brA)]
b_intA <- brA[-1]
nA <- length(xA)
# (ii) Probabilidades esperadas por intervalo bajo Uniforme(aA,bA)
P_A <- punif(b_intA, min = aA, max = bA) - punif(a_intA, min = aA, max = bA)
P_A[P_A < 0] <- 0
P_A <- P_A / sum(P_A)
# (iii) Frecuencias esperadas
E_A <- nA * P_A
# Revisión rápida de condición E>=5 (si no, reduce kA)
min(E_A)
## [1] 49.0982
# (iv) Pearson X^2
X2_A <- sum((O_A - E_A)^2 / E_A)
# df ajustado por parámetros estimados (Uniforme tiene 2: a y b)
dfA_adj <- length(O_A) - 1 - 2
pA_adj <- pchisq(X2_A, df = dfA_adj, lower.tail = FALSE)
X2_A
## [1] 28.46914
dfA_adj
## [1] 17
pA_adj
## [1] 0.03974618
# (v) Chi-cuadrado con chisq.test
chiA <- suppressWarnings(chisq.test(O_A, p = P_A, rescale.p = TRUE))
chiA
##
## Chi-squared test for given probabilities
##
## data: O_A
## X-squared = 28.469, df = 19, p-value = 0.07481
# (vi) Visual: histograma + curva uniforme
ggplot(data.frame(xA), aes(x = xA)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
stat_function(fun = dunif, args = list(min = aA, max = bA), linewidth = 1) +
labs(title = "Thickness_Loss: Ajuste Uniforme(a,b)", x = "Thickness_Loss", y = "Densidad")

# ---------------------------------------------------------
# 3B) Material_Loss: comparar Normal vs Lognormal vs Gamma vs Exponencial
# ---------------------------------------------------------
kB <- 20
hB <- hist(xB, breaks = kB, plot = FALSE)
O_B <- hB$counts
brB <- hB$breaks
a_intB <- brB[-length(brB)]
b_intB <- brB[-1]
nB <- length(xB)
# ---------- Modelo 1: Normal(mu, sd) ----------
muN <- mean(xB)
sdN <- sd(xB)
P_N <- pnorm(b_intB, mean = muN, sd = sdN) - pnorm(a_intB, mean = muN, sd = sdN)
P_N[P_N < 0] <- 0
P_N <- P_N / sum(P_N)
E_N <- nB * P_N
X2_N <- sum((O_B - E_N)^2 / E_N)
dfN_adj <- length(O_B) - 1 - 2
pN_adj <- pchisq(X2_N, df = dfN_adj, lower.tail = FALSE)
chiN <- suppressWarnings(chisq.test(O_B, p = P_N, rescale.p = TRUE))
# ---------- Modelo 2: Lognormal(meanlog, sdlog) ----------
meanlogL <- mean(log(xB))
sdlogL <- sd(log(xB))
P_LN <- plnorm(b_intB, meanlog = meanlogL, sdlog = sdlogL) - plnorm(a_intB, meanlog = meanlogL, sdlog = sdlogL)
P_LN[P_LN < 0] <- 0
P_LN <- P_LN / sum(P_LN)
E_LN <- nB * P_LN
X2_LN <- sum((O_B - E_LN)^2 / E_LN)
dfLN_adj <- length(O_B) - 1 - 2
pLN_adj <- pchisq(X2_LN, df = dfLN_adj, lower.tail = FALSE)
chiLN <- suppressWarnings(chisq.test(O_B, p = P_LN, rescale.p = TRUE))
# ---------- Modelo 3: Gamma ----------
# mean = shape/rate; var = shape/rate^2 => shape=(mean^2)/var ; rate=mean/var
shapeG <- (mean(xB)^2) / var(xB)
rateG <- mean(xB) / var(xB)
P_G <- pgamma(b_intB, shape = shapeG, rate = rateG) - pgamma(a_intB, shape = shapeG, rate = rateG)
P_G[P_G < 0] <- 0
P_G <- P_G / sum(P_G)
E_G <- nB * P_G
X2_G <- sum((O_B - E_G)^2 / E_G)
dfG_adj <- length(O_B) - 1 - 2
pG_adj <- pchisq(X2_G, df = dfG_adj, lower.tail = FALSE)
chiG <- suppressWarnings(chisq.test(O_B, p = P_G, rescale.p = TRUE))
# ---------- Modelo 4: Exponencial ----------
rateE <- 1 / mean(xB)
P_E <- pexp(b_intB, rate = rateE) - pexp(a_intB, rate = rateE)
P_E[P_E < 0] <- 0
P_E <- P_E / sum(P_E)
E_E <- nB * P_E
X2_E <- sum((O_B - E_E)^2 / E_E)
dfE_adj <- length(O_B) - 1 - 1
pE_adj <- pchisq(X2_E, df = dfE_adj, lower.tail = FALSE)
chiE <- suppressWarnings(chisq.test(O_B, p = P_E, rescale.p = TRUE))
# ---------- Tabla comparativa ----------
tabla_B <- data.frame(
Modelo = c("Normal", "Lognormal", "Gamma", "Exponencial"),
X2_Pearson = c(X2_N, X2_LN, X2_G, X2_E),
df_adj = c(dfN_adj, dfLN_adj, dfG_adj, dfE_adj),
p_adj = c(pN_adj, pLN_adj, pG_adj, pE_adj),
p_chisq_R = c(chiN$p.value, chiLN$p.value, chiG$p.value, chiE$p.value)
)
tabla_B <- tabla_B[order(-tabla_B$p_adj), ]
tabla_B
# Visual del mejor modelo
mejor_modelo_B <- as.character(tabla_B$Modelo[1])
mejor_modelo_B
## [1] "Exponencial"
ggplot(data.frame(xB), aes(x = xB)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
labs(title = paste("Material_Loss + curva teórica:", mejor_modelo_B),
x = "Material_Loss", y = "Densidad")

# Agregar curva según el ganador
if(mejor_modelo_B == "Normal"){
ggplot(data.frame(xB), aes(x = xB)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
stat_function(fun = dnorm, args = list(mean = muN, sd = sdN), linewidth = 1) +
labs(title = "Material_Loss: Ajuste Normal", x = "Material_Loss", y = "Densidad")
}
if(mejor_modelo_B == "Lognormal"){
ggplot(data.frame(xB), aes(x = xB)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
stat_function(fun = dlnorm, args = list(meanlog = meanlogL, sdlog = sdlogL), linewidth = 1) +
labs(title = "Material_Loss: Ajuste Lognormal", x = "Material_Loss", y = "Densidad")
}
if(mejor_modelo_B == "Gamma"){
ggplot(data.frame(xB), aes(x = xB)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
stat_function(fun = dgamma, args = list(shape = shapeG, rate = rateG), linewidth = 1) +
labs(title = "Material_Loss: Ajuste Gamma", x = "Material_Loss", y = "Densidad")
}
if(mejor_modelo_B == "Exponencial"){
ggplot(data.frame(xB), aes(x = xB)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "grey80", color = "grey30") +
stat_function(fun = dexp, args = list(rate = rateE), linewidth = 1) +
labs(title = "Material_Loss: Ajuste Exponencial", x = "Material_Loss", y = "Densidad")
}

# =========================================================
# 4) OPTIMIZACIÓN Y AJUSTE DE ESCALA
# =========================================================
#####################################
"4) Optimización: outliers + bins + escala"
## [1] "4) Optimización: outliers + bins + escala"
#####################################
# 4.1 Filtrado de outliers por IQR (Material_Loss)
Q1 <- quantile(xB, 0.25)
Q3 <- quantile(xB, 0.75)
IQRv <- Q3 - Q1
LI <- Q1 - 1.5*IQRv
LS <- Q3 + 1.5*IQRv
xB_f <- xB[xB >= LI & xB <= LS]
length(xB); length(xB_f)
## [1] 1000
## [1] 930
# 4.2 Reducir intervalos (para aumentar E>=5)
kB2 <- 10
hB2 <- hist(xB_f, breaks = kB2, plot = FALSE)
O_B2 <- hB2$counts
brB2 <- hB2$breaks
a2 <- brB2[-length(brB2)]
b2 <- brB2[-1]
nB2 <- length(xB_f)
# Re-estimar parámetros
muN2 <- mean(xB_f); sdN2 <- sd(xB_f)
meanlogL2 <- mean(log(xB_f)); sdlogL2 <- sd(log(xB_f))
shapeG2 <- (mean(xB_f)^2) / var(xB_f)
rateG2 <- mean(xB_f) / var(xB_f)
rateE2 <- 1 / mean(xB_f)
# =========================================================
# 5) REEVALUACIÓN (bondad y ajuste tras optimización)
# =========================================================
# Normal (filtrado)
P_N2 <- pnorm(b2, mean = muN2, sd = sdN2) - pnorm(a2, mean = muN2, sd = sdN2)
P_N2[P_N2 < 0] <- 0; P_N2 <- P_N2 / sum(P_N2)
E_N2 <- nB2 * P_N2
X2_N2 <- sum((O_B2 - E_N2)^2 / E_N2)
dfN2 <- length(O_B2) - 1 - 2
pN2 <- pchisq(X2_N2, df = dfN2, lower.tail = FALSE)
# Lognormal (filtrado)
P_LN2 <- plnorm(b2, meanlog = meanlogL2, sdlog = sdlogL2) - plnorm(a2, meanlog = meanlogL2, sdlog = sdlogL2)
P_LN2[P_LN2 < 0] <- 0; P_LN2 <- P_LN2 / sum(P_LN2)
E_LN2 <- nB2 * P_LN2
X2_LN2 <- sum((O_B2 - E_LN2)^2 / E_LN2)
dfLN2 <- length(O_B2) - 1 - 2
pLN2 <- pchisq(X2_LN2, df = dfLN2, lower.tail = FALSE)
# Gamma (filtrado)
P_G2 <- pgamma(b2, shape = shapeG2, rate = rateG2) - pgamma(a2, shape = shapeG2, rate = rateG2)
P_G2[P_G2 < 0] <- 0; P_G2 <- P_G2 / sum(P_G2)
E_G2 <- nB2 * P_G2
X2_G2 <- sum((O_B2 - E_G2)^2 / E_G2)
dfG2 <- length(O_B2) - 1 - 2
pG2 <- pchisq(X2_G2, df = dfG2, lower.tail = FALSE)
# Exponencial (filtrado)
P_E2 <- pexp(b2, rate = rateE2) - pexp(a2, rate = rateE2)
P_E2[P_E2 < 0] <- 0; P_E2 <- P_E2 / sum(P_E2)
E_E2 <- nB2 * P_E2
X2_E2 <- sum((O_B2 - E_E2)^2 / E_E2)
dfE2 <- length(O_B2) - 1 - 1
pE2 <- pchisq(X2_E2, df = dfE2, lower.tail = FALSE)
tabla_B2 <- data.frame(
Modelo = c("Normal", "Lognormal", "Gamma", "Exponencial"),
X2_Pearson = c(X2_N2, X2_LN2, X2_G2, X2_E2),
df_adj = c(dfN2, dfLN2, dfG2, dfE2),
p_adj = c(pN2, pLN2, pG2, pE2)
)
tabla_B2 <- tabla_B2[order(-tabla_B2$p_adj), ]
tabla_B2
mejor_B2 <- as.character(tabla_B2$Modelo[1])
mejor_B2
## [1] "Exponencial"
# =========================================================
# 6) CÁLCULO DE PROBABILIDADES
# =========================================================
# --- Variable A (Uniforme): Thickness_Loss ---
# Pregunta A1: P(Thickness_Loss > 8)
pA1 <- 1 - punif(8, min = aA, max = bA)
pA1
## [1] 0.1993988
# Pregunta A2: P(3 < Thickness_Loss < 5)
pA2 <- punif(5, min = aA, max = bA) - punif(3, min = aA, max = bA)
pA2
## [1] 0.2004008
# --- Variable B ---
# B1: P(Material_Loss > 80)
# B2: P(20 < Material_Loss < 60)
if(mejor_B2 == "Normal"){
pB1 <- 1 - pnorm(80, mean = muN2, sd = sdN2)
pB2 <- pnorm(60, mean = muN2, sd = sdN2) - pnorm(20, mean = muN2, sd = sdN2)
pB1; pB2
}
if(mejor_B2 == "Lognormal"){
pB1 <- 1 - plnorm(80, meanlog = meanlogL2, sdlog = sdlogL2)
pB2 <- plnorm(60, meanlog = meanlogL2, sdlog = sdlogL2) - plnorm(20, meanlog = meanlogL2, sdlog = sdlogL2)
pB1; pB2
}
if(mejor_B2 == "Gamma"){
pB1 <- 1 - pgamma(80, shape = shapeG2, rate = rateG2)
pB2 <- pgamma(60, shape = shapeG2, rate = rateG2) - pgamma(20, shape = shapeG2, rate = rateG2)
pB1; pB2
}
if(mejor_B2 == "Exponencial"){
pB1 <- 1 - pexp(80, rate = rateE2)
pB2 <- pexp(60, rate = rateE2) - pexp(20, rate = rateE2)
pB1; pB2
}
## [1] 0.3848328
# =========================================================
# 7) TOMA DE DECISIONES
# =========================================================
# Umbral de riesgo
umbral_riesgo <- 0.20
# Decisión para Thickness_Loss
decision_A <- ifelse(pA1 > umbral_riesgo,
"RIESGO ALTO (Thickness_Loss > 8): priorizar inspección y mitigación",
"RIESGO CONTROLADO: mantener monitoreo rutinario")
decision_A
## [1] "RIESGO CONTROLADO: mantener monitoreo rutinario"
# Decisión para Material_Loss
if(exists("pB1")){
decision_B <- ifelse(pB1 > umbral_riesgo,
"RIESGO ALTO (Material_Loss > 80): plan de mantenimiento preventivo inmediato",
"RIESGO CONTROLADO: seguimiento y mantenimiento programado")
decision_B
}
## [1] "RIESGO CONTROLADO: seguimiento y mantenimiento programado"