##============================================================== ###
## UNIVERSIDAD CENTRAL DEL ECUADOR ###
##Facultad de Ingeniería en Geología,Minas, Petróleos y Ambiental ##
##===============================================================###
##Materia:Estadistica ======== Docente:Ing.Christian Mejía .E MSc.##
##===============================================================###
## Utilización del Programa R-Studio en Estadistica ##
###=============================================================####
##Expuesto- Proyecto Semestral ##
##===============================================================###
#Tema: Proyecto de Estadística en R ##
##===============================================================###
#Elaborado por: Jim Acuña ##
##===============================================================###
########################
###### Librerias #######
########################
library(dplyr)
library(ggplot2)
library(ggfortify)
library(tidyverse)
library(fdth)
library(lattice)
library(MASS)
library(PASWR)
library(magick)
library(readxl)
library(plotly)
library(psych)
library(car)
library(ggpmisc)
library(scatterplot3d)
library(corrplot)
library(GGally)
library(RSNNS)
library(broom)
#############################
#### Directorio de trabajo ##
#############################
setwd("D:/Personal/Escritorio/ESTADISTICA JIM/Kaggle")
Datos<-read.csv("market_pipe_thickness_loss_dataset_LIMPIO.csv", header = T, sep=";",dec =".", fileEncoding = "ISO-8859-1")
Datos <- na.omit(Datos)
str(Datos)
## 'data.frame': 1000 obs. of 11 variables:
## $ Pipe_Size : int 800 800 400 1500 1500 600 200 300 150 800 ...
## $ Thickness : num 15.5 22 12.1 38.7 24.3 ...
## $ Material : chr "Carbon Steel" "PVC" "Carbon Steel" "Carbon Steel" ...
## $ Grade : chr "ASTM A333 Grade 6" "ASTM A106 Grade B" "API 5L X52" "API 5L X42" ...
## $ Max_Pressure : int 300 150 2500 1500 1500 600 1500 900 300 150 ...
## $ Temperature : num 84.9 14.1 0.6 52.7 11.7 67.3 89.6 40.8 3.2 11.6 ...
## $ Corrosion_Impact: num 16.04 7.38 2.12 5.58 12.29 ...
## $ Thickness_Loss : num 4.91 7.32 6.32 6.2 8.58 5.21 5.86 3.02 2.47 0.53 ...
## $ Material_Loss : num 31.7 33.3 52.5 16 35.3 ...
## $ Time : int 2 4 7 19 20 11 6 21 19 1 ...
## $ Condition : chr "Moderate" "Critical" "Critical" "Critical" ...
names(Datos)
## [1] "Pipe_Size" "Thickness" "Material" "Grade"
## [5] "Max_Pressure" "Temperature" "Corrosion_Impact" "Thickness_Loss"
## [9] "Material_Loss" "Time" "Condition"
#View(Datos)
#Normalizacion de Variable Material Loss#
normalizacion<-normalizeData(Datos$Material_Loss,type ="0_1")
Datos$Material_Loss_norm<-as.data.frame(normalizacion)
summary(Datos$Material_Loss)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08 15.66 31.66 46.75 61.03 318.75
summary(Datos$Material_Loss_norm)
## V1
## Min. :0.00000
## 1st Qu.:0.04891
## Median :0.09910
## Mean :0.14644
## 3rd Qu.:0.19128
## Max. :1.00000
#view(Datos)
######################################################################
# =========================================================
# VARIABLES DERIVADAS (Feature Engineering)
# =========================================================
Datos <- Datos %>%
mutate(
# Evitar divisiones por 0 o negativos
Time_pos = ifelse(Time > 0, Time, NA_real_),
Thick_pos = ifelse(Thickness > 0, Thickness, NA_real_),
Size_pos = ifelse(Pipe_Size > 0, Pipe_Size, NA_real_),
# 1) Variables “fuertes” típicas: tasa y pérdidas relativas
Loss_Rate = Thickness_Loss / Time_pos, # mm/año
Loss_Rel = Thickness_Loss / Thick_pos, # fracción del espesor original
Rem_Thick = Thick_pos - Thickness_Loss, # espesor remanente
Rem_Rel = Rem_Thick / Thick_pos, # % remanente
# 2) Índices compuestos
Exposure_Index = Time_pos * Corrosion_Impact, # exposición acumulada
Stress_Index = (Max_Pressure * Size_pos) / Thick_pos, # índice de carga
Env_Index = Corrosion_Impact * Temperature, # severidad ambiental
Load_Exposure = Stress_Index * Exposure_Index, # efecto combinado
# 3) Transformaciones (para relaciones no lineales)
log_Time = log1p(Time),
log_Loss = log1p(Thickness_Loss),
log_Stress = log1p(Stress_Index)
)
# =========================================================
# ETAPA 2: REGRESIONES
# =========================================================
# Asegurar tipos de datos
if (is.data.frame(Datos$Material_Loss_norm)) Datos$Material_Loss_norm <- Datos$Material_Loss_norm[[1]]
Datos$Material <- as.factor(Datos$Material)
Datos$Grade <- as.factor(Datos$Grade)
Datos$Condition <- as.factor(Datos$Condition)
PipeEDA <- Datos %>%
dplyr::select(Pipe_Size, Thickness, Max_Pressure, Temperature,
Corrosion_Impact, Time, Thickness_Loss, Material_Loss_norm) %>%
na.omit()
# Correlaciones
cor_mat <- cor(PipeEDA, use = "complete.obs", method = "pearson")
corrplot::corrplot(cor_mat, method = "color", type = "upper", tl.cex = 0.8)
psych::pairs.panels(PipeEDA)
## Correlacion entre variables Derivadas
PipeFE <- Datos %>%
dplyr::select(Thickness_Loss, Loss_Rate, Loss_Rel, Exposure_Index,
Stress_Index, Env_Index, Load_Exposure, log_Time, log_Loss, log_Stress) %>%
na.omit()
cor_mat_fe <- cor(PipeFE, use="complete.obs")
corrplot::corrplot(cor_mat_fe, method="color", type="upper", tl.cex=0.8)
psych::pairs.panels(PipeFE)
#####################################
# Regresion Lineal (y = mx + b)
#####################################
RLineal <- ggplot(data = Datos, aes(x = Thickness, y = Pipe_Size )) +
geom_point(shape = 7, col = "salmon") +
geom_smooth(method = lm, se = FALSE, formula = y ~ x) +
stat_poly_eq(
formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE,
eq.with.lhs = "hat(Y)~`=`~"
) +
labs(
title = paste("Graf: Dispersion de Diametro vs Espesor de la tuberia"),
x = "Espesor (mm)",
y = "Diametro de la tuberia (mm)",
caption = "Jim Acuña"
) +
theme(
plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
plot.caption = element_text(hjust = 0.5, face = "italic")
)
RLineal
# Conjeturar el modelo matematico
x <- Datos$Thickness
y <- Datos$Pipe_Size
RegresionL <- lm(y ~ x)
RegresionL
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## -91.43 38.16
# Ecuación (coeficientes)
RegresionL$coefficients
## (Intercept) x
## -91.43373 38.16422
# Resumen del modelo
summary(RegresionL)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -432.88 -107.86 -24.37 59.92 784.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -91.4337 10.7627 -8.495 <2e-16 ***
## x 38.1642 0.5599 68.164 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 186.7 on 998 degrees of freedom
## Multiple R-squared: 0.8232, Adjusted R-squared: 0.823
## F-statistic: 4646 on 1 and 998 DF, p-value: < 2.2e-16
# Indicadores (como tu ejemplo)
Covarianza <- cov(x, y, use = "complete.obs")
Covarianza
## [1] 4246.451
CoefPearson <- cor(x, y, use = "pairwise.complete.obs", method = "pearson")
CoefPearson <- round(CoefPearson, 4)
CoefPearson
## [1] 0.9073
coefDeterminacion <- round((CoefPearson^2) * 100, 2)
coefDeterminacion
## [1] 82.32
###############################################
# Regresion Polinomial (y = a + bx + cx^2)
###############################################
RPolinom <- ggplot(data = Datos, aes(x = Corrosion_Impact, y = Thickness_Loss)) +
geom_point(shape = 7, col = "salmon") +
geom_smooth(method = lm, se = FALSE, formula = y ~ x + I(x^2)) +
stat_poly_eq(
formula = y ~ x + I(x^2),
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE,
eq.with.lhs = "hat(Y)~`=`~"
) +
labs(
title = paste("Graf: Dispersion de Thickness_Loss vs Corrosion_Impact (Polinomial)"),
x = "Corrosion_Impact (%)",
y = "Thickness_Loss (mm)",
caption = "Jim Douglas"
) +
theme(
plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
plot.caption = element_text(hjust = 0.5, face = "italic")
)
RPolinom
# Modelo
x <- Datos$Corrosion_Impact
y <- Datos$Thickness_Loss
xcuad <- x^2
RegresionPOL <- lm(y ~ x + xcuad)
RegresionPOL
##
## Call:
## lm(formula = y ~ x + xcuad)
##
## Coefficients:
## (Intercept) x xcuad
## 4.7966979 0.0190813 -0.0007482
summary(RegresionPOL)
##
## Call:
## lm(formula = y ~ x + xcuad)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8872 -2.5458 0.0126 2.5845 5.1243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7966979 0.2680739 17.893 <2e-16 ***
## x 0.0190813 0.0634655 0.301 0.764
## xcuad -0.0007482 0.0031036 -0.241 0.810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.904 on 997 degrees of freedom
## Multiple R-squared: 0.0001313, Adjusted R-squared: -0.001874
## F-statistic: 0.06548 on 2 and 997 DF, p-value: 0.9366
# Extraer coeficientes (a,b,c)
a <- RegresionPOL$coefficients[1]
b <- RegresionPOL$coefficients[2]
c <- RegresionPOL$coefficients[3]
a; b; c
## (Intercept)
## 4.796698
## x
## 0.01908131
## xcuad
## -0.0007482059
# Indicadores
yhat <- fitted(RegresionPOL)
CoefPearson <- round(cor(y, yhat, use="complete.obs"), 4) # mejor que cor(y, x+x^2)
CoefPearson
## [1] 0.0115
coefDeterminacion <- round((CoefPearson^2) * 100, 2)
coefDeterminacion
## [1] 0.01
###############################################
# Regresion Multiple (y = b0 + b1x1 + b2x2 + ...)
###############################################
y <- Datos$Thickness_Loss
x1 <- Datos$Time
x2 <- Datos$Corrosion_Impact
x3 <- Datos$Max_Pressure
datosMult <- data.frame(y, x1, x2, x3)
datosMult <- na.omit(datosMult)
# Diagrama de dispersion
plot(datosMult)
# Coeficiente de correlacion (matriz)
cor(datosMult)
## y x1 x2 x3
## y 1.000000000 -0.01260024 0.008547516 0.02602478
## x1 -0.012600243 1.00000000 -0.024244183 0.06199863
## x2 0.008547516 -0.02424418 1.000000000 -0.02462798
## x3 0.026024785 0.06199863 -0.024627979 1.00000000
# 3D (solo 2 predictores a la vez)
Reg3D <- scatterplot3d(x1, x2, y, angle = 55,
pch = 16, color = "blue",
xlab = "Time (años)",
ylab = "Corrosion_Impact (%)",
zlab = "Thickness_Loss (mm)",
main = "Diagrama 3D: Thickness_Loss vs Time y Corrosion_Impact")
# Modelo multiple (con 3 predictores)
RegresionM <- lm(y ~ x1 + x2 + x3, data = datosMult)
RegresionM
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = datosMult)
##
## Coefficients:
## (Intercept) x1 x2 x3
## 4.820e+00 -5.719e-03 4.426e-03 9.678e-05
summary(RegresionM)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = datosMult)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0032 -2.5531 0.0079 2.5565 5.2083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.820e+00 2.697e-01 17.875 <2e-16 ***
## x1 -5.719e-03 1.290e-02 -0.443 0.658
## x2 4.426e-03 1.580e-02 0.280 0.780
## x3 9.678e-05 1.133e-04 0.854 0.393
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.904 on 996 degrees of freedom
## Multiple R-squared: 0.0009588, Adjusted R-squared: -0.00205
## F-statistic: 0.3186 on 3 and 996 DF, p-value: 0.8119
# Plano 3D solo para el modelo con 2 predictores (x1 y x2)
RegresionM_2 <- lm(y ~ x1 + x2, data = datosMult)
Reg3D$plane3d(RegresionM_2, col = "red")
# Indicadores
yhat <- fitted(RegresionM)
CoefPearson <- round(cor(datosMult$y, yhat), 4)
CoefPearson
## [1] 0.031
coefDeterminacion <- round((CoefPearson^2) * 100, 2)
coefDeterminacion
## [1] 0.1
x <- Datos$Time
y <- Datos$Corrosion_Impact
df <- na.omit(data.frame(x,y))
df <- df[df$x > 0, ] # requerido
RegLog_B <- lm(y ~ log(x), data=df)
summary(RegLog_B)
##
## Call:
## lm(formula = y ~ log(x), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9550 -5.2721 -0.0363 5.0788 10.2694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.0850 0.5564 18.124 <2e-16 ***
## log(x) -0.1461 0.2264 -0.646 0.519
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.819 on 998 degrees of freedom
## Multiple R-squared: 0.0004173, Adjusted R-squared: -0.0005842
## F-statistic: 0.4167 on 1 and 998 DF, p-value: 0.5187
ggplot(df, aes(x=x, y=y)) +
geom_point(shape=7, col="salmon") +
stat_smooth(method="lm", formula = y ~ log(x), se=FALSE) +
labs(title="Regresión Logarítmica: Corrosion_Impact vs Time",
x="Time (years)", y="Corrosion_Impact", caption="Jim Douglas")
x <- Datos$Time
y <- Datos$Corrosion_Impact
df <- na.omit(data.frame(x,y))
df <- df[df$y > 0, ] # requerido
RegExp_B_lin <- lm(log(y) ~ x, data=df)
a <- exp(coef(RegExp_B_lin)[1]); b <- coef(RegExp_B_lin)[2]
df$yhat <- a * exp(b * df$x)
R2_exp_B <- 1 - sum((df$y - df$yhat)^2) / sum((df$y - mean(df$y))^2)
R2_exp_B
## [1] -0.2016574
ggplot(df, aes(x=x, y=y)) +
geom_point(shape=7, col="salmon") +
geom_line(aes(y=df$yhat), linewidth=1) +
labs(title="Regresión Exponencial: Corrosion_Impact vs Time",
x="Time (years)", y="Corrosion_Impact", caption="Jim Douglas")
x <- Datos$Time
y <- Datos$Corrosion_Impact
df <- na.omit(data.frame(x,y))
df <- df[df$x > 0 & df$y > 0, ] # requerido
RegPot_B_lin <- lm(log(y) ~ log(x), data=df)
a <- exp(coef(RegPot_B_lin)[1]); b <- coef(RegPot_B_lin)[2]
df$yhat <- a * (df$x^b)
R2_pot_B <- 1 - sum((df$y - df$yhat)^2) / sum((df$y - mean(df$y))^2)
R2_pot_B
## [1] -0.2015753
ggplot(df, aes(x=x, y=y)) +
geom_point(shape=7, col="salmon") +
geom_line(aes(y=df$yhat), linewidth=1) +
labs(title="Regresión Potencial: Corrosion_Impact vs Time",
x="Time (years)", y="Corrosion_Impact", caption="Jim Douglas")
######################################
# Regresiones complementarias
######################################
#####################################
"Variables derivadas (para regresiones complementarias)"
## [1] "Variables derivadas (para regresiones complementarias)"
#####################################
# Crear variables derivadas
if(!("Stress_Index" %in% names(Datos))){
Datos$Stress_Index <- (Datos$Max_Pressure * Datos$Pipe_Size) / Datos$Thickness
}
if(!("Exposure_Index" %in% names(Datos))){
Datos$Exposure_Index <- Datos$Time * Datos$Corrosion_Impact
}
if(!("Env_Index" %in% names(Datos))){
Datos$Env_Index <- Datos$Corrosion_Impact * Datos$Temperature
}
#####################################
# "Regresión Complementaria: Thickness_Loss vs Stress_Index"
#####################################
RComp1 <- ggplot(data=Datos, aes(x=Stress_Index, y=Thickness_Loss)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Thickness_Loss vs Stress_Index"),
x="Stress_Index",
y="Thickness_Loss (mm)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp1
# --- x <- ; y <- ; lm() ---
x <- Datos$Stress_Index
y <- Datos$Thickness_Loss
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y
RegComp1 <- lm(y ~ x)
RegComp1
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 4.786e+00 3.496e-06
# --- Coeficientes ---
RegComp1$coefficients
## (Intercept) x
## 4.785891e+00 3.496481e-06
# --- Summary ---
summary(RegComp1)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9740 -2.5586 0.0374 2.5401 5.1847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.786e+00 1.331e-01 35.962 <2e-16 ***
## x 3.496e-06 3.357e-06 1.041 0.298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.901 on 998 degrees of freedom
## Multiple R-squared: 0.001086, Adjusted R-squared: 8.472e-05
## F-statistic: 1.085 on 1 and 998 DF, p-value: 0.2979
# --- cov / cor / %R2 ---
Covarianza <- cov(x, y)
Covarianza
## [1] 2613.286
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] 0.0329
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 0.11
#####################################
"Regresión Complementaria: Thickness_Loss vs Exposure_Index"
## [1] "Regresión Complementaria: Thickness_Loss vs Exposure_Index"
#####################################
# --- Gráfico (ggplot + stat_poly_eq) ---
RComp2 <- ggplot(data=Datos, aes(x=Exposure_Index, y=Thickness_Loss)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Thickness_Loss vs Exposure_Index"),
x="Exposure_Index (Time * Corrosion_Impact)",
y="Thickness_Loss (mm)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp2
# --- x <- ; y <- ; lm() ---
x <- Datos$Exposure_Index
y <- Datos$Thickness_Loss
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y
RegComp2 <- lm(y ~ x)
RegComp2
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 4.9313616 -0.0003596
# --- Coeficientes ---
RegComp2$coefficients
## (Intercept) x
## 4.9313615785 -0.0003595942
# --- Summary ---
summary(RegComp2)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8713 -2.5479 0.0366 2.5738 5.1490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9313616 0.1414714 34.858 <2e-16 ***
## x -0.0003596 0.0008591 -0.419 0.676
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.902 on 998 degrees of freedom
## Multiple R-squared: 0.0001755, Adjusted R-squared: -0.0008263
## F-statistic: 0.1752 on 1 and 998 DF, p-value: 0.6756
# --- cov / cor / %R2 ---
Covarianza <- cov(x, y)
Covarianza
## [1] -4.107866
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] -0.0132
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 0.02
#####################################
"Regresión Complementaria: Material_Loss_norm vs Thickness_Loss"
## [1] "Regresión Complementaria: Material_Loss_norm vs Thickness_Loss"
#####################################
# --- Gráfico (ggplot + stat_poly_eq) ---
RComp3 <- ggplot(data=Datos, aes(x=Thickness_Loss, y=Material_Loss_norm)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Material_Loss_norm vs Thickness_Loss"),
x="Thickness_Loss (mm)",
y="Material_Loss_norm (0-1)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp3
# --- x <- ; y <- ; lm() ---
x <- Datos$Thickness_Loss
y <- Datos$Material_Loss_norm
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y
RegComp3 <- lm(y ~ x)
RegComp3
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 0.004357 0.029079
# --- Coeficientes ---
RegComp3$coefficients
## (Intercept) x
## 0.004356924 0.029078825
# --- Summary ---
summary(RegComp3)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22660 -0.07032 -0.01436 0.03441 0.71387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.004357 0.007406 0.588 0.556
## x 0.029079 0.001303 22.310 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1195 on 998 degrees of freedom
## Multiple R-squared: 0.3328, Adjusted R-squared: 0.3321
## F-statistic: 497.7 on 1 and 998 DF, p-value: < 2.2e-16
# --- cov / cor / %R2 ---
Covarianza <- cov(x, y)
Covarianza
## [1] 0.2447423
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] 0.5769
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 33.28
#####################################
"Regresión Complementaria: Thickness_Loss vs Env_Index"
## [1] "Regresión Complementaria: Thickness_Loss vs Env_Index"
#####################################
# --- Gráfico (ggplot + stat_poly_eq) ---
RComp4 <- ggplot(data=Datos, aes(x=Env_Index, y=Thickness_Loss)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Thickness_Loss vs Env_Index"),
x="Env_Index (Corrosion_Impact * Temperature)",
y="Thickness_Loss (mm)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp4
# --- x <- ; y <- ; lm() ---
x <- Datos$Env_Index
y <- Datos$Thickness_Loss
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y
RegComp4 <- lm(y ~ x)
RegComp4
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 4.8333197 0.0001285
# --- Coeficientes ---
RegComp4$coefficients
## (Intercept) x
## 4.8333196740 0.0001285222
# --- Summary ---
summary(RegComp4)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9177 -2.5600 0.0298 2.5652 5.1715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.8333197 0.1169535 41.327 <2e-16 ***
## x 0.0001285 0.0001759 0.731 0.465
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.902 on 998 degrees of freedom
## Multiple R-squared: 0.0005347, Adjusted R-squared: -0.0004668
## F-statistic: 0.5339 on 1 and 998 DF, p-value: 0.4652
# --- cov / cor / %R2 ---
Covarianza <- cov(x, y)
Covarianza
## [1] 35.01276
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] 0.0231
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 0.05
# 0) Data de modelado
Dmod <- Datos %>%
mutate(
Material = as.factor(Material),
Condition = as.factor(Condition),
Grade = as.factor(Grade)
) %>%
dplyr::select(
Thickness_Loss, log_Loss, Time, log_Time, Temperature, Corrosion_Impact,
Pipe_Size, Thickness, Max_Pressure,
Stress_Index, log_Stress,
Material, Condition, Grade
) %>% na.omit()
# Split simple train/test (para mostrar que sí “predice” algo)
set.seed(123)
id <- sample(seq_len(nrow(Dmod)), size = floor(0.7 * nrow(Dmod)))
train <- Dmod[id, ]
test <- Dmod[-id, ]
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2, na.rm = TRUE))
# ---------------------------------------------------------
# M1) Múltiple clásica
# ---------------------------------------------------------
M1 <- lm(log_Loss ~ log1p(Time) + Corrosion_Impact + Temperature + Max_Pressure +
Material + Condition, data=train)
# ---------------------------------------------------------
# M2) Modelo mejorado (ingeniería): interacción + Stress_Index
# ---------------------------------------------------------
M2 <- lm(log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) +
Temperature + Material + Condition, data=train)
# ---------------------------------------------------------
# M3) Alternativo con componentes de Stress
# ---------------------------------------------------------
M3 <- lm(log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Max_Pressure) +
log1p(Pipe_Size) + log1p(Thickness) + Temperature +
Material + Condition, data=train)
# ---------------------------------------------------------
# M4) Robusta
# ---------------------------------------------------------
M4 <- MASS::rlm(log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) +
Temperature + Material + Condition, data=train)
# =========================
# 1) Resúmenes
# =========================
summary(M1); summary(M2); summary(M3)
##
## Call:
## lm(formula = log_Loss ~ log1p(Time) + Corrosion_Impact + Temperature +
## Max_Pressure + Material + Condition, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56136 -0.16734 0.01634 0.16569 0.48141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.162e+00 3.990e-02 54.199 <2e-16 ***
## log1p(Time) -1.022e-02 1.201e-02 -0.851 0.3951
## Corrosion_Impact -8.616e-04 1.405e-03 -0.613 0.5400
## Temperature 1.124e-04 2.005e-04 0.561 0.5752
## Max_Pressure 7.807e-06 1.009e-05 0.774 0.4394
## MaterialFiberglass -4.138e-02 2.549e-02 -1.623 0.1050
## MaterialHDPE -1.975e-02 2.621e-02 -0.753 0.4514
## MaterialPVC -2.828e-02 2.626e-02 -1.077 0.2818
## MaterialStainless Steel -5.259e-02 2.613e-02 -2.013 0.0445 *
## ConditionModerate -6.064e-01 1.899e-02 -31.938 <2e-16 ***
## ConditionNormal -1.519e+00 2.169e-02 -70.016 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2165 on 689 degrees of freedom
## Multiple R-squared: 0.8801, Adjusted R-squared: 0.8784
## F-statistic: 505.8 on 10 and 689 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) +
## Temperature + Material + Condition, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56660 -0.16834 0.01774 0.16481 0.50488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0179118 0.0977065 20.653 <2e-16 ***
## log1p(Time) 0.0091330 0.0224564 0.407 0.6844
## Corrosion_Impact 0.0039890 0.0049066 0.813 0.4165
## log1p(Stress_Index) 0.0108243 0.0077821 1.391 0.1647
## Temperature 0.0001075 0.0002004 0.537 0.5916
## MaterialFiberglass -0.0392402 0.0255051 -1.539 0.1244
## MaterialHDPE -0.0178163 0.0261678 -0.681 0.4962
## MaterialPVC -0.0280653 0.0263011 -1.067 0.2863
## MaterialStainless Steel -0.0529848 0.0260884 -2.031 0.0426 *
## ConditionModerate -0.6068276 0.0189713 -31.987 <2e-16 ***
## ConditionNormal -1.5183967 0.0216765 -70.048 <2e-16 ***
## log1p(Time):Corrosion_Impact -0.0020473 0.0019852 -1.031 0.3028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2163 on 688 degrees of freedom
## Multiple R-squared: 0.8805, Adjusted R-squared: 0.8786
## F-statistic: 460.9 on 11 and 688 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Max_Pressure) +
## log1p(Pipe_Size) + log1p(Thickness) + Temperature + Material +
## Condition, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58156 -0.16992 0.01777 0.16607 0.50704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0127880 0.0998046 20.167 <2e-16 ***
## log1p(Time) 0.0098662 0.0224852 0.439 0.6610
## Corrosion_Impact 0.0040414 0.0049104 0.823 0.4108
## log1p(Max_Pressure) 0.0096244 0.0086226 1.116 0.2647
## log1p(Pipe_Size) -0.0012329 0.0226516 -0.054 0.9566
## log1p(Thickness) 0.0202268 0.0366227 0.552 0.5809
## Temperature 0.0001028 0.0002005 0.513 0.6083
## MaterialFiberglass -0.0405477 0.0255414 -1.588 0.1129
## MaterialHDPE -0.0180557 0.0262423 -0.688 0.4917
## MaterialPVC -0.0283363 0.0263269 -1.076 0.2822
## MaterialStainless Steel -0.0525271 0.0261973 -2.005 0.0453 *
## ConditionModerate -0.6064869 0.0190471 -31.841 <2e-16 ***
## ConditionNormal -1.5190478 0.0217242 -69.924 <2e-16 ***
## log1p(Time):Corrosion_Impact -0.0020390 0.0019865 -1.026 0.3050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2164 on 686 degrees of freedom
## Multiple R-squared: 0.8807, Adjusted R-squared: 0.8785
## F-statistic: 389.7 on 13 and 686 DF, p-value: < 2.2e-16
# M4 no da summary igual a lm, pero sirve como “respaldo”
M4
## Call:
## rlm(formula = log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) +
## Temperature + Material + Condition, data = train)
## Converged in 6 iterations
##
## Coefficients:
## (Intercept) log1p(Time)
## 2.0002855006 0.0097759861
## Corrosion_Impact log1p(Stress_Index)
## 0.0041885391 0.0118986456
## Temperature MaterialFiberglass
## 0.0001198373 -0.0369308278
## MaterialHDPE MaterialPVC
## -0.0196815060 -0.0240642554
## MaterialStainless Steel ConditionModerate
## -0.0491227065 -0.6046579019
## ConditionNormal log1p(Time):Corrosion_Impact
## -1.5047231083 -0.0019870049
##
## Degrees of freedom: 700 total; 688 residual
## Scale estimate: 0.249
# =========================
# 2) VIF (colinealidad)
# =========================
vif(M1)
## GVIF Df GVIF^(1/(2*Df))
## log1p(Time) 1.012590 1 1.006276
## Corrosion_Impact 1.011106 1 1.005538
## Temperature 1.005053 1 1.002523
## Max_Pressure 1.015165 1 1.007554
## Material 1.048247 4 1.005907
## Condition 1.026648 2 1.006597
vif(M2)
## GVIF Df GVIF^(1/(2*Df))
## log1p(Time) 3.548530 1 1.883754
## Corrosion_Impact 12.351735 1 3.514504
## log1p(Stress_Index) 1.013115 1 1.006536
## Temperature 1.005309 1 1.002651
## Material 1.059740 4 1.007279
## Condition 1.027705 2 1.006855
## log1p(Time):Corrosion_Impact 14.745812 1 3.840028
vif(M3)
## GVIF Df GVIF^(1/(2*Df))
## log1p(Time) 3.554422 1 1.885317
## Corrosion_Impact 12.360141 1 3.515699
## log1p(Max_Pressure) 1.021260 1 1.010574
## log1p(Pipe_Size) 7.721539 1 2.778766
## log1p(Thickness) 7.710039 1 2.776696
## Temperature 1.005759 1 1.002875
## Material 1.077986 4 1.009431
## Condition 1.036947 2 1.009111
## log1p(Time):Corrosion_Impact 14.752440 1 3.840890
# =========================
# 3) Comparación con RMSE
# =========================
pred_M1 <- predict(M1, newdata=test)
pred_M2 <- predict(M2, newdata=test)
pred_M3 <- predict(M3, newdata=test)
tab_comp <- dplyr::bind_rows(
broom::glance(M1) %>% mutate(Modelo="M1: log(TL) clásicos",
RMSE_train=rmse(train$log_Loss, fitted(M1)),
RMSE_test =rmse(test$log_Loss, pred_M1)),
broom::glance(M2) %>% mutate(Modelo="M2: log(TL) interacción+stress",
RMSE_train=rmse(train$log_Loss, fitted(M2)),
RMSE_test =rmse(test$log_Loss, pred_M2)),
broom::glance(M3) %>% mutate(Modelo="M3: log(TL) componentes",
RMSE_train=rmse(train$log_Loss, fitted(M3)),
RMSE_test =rmse(test$log_Loss, pred_M3))
) %>%
dplyr::select(Modelo, r.squared, adj.r.squared, AIC, RMSE_train, RMSE_test)
tab_comp
# =========================
# 4) Diagnósticos
# =========================
par(mfrow=c(2,2)); plot(M2); par(mfrow=c(1,1))
# Gráfico real vs pred (muy presentable)
test$pred_M2 <- pred_M2
ggplot(test, aes(x=log_Loss, y=pred_M2)) +
geom_point(alpha=0.4) +
geom_smooth(method="lm", se=FALSE) +
labs(title="Validación: Real vs Predicho (M2)",
x="log(1 + Thickness_Loss) real",
y="log(1 + Thickness_Loss) predicho")