1 Carga de librerías y datos

##============================================================== ###
##                UNIVERSIDAD CENTRAL DEL ECUADOR                ###
##Facultad de Ingeniería en Geología,Minas, Petróleos y Ambiental ##
##===============================================================###
##Materia:Estadistica ======== Docente:Ing.Christian Mejía .E MSc.##
##===============================================================###
##   Utilización del Programa R-Studio en Estadistica             ##
###=============================================================####
##Expuesto- Proyecto Semestral                                    ##          
##===============================================================###
#Tema: Proyecto de Estadística en R                               ##
##===============================================================###
#Elaborado por: Jim Acuña                                         ##               
##===============================================================###

########################
###### Librerias #######
########################

library(dplyr)
library(ggplot2)
library(ggfortify)
library(tidyverse)
library(fdth)
library(lattice)
library(MASS)
library(PASWR)
library(magick)
library(readxl)
library(plotly)
library(psych)
library(car)
library(ggpmisc)
library(scatterplot3d)
library(corrplot)
library(GGally)
library(RSNNS)
library(broom)
#############################
#### Directorio de trabajo ##
#############################

setwd("D:/Personal/Escritorio/ESTADISTICA JIM/Kaggle")
Datos<-read.csv("market_pipe_thickness_loss_dataset_LIMPIO.csv", header = T, sep=";",dec =".", fileEncoding = "ISO-8859-1")
Datos <- na.omit(Datos)
str(Datos)
## 'data.frame':    1000 obs. of  11 variables:
##  $ Pipe_Size       : int  800 800 400 1500 1500 600 200 300 150 800 ...
##  $ Thickness       : num  15.5 22 12.1 38.7 24.3 ...
##  $ Material        : chr  "Carbon Steel" "PVC" "Carbon Steel" "Carbon Steel" ...
##  $ Grade           : chr  "ASTM A333 Grade 6" "ASTM A106 Grade B" "API 5L X52" "API 5L X42" ...
##  $ Max_Pressure    : int  300 150 2500 1500 1500 600 1500 900 300 150 ...
##  $ Temperature     : num  84.9 14.1 0.6 52.7 11.7 67.3 89.6 40.8 3.2 11.6 ...
##  $ Corrosion_Impact: num  16.04 7.38 2.12 5.58 12.29 ...
##  $ Thickness_Loss  : num  4.91 7.32 6.32 6.2 8.58 5.21 5.86 3.02 2.47 0.53 ...
##  $ Material_Loss   : num  31.7 33.3 52.5 16 35.3 ...
##  $ Time            : int  2 4 7 19 20 11 6 21 19 1 ...
##  $ Condition       : chr  "Moderate" "Critical" "Critical" "Critical" ...
names(Datos)
##  [1] "Pipe_Size"        "Thickness"        "Material"         "Grade"           
##  [5] "Max_Pressure"     "Temperature"      "Corrosion_Impact" "Thickness_Loss"  
##  [9] "Material_Loss"    "Time"             "Condition"
#View(Datos)
#Normalizacion de Variable Material Loss#
normalizacion<-normalizeData(Datos$Material_Loss,type ="0_1")
Datos$Material_Loss_norm<-as.data.frame(normalizacion)
summary(Datos$Material_Loss)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.08   15.66   31.66   46.75   61.03  318.75
summary(Datos$Material_Loss_norm)
##        V1         
##  Min.   :0.00000  
##  1st Qu.:0.04891  
##  Median :0.09910  
##  Mean   :0.14644  
##  3rd Qu.:0.19128  
##  Max.   :1.00000
#view(Datos)

######################################################################

2 Variables derivadas

# =========================================================
# VARIABLES DERIVADAS (Feature Engineering)
# =========================================================

Datos <- Datos %>%
  mutate(
    # Evitar divisiones por 0 o negativos
    Time_pos      = ifelse(Time > 0, Time, NA_real_),
    Thick_pos     = ifelse(Thickness > 0, Thickness, NA_real_),
    Size_pos      = ifelse(Pipe_Size > 0, Pipe_Size, NA_real_),

    # 1) Variables “fuertes” típicas: tasa y pérdidas relativas
    Loss_Rate     = Thickness_Loss / Time_pos,          # mm/año
    Loss_Rel      = Thickness_Loss / Thick_pos,         # fracción del espesor original
    Rem_Thick     = Thick_pos - Thickness_Loss,         # espesor remanente
    Rem_Rel       = Rem_Thick / Thick_pos,              # % remanente

    # 2) Índices compuestos
    Exposure_Index    = Time_pos * Corrosion_Impact,                # exposición acumulada
    Stress_Index      = (Max_Pressure * Size_pos) / Thick_pos,      # índice de carga
    Env_Index         = Corrosion_Impact * Temperature,             # severidad ambiental
    Load_Exposure     = Stress_Index * Exposure_Index,              # efecto combinado

    # 3) Transformaciones (para relaciones no lineales)
    log_Time      = log1p(Time),
    log_Loss      = log1p(Thickness_Loss),
    log_Stress    = log1p(Stress_Index)
  )

3 ETAPA 2: Regresiones

3.1 Correlacion entre variables base

# =========================================================
# ETAPA 2: REGRESIONES
# =========================================================

# Asegurar tipos de datos
if (is.data.frame(Datos$Material_Loss_norm)) Datos$Material_Loss_norm <- Datos$Material_Loss_norm[[1]]

Datos$Material   <- as.factor(Datos$Material)
Datos$Grade      <- as.factor(Datos$Grade)
Datos$Condition  <- as.factor(Datos$Condition)

PipeEDA <- Datos %>%
  dplyr::select(Pipe_Size, Thickness, Max_Pressure, Temperature,
                Corrosion_Impact, Time, Thickness_Loss, Material_Loss_norm) %>%
  na.omit()

# Correlaciones
cor_mat <- cor(PipeEDA, use = "complete.obs", method = "pearson")
corrplot::corrplot(cor_mat, method = "color", type = "upper", tl.cex = 0.8)

psych::pairs.panels(PipeEDA)

## Correlacion entre variables Derivadas

PipeFE <- Datos %>%
  dplyr::select(Thickness_Loss, Loss_Rate, Loss_Rel, Exposure_Index,
                Stress_Index, Env_Index, Load_Exposure, log_Time, log_Loss, log_Stress) %>%
  na.omit()

cor_mat_fe <- cor(PipeFE, use="complete.obs")
corrplot::corrplot(cor_mat_fe, method="color", type="upper", tl.cex=0.8)

psych::pairs.panels(PipeFE)

4 REGRESION LINEAL: y = a + bx

#####################################
# Regresion Lineal (y = mx + b)
#####################################

RLineal <- ggplot(data = Datos, aes(x = Thickness, y = Pipe_Size )) +
  geom_point(shape = 7, col = "salmon") +
  geom_smooth(method = lm, se = FALSE, formula = y ~ x) +
  stat_poly_eq(
    formula = y ~ x,
    aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
    parse = TRUE,
    eq.with.lhs = "hat(Y)~`=`~"
  ) +
  labs(
    title = paste("Graf: Dispersion de Diametro vs Espesor de la tuberia"),
    x = "Espesor (mm)",
    y = "Diametro de la tuberia (mm)",
    caption = "Jim Acuña"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

RLineal

# Conjeturar el modelo matematico
x <- Datos$Thickness
y <- Datos$Pipe_Size
RegresionL <- lm(y ~ x)
RegresionL
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      -91.43        38.16
# Ecuación (coeficientes)
RegresionL$coefficients
## (Intercept)           x 
##   -91.43373    38.16422
# Resumen del modelo
summary(RegresionL)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -432.88 -107.86  -24.37   59.92  784.26 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -91.4337    10.7627  -8.495   <2e-16 ***
## x            38.1642     0.5599  68.164   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 186.7 on 998 degrees of freedom
## Multiple R-squared:  0.8232, Adjusted R-squared:  0.823 
## F-statistic:  4646 on 1 and 998 DF,  p-value: < 2.2e-16
# Indicadores (como tu ejemplo)
Covarianza <- cov(x, y, use = "complete.obs")
Covarianza
## [1] 4246.451
CoefPearson <- cor(x, y, use = "pairwise.complete.obs", method = "pearson")
CoefPearson <- round(CoefPearson, 4)
CoefPearson
## [1] 0.9073
coefDeterminacion <- round((CoefPearson^2) * 100, 2)
coefDeterminacion
## [1] 82.32

5 REGRESION POLINOMIAL: y = a + bx + cx^2

###############################################
# Regresion Polinomial (y = a + bx + cx^2)
###############################################

RPolinom <- ggplot(data = Datos, aes(x = Corrosion_Impact, y = Thickness_Loss)) +
  geom_point(shape = 7, col = "salmon") +
  geom_smooth(method = lm, se = FALSE, formula = y ~ x + I(x^2)) +
  stat_poly_eq(
    formula = y ~ x + I(x^2),
    aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
    parse = TRUE,
    eq.with.lhs = "hat(Y)~`=`~"
  ) +
  labs(
    title = paste("Graf: Dispersion de Thickness_Loss vs Corrosion_Impact (Polinomial)"),
    x = "Corrosion_Impact (%)",
    y = "Thickness_Loss (mm)",
    caption = "Jim Douglas"
  ) +
  theme(
    plot.title = element_text(size = rel(1.1), hjust = 0.5, face = "bold"),
    plot.caption = element_text(hjust = 0.5, face = "italic")
  )

RPolinom

# Modelo
x <- Datos$Corrosion_Impact
y <- Datos$Thickness_Loss
xcuad <- x^2

RegresionPOL <- lm(y ~ x + xcuad)
RegresionPOL
## 
## Call:
## lm(formula = y ~ x + xcuad)
## 
## Coefficients:
## (Intercept)            x        xcuad  
##   4.7966979    0.0190813   -0.0007482
summary(RegresionPOL)
## 
## Call:
## lm(formula = y ~ x + xcuad)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8872 -2.5458  0.0126  2.5845  5.1243 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.7966979  0.2680739  17.893   <2e-16 ***
## x            0.0190813  0.0634655   0.301    0.764    
## xcuad       -0.0007482  0.0031036  -0.241    0.810    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.904 on 997 degrees of freedom
## Multiple R-squared:  0.0001313,  Adjusted R-squared:  -0.001874 
## F-statistic: 0.06548 on 2 and 997 DF,  p-value: 0.9366
# Extraer coeficientes (a,b,c)
a <- RegresionPOL$coefficients[1]
b <- RegresionPOL$coefficients[2]
c <- RegresionPOL$coefficients[3]
a; b; c
## (Intercept) 
##    4.796698
##          x 
## 0.01908131
##         xcuad 
## -0.0007482059
# Indicadores
yhat <- fitted(RegresionPOL)
CoefPearson <- round(cor(y, yhat, use="complete.obs"), 4)   # mejor que cor(y, x+x^2)
CoefPearson
## [1] 0.0115
coefDeterminacion <- round((CoefPearson^2) * 100, 2)
coefDeterminacion
## [1] 0.01

6 Regresion Multiple (y = b0 + b1x1 + b2x2 + …)

###############################################
# Regresion Multiple (y = b0 + b1x1 + b2x2 + ...)
###############################################

y  <- Datos$Thickness_Loss
x1 <- Datos$Time
x2 <- Datos$Corrosion_Impact
x3 <- Datos$Max_Pressure

datosMult <- data.frame(y, x1, x2, x3)
datosMult <- na.omit(datosMult)

# Diagrama de dispersion
plot(datosMult)

# Coeficiente de correlacion (matriz)
cor(datosMult)
##               y          x1           x2          x3
## y   1.000000000 -0.01260024  0.008547516  0.02602478
## x1 -0.012600243  1.00000000 -0.024244183  0.06199863
## x2  0.008547516 -0.02424418  1.000000000 -0.02462798
## x3  0.026024785  0.06199863 -0.024627979  1.00000000
# 3D (solo 2 predictores a la vez)
Reg3D <- scatterplot3d(x1, x2, y, angle = 55,
                       pch = 16, color = "blue",
                       xlab = "Time (años)",
                       ylab = "Corrosion_Impact (%)",
                       zlab = "Thickness_Loss (mm)",
                       main = "Diagrama 3D: Thickness_Loss vs Time y Corrosion_Impact")

# Modelo multiple (con 3 predictores)
RegresionM <- lm(y ~ x1 + x2 + x3, data = datosMult)
RegresionM
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = datosMult)
## 
## Coefficients:
## (Intercept)           x1           x2           x3  
##   4.820e+00   -5.719e-03    4.426e-03    9.678e-05
summary(RegresionM)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = datosMult)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0032 -2.5531  0.0079  2.5565  5.2083 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.820e+00  2.697e-01  17.875   <2e-16 ***
## x1          -5.719e-03  1.290e-02  -0.443    0.658    
## x2           4.426e-03  1.580e-02   0.280    0.780    
## x3           9.678e-05  1.133e-04   0.854    0.393    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.904 on 996 degrees of freedom
## Multiple R-squared:  0.0009588,  Adjusted R-squared:  -0.00205 
## F-statistic: 0.3186 on 3 and 996 DF,  p-value: 0.8119
# Plano 3D solo para el modelo con 2 predictores (x1 y x2)
RegresionM_2 <- lm(y ~ x1 + x2, data = datosMult)
Reg3D$plane3d(RegresionM_2, col = "red")

# Indicadores
yhat <- fitted(RegresionM)
CoefPearson <- round(cor(datosMult$y, yhat), 4)
CoefPearson
## [1] 0.031
coefDeterminacion <- round((CoefPearson^2) * 100, 2)
coefDeterminacion
## [1] 0.1

7 logaritmica

x <- Datos$Time
y <- Datos$Corrosion_Impact
df <- na.omit(data.frame(x,y))
df <- df[df$x > 0, ]  # requerido

RegLog_B <- lm(y ~ log(x), data=df)
summary(RegLog_B)
## 
## Call:
## lm(formula = y ~ log(x), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9550 -5.2721 -0.0363  5.0788 10.2694 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.0850     0.5564  18.124   <2e-16 ***
## log(x)       -0.1461     0.2264  -0.646    0.519    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.819 on 998 degrees of freedom
## Multiple R-squared:  0.0004173,  Adjusted R-squared:  -0.0005842 
## F-statistic: 0.4167 on 1 and 998 DF,  p-value: 0.5187
ggplot(df, aes(x=x, y=y)) +
  geom_point(shape=7, col="salmon") +
  stat_smooth(method="lm", formula = y ~ log(x), se=FALSE) +
  labs(title="Regresión Logarítmica: Corrosion_Impact vs Time",
       x="Time (years)", y="Corrosion_Impact", caption="Jim Douglas")

8 Exponencial

x <- Datos$Time
y <- Datos$Corrosion_Impact
df <- na.omit(data.frame(x,y))
df <- df[df$y > 0, ]  # requerido

RegExp_B_lin <- lm(log(y) ~ x, data=df)
a <- exp(coef(RegExp_B_lin)[1]); b <- coef(RegExp_B_lin)[2]
df$yhat <- a * exp(b * df$x)

R2_exp_B <- 1 - sum((df$y - df$yhat)^2) / sum((df$y - mean(df$y))^2)
R2_exp_B
## [1] -0.2016574
ggplot(df, aes(x=x, y=y)) +
  geom_point(shape=7, col="salmon") +
  geom_line(aes(y=df$yhat), linewidth=1) +
  labs(title="Regresión Exponencial: Corrosion_Impact vs Time",
       x="Time (years)", y="Corrosion_Impact", caption="Jim Douglas")

9 Potencial

x <- Datos$Time
y <- Datos$Corrosion_Impact
df <- na.omit(data.frame(x,y))
df <- df[df$x > 0 & df$y > 0, ]  # requerido

RegPot_B_lin <- lm(log(y) ~ log(x), data=df)
a <- exp(coef(RegPot_B_lin)[1]); b <- coef(RegPot_B_lin)[2]
df$yhat <- a * (df$x^b)

R2_pot_B <- 1 - sum((df$y - df$yhat)^2) / sum((df$y - mean(df$y))^2)
R2_pot_B
## [1] -0.2015753
ggplot(df, aes(x=x, y=y)) +
  geom_point(shape=7, col="salmon") +
  geom_line(aes(y=df$yhat), linewidth=1) +
  labs(title="Regresión Potencial: Corrosion_Impact vs Time",
       x="Time (years)", y="Corrosion_Impact", caption="Jim Douglas")

10 Regresiones complementarias

######################################
# Regresiones complementarias
######################################

#####################################
"Variables derivadas (para regresiones complementarias)"
## [1] "Variables derivadas (para regresiones complementarias)"
#####################################

# Crear variables derivadas

if(!("Stress_Index" %in% names(Datos))){
Datos$Stress_Index <- (Datos$Max_Pressure * Datos$Pipe_Size) / Datos$Thickness
}

if(!("Exposure_Index" %in% names(Datos))){
Datos$Exposure_Index <- Datos$Time * Datos$Corrosion_Impact
}

if(!("Env_Index" %in% names(Datos))){
Datos$Env_Index <- Datos$Corrosion_Impact * Datos$Temperature
}

#####################################
# "Regresión Complementaria: Thickness_Loss vs Stress_Index"
#####################################

RComp1 <- ggplot(data=Datos, aes(x=Stress_Index, y=Thickness_Loss)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Thickness_Loss vs Stress_Index"),
x="Stress_Index",
y="Thickness_Loss (mm)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp1

# --- x <- ; y <- ; lm() ---

x <- Datos$Stress_Index
y <- Datos$Thickness_Loss
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y

RegComp1 <- lm(y ~ x)
RegComp1
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##   4.786e+00    3.496e-06
# --- Coeficientes ---

RegComp1$coefficients
##  (Intercept)            x 
## 4.785891e+00 3.496481e-06
# --- Summary ---

summary(RegComp1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9740 -2.5586  0.0374  2.5401  5.1847 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.786e+00  1.331e-01  35.962   <2e-16 ***
## x           3.496e-06  3.357e-06   1.041    0.298    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.901 on 998 degrees of freedom
## Multiple R-squared:  0.001086,   Adjusted R-squared:  8.472e-05 
## F-statistic: 1.085 on 1 and 998 DF,  p-value: 0.2979
# --- cov / cor / %R2 ---

Covarianza <- cov(x, y)
Covarianza
## [1] 2613.286
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] 0.0329
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 0.11
#####################################
"Regresión Complementaria: Thickness_Loss vs Exposure_Index"
## [1] "Regresión Complementaria: Thickness_Loss vs Exposure_Index"
#####################################

# --- Gráfico (ggplot + stat_poly_eq) ---

RComp2 <- ggplot(data=Datos, aes(x=Exposure_Index, y=Thickness_Loss)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Thickness_Loss vs Exposure_Index"),
x="Exposure_Index (Time * Corrosion_Impact)",
y="Thickness_Loss (mm)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp2

# --- x <- ; y <- ; lm() ---

x <- Datos$Exposure_Index
y <- Datos$Thickness_Loss
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y

RegComp2 <- lm(y ~ x)
RegComp2
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##   4.9313616   -0.0003596
# --- Coeficientes ---

RegComp2$coefficients
##   (Intercept)             x 
##  4.9313615785 -0.0003595942
# --- Summary ---

summary(RegComp2)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8713 -2.5479  0.0366  2.5738  5.1490 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.9313616  0.1414714  34.858   <2e-16 ***
## x           -0.0003596  0.0008591  -0.419    0.676    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.902 on 998 degrees of freedom
## Multiple R-squared:  0.0001755,  Adjusted R-squared:  -0.0008263 
## F-statistic: 0.1752 on 1 and 998 DF,  p-value: 0.6756
# --- cov / cor / %R2 ---

Covarianza <- cov(x, y)
Covarianza
## [1] -4.107866
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] -0.0132
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 0.02
#####################################
"Regresión Complementaria: Material_Loss_norm vs Thickness_Loss"
## [1] "Regresión Complementaria: Material_Loss_norm vs Thickness_Loss"
#####################################

# --- Gráfico (ggplot + stat_poly_eq) ---

RComp3 <- ggplot(data=Datos, aes(x=Thickness_Loss, y=Material_Loss_norm)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Material_Loss_norm vs Thickness_Loss"),
x="Thickness_Loss (mm)",
y="Material_Loss_norm (0-1)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp3

# --- x <- ; y <- ; lm() ---

x <- Datos$Thickness_Loss
y <- Datos$Material_Loss_norm
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y

RegComp3 <- lm(y ~ x)
RegComp3
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##    0.004357     0.029079
# --- Coeficientes ---

RegComp3$coefficients
## (Intercept)           x 
## 0.004356924 0.029078825
# --- Summary ---

summary(RegComp3)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22660 -0.07032 -0.01436  0.03441  0.71387 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.004357   0.007406   0.588    0.556    
## x           0.029079   0.001303  22.310   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1195 on 998 degrees of freedom
## Multiple R-squared:  0.3328, Adjusted R-squared:  0.3321 
## F-statistic: 497.7 on 1 and 998 DF,  p-value: < 2.2e-16
# --- cov / cor / %R2 ---

Covarianza <- cov(x, y)
Covarianza
## [1] 0.2447423
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] 0.5769
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 33.28
#####################################
"Regresión Complementaria: Thickness_Loss vs Env_Index"
## [1] "Regresión Complementaria: Thickness_Loss vs Env_Index"
#####################################

# --- Gráfico (ggplot + stat_poly_eq) ---

RComp4 <- ggplot(data=Datos, aes(x=Env_Index, y=Thickness_Loss)) +
geom_point(shape=7, col="salmon") +
geom_smooth(method=lm, se=FALSE, formula = y ~ x) +
stat_poly_eq(formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, eq.with.lhs = "hat(Y)~`=`~") +
labs(title=paste("Graf: Dispersión de Thickness_Loss vs Env_Index"),
x="Env_Index (Corrosion_Impact * Temperature)",
y="Thickness_Loss (mm)",
caption="Jim Acuña") +
theme(plot.title = element_text(size=rel(1.1), hjust=0.5, face="bold"),
plot.caption = element_text(hjust=0.5))
RComp4

# --- x <- ; y <- ; lm() ---

x <- Datos$Env_Index
y <- Datos$Thickness_Loss
df <- na.omit(data.frame(x,y))
x <- df$x; y <- df$y

RegComp4 <- lm(y ~ x)
RegComp4
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##   4.8333197    0.0001285
# --- Coeficientes ---

RegComp4$coefficients
##  (Intercept)            x 
## 4.8333196740 0.0001285222
# --- Summary ---

summary(RegComp4)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9177 -2.5600  0.0298  2.5652  5.1715 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.8333197  0.1169535  41.327   <2e-16 ***
## x           0.0001285  0.0001759   0.731    0.465    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.902 on 998 degrees of freedom
## Multiple R-squared:  0.0005347,  Adjusted R-squared:  -0.0004668 
## F-statistic: 0.5339 on 1 and 998 DF,  p-value: 0.4652
# --- cov / cor / %R2 ---

Covarianza <- cov(x, y)
Covarianza
## [1] 35.01276
CoefPearson <- cor(x, y, use="pairwise.complete.obs")
CoefPearson <- round(CoefPearson,4)
CoefPearson
## [1] 0.0231
coefDeterminacion <- CoefPearson^2 * 100
coefDeterminacion <- round(coefDeterminacion,2)
coefDeterminacion
## [1] 0.05
# 0) Data de modelado
Dmod <- Datos %>%
  mutate(
    Material  = as.factor(Material),
    Condition = as.factor(Condition),
    Grade     = as.factor(Grade)
  ) %>%
  dplyr::select(
    Thickness_Loss, log_Loss, Time, log_Time, Temperature, Corrosion_Impact,
    Pipe_Size, Thickness, Max_Pressure,
    Stress_Index, log_Stress,
    Material, Condition, Grade
  ) %>% na.omit()

# Split simple train/test (para mostrar que sí “predice” algo)
set.seed(123)
id <- sample(seq_len(nrow(Dmod)), size = floor(0.7 * nrow(Dmod)))
train <- Dmod[id, ]
test  <- Dmod[-id, ]

rmse <- function(y, yhat) sqrt(mean((y - yhat)^2, na.rm = TRUE))

# ---------------------------------------------------------
# M1) Múltiple clásica
# ---------------------------------------------------------
M1 <- lm(log_Loss ~ log1p(Time) + Corrosion_Impact + Temperature + Max_Pressure +
           Material + Condition, data=train)

# ---------------------------------------------------------
# M2) Modelo mejorado (ingeniería): interacción + Stress_Index
# ---------------------------------------------------------
M2 <- lm(log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) +
           Temperature + Material + Condition, data=train)

# ---------------------------------------------------------
# M3) Alternativo con componentes de Stress
# ---------------------------------------------------------
M3 <- lm(log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Max_Pressure) +
           log1p(Pipe_Size) + log1p(Thickness) + Temperature +
           Material + Condition, data=train)

# ---------------------------------------------------------
# M4) Robusta 
# ---------------------------------------------------------
M4 <- MASS::rlm(log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) +
                  Temperature + Material + Condition, data=train)

# =========================
# 1) Resúmenes 
# =========================
summary(M1); summary(M2); summary(M3)
## 
## Call:
## lm(formula = log_Loss ~ log1p(Time) + Corrosion_Impact + Temperature + 
##     Max_Pressure + Material + Condition, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56136 -0.16734  0.01634  0.16569  0.48141 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.162e+00  3.990e-02  54.199   <2e-16 ***
## log1p(Time)             -1.022e-02  1.201e-02  -0.851   0.3951    
## Corrosion_Impact        -8.616e-04  1.405e-03  -0.613   0.5400    
## Temperature              1.124e-04  2.005e-04   0.561   0.5752    
## Max_Pressure             7.807e-06  1.009e-05   0.774   0.4394    
## MaterialFiberglass      -4.138e-02  2.549e-02  -1.623   0.1050    
## MaterialHDPE            -1.975e-02  2.621e-02  -0.753   0.4514    
## MaterialPVC             -2.828e-02  2.626e-02  -1.077   0.2818    
## MaterialStainless Steel -5.259e-02  2.613e-02  -2.013   0.0445 *  
## ConditionModerate       -6.064e-01  1.899e-02 -31.938   <2e-16 ***
## ConditionNormal         -1.519e+00  2.169e-02 -70.016   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2165 on 689 degrees of freedom
## Multiple R-squared:  0.8801, Adjusted R-squared:  0.8784 
## F-statistic: 505.8 on 10 and 689 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) + 
##     Temperature + Material + Condition, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56660 -0.16834  0.01774  0.16481  0.50488 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   2.0179118  0.0977065  20.653   <2e-16 ***
## log1p(Time)                   0.0091330  0.0224564   0.407   0.6844    
## Corrosion_Impact              0.0039890  0.0049066   0.813   0.4165    
## log1p(Stress_Index)           0.0108243  0.0077821   1.391   0.1647    
## Temperature                   0.0001075  0.0002004   0.537   0.5916    
## MaterialFiberglass           -0.0392402  0.0255051  -1.539   0.1244    
## MaterialHDPE                 -0.0178163  0.0261678  -0.681   0.4962    
## MaterialPVC                  -0.0280653  0.0263011  -1.067   0.2863    
## MaterialStainless Steel      -0.0529848  0.0260884  -2.031   0.0426 *  
## ConditionModerate            -0.6068276  0.0189713 -31.987   <2e-16 ***
## ConditionNormal              -1.5183967  0.0216765 -70.048   <2e-16 ***
## log1p(Time):Corrosion_Impact -0.0020473  0.0019852  -1.031   0.3028    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2163 on 688 degrees of freedom
## Multiple R-squared:  0.8805, Adjusted R-squared:  0.8786 
## F-statistic: 460.9 on 11 and 688 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Max_Pressure) + 
##     log1p(Pipe_Size) + log1p(Thickness) + Temperature + Material + 
##     Condition, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58156 -0.16992  0.01777  0.16607  0.50704 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   2.0127880  0.0998046  20.167   <2e-16 ***
## log1p(Time)                   0.0098662  0.0224852   0.439   0.6610    
## Corrosion_Impact              0.0040414  0.0049104   0.823   0.4108    
## log1p(Max_Pressure)           0.0096244  0.0086226   1.116   0.2647    
## log1p(Pipe_Size)             -0.0012329  0.0226516  -0.054   0.9566    
## log1p(Thickness)              0.0202268  0.0366227   0.552   0.5809    
## Temperature                   0.0001028  0.0002005   0.513   0.6083    
## MaterialFiberglass           -0.0405477  0.0255414  -1.588   0.1129    
## MaterialHDPE                 -0.0180557  0.0262423  -0.688   0.4917    
## MaterialPVC                  -0.0283363  0.0263269  -1.076   0.2822    
## MaterialStainless Steel      -0.0525271  0.0261973  -2.005   0.0453 *  
## ConditionModerate            -0.6064869  0.0190471 -31.841   <2e-16 ***
## ConditionNormal              -1.5190478  0.0217242 -69.924   <2e-16 ***
## log1p(Time):Corrosion_Impact -0.0020390  0.0019865  -1.026   0.3050    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2164 on 686 degrees of freedom
## Multiple R-squared:  0.8807, Adjusted R-squared:  0.8785 
## F-statistic: 389.7 on 13 and 686 DF,  p-value: < 2.2e-16
# M4 no da summary igual a lm, pero sirve como “respaldo”
M4
## Call:
## rlm(formula = log_Loss ~ log1p(Time) * Corrosion_Impact + log1p(Stress_Index) + 
##     Temperature + Material + Condition, data = train)
## Converged in 6 iterations
## 
## Coefficients:
##                  (Intercept)                  log1p(Time) 
##                 2.0002855006                 0.0097759861 
##             Corrosion_Impact          log1p(Stress_Index) 
##                 0.0041885391                 0.0118986456 
##                  Temperature           MaterialFiberglass 
##                 0.0001198373                -0.0369308278 
##                 MaterialHDPE                  MaterialPVC 
##                -0.0196815060                -0.0240642554 
##      MaterialStainless Steel            ConditionModerate 
##                -0.0491227065                -0.6046579019 
##              ConditionNormal log1p(Time):Corrosion_Impact 
##                -1.5047231083                -0.0019870049 
## 
## Degrees of freedom: 700 total; 688 residual
## Scale estimate: 0.249
# =========================
# 2) VIF (colinealidad)
# =========================
vif(M1)
##                      GVIF Df GVIF^(1/(2*Df))
## log1p(Time)      1.012590  1        1.006276
## Corrosion_Impact 1.011106  1        1.005538
## Temperature      1.005053  1        1.002523
## Max_Pressure     1.015165  1        1.007554
## Material         1.048247  4        1.005907
## Condition        1.026648  2        1.006597
vif(M2)
##                                   GVIF Df GVIF^(1/(2*Df))
## log1p(Time)                   3.548530  1        1.883754
## Corrosion_Impact             12.351735  1        3.514504
## log1p(Stress_Index)           1.013115  1        1.006536
## Temperature                   1.005309  1        1.002651
## Material                      1.059740  4        1.007279
## Condition                     1.027705  2        1.006855
## log1p(Time):Corrosion_Impact 14.745812  1        3.840028
vif(M3)
##                                   GVIF Df GVIF^(1/(2*Df))
## log1p(Time)                   3.554422  1        1.885317
## Corrosion_Impact             12.360141  1        3.515699
## log1p(Max_Pressure)           1.021260  1        1.010574
## log1p(Pipe_Size)              7.721539  1        2.778766
## log1p(Thickness)              7.710039  1        2.776696
## Temperature                   1.005759  1        1.002875
## Material                      1.077986  4        1.009431
## Condition                     1.036947  2        1.009111
## log1p(Time):Corrosion_Impact 14.752440  1        3.840890
# =========================
# 3) Comparación con RMSE
# =========================

pred_M1 <- predict(M1, newdata=test)
pred_M2 <- predict(M2, newdata=test)
pred_M3 <- predict(M3, newdata=test)

tab_comp <- dplyr::bind_rows(
  broom::glance(M1) %>% mutate(Modelo="M1: log(TL) clásicos",
                               RMSE_train=rmse(train$log_Loss, fitted(M1)),
                               RMSE_test =rmse(test$log_Loss, pred_M1)),
  broom::glance(M2) %>% mutate(Modelo="M2: log(TL) interacción+stress",
                               RMSE_train=rmse(train$log_Loss, fitted(M2)),
                               RMSE_test =rmse(test$log_Loss, pred_M2)),
  broom::glance(M3) %>% mutate(Modelo="M3: log(TL) componentes",
                               RMSE_train=rmse(train$log_Loss, fitted(M3)),
                               RMSE_test =rmse(test$log_Loss, pred_M3))
) %>%
  dplyr::select(Modelo, r.squared, adj.r.squared, AIC, RMSE_train, RMSE_test)

tab_comp
# =========================
# 4) Diagnósticos
# =========================
par(mfrow=c(2,2)); plot(M2); par(mfrow=c(1,1))

# Gráfico real vs pred (muy presentable)
test$pred_M2 <- pred_M2
ggplot(test, aes(x=log_Loss, y=pred_M2)) +
  geom_point(alpha=0.4) +
  geom_smooth(method="lm", se=FALSE) +
  labs(title="Validación: Real vs Predicho (M2)",
       x="log(1 + Thickness_Loss) real",
       y="log(1 + Thickness_Loss) predicho")