En esta etapa se realiza un análisis de regresiones sobre un conjunto de datos de tuberías, con énfasis en:
lm()).ruta_archivo <- "market_pipe_thickness_loss_dataset_LIMPIO.xlsx"
df_raw <- read_excel(ruta_archivo) |> clean_names()
df_raw |> glimpse()
## Rows: 1,000
## Columns: 12
## $ pipe_size <dbl> 800, 800, 400, 1500, 1500, 600, 200, 300, 150, 800,…
## $ thickness <dbl> 15.48, 22.00, 12.05, 38.72, 24.32, 16.75, 9.94, 13.…
## $ material <chr> "Carbon Steel", "PVC", "Carbon Steel", "Carbon Stee…
## $ grade <chr> "ASTM A333 Grade 6", "ASTM A106 Grade B", "API 5L X…
## $ max_pressure <dbl> 300, 150, 2500, 1500, 1500, 600, 1500, 900, 300, 15…
## $ temperature <dbl> 84.9, 14.1, 0.6, 52.7, 11.7, 67.3, 89.6, 40.8, 3.2,…
## $ corrosion_impact <dbl> 16.04, 7.38, 2.12, 5.58, 12.29, 2.06, 1.34, 5.57, 1…
## $ thickness_loss <dbl> 4.91, 7.32, 6.32, 6.20, 8.58, 5.21, 5.86, 3.02, 2.4…
## $ material_loss <dbl> 31.72, 33.27, 52.45, 16.01, 35.28, 31.10, 58.95, 21…
## $ material_loss_norm <dbl> 0.0992876644, 0.1041516302, 0.1643392852, 0.0499890…
## $ time <dbl> 2, 4, 7, 19, 20, 11, 6, 21, 19, 1, 13, 6, 6, 13, 7,…
## $ condition <chr> "Moderate", "Critical", "Critical", "Critical", "Cr…
# Tipos de variables
tipos <- tibble(
variable = names(df_raw),
variable_es = pretty_var(variable),
clase = map_chr(df_raw, ~ paste(class(.x), collapse = ", "))
)
kbl2(tipos, caption = "Tipos de variables")
| variable | variable_es | clase |
|---|---|---|
| pipe_size | Diámetro de tubería | numeric |
| thickness | Espesor inicial (mm) | numeric |
| material | Material | character |
| grade | Grado | character |
| max_pressure | Presión máxima (psi) | numeric |
| temperature | Temperatura (°C) | numeric |
| corrosion_impact | Impacto de corrosión (%) | numeric |
| thickness_loss | Pérdida de espesor (mm) | numeric |
| material_loss | Pérdida de material (%) | numeric |
| material_loss_norm | Pérdida de material (normalizada) | numeric |
| time | Tiempo de servicio (años) | numeric |
| condition | Condición operacional | character |
# Conteo de N/A por variable
na_por_var <- tibble(
variable = names(df_raw),
variable_es = pretty_var(variable),
na = map_int(df_raw, ~ sum(is.na(.x)))
) |> arrange(desc(na))
kbl2(na_por_var, caption = "Valores N/A por variable")
| variable | variable_es | na |
|---|---|---|
| pipe_size | Diámetro de tubería | 0 |
| thickness | Espesor inicial (mm) | 0 |
| material | Material | 0 |
| grade | Grado | 0 |
| max_pressure | Presión máxima (psi) | 0 |
| temperature | Temperatura (°C) | 0 |
| corrosion_impact | Impacto de corrosión (%) | 0 |
| thickness_loss | Pérdida de espesor (mm) | 0 |
| material_loss | Pérdida de material (%) | 0 |
| material_loss_norm | Pérdida de material (normalizada) | 0 |
| time | Tiempo de servicio (años) | 0 |
| condition | Condición operacional | 0 |
# Duplicados (filas idénticas)
n_duplicadas <- df_raw |> duplicated() |> sum()
n_duplicadas
## [1] 0
summary(df_raw)
## pipe_size thickness material grade
## Min. : 50 Min. : 3.000 Length:1000 Length:1000
## 1st Qu.: 150 1st Qu.: 7.357 Class :character Class :character
## Median : 300 Median :12.930 Mode :character Mode :character
## Mean : 522 Mean :16.074
## 3rd Qu.: 800 3rd Qu.:23.027
## Max. :1500 Max. :49.530
## max_pressure temperature corrosion_impact thickness_loss
## Min. : 150 Min. :-50.00 Min. : 0.000 Min. :0.010
## 1st Qu.: 300 1st Qu.: 13.40 1st Qu.: 4.492 1st Qu.:2.365
## Median : 900 Median : 41.20 Median : 9.720 Median :4.915
## Mean :1004 Mean : 42.60 Mean : 9.746 Mean :4.886
## 3rd Qu.:1500 3rd Qu.: 69.15 3rd Qu.:14.832 3rd Qu.:7.433
## Max. :2500 Max. :149.70 Max. :20.000 Max. :9.990
## material_loss material_loss_norm time condition
## Min. : 0.08 Min. :0.00000 Min. : 1.00 Length:1000
## 1st Qu.: 15.66 1st Qu.:0.04891 1st Qu.: 7.00 Class :character
## Median : 31.66 Median :0.09910 Median :13.00 Mode :character
## Mean : 46.75 Mean :0.14644 Mean :12.96
## 3rd Qu.: 61.03 3rd Qu.:0.19128 3rd Qu.:19.00
## Max. :318.75 Max. :1.00000 Max. :25.00
# Separar numéricas y categóricas (robusto: evita conflictos tipo MASS::select)
vars_num <- names(df_raw)[vapply(df_raw, is.numeric, logical(1))]
vars_cat <- names(df_raw)[vapply(df_raw, function(x) is.factor(x) || is.character(x), logical(1))]
vars_num
## [1] "pipe_size" "thickness" "max_pressure"
## [4] "temperature" "corrosion_impact" "thickness_loss"
## [7] "material_loss" "material_loss_norm" "time"
vars_cat
## [1] "material" "grade" "condition"
# Resumen numérico ordenado
df_num <- df_raw[, vars_num, drop = FALSE]
res_num <- df_num |>
tidyr::pivot_longer(cols = dplyr::everything(),
names_to = "variable",
values_to = "valor") |>
dplyr::group_by(variable) |>
dplyr::summarise(
n = sum(!is.na(valor)),
`NA` = sum(is.na(valor)),
media = mean(valor, na.rm = TRUE),
sd = stats::sd(valor, na.rm = TRUE),
min = min(valor, na.rm = TRUE),
p25 = stats::quantile(valor, 0.25, na.rm = TRUE),
mediana = stats::median(valor, na.rm = TRUE),
p75 = stats::quantile(valor, 0.75, na.rm = TRUE),
max = max(valor, na.rm = TRUE),
.groups = "drop"
) |>
dplyr::arrange(dplyr::desc(sd)) |>
dplyr::mutate(variable_es = pretty_var(variable)) |>
dplyr::select(variable, variable_es, dplyr::everything())
kbl2(res_num, caption = "Resumen estadístico de variables numéricas")
| variable | variable_es | n | NA | media | sd | min | p25 | mediana | p75 | max |
|---|---|---|---|---|---|---|---|---|---|---|
| max_pressure | Presión máxima (psi) | 1000 | 0 | 1004.1000000 | 812.8430992 | 150.00 | 300.0000000 | 900.0000000 | 1500.0000000 | 2500.00 |
| pipe_size | Diámetro de tubería | 1000 | 0 | 522.0000000 | 443.7035867 | 50.00 | 150.0000000 | 300.0000000 | 800.0000000 | 1500.00 |
| material_loss | Pérdida de material (%) | 1000 | 0 | 46.7475600 | 46.6025532 | 0.08 | 15.6650000 | 31.6600000 | 61.0350000 | 318.75 |
| temperature | Temperatura (°C) | 1000 | 0 | 42.5956000 | 41.1270711 | -50.00 | 13.4000000 | 41.2000000 | 69.1500000 | 149.70 |
| thickness | Espesor inicial (mm) | 1000 | 0 | 16.0735300 | 10.5483579 | 3.00 | 7.3575000 | 12.9300000 | 23.0275000 | 49.53 |
| time | Tiempo de servicio (años) | 1000 | 0 | 12.9610000 | 7.1359934 | 1.00 | 7.0000000 | 13.0000000 | 19.0000000 | 25.00 |
| corrosion_impact | Impacto de corrosión (%) | 1000 | 0 | 9.7459900 | 5.8174916 | 0.00 | 4.4925000 | 9.7200000 | 14.8325000 | 20.00 |
| thickness_loss | Pérdida de espesor (mm) | 1000 | 0 | 4.8863000 | 2.9011226 | 0.01 | 2.3650000 | 4.9150000 | 7.4325000 | 9.99 |
| material_loss_norm | Pérdida de material (normalizada) | 1000 | 0 | 0.1464448 | 0.1462408 | 0.00 | 0.0489064 | 0.0990994 | 0.1912794 | 1.00 |
| material | n |
|---|---|
| Fiberglass | 219 |
| Carbon Steel | 210 |
| Stainless Steel | 201 |
| PVC | 186 |
| HDPE | 184 |
| grade | n |
|---|---|
| ASTM A333 Grade 6 | 228 |
| ASTM A106 Grade B | 212 |
| API 5L X52 | 191 |
| API 5L X42 | 186 |
| API 5L X65 | 183 |
| condition | n |
|---|---|
| Critical | 487 |
| Moderate | 299 |
| Normal | 214 |
En muchos casos, las variables originales no muestran correlaciones claras. Por ello, se crean índices derivados (pérdidas relativas, exposición, esfuerzo, etc.) que suelen mejorar la interpretación y el ajuste.
eps <- 1e-9
# Asegurar que las variables clave sean numéricas
vars_num <- intersect(
c("thickness","thickness_loss","time","material_loss","max_pressure","temperature","corrosion_impact","pipe_size"),
names(df_raw)
)
df <- df_raw |>
mutate(across(all_of(vars_num), ~ suppressWarnings(as.numeric(.x)))) |>
mutate(
material_loss_norm = if ("material_loss_norm" %in% names(df_raw)) as.numeric(material_loss_norm)
else scales::rescale(material_loss, to = c(0, 1)),
# ====== Variables derivadas ======
remaining_thickness = pmax(thickness - thickness_loss, 0),
loss_rel = thickness_loss / pmax(thickness, eps), # pérdida relativa
loss_rate = thickness_loss / pmax(time, eps), # pérdida por año
material_loss_rate = material_loss / pmax(time, eps),
pressure_thickness = max_pressure / pmax(thickness, eps), # carga de presión sobre pared
pressure_size = max_pressure / pmax(pipe_size, eps),
temp_pressure = temperature * max_pressure,
corrosion_pressure = corrosion_impact * max_pressure,
# Índice compuesto de exposición: presión + temp + corrosión + tiempo
exposure_index = as.numeric(scale(max_pressure)) +
as.numeric(scale(temperature)) +
as.numeric(scale(corrosion_impact)) +
as.numeric(scale(time)),
# Un índice de "esfuerzo" (heurístico) que crece con presión y diámetro, y cae con espesor
stress_index = (max_pressure * pipe_size) / pmax(thickness^2, eps),
load_exposure = exposure_index * loss_rate
) |>
# ====== Variables adicionales para modelos ======
mutate(
corr_time = corrosion_impact * time,
exposure_rate = exposure_index / pmax(time, eps)
) |>
# ====== Transformaciones de escala (para no linealidad) ======
mutate(
log_time = log1p(pmax(time, 0)),
log_pressure = log1p(pmax(max_pressure, 0)),
log_stress = log1p(pmax(stress_index, 0)),
log_temp = log1p(pmax(temperature, 0)),
log_corr = log1p(pmax(corrosion_impact, 0)),
inv_thickness = 1 / pmax(thickness, eps),
inv_sqrt_thickness = 1 / sqrt(pmax(thickness, eps)),
inv_time = 1 / pmax(time, eps),
inv_sqrt_time = 1 / sqrt(pmax(time, eps)),
sqrt_pressure = sqrt(pmax(max_pressure, 0)),
sqrt_corr = sqrt(pmax(corrosion_impact, 0))
)
glimpse(df)
## Rows: 1,000
## Columns: 36
## $ pipe_size <dbl> 800, 800, 400, 1500, 1500, 600, 200, 300, 150, 800…
## $ thickness <dbl> 15.48, 22.00, 12.05, 38.72, 24.32, 16.75, 9.94, 13…
## $ material <chr> "Carbon Steel", "PVC", "Carbon Steel", "Carbon Ste…
## $ grade <chr> "ASTM A333 Grade 6", "ASTM A106 Grade B", "API 5L …
## $ max_pressure <dbl> 300, 150, 2500, 1500, 1500, 600, 1500, 900, 300, 1…
## $ temperature <dbl> 84.9, 14.1, 0.6, 52.7, 11.7, 67.3, 89.6, 40.8, 3.2…
## $ corrosion_impact <dbl> 16.04, 7.38, 2.12, 5.58, 12.29, 2.06, 1.34, 5.57, …
## $ thickness_loss <dbl> 4.91, 7.32, 6.32, 6.20, 8.58, 5.21, 5.86, 3.02, 2.…
## $ material_loss <dbl> 31.72, 33.27, 52.45, 16.01, 35.28, 31.10, 58.95, 2…
## $ material_loss_norm <dbl> 0.0992876644, 0.1041516302, 0.1643392852, 0.049989…
## $ time <dbl> 2, 4, 7, 19, 20, 11, 6, 21, 19, 1, 13, 6, 6, 13, 7…
## $ condition <chr> "Moderate", "Critical", "Critical", "Critical", "C…
## $ remaining_thickness <dbl> 10.57, 14.68, 5.73, 32.52, 15.74, 11.54, 4.08, 10.…
## $ loss_rel <dbl> 0.317183463, 0.332727273, 0.524481328, 0.160123967…
## $ loss_rate <dbl> 2.455000000, 1.830000000, 0.902857143, 0.326315789…
## $ material_loss_rate <dbl> 15.86000000, 8.31750000, 7.49285714, 0.84263158, 1…
## $ pressure_thickness <dbl> 19.379845, 6.818182, 207.468880, 38.739669, 61.677…
## $ pressure_size <dbl> 0.3750, 0.1875, 6.2500, 1.0000, 1.0000, 1.0000, 7.…
## $ temp_pressure <dbl> 25470, 2115, 1500, 79050, 17550, 40380, 134400, 36…
## $ corrosion_pressure <dbl> 4812, 1107, 5300, 8370, 18435, 1236, 2010, 5013, 4…
## $ exposure_index <dbl> -0.29169689, -3.40607299, -1.32700287, 0.98592688,…
## $ stress_index <dbl> 1001.5424, 247.9339, 6886.9338, 1500.7620, 3804.13…
## $ load_exposure <dbl> -0.71611587, -6.23311357, -1.19809402, 0.32172351,…
## $ corr_time <dbl> 32.08, 29.52, 14.84, 106.02, 245.80, 22.66, 8.04, …
## $ exposure_rate <dbl> -0.145848447, -0.851518247, -0.189571838, 0.051890…
## $ log_time <dbl> 1.0986123, 1.6094379, 2.0794415, 2.9957323, 3.0445…
## $ log_pressure <dbl> 5.707110, 5.017280, 7.824446, 7.313887, 7.313887, …
## $ log_stress <dbl> 6.910294, 5.517187, 8.837526, 7.314394, 8.244105, …
## $ log_temp <dbl> 4.4531838, 2.7146947, 0.4700036, 3.9834130, 2.5416…
## $ log_corr <dbl> 2.8355635, 2.1258479, 1.1378330, 1.8840347, 2.5870…
## $ inv_thickness <dbl> 0.06459948, 0.04545455, 0.08298755, 0.02582645, 0.…
## $ inv_sqrt_thickness <dbl> 0.2541643, 0.2132007, 0.2880756, 0.1607061, 0.2027…
## $ inv_time <dbl> 0.50000000, 0.25000000, 0.14285714, 0.05263158, 0.…
## $ inv_sqrt_time <dbl> 0.7071068, 0.5000000, 0.3779645, 0.2294157, 0.2236…
## $ sqrt_pressure <dbl> 17.32051, 12.24745, 50.00000, 38.72983, 38.72983, …
## $ sqrt_corr <dbl> 4.004997, 2.716616, 1.456022, 2.362202, 3.505710, …
Este bloque da una visión global de posibles relaciones (para sugerir regresiones) usando:
Importante: algunas variables derivadas están definidas por fórmula (por ejemplo
load_exposure = exposure_index * loss_rate), así que pueden producir correlaciones altas por construcción.
vars_base <- c("thickness","thickness_loss","time","max_pressure","temperature","corrosion_impact","pipe_size","material_loss")
vars_deriv <- c(
"remaining_thickness","loss_rel","loss_rate","material_loss_rate",
"pressure_thickness","pressure_size","temp_pressure","corrosion_pressure",
"exposure_index","stress_index","load_exposure",
"corr_time","exposure_rate",
"log_time","log_pressure","log_stress","log_temp","log_corr",
"inv_thickness","inv_sqrt_thickness","inv_time","inv_sqrt_time",
"sqrt_pressure","sqrt_corr",
"material_loss_norm"
)
vars_all <- intersect(c(vars_base, vars_deriv), names(df))
df_num <- df |>
dplyr::select(all_of(vars_all)) |>
dplyr::select(where(is.numeric)) |>
tidyr::drop_na()
M <- cor(df_num, use = "pairwise.complete.obs")
colnames(M) <- pretty_var(colnames(M))
rownames(M) <- pretty_var(rownames(M))
corrplot::corrplot(
M,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.cex = 0.6,
addCoef.col = "black",
number.cex = 0.4
)
mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
vars_small <- intersect(c(
"thickness","thickness_loss","time","max_pressure","temperature","corrosion_impact",
"loss_rate","load_exposure","exposure_index","stress_index","inv_sqrt_thickness","exposure_rate"
), names(df))
if (requireNamespace("GGally", quietly = TRUE)) {
df_plot <- df |>
dplyr::select(all_of(vars_small)) |>
tidyr::drop_na() |>
dplyr::rename_with(pretty_var)
GGally::ggpairs(
df_plot,
progress = FALSE
) + ggplot2::labs(caption = caption_autor)
} else {
cat("Paquete 'GGally' no está instalado. Instala ('GGally')\n")
}
Después de revisar el corrplot y la dispersión, se seleccionaron 3 pares que permiten un análisis bivariado claro (según forma del gráfico y métricas):
Nota metodológica:
remaining_thickness,loss_rate,load_exposure,exposure_rateycorr_timeincluyen variables derivadas por construcción; por eso se debe interpretar con cautela (un \(R^2\) alto puede venir de la fórmula).
pearson_manual <- function(x, y) {
x <- as.numeric(x); y <- as.numeric(y)
ok <- is.finite(x) & is.finite(y)
x <- x[ok]; y <- y[ok]
n <- length(x)
Sx <- sum(x); Sy <- sum(y); Sxy <- sum(x*y); Sx2 <- sum(x^2); Sy2 <- sum(y^2)
r <- (n*Sxy - Sx*Sy) / sqrt((n*Sx2 - Sx^2) * (n*Sy2 - Sy^2))
b <- (n*Sxy - Sx*Sy) / (n*Sx2 - Sx^2)
a <- mean(y) - b*mean(x)
tibble::tibble(n, `Σx`=Sx, `Σy`=Sy, `Σxy`=Sxy, `Σx²`=Sx2, `Σy²`=Sy2, r=r, `R²=r²`=r^2, a=a, b=b)
}
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
En un análisis de regresión no basta con reportar el coeficiente de determinación \(R^2\). También se deben revisar diagnósticos que ayudan a evaluar si el modelo es estable e interpretable:
El VIF (Factor de Inflación de la Varianza)
cuantifica multicolinealidad entre predictores en una
regresión múltiple.
En términos simples, mide cuánto se “infla” la varianza (incertidumbre)
de un coeficiente \(\beta_j\) cuando el
predictor \(x_j\) está correlacionado
con los demás.
¿Por qué importa? Si existe multicolinealidad alta, el modelo puede mantener un buen \(R^2\) pero los coeficientes (signos y magnitudes) pueden cambiar mucho con pequeñas variaciones de datos, afectando la interpretación.
La distancia de Cook identifica observaciones influyentes: puntos que, por ser atípicos (residuos grandes) y/o tener alto apalancamiento (leverage), pueden cambiar notablemente los coeficientes del modelo al incluirlos o excluirlos.
Un umbral práctico muy usado es:
\[ D_i > \frac{4}{n} \]
donde \(n\) es el tamaño de muestra.
¿Por qué importa? Detectar puntos influyentes permite: - justificar técnicamente si un dato es válido (condición extrema real) o un error, - comparar el ajuste antes y después de retirar influyentes, - evitar conclusiones engañosas por pocos puntos dominantes.
d <- df |> dplyr::select(x = thickness, y = remaining_thickness, material) |> tidyr::drop_na()
n <- nrow(d)
# Gráfico EDA + conjetura (recta)
p0 <- ggplot2::ggplot(d, ggplot2::aes(x = x, y = y)) +
ggplot2::geom_point(alpha = 0.55) +
ggplot2::geom_smooth(method = "lm", se = TRUE) +
ggplot2::labs(
title = "EDA: Espesor remanente vs Espesor inicial",
subtitle = "Conjetura: relación lineal (espesor remanente = espesor − pérdida)",
x = pretty_var("thickness"),
y = pretty_var("remaining_thickness"),
caption = caption_autor
) +
ggplot2::theme_minimal()
# Mostrar ecuación y R² sobre la gráfica
if (requireNamespace("ggpmisc", quietly = TRUE)) {
p0 <- p0 +
ggpmisc::stat_poly_eq(
formula = y ~ x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE,
label.x.npc = "left",
label.y.npc = "top",
size = 3.4
)
}
p0
# Cálculo “a mano” (Pearson + recta lineal)
manual0 <- pearson_manual(d$x, d$y)
kbl2(manual0, digits = 5,
caption = "Cálculo: Σ, r de Pearson, R²=r² y recta lineal (a + b·x)")
| n | Σx | Σy | Σxy | Σx² | Σy² | r | R²=r² | a | b |
|---|---|---|---|---|---|---|---|---|---|
| 1000 | 16073.53 | 11455.13 | 290758.4 | 369515 | 240737 | 0.96647 | 0.93406 | -3.96442 | 0.95931 |
# Ajuste formal lineal
mod0 <- stats::lm(y ~ x, data = d)
sum0 <- summary(mod0)
sum0
##
## Call:
## stats::lm(formula = y ~ x, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3249 -2.1497 -0.1333 2.1424 5.6696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.964423 0.155096 -25.56 <2e-16 ***
## x 0.959313 0.008068 118.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.69 on 998 degrees of freedom
## Multiple R-squared: 0.9341, Adjusted R-squared: 0.934
## F-statistic: 1.414e+04 on 1 and 998 DF, p-value: < 2.2e-16
# Métricas
yhat0 <- predict(mod0, newdata = d)
met0 <- tibble::tibble(
Modelo = "Lineal: remaining_thickness ~ thickness",
n = n,
R2 = sum0$r.squared,
R2_aj = sum0$adj.r.squared,
RMSE = rmse(d$y, yhat0),
MAE = mae(d$y, yhat0),
AIC = AIC(mod0)
)
kbl2(met0, digits = 5, caption = "Métricas del modelo 0")
| Modelo | n | R2 | R2_aj | RMSE | MAE | AIC |
|---|---|---|---|---|---|---|
| Lineal: remaining_thickness ~ thickness | 1000 | 0.93406 | 0.93399 | 2.6873 | 2.27202 | 4820.953 |
# Coeficientes (ecuación)
coef0 <- coef(mod0)
coef_tab0 <- tibble::tibble(Termino = names(coef0), Coef = as.numeric(coef0))
kbl2(coef_tab0, digits = 6, caption = "Coeficientes del modelo 0")
| Termino | Coef |
|---|---|
| (Intercept) | -3.964423 |
| x | 0.959313 |
# Predicción (x0 = percentil 75 de thickness)
x0 <- as.numeric(stats::quantile(d$x, 0.75))
pred0 <- predict(mod0, newdata = tibble::tibble(x = x0), interval = "prediction")
pred0
## fit lwr upr
## 1 18.12617 12.84369 23.40865
# Optimización simple: retirar influyentes por Cook y reevaluar
umbral <- 4 / n
idx_inf <- which(stats::cooks.distance(mod0) > umbral)
# Gráfico: Diagnóstico de Cook
#plot(stats::cooks.distance(mod0),
# type = "h",
# main = paste0("Diagnóstico: Distancia de Cook (", "Modelo 0", ")"),
# ylab = "Distancia de Cook")
#abline(h = umbral, lty = 2)
#mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
if (length(idx_inf) > 0) {
d2 <- d[-idx_inf, , drop = FALSE]
mod0b <- stats::lm(y ~ x, data = d2)
yhat0b <- predict(mod0b, newdata = d2)
met0b <- tibble::tibble(
Escenario = c("Original", "Sin influyentes (Cook)"),
n = c(nrow(d), nrow(d2)),
R2 = c(summary(mod0)$r.squared, summary(mod0b)$r.squared),
RMSE = c(rmse(d$y, yhat0), rmse(d2$y, yhat0b)),
MAE = c(mae(d$y, yhat0), mae(d2$y, yhat0b))
)
kbl2(met0b, digits = 5, caption = "Reevaluación modelo 0 (Cook)")
}
| Escenario | n | R2 | RMSE | MAE |
|---|---|---|---|---|
| Original | 1000 | 0.93406 | 2.68730 | 2.27202 |
| Sin influyentes (Cook) | 959 | 0.92533 | 2.57931 | 2.17610 |
¿Cuál sería el espesor remanente estimado si se propone un espesor inicial de X?
m0 <- coef(mod0)[["x"]]
b0 <- coef(mod0)[["(Intercept)"]]
# Usamos como X propuesto el percentil 75 del espesor inicial
x_prop0 <- round(as.numeric(stats::quantile(d$x, 0.75)), 4)
# Cálculo directo con la ecuación: ŷ = b + m·x
y_est0 <- m0*x_prop0 + b0
y_est0
## [1] 18.12617
# Porcentaje de confianza
r0 <- suppressWarnings(stats::cor(d$y, yhat0, use = "complete.obs"))
conf0 <- 100 * (r0^2)
conf0_txt <- ifelse(is.finite(conf0), sprintf("%.2f", conf0), "N/A")
# Ecuación
eq0 <- paste0("ŷ = ", sprintf("%.4f", b0), ifelse(m0 >= 0, " + ", " - "), sprintf("%.4f", abs(m0)), "·x")
CONCLUSIONES (Modelo 0): Entre remaining_thickness (espesor remanente) y thickness (espesor inicial) existe una relación aproximadamente lineal. La ecuación estimada es ŷ = -3.9644 + 0.9593·x. Podemos afirmar con un nivel de confianza del 93.41% que bajo las condiciones del conjunto de datos, si se propone thickness = 23.0275, se espera remaining_thickness ≈ 18.1262 (aprox.).
d <- df |> dplyr::select(x = load_exposure, y = loss_rate, material) |> tidyr::drop_na()
n <- nrow(d)
ggplot2::ggplot(d, ggplot2::aes(x = x, y = y)) +
ggplot2::geom_point(alpha = 0.55) +
ggplot2::geom_smooth(method = "loess", se = TRUE) +
ggplot2::labs(
title = "EDA: Tasa de pérdida vs Carga de exposición",
subtitle = "Conjetura: curvatura marcada → polinomio",
x = pretty_var("load_exposure"),
y = pretty_var("loss_rate"),
caption = caption_autor
) +
ggplot2::theme_minimal()
# Cálculo Pearson + recta base
manual1 <- pearson_manual(d$x, d$y)
kbl2(manual1, digits = 5, caption = "Cálculo: Σ, r de Pearson, R²=r²")
| n | Σx | Σy | Σxy | Σx² | Σy² | r | R²=r² | a | b |
|---|---|---|---|---|---|---|---|---|---|
| 1000 | -681.3953 | 756.1608 | -3028.532 | 10566.45 | 2156.801 | -0.62808 | 0.39449 | 0.58664 | -0.24879 |
# Ajuste polinómico grado 4 (con centrado para estabilidad numérica)
mx <- mean(d$x)
d <- d |> dplyr::mutate(xc = x - mx)
mod1 <- stats::lm(y ~ poly(xc, 4, raw = TRUE), data = d)
sum1 <- summary(mod1)
# Métricas
yhat1 <- predict(mod1, newdata = d)
met1 <- tibble::tibble(
Modelo = "Polinómico grado 4 (x centrada)",
n = n,
R2 = sum1$r.squared,
R2_aj = sum1$adj.r.squared,
RMSE = rmse(d$y, yhat1),
MAE = mae(d$y, yhat1),
AIC = AIC(mod1)
)
kbl2(met1, digits = 5, caption = "Métricas del modelo 1")
| Modelo | n | R2 | R2_aj | RMSE | MAE | AIC |
|---|---|---|---|---|---|---|
| Polinómico grado 4 (x centrada) | 1000 | 0.60857 | 0.607 | 0.78767 | 0.42789 | 2372.52 |
# Ecuación (en términos de xc = x - mean(x))
coef1 <- coef(mod1)
coef_tab <- tibble::tibble(
Termino = names(coef1),
Coef = as.numeric(coef1)
)
kbl2(coef_tab, digits = 6, caption = "Coeficientes del polinomio (sobre xc)")
| Termino | Coef |
|---|---|
| (Intercept) | 0.591657 |
| poly(xc, 4, raw = TRUE)1 | -0.150763 |
| poly(xc, 4, raw = TRUE)2 | 0.026507 |
| poly(xc, 4, raw = TRUE)3 | 0.000615 |
| poly(xc, 4, raw = TRUE)4 | -0.000008 |
# Gráfico con curva polinómica
p1 <- ggplot2::ggplot(d, ggplot2::aes(x = x, y = y)) +
ggplot2::geom_point(alpha = 0.55) +
ggplot2::stat_smooth(method = "lm", formula = y ~ poly(I(x - mx), 4, raw = TRUE), se = TRUE) +
ggplot2::labs(
title = "Ajuste polinómico grado 4",
subtitle = "Curva estimada sobre x centrada",
x = pretty_var("load_exposure"),
y = pretty_var("loss_rate"),
caption = caption_autor
) +
ggplot2::theme_minimal()
p1
# Predicción (x0 = percentil 75)
x0 <- as.numeric(stats::quantile(d$x, 0.75))
pred1 <- predict(mod1, newdata = tibble::tibble(xc = x0 - mx), interval = "prediction")
pred1
## fit lwr upr
## 1 0.4662758 -1.084194 2.016745
# Optimización simple: retirar influyentes por Cook y reevaluar
umbral <- 4 / n
idx_inf <- which(stats::cooks.distance(mod1) > umbral)
# Gráfico: Diagnóstico de Cook
#plot(stats::cooks.distance(mod1),
# type = "h",
# main = paste0("Diagnóstico: Distancia de Cook (", "Modelo 1", ")"),
# ylab = "Distancia de Cook")
#abline(h = umbral, lty = 2)
#mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
if (length(idx_inf) > 0) {
d2 <- d[-idx_inf, , drop = FALSE]
mod1b <- stats::lm(y ~ poly(xc, 4, raw = TRUE), data = d2)
yhat1b <- predict(mod1b, newdata = d2)
met1b <- tibble::tibble(
Escenario = c("Original", "Sin influyentes (Cook)"),
n = c(nrow(d), nrow(d2)),
R2 = c(summary(mod1)$r.squared, summary(mod1b)$r.squared),
RMSE = c(rmse(d$y, yhat1), rmse(d2$y, yhat1b)),
MAE = c(mae(d$y, yhat1), mae(d2$y, yhat1b))
)
kbl2(met1b, digits = 5, caption = "Reevaluación modelo 1 (Cook)")
}
| Escenario | n | R2 | RMSE | MAE |
|---|---|---|---|---|
| Original | 1000 | 0.60857 | 0.78767 | 0.42789 |
| Sin influyentes (Cook) | 945 | 0.53494 | 0.37587 | 0.25378 |
¿Cuál sería el loss_rate estimado si se propone un load_exposure de X?
coefs1 <- coef(mod1)
x_prop1 <- round(as.numeric(stats::quantile(d$x, 0.75)), 4)
xc_prop1 <- x_prop1 - mx
# Cálculo directo con la ecuación polinómica sobre xc = x - media(x)
# ŷ = a0 + a1·xc + a2·xc^2 + a3·xc^3 + a4·xc^4
pows1 <- 0:4
y_est1 <- sum(coefs1 * (xc_prop1^pows1))
y_est1
## [1] 0.4662728
r1 <- suppressWarnings(stats::cor(d$y, yhat1, use = "complete.obs"))
conf1 <- 100 * (r1^2)
conf1_txt <- ifelse(is.finite(conf1), sprintf("%.2f", conf1), "N/A")
conf1_res <- round(100-conf1,4)
conf1_res_txt <- ifelse(is.finite(conf1_res), sprintf("%.2f", conf1_res), "N/A")
fmt_term <- function(cc, lab) {
if (!is.finite(cc)) return("")
paste0(ifelse(cc >= 0, " + ", " - "), sprintf("%.4f", abs(cc)), lab)
}
eq1 <- paste0(
"ŷ = ", sprintf("%.4f", coefs1[1]),
fmt_term(coefs1[2], "·xc"),
fmt_term(coefs1[3], "·xc^2"),
fmt_term(coefs1[4], "·xc^3"),
fmt_term(coefs1[5], "·xc^4"),
", con xc = x - ", sprintf("%.4f", mx)
)
CONCLUSIONES (Modelo 1): Entre loss_rate y load_exposure se observa una relación no lineal, por lo que se ajustó un polinomio de grado 4. La ecuación (en términos de la variable centrada) es ŷ = 0.5917 - 0.1508·xc + 0.0265·xc^2 + 0.0006·xc^3 - 0.0000·xc^4, con xc = x - -0.6814. Podemos afirmar con un nivel de confianza del 60.86%. que bajo las condiciones del conjunto de datos, si se propone load_exposure = 0.3368, se estima loss_rate ≈ 0.4663 (aprox.). Por tanto, el 39.14%** se explica con otras variables.
d <- df |> dplyr::select(x = corr_time, y = exposure_rate, material) |> tidyr::drop_na()
n <- nrow(d)
ggplot2::ggplot(d, ggplot2::aes(x = x, y = y)) +
ggplot2::geom_point(alpha = 0.55) +
ggplot2::geom_smooth(method = "loess", se = TRUE) +
ggplot2::labs(
title = "EDA: Tasa de exposición vs Corrosión×tiempo",
subtitle = "Conjetura: curvatura compleja → polinomio grado 5",
x = paste0(pretty_var("corr_time"), " (corrosion_impact × time)"),
y = paste0(pretty_var("exposure_rate"), " (exposure_index / time)"),
caption = caption_autor
) +
ggplot2::theme_minimal()
# Cálculo Pearson + recta base
manual2 <- pearson_manual(d$x, d$y)
kbl2(manual2, digits = 5, caption = "Cálculo: Σ, r de Pearson, R²=r²")
| n | Σx | Σy | Σxy | Σx² | Σy² | r | R²=r² | a | b |
|---|---|---|---|---|---|---|---|---|---|
| 1000 | 125312.3 | -144.8231 | 5456.908 | 27115370 | 372.5743 | 0.37264 | 0.13886 | -0.40402 | 0.00207 |
# Ajuste polinómico grado 5 con centrado
mx <- mean(d$x)
d <- d |> dplyr::mutate(xc = x - mx)
mod2 <- stats::lm(y ~ poly(xc, 5, raw = TRUE), data = d)
sum2 <- summary(mod2)
# Métricas
yhat2 <- predict(mod2, newdata = d)
met2 <- tibble::tibble(
Modelo = "Polinómico grado 5 (x centrada)",
n = n,
R2 = sum2$r.squared,
R2_aj = sum2$adj.r.squared,
RMSE = rmse(d$y, yhat2),
MAE = mae(d$y, yhat2),
AIC = AIC(mod2)
)
kbl2(met2, digits = 5, caption = "Métricas del modelo 2")
| Modelo | n | R2 | R2_aj | RMSE | MAE | AIC |
|---|---|---|---|---|---|---|
| Polinómico grado 5 (x centrada) | 1000 | 0.34527 | 0.34198 | 0.47979 | 0.25524 | 1383.083 |
coef2 <- coef(mod2)
coef_tab2 <- tibble::tibble(Termino = names(coef2), Coef = as.numeric(coef2))
kbl2(coef_tab2, digits = 6, caption = "Coeficientes del polinomio (sobre xc)")
| Termino | Coef |
|---|---|
| (Intercept) | 0.032329 |
| poly(xc, 5, raw = TRUE)1 | -0.001789 |
| poly(xc, 5, raw = TRUE)2 | 0.000001 |
| poly(xc, 5, raw = TRUE)3 | 0.000000 |
| poly(xc, 5, raw = TRUE)4 | 0.000000 |
| poly(xc, 5, raw = TRUE)5 | 0.000000 |
# Curva ajustada
p2 <- ggplot2::ggplot(d, ggplot2::aes(x = x, y = y)) +
ggplot2::geom_point(alpha = 0.55) +
ggplot2::stat_smooth(method = "lm", formula = y ~ poly(I(x - mx), 5, raw = TRUE), se = TRUE) +
ggplot2::labs(
title = "Ajuste polinómico grado 5",
subtitle = "Curva estimada sobre x centrada",
x = pretty_var("corr_time"),
y = pretty_var("exposure_rate"),
caption = caption_autor
) +
ggplot2::theme_minimal()
p2
# Predicción (x0 = percentil 75)
x0 <- as.numeric(stats::quantile(d$x, 0.75))
pred2 <- predict(mod2, newdata = tibble::tibble(xc = x0 - mx), interval = "prediction")
pred2
## fit lwr upr
## 1 -0.005268388 -0.9515 0.9409632
# Cook (reevaluación)
umbral <- 4 / n
idx_inf <- which(stats::cooks.distance(mod2) > umbral)
# Gráfico: Diagnóstico de Cook
#plot(stats::cooks.distance(mod2),
# type = "h",
# main = paste0("Diagnóstico: Distancia de Cook (", "Modelo 2", ")"),
# ylab = "Distancia de Cook")
#abline(h = umbral, lty = 2)
#mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
if (length(idx_inf) > 0) {
d2 <- d[-idx_inf, , drop = FALSE]
mod2b <- stats::lm(y ~ poly(xc, 5, raw = TRUE), data = d2)
yhat2b <- predict(mod2b, newdata = d2)
met2b <- tibble::tibble(
Escenario = c("Original", "Sin influyentes (Cook)"),
n = c(nrow(d), nrow(d2)),
R2 = c(summary(mod2)$r.squared, summary(mod2b)$r.squared),
RMSE = c(rmse(d$y, yhat2), rmse(d2$y, yhat2b)),
MAE = c(mae(d$y, yhat2), mae(d2$y, yhat2b))
)
kbl2(met2b, digits = 5, caption = "Reevaluación modelo 2 (Cook)")
}
| Escenario | n | R2 | RMSE | MAE |
|---|---|---|---|---|
| Original | 1000 | 0.34527 | 0.47979 | 0.25524 |
| Sin influyentes (Cook) | 956 | 0.42803 | 0.27200 | 0.17629 |
¿Cuál sería el exposure_rate estimado si se propone un corr_time de X?
coefs2 <- coef(mod2)
x_prop2 <- round(as.numeric(stats::quantile(d$x, 0.75)), 4)
xc_prop2 <- x_prop2 - mx
# ŷ = a0 + a1·xc + a2·xc^2 + a3·xc^3 + a4·xc^4 + a5·xc^5
pows2 <- 0:5
y_est2 <- sum(coefs2 * (xc_prop2^pows2))
y_est2
## [1] -0.005268388
r2 <- suppressWarnings(stats::cor(d$y, yhat2, use = "complete.obs"))
conf2 <- 100 * (r2^2)
conf2_txt <- ifelse(is.finite(conf2), sprintf("%.2f", conf2), "N/A")
conf2_res <- 100-conf2
conf2_res_txt <- ifelse(is.finite(conf2_res), sprintf("%.2f", conf2_res), "N/A")
fmt_term <- function(cc, lab) {
if (!is.finite(cc)) return("")
paste0(ifelse(cc >= 0, " + ", " - "), sprintf("%.4f", abs(cc)), lab)
}
eq2 <- paste0(
"ŷ = ", sprintf("%.4f", coefs2[1]),
fmt_term(coefs2[2], "·xc"),
fmt_term(coefs2[3], "·xc^2"),
fmt_term(coefs2[4], "·xc^3"),
fmt_term(coefs2[5], "·xc^4"),
fmt_term(coefs2[6], "·xc^5"),
", con xc = x - ", sprintf("%.4f", mx)
)
CONCLUSIONES (Modelo 2): Entre exposure_rate y corr_time se observa una relación no lineal con curvatura compleja, por lo que se ajustó un polinomio de grado 5. La ecuación (en términos de la variable centrada) es ŷ = 0.0323 - 0.0018·xc + 0.0000·xc^2 + 0.0000·xc^3 - 0.0000·xc^4 + 0.0000·xc^5, con xc = x - 125.3123. Podemos afirmar con un nivel de confianza del 34.53%. que bajo las condiciones del conjunto de datos, si se propone corr_time = 197.79, se estima exposure_rate ≈ -0.0053 (aprox.). Por tanto, el 65.47% ** se explica con otras variables.**
A continuación se incluyen dos modelos múltiples:
Nota:
inv_sqrt_thickness = 1/sqrt(thickness)captura no linealidad simple asociada al espesor.
exposure_indexes un índice compuesto estandarizado (presión + temperatura + corrosión + tiempo), ystress_indexes un índice heurístico (presión·diámetro / espesor²).
Por lo tanto, estos modelos deben interpretarse como modelos exploratorios.
d1 <- df |>
dplyr::select(thickness_loss, material_loss, inv_sqrt_thickness) |>
tidyr::drop_na() |>
dplyr::rename(
y = thickness_loss,
x1 = material_loss,
x2 = inv_sqrt_thickness
)
mod_m1 <- stats::lm(y ~ x1 + x2, data = d1)
sum_m1 <- summary(mod_m1)
sum_m1
##
## Call:
## stats::lm(formula = y ~ x1 + x2, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3969 -0.8566 0.1762 1.1335 3.8696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.210946 0.179408 45.77 <2e-16 ***
## x1 0.067318 0.001539 43.73 <2e-16 ***
## x2 -21.937171 0.712367 -30.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.698 on 997 degrees of freedom
## Multiple R-squared: 0.658, Adjusted R-squared: 0.6574
## F-statistic: 959.3 on 2 and 997 DF, p-value: < 2.2e-16
# Métricas
yhat_m1 <- predict(mod_m1, newdata = d1)
met_m1 <- tibble::tibble(
Modelo = "M3: thickness_loss ~ material_loss + inv_sqrt_thickness",
n = nrow(d1),
R2 = sum_m1$r.squared,
R2_aj = sum_m1$adj.r.squared,
RMSE = sqrt(mean((d1$y - yhat_m1)^2)),
MAE = mean(abs(d1$y - yhat_m1)),
AIC = AIC(mod_m1)
)
kbl2(met_m1, digits = 5, caption = "Métricas del modelo M3")
| Modelo | n | R2 | R2_aj | RMSE | MAE | AIC |
|---|---|---|---|---|---|---|
| M3: thickness_loss ~ material_loss + inv_sqrt_thickness | 1000 | 0.65804 | 0.65735 | 1.69566 | 1.29208 | 3902.017 |
# VIF (multicolinealidad)
if (requireNamespace("car", quietly = TRUE)) {
vif_raw <- tryCatch(car::vif(mod_m1), error = function(e) {
cat("No se pudo calcular VIF M3: ", e$message, "\n"); NA
})
print(vif_raw)
# Convertir a VIF numérico
if (is.matrix(vif_raw)) {
# Para factores: usar GVIF^(1/(2*Df)) como equivalente comparable
vif_val <- vif_raw[, "GVIF"]^(1/(2*vif_raw[, "Df"]))
names(vif_val) <- rownames(vif_raw)
} else {
vif_val <- vif_raw
}
vif_tbl <- tibble::tibble(
Termino = names(vif_val),
VIF = as.numeric(vif_val),
Termino_es = dplyr::recode(
Termino,
x1 = pretty_var("material_loss"),
x2 = pretty_var("inv_sqrt_thickness"),
.default = Termino
)
)
kbl2(vif_tbl |> dplyr::select(Predictor = Termino_es, VIF), digits = 3,
caption = "VIF (multicolinealidad) — Modelo M3")
# ggplot2::ggplot(vif_tbl, ggplot2::aes(x = reorder(Termino_es, VIF), y = VIF)) +
# ggplot2::geom_col() +
# ggplot2::coord_flip() +
# ggplot2::geom_hline(yintercept = 5, lty = 2) +
# ggplot2::geom_hline(yintercept = 10, lty = 3) +
# ggplot2::labs(
# title = "VIF — Modelo M3",
# x = "Predictor",
# y = "VIF",
# caption = caption_autor
# )
} else {
cat("Paquete 'car' no está instalado. Instala 'car' para calcular VIF.\n")
}
## x1 x2
## 1.782542 1.782542
| Predictor | VIF |
|---|---|
| Pérdida de material (%) | 1.783 |
| 1 / √(espesor) | 1.783 |
# Diagnóstico: Distancia de Cook
umbral_cook_m1 <- 4 / nrow(d1)
#plot(stats::cooks.distance(mod_m1),
# type = "h",
# main = "Diagnóstico: Distancia de Cook (M3)",
# ylab = "Distancia de Cook")
#abline(h = umbral_cook_m1, lty = 2)
#mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
# Diagnósticos clásicos (residuos, normalidad, leverage)
op <- par(mfrow = c(2,2), oma = c(0,0,2,0))
plot(mod_m1)
mtext(caption_autor, outer = TRUE, side = 3, line = 0, adj = 1, cex = 0.8, font = 3)
par(op)
# Predicción puntual + intervalo (medianas)
x1_0 <- median(d1$x1)
x2_0 <- median(d1$x2)
predict(mod_m1, newdata = tibble::tibble(x1 = x1_0, x2 = x2_0), interval = "prediction")
## fit lwr upr
## 1 4.241522 0.9072067 7.575837
¿Cuál sería thickness_loss estimado para valores típicos de los predictores?
coefs_m1 <- coef(mod_m1)
x1_prop_m1 <- median(d1$x1)
x2_prop_m1 <- median(d1$x2)
# ŷ = b0 + b1·x1 + b2·x2
y_est_m1 <- coefs_m1[1] + coefs_m1[2]*x1_prop_m1 + coefs_m1[3]*x2_prop_m1
y_est_m1
## (Intercept)
## 4.241522
r_m1 <- suppressWarnings(stats::cor(d1$y, yhat_m1, use = "complete.obs"))
conf_m1 <- 100 * (r_m1^2)
conf_m1_txt <- ifelse(is.finite(conf_m1), sprintf("%.2f", conf_m1), "N/A")
eq_m1 <- paste0(
"ŷ = ", sprintf("%.4f", coefs_m1[1]),
ifelse(coefs_m1[2] >= 0, " + ", " - "), sprintf("%.4f", abs(coefs_m1[2])), "·x1",
ifelse(coefs_m1[3] >= 0, " + ", " - "), sprintf("%.4f", abs(coefs_m1[3])), "·x2"
)
CONCLUSIONES (M3): La pérdida de espesor thickness_loss (Pérdida de espesor) se explica mediante una combinación lineal de material_loss (x1) e inv_sqrt_thickness (x2). La ecuación estimada es ŷ = 8.2109 + 0.0673·x1 - 21.9372·x2. Podemos afirmar con un nivel de confianza del 65.80%. que para valores típicos x1 = 31.66 y x2 = 0.2781, se estima thickness_loss ≈ 4.2415 (aprox.). Además, se comparó el plano lineal con una superficie cuadrática (incluye curvatura e interacción). En esa comparación, el R² mejora, lo que aporta mayor confianza en el ajuste cuando la relación no es estrictamente lineal (ver sección 3D).
###############################################
# Regresión Múltiple (y = b0 + b1x1 + b2x2 + ...)
# M1: y = Thickness_Loss ; x1 = Material_Loss ; x2 = Inv_Sqrt_Thickness
###############################################
datosMult <- d1 |>
dplyr::select(y, x1, x2) |>
tidyr::drop_na()
# Etiquetas
datosMult_plot <- datosMult
colnames(datosMult_plot) <- c(
pretty_var("thickness_loss"),
pretty_var("material_loss"),
pretty_var("inv_sqrt_thickness")
)
if (ncol(datosMult_plot) >= 2) {
pairs(datosMult_plot, main = "Matriz de dispersión: variables del modelo múltiple (M3)")
mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
} else {
cat("No se pudo graficar pairs(): se requieren al menos 2 columnas.\n")
}
cor_mult <- cor(datosMult, use = "pairwise.complete.obs")
colnames(cor_mult) <- colnames(datosMult_plot)
rownames(cor_mult) <- colnames(datosMult_plot)
corrplot::corrplot(
cor_mult,
method = "color",
type = "upper",
tl.col = "black",
addCoef.col = "black",
number.cex = 0.65
)
mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
Reg3D <- scatterplot3d(
datosMult$x1, datosMult$x2, datosMult$y, angle = 55,
pch = 16, color = "blue",
xlab = pretty_var("material_loss"),
ylab = pretty_var("inv_sqrt_thickness"),
zlab = pretty_var("thickness_loss"),
main = "Diagrama 3D (M3): Pérdida de espesor vs Pérdida de material e 1/√(espesor)"
)
RegresionM1 <-
mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
RegresionM1 <- lm(y ~ x1 + x2, data = datosMult)
RegresionM1
##
## Call:
## lm(formula = y ~ x1 + x2, data = datosMult)
##
## Coefficients:
## (Intercept) x1 x2
## 8.21095 0.06732 -21.93717
summary(RegresionM1)
##
## Call:
## lm(formula = y ~ x1 + x2, data = datosMult)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3969 -0.8566 0.1762 1.1335 3.8696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.210946 0.179408 45.77 <2e-16 ***
## x1 0.067318 0.001539 43.73 <2e-16 ***
## x2 -21.937171 0.712367 -30.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.698 on 997 degrees of freedom
## Multiple R-squared: 0.658, Adjusted R-squared: 0.6574
## F-statistic: 959.3 on 2 and 997 DF, p-value: < 2.2e-16
# Plano 3D (modelo con 2 predictores)
Reg3D$plane3d(RegresionM1, col = "red")
# Indicadores
yhat <- fitted(RegresionM1)
CoefPearson <- round(cor(datosMult$y, yhat), 4)
CoefPearson
## [1] 0.8112
coefDeterminacion <- round((CoefPearson^2) * 100, 2)
coefDeterminacion
## [1] 65.8
# Importante: en plotly::add_surface(), la matriz Z debe estar alineada con:
# - filas = valores de y (aquí x2)
# - columnas = valores de x (aquí x1)
# Modelo “superficie” (respuesta cuadrática)
mod_m1_surf <- stats::lm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), data = d1)
sum_m1_surf <- summary(mod_m1_surf)
# Comparación rápida M3 vs Superficie (AIC/RMSE)
yhat_lin <- predict(mod_m1, newdata = d1)
yhat_surf <- predict(mod_m1_surf, newdata = d1)
comp_m1 <- tibble::tibble(
Modelo = c("M3 lineal (plano)", "M3 superficie cuadrática"),
R2 = c(summary(mod_m1)$r.squared, sum_m1_surf$r.squared),
R2_aj = c(summary(mod_m1)$adj.r.squared, sum_m1_surf$adj.r.squared),
RMSE = c(sqrt(mean((d1$y - yhat_lin)^2)), sqrt(mean((d1$y - yhat_surf)^2))),
MAE = c(mean(abs(d1$y - yhat_lin)), mean(abs(d1$y - yhat_surf))),
AIC = c(AIC(mod_m1), AIC(mod_m1_surf))
) |>
dplyr::arrange(RMSE)
kbl2(comp_m1, digits = 5, caption = "Comparación: plano vs superficie (M3)")
| Modelo | R2 | R2_aj | RMSE | MAE | AIC |
|---|---|---|---|---|---|
| M3 superficie cuadrática | 0.91142 | 0.91097 | 0.86303 | 0.62392 | 2557.271 |
| M3 lineal (plano) | 0.65804 | 0.65735 | 1.69566 | 1.29208 | 3902.017 |
# Elegir el mejor (menor RMSE)
mod_best_m1 <- if (comp_m1$Modelo[1] == "M3 superficie cuadrática") mod_m1_surf else mod_m1
if (requireNamespace("plotly", quietly = TRUE)) {
# Malla para superficie
x1v <- seq(min(d1$x1), max(d1$x1), length.out = 50)
x2v <- seq(min(d1$x2), max(d1$x2), length.out = 50)
grid <- expand.grid(x1 = x1v, x2 = x2v)
grid$pred <- predict(mod_best_m1, newdata = grid)
# Construcción robusta de Z: filas=x2, columnas=x1
Z_df <- grid |>
dplyr::arrange(x2, x1) |>
tidyr::pivot_wider(names_from = x1, values_from = pred)
Z <- as.matrix(dplyr::select(Z_df, -x2))
plotly::plot_ly() |>
plotly::add_markers(
data = d1, x = ~x1, y = ~x2, z = ~y,
marker = list(size = 2),
name = "Datos"
) |>
plotly::add_surface(
x = x1v, y = x2v, z = Z,
opacity = 0.55,
name = "Superficie ajustada"
) |>
plotly::layout(
title = paste0("Modelo M3 3D: Pérdida de espesor (superficie ajustada) — ", comp_m1$Modelo[1]),
scene = list(
xaxis = list(title = pretty_var("material_loss")),
yaxis = list(title = pretty_var("inv_sqrt_thickness")),
zaxis = list(title = pretty_var("thickness_loss"))
),
annotations = list(
list(
text = caption_autor,
x = 1, y = 0,
xref = "paper", yref = "paper",
xanchor = "right", yanchor = "bottom",
showarrow = FALSE,
font = list(size = 12)
)
)
)
} else {
cat("Instala 'plotly' para ver la superficie 3D.\n")
}
d2 <- df |>
dplyr::select(loss_rate, exposure_index, stress_index, log_time) |>
tidyr::drop_na() |>
dplyr::rename(
y = loss_rate,
x1 = exposure_index,
x2 = stress_index,
x3 = log_time
)
p1 <- ggplot2::ggplot(d2, ggplot2::aes(x = x1, y = y)) +
ggplot2::geom_point(alpha = 0.5) +
ggplot2::geom_smooth(method = "lm", se = TRUE) +
ggplot2::labs(title = "Tasa de pérdida vs Índice de exposición", x = pretty_var("exposure_index"), y = pretty_var("loss_rate"), caption = caption_autor) +
ggplot2::theme_minimal()
p2 <- ggplot2::ggplot(d2, ggplot2::aes(x = x2, y = y)) +
ggplot2::geom_point(alpha = 0.5) +
ggplot2::geom_smooth(method = "lm", se = TRUE) +
ggplot2::labs(title = "Tasa de pérdida vs Índice de esfuerzo", x = pretty_var("stress_index"), y = pretty_var("loss_rate"), caption = caption_autor) +
ggplot2::theme_minimal()
p3 <- ggplot2::ggplot(d2, ggplot2::aes(x = x3, y = y)) +
ggplot2::geom_point(alpha = 0.5) +
ggplot2::geom_smooth(method = "lm", se = TRUE) +
ggplot2::labs(title = "Tasa de pérdida vs log(1+tiempo)", x = pretty_var("log_time"), y = pretty_var("loss_rate"), caption = caption_autor) +
ggplot2::theme_minimal()
p1; p2; p3
mod_m2 <- stats::lm(y ~ x1 + x2 + x3, data = d2)
sum_m2 <- summary(mod_m2)
sum_m2
##
## Call:
## stats::lm(formula = y ~ x1 + x2 + x3, data = d2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5353 -0.3442 0.0407 0.2889 6.6625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.147e+00 1.305e-01 31.775 < 2e-16 ***
## x1 6.746e-02 1.804e-02 3.738 0.000196 ***
## x2 -1.768e-05 1.207e-05 -1.465 0.143226
## x3 -1.367e+00 4.876e-02 -28.042 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9049 on 996 degrees of freedom
## Multiple R-squared: 0.4854, Adjusted R-squared: 0.4839
## F-statistic: 313.2 on 3 and 996 DF, p-value: < 2.2e-16
# Métricas
yhat_m2 <- predict(mod_m2, newdata = d2)
met_m2 <- tibble::tibble(
Modelo = "M4: loss_rate ~ exposure_index + stress_index + log_time",
n = nrow(d2),
R2 = sum_m2$r.squared,
R2_aj = sum_m2$adj.r.squared,
RMSE = sqrt(mean((d2$y - yhat_m2)^2)),
MAE = mean(abs(d2$y - yhat_m2)),
AIC = AIC(mod_m2)
)
kbl2(met_m2, digits = 5, caption = "Métricas del modelo M4")
| Modelo | n | R2 | R2_aj | RMSE | MAE | AIC |
|---|---|---|---|---|---|---|
| M4: loss_rate ~ exposure_index + stress_index + log_time | 1000 | 0.48543 | 0.48388 | 0.90311 | 0.51631 | 2644.055 |
# VIF (multicolinealidad)
if (requireNamespace("car", quietly = TRUE)) {
vif_raw <- tryCatch(car::vif(mod_m2), error = function(e) {
cat("No se pudo calcular VIF M4: ", e$message, "\n"); NA
})
print(vif_raw)
if (is.matrix(vif_raw)) {
vif_val <- vif_raw[, "GVIF"]^(1/(2*vif_raw[, "Df"]))
names(vif_val) <- rownames(vif_raw)
} else {
vif_val <- vif_raw
}
vif_tbl <- tibble::tibble(
Termino = names(vif_val),
VIF = as.numeric(vif_val),
Termino_es = dplyr::recode(
Termino,
x1 = pretty_var("exposure_index"),
x2 = pretty_var("stress_index"),
x3 = pretty_var("log_time"),
.default = Termino
)
)
kbl2(vif_tbl |> dplyr::select(Predictor = Termino_es, VIF), digits = 3,
caption = "VIF (multicolinealidad) — Modelo M4")
# ggplot2::ggplot(vif_tbl, ggplot2::aes(x = reorder(Termino_es, VIF), y = VIF)) +
# ggplot2::geom_col() +
# ggplot2::coord_flip() +
# ggplot2::geom_hline(yintercept = 5, lty = 2) +
# ggplot2::geom_hline(yintercept = 10, lty = 3) +
# ggplot2::labs(
# title = "VIF — Modelo M4",
# x = "Predictor",
# y = "VIF",
# caption = caption_autor
# )
} else {
cat("Paquete 'car' no está instalado. Instala 'car' para calcular VIF.\n")
}
## x1 x2 x3
## 1.579505 1.201788 1.355368
| Predictor | VIF |
|---|---|
| Índice de exposición (estandarizado) | 1.580 |
| Índice de esfuerzo | 1.202 |
| log(1 + tiempo) | 1.355 |
# Diagnóstico: Distancia de Cook
umbral_cook_m2 <- 4 / nrow(d2)
#plot(stats::cooks.distance(mod_m2),
# type = "h",
# main = "Diagnóstico: Distancia de Cook (M4)",
# ylab = "Distancia de Cook")
#abline(h = umbral_cook_m2, lty = 2)
#mtext(caption_autor, side = 1, line = 4, adj = 1, cex = 0.8, font = 3)
# Diagnósticos clásicos (residuos, normalidad, leverage)
op <- par(mfrow = c(2,2), oma = c(0,0,2,0))
plot(mod_m2)
mtext(caption_autor, outer = TRUE, side = 3, line = 0, adj = 1, cex = 0.8, font = 3)
par(op)
# Predicción puntual + intervalo (medianas)
x1_0 <- median(d2$x1)
x2_0 <- median(d2$x2)
x3_0 <- median(d2$x3)
predict(mod_m2, newdata = tibble::tibble(x1 = x1_0, x2 = x2_0, x3 = x3_0), interval = "prediction")
## fit lwr upr
## 1 0.5096725 -1.267181 2.286526
¿Cuál sería loss_rate estimado para valores típicos de los predictores?
coefs_m2 <- coef(mod_m2)
x1_prop_m2 <- median(d2$x1)
x2_prop_m2 <- median(d2$x2)
x3_prop_m2 <- median(d2$x3)
# ŷ = b0 + b1·x1 + b2·x2 + b3·x3
y_est_m2 <- coefs_m2[1] + coefs_m2[2]*x1_prop_m2 + coefs_m2[3]*x2_prop_m2 + coefs_m2[4]*x3_prop_m2
y_est_m2
## (Intercept)
## 0.5096725
r_m2 <- suppressWarnings(stats::cor(d2$y, yhat_m2, use = "complete.obs"))
conf_m2 <- 100 * (r_m2^2)
conf_m2_txt <- ifelse(is.finite(conf_m2), sprintf("%.2f", conf_m2), "N/A")
eq_m2 <- paste0(
"ŷ = ", sprintf("%.4f", coefs_m2[1]),
ifelse(coefs_m2[2] >= 0, " + ", " - "), sprintf("%.4f", abs(coefs_m2[2])), "·x1",
ifelse(coefs_m2[3] >= 0, " + ", " - "), sprintf("%.4f", abs(coefs_m2[3])), "·x2",
ifelse(coefs_m2[4] >= 0, " + ", " - "), sprintf("%.4f", abs(coefs_m2[4])), "·x3"
)
CONCLUSIONES (M4): La variable loss_rate (tasa de pérdida) se modela como una combinación lineal de exposure_index (x1), stress_index (x2) y log_time (x3). La ecuación estimada es ŷ = 4.1474 + 0.0675·x1 - 0.0000·x2 - 1.3673·x3. Podemos afirmar con una confianza del 48.54% que para valores típicos x1 = -0.0598, x2 = 1433.3103 y x3 = 2.6391, se estima loss_rate ≈ 0.5097 (aprox.). Además, al comparar el ajuste como plano lineal frente a una superficie cuadrática (con términos cuadráticos e interacción), el R² aumenta, lo cual sugiere curvatura y mejora la capacidad explicativa del modelo (ver sección 3D). Nota: No extrapolar fuera del rango observado.
Se muestra:
stress_index (x2), log_time (x3),
loss_rate (y)exposure_index (x1)exposure_index
fijando (mediana)# En M4 hay 3 predictores: exposure_index (x1), stress_index (x2), log_time (x3).
# Para visualizar una superficie 3D debemos fijar uno de ellos (aquí fijamos x1 en la mediana).
# OJO: en plotly::add_surface(), Z debe quedar con filas=y (aquí x3) y columnas=x (aquí x2).
# Modelo “superficie” en (x2, x3) + exposure_index lineal
mod_m2_surf <- stats::lm(y ~ x1 + x2 + x3 + I(x2^2) + I(x3^2) + I(x2*x3), data = d2)
sum_m2_surf <- summary(mod_m2_surf)
# Comparación rápida M4 lineal vs superficie
yhat_m2_lin <- predict(mod_m2, newdata = d2)
yhat_m2_surf <- predict(mod_m2_surf, newdata = d2)
comp_m2 <- tibble::tibble(
Modelo = c("M4 lineal", "M4 superficie (x2,x3 cuadrático)"),
R2 = c(summary(mod_m2)$r.squared, sum_m2_surf$r.squared),
R2_aj = c(summary(mod_m2)$adj.r.squared, sum_m2_surf$adj.r.squared),
RMSE = c(sqrt(mean((d2$y - yhat_m2_lin)^2)), sqrt(mean((d2$y - yhat_m2_surf)^2))),
MAE = c(mean(abs(d2$y - yhat_m2_lin)), mean(abs(d2$y - yhat_m2_surf))),
AIC = c(AIC(mod_m2), AIC(mod_m2_surf))
) |>
dplyr::arrange(RMSE)
kbl2(comp_m2, digits = 5, caption = "Comparación: lineal vs superficie (M4)")
| Modelo | R2 | R2_aj | RMSE | MAE | AIC |
|---|---|---|---|---|---|
| M4 superficie (x2,x3 cuadrático) | 0.63681 | 0.63462 | 0.75872 | 0.41419 | 2301.640 |
| M4 lineal | 0.48543 | 0.48388 | 0.90311 | 0.51631 | 2644.055 |
mod_best_m2 <- if (comp_m2$Modelo[1] == "M4 superficie (x2,x3 cuadrático)") mod_m2_surf else mod_m2
if (requireNamespace("plotly", quietly = TRUE)) {
x1_fix <- median(d2$x1)
x2v <- seq(min(d2$x2), max(d2$x2), length.out = 60)
x3v <- seq(min(d2$x3), max(d2$x3), length.out = 60)
# malla sobre (x2, x3) con x1 fijo
grid <- expand.grid(x2 = x2v, x3 = x3v)
grid$x1 <- x1_fix
grid$pred <- predict(mod_best_m2, newdata = grid)
# Construcción robusta de Z: filas=x3, columnas=x2
Z_df <- grid |>
dplyr::arrange(x3, x2) |>
tidyr::pivot_wider(names_from = x2, values_from = pred)
Z <- as.matrix(dplyr::select(Z_df, -x3))
plotly::plot_ly() |>
plotly::add_markers(
data = d2,
x = ~x2, y = ~x3, z = ~y,
color = ~x1,
marker = list(size = 2),
name = "Datos (color = exposure_index)"
) |>
plotly::add_surface(
x = x2v, y = x3v, z = Z,
opacity = 0.55,
name = "Superficie ajustada (x1 fijo)"
) |>
plotly::layout(
title = paste0("M4 3D: Tasa de pérdida (superficie ajustada) — ", comp_m2$Modelo[1], " | x1 fijo = mediana"),
scene = list(
xaxis = list(title = pretty_var("stress_index")),
yaxis = list(title = pretty_var("log_time")),
zaxis = list(title = pretty_var("loss_rate"))
),
annotations = list(
list(
text = caption_autor,
x = 1, y = 0,
xref = "paper", yref = "paper",
xanchor = "right", yanchor = "bottom",
showarrow = FALSE,
font = list(size = 12)
)
)
)
} else {
cat("Instala 'plotly' para ver la visualización 3D.\n")
}