#—1. LIBRERIAS E IMPORTACION DE DATOS–
library(ggplot2) # Gráficos
library(lmtest) # Pruebas de hipótesis
library(car) # VIF y gráficos
library(tidyverse)
library(glmnet) # Para Ridge y Lasso
library(corrplot) # Para correlaciones
#—IMPORTANCION DE DATOS—
datos <- read.csv("C:/Users/BJ/Downloads/BDprimerexamenconcretocsv.csv")
View(datos)
#—ESTRUCTURA DE BD
str(datos)
'data.frame': 1030 obs. of 9 variables:
$ cement : num 540 540 332 332 199 ...
$ slag : num 0 0 142 142 132 ...
$ ash : num 0 0 0 0 0 0 0 0 0 0 ...
$ water : num 162 162 228 228 192 228 228 228 228 228 ...
$ superplastic: num 2.5 2.5 0 0 0 0 0 0 0 0 ...
$ coarseagg : num 1040 1055 932 932 978 ...
$ fineagg : num 676 676 594 594 826 ...
$ age : int 28 28 270 365 360 90 365 28 28 28 ...
$ strength : num 80 61.9 40.3 41 44.3 ...
head(datos)
#—b) Integridad de los datos—
sum(duplicated(datos))
[1] 25
summary(datos)
cement slag ash water superplastic coarseagg fineagg
Min. :102.0 Min. : 0.0 Min. : 0.00 Min. :121.8 Min. : 0.000 Min. : 801.0 Min. :594.0
1st Qu.:192.4 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.:164.9 1st Qu.: 0.000 1st Qu.: 932.0 1st Qu.:731.0
Median :272.9 Median : 22.0 Median : 0.00 Median :185.0 Median : 6.400 Median : 968.0 Median :779.5
Mean :281.2 Mean : 73.9 Mean : 54.19 Mean :181.6 Mean : 6.205 Mean : 972.9 Mean :773.6
3rd Qu.:350.0 3rd Qu.:142.9 3rd Qu.:118.30 3rd Qu.:192.0 3rd Qu.:10.200 3rd Qu.:1029.4 3rd Qu.:824.0
Max. :540.0 Max. :359.4 Max. :200.10 Max. :247.0 Max. :32.200 Max. :1145.0 Max. :992.6
age strength
Min. : 1.00 Min. : 2.33
1st Qu.: 7.00 1st Qu.:23.71
Median : 28.00 Median :34.45
Mean : 45.66 Mean :35.82
3rd Qu.: 56.00 3rd Qu.:46.13
Max. :365.00 Max. :82.60
#—a) Distribución de la variable respuesta—
ggplot(datos, aes(x=strength)) +
geom_histogram(bins=30, fill="darkblue", col="white", aes(y=..density..)) +
geom_density(col="red", lwd=1) +
ggtitle("Distribución de la Resistencia (Strength)") +
theme_minimal()
#—b) Distribución de variables predictoras continuas–
datos %>%
gather(key="variable", value="valor", -strength) %>%
ggplot(aes(x=valor)) +
geom_histogram(fill="steelblue", col="white", bins=30) +
facet_wrap(~variable, scales="free") +
theme_minimal() +
ggtitle("Histogramas de Variables Predictoras")
#—c) Análisis de variables cualitativas—
No hay variables cualitativas todas son cuantitativas
#—d) Identificación de valores atípicos (Boxplots)—
datos %>%
gather(key="variable", value="valor", -strength) %>%
ggplot(aes(x=factor(0), y=valor)) +
geom_boxplot(fill="orange", alpha=0.5) +
facet_wrap(~variable, scales="free") +
theme_minimal() +
ggtitle("Diagramas de Caja (Detección de Outliers)")
#—e) Evaluación de correlaciones—
M <- cor(datos)
corrplot(M, method="color", addCoef.col = "black", type="upper", diag=FALSE)
pairs(strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age, data = datos)
#—-b) Partición del conjunto de datos (75% - 25%)—-
set.seed(123)
n <- nrow(datos)
train_index <- sample(1:n, size = 0.75 * n)
train_data <- datos[train_index, ]
test_data <- datos[-train_index, ]
x_train <- model.matrix(strength ~ ., train_data)[,-1]
y_train <- train_data$strength
x_test <- model.matrix(strength ~ ., test_data)[,-1]
y_test <- test_data$strength
#—c) Estimación y entrenamiento—
#—-i. Modelo de Regresión Lineal Múltiple (OLS)—-
modelo_ols <- lm(strength ~ ., data = train_data)
summary(modelo_ols)
Call:
lm(formula = strength ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-30.775 -6.206 0.779 6.710 35.059
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -36.665934 29.252994 -1.253 0.210441
cement 0.119007 0.009468 12.569 < 2e-16 ***
slag 0.103612 0.011214 9.239 < 2e-16 ***
ash 0.085797 0.014096 6.087 1.82e-09 ***
water -0.111204 0.044035 -2.525 0.011760 *
superplastic 0.406345 0.105975 3.834 0.000136 ***
coarseagg 0.022639 0.010381 2.181 0.029502 *
fineagg 0.022074 0.011854 1.862 0.062965 .
age 0.120509 0.006562 18.364 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.12 on 763 degrees of freedom
Multiple R-squared: 0.6216, Adjusted R-squared: 0.6177
F-statistic: 156.7 on 8 and 763 DF, p-value: < 2.2e-16
#—-ii. Cresta Modelo (Alfa = 0)—-
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min
modelo_ridge <- glmnet(x_train, y_train, alpha = 0, lambda = best_lambda_ridge)
coef(modelo_ridge)
9 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 56.187040084
cement 0.081439762
slag 0.059882777
ash 0.032706292
water -0.206067115
superplastic 0.462355544
coarseagg -0.007191762
fineagg -0.016592102
age 0.112037463
#—-iii. Modelo LASSO (Alfa = 1)—-
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min
modelo_lasso <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda_lasso)
coef(modelo_lasso)
9 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -28.28742386
cement 0.11625840
slag 0.10034618
ash 0.08192857
water -0.12135401
superplastic 0.40319696
coarseagg 0.01987501
fineagg 0.01872564
age 0.12022522
#—-iv. Árbol de regresión—-
modelo_arbol <- rpart(strength ~ ., data = train_data, method = "anova")
rpart.plot(modelo_arbol, type = 3, digits = 3, fallen.leaves = TRUE,
main = "Árbol de Regresión para Resistencia")
plot(modelo_ols, which=2)
#—d) Verificación de supuestos(solo para regresion multiple)—
#–Normalidad de residuos–
ks.test(modelo_ols$residuals, "pnorm", mean(modelo_ols$residuals), sd(modelo_ols$residuals))
Asymptotic one-sample Kolmogorov-Smirnov test
data: modelo_ols$residuals
D = 0.043113, p-value = 0.1134
alternative hypothesis: two-sided
#–Homocedasticidad (Breusch-Pagan)–
bptest(modelo_ols)
studentized Breusch-Pagan test
data: modelo_ols
BP = 104.46, df = 8, p-value < 2.2e-16
#–Multicolinealidad (VIF)–
vif(modelo_ols)
cement slag ash water superplastic coarseagg fineagg age
7.369890 7.157315 6.192801 6.464365 2.875256 4.766997 6.876622 1.077122
#–Autocorrelación–
dwtest(modelo_ols)
Durbin-Watson test
data: modelo_ols
DW = 1.9906, p-value = 0.4464
alternative hypothesis: true autocorrelation is greater than 0
#—e) Evaluación del desempeño (Tabla Comparativa)—
x_train <- model.matrix(strength ~ ., data = train_data)[,-1]
y_train <- train_data$strength
# Para el conjunto de PRUEBA
x_test <- model.matrix(strength ~ ., data = test_data)[,-1]
y_test <- test_data$strength
#—MODELO LASSO (Alfa = 1)—
# 1. Predicciones
pred_ols <- predict(modelo_ols, newdata = test_data)
pred_ridge <- predict(modelo_ridge, newx = x_test)
pred_lasso <- predict(modelo_lasso, newx = x_test)
pred_arbol <- predict(modelo_arbol, newdata = test_data)
# 2. Función para calcular métricas
calcular_metricas <- function(real, pred, modelo_obj = NULL, k=NULL) {
sse <- sum((real - pred)^2)
sst <- sum((real - mean(real))^2)
r2 <- 1 - sse/sst
n <- length(real)
if(is.null(k)) k <- 8 # Num variables aprox
r2_adj <- 1 - (1 - r2) * ((n - 1) / (n - k - 1))
mse <- mean((real - pred)^2)
rmse <- sqrt(mse)
# AIC/BIC (Aproximación para comparación)
aic <- n * log(mse) + 2 * k
bic <- n * log(mse) + k * log(n)
return(c(R2 = r2, R2_Adj = r2_adj, MSE = mse, RMSE = rmse, AIC = aic, BIC = bic))
}
# 3. Crear Tabla
metricas_ols <- calcular_metricas(test_data$strength, pred_ols)
metricas_ridge <- calcular_metricas(test_data$strength, pred_ridge)
metricas_lasso <- calcular_metricas(test_data$strength, pred_lasso)
metricas_arbol <- calcular_metricas(test_data$strength, pred_arbol)
tabla_final <- rbind(OLS = metricas_ols,
Ridge = metricas_ridge,
Lasso = metricas_lasso,
Arbol = metricas_arbol)
print(round(tabla_final, 3))
R2 R2_Adj MSE RMSE AIC BIC
OLS 0.592 0.579 127.183 11.278 1266.172 1294.596
Ridge 0.583 0.570 129.984 11.401 1271.792 1300.216
Lasso 0.592 0.579 127.173 11.277 1266.151 1294.574
Arbol 0.723 0.714 86.506 9.301 1166.736 1195.160
#—a) Comparación Valores Reales vs Predichos (Mejor Modelo)—
pred_lasso <- predict(modelo_lasso, newx = x_test)
resultados <- data.frame(
Real = test_data$strength,
Predicho = as.vector(pred_lasso)
)
print(head(resultados))
Real Predicho
1 79.99 52.53561
2 40.27 59.10646
3 43.70 71.28369
4 45.85 20.84406
5 28.02 23.05739
6 47.81 21.44872
ggplot(resultados, aes(x=Real, y=Predicho)) +
geom_point(color="blue", alpha=0.6) +
geom_abline(intercept=0, slope=1, color="red", linetype="dashed") +
ggtitle("Validación: Valores Reales vs Predichos (Lasso)") +
theme_minimal()