lm()mi_regresion <- function(df) {
x <- df[[1]]
y <- df[[2]]
n <- length(x)
beta1 <- (sum(x) * sum(y) - n * sum(x * y)) / ((sum(x)^2) - n * sum(x^2))
beta0 <- (sum(y) - beta1 * sum(x)) / n
y_hat <- beta0 + beta1 * x
residuos <- y - y_hat
r2 <- sum((y_hat - mean(y))^2) / sum((y - mean(y))^2)
r <- sqrt(r2)
grafico <- ggplot(data = df, aes(x = x, y = y)) +
geom_point() +
geom_abline(intercept = beta0, slope = beta1, color = "blue") +
ggtitle("Regresión Lineal sin lm()")
return(list(
betas = c(beta0, beta1),
R2 = r2,
r = r,
residuos = residuos,
grafico = grafico
))
}
lm()datos <- read_excel("dataset.xlsx")
# Crear un subconjunto simple de prueba
sub_df <- datos[, c("X1", "Y1")]
# Aplicar nuestra función personalizada
modelo_manual <- mi_regresion(sub_df)
modelo_manual$grafico
# Aplicar modelo con lm()
modelo_lm <- lm(Y1 ~ X1, data = datos)
summary(modelo_lm)
##
## Call:
## lm(formula = Y1 ~ X1, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.569 -6.332 -1.028 3.393 19.259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -23.053 2.081 -11.08 <2e-16 ***
## X1 59.359 2.698 22.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.904 on 766 degrees of freedom
## Multiple R-squared: 0.3872, Adjusted R-squared: 0.3864
## F-statistic: 484 on 1 and 766 DF, p-value: < 2.2e-16
# Gráfica usando ggplot para comparación
ggplot(datos, aes(x = X1, y = Y1)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
ggtitle("Regresión Lineal con lm()")
## `geom_smooth()` using formula = 'y ~ x'
# Comparar métricas
r2_lm <- summary(modelo_lm)$r.squared
r_manual <- modelo_manual$r
r2_manual <- modelo_manual$R2
cat("R2 (manual):", r2_manual, "\n")
## R2 (manual): 0.3872224
cat("R2 (lm):", r2_lm, "\n")
## R2 (lm): 0.3872224
cat("Correlación manual:", r_manual, "\n")
## Correlación manual: 0.6222719
cat("Correlación lm:", sqrt(r2_lm), "\n")
## Correlación lm: 0.6222719
datos <- read_excel("dataset.xlsx")
summary(datos)
## X1 X2 X3 X4
## Min. :0.6200 Min. :514.5 Min. :245.0 Min. :110.2
## 1st Qu.:0.6825 1st Qu.:606.4 1st Qu.:294.0 1st Qu.:140.9
## Median :0.7500 Median :673.8 Median :318.5 Median :183.8
## Mean :0.7642 Mean :671.7 Mean :318.5 Mean :176.6
## 3rd Qu.:0.8300 3rd Qu.:741.1 3rd Qu.:343.0 3rd Qu.:220.5
## Max. :0.9800 Max. :808.5 Max. :416.5 Max. :220.5
## X5 X6 X7 X8 Y1
## Min. :3.50 Min. :2.00 Min. :0.0000 Min. :0.000 Min. : 6.01
## 1st Qu.:3.50 1st Qu.:2.75 1st Qu.:0.1000 1st Qu.:1.750 1st Qu.:12.99
## Median :5.25 Median :3.50 Median :0.2500 Median :3.000 Median :18.95
## Mean :5.25 Mean :3.50 Mean :0.2344 Mean :2.812 Mean :22.31
## 3rd Qu.:7.00 3rd Qu.:4.25 3rd Qu.:0.4000 3rd Qu.:4.000 3rd Qu.:31.67
## Max. :7.00 Max. :5.00 Max. :0.4000 Max. :5.000 Max. :43.10
## Y2
## Min. :10.90
## 1st Qu.:15.62
## Median :22.08
## Mean :24.59
## 3rd Qu.:33.13
## Max. :48.03
lapply(names(datos), function(var) {
ggplot(datos, aes_string(x = var)) + geom_density() + ggtitle(paste("Densidad de", var))
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
ggcorr(datos, label = TRUE)
Los comentarios se deben redactar luego de observar el resumen y las gráficas.
vars <- setdiff(names(datos), c("Y1", "Y2"))
for (var in vars) {
print(ggplot(datos, aes_string(x = var, y = "Y1")) + geom_point() + ggtitle(paste(var, "vs Y1")))
print(ggplot(datos, aes_string(x = var, y = "Y2")) + geom_point() + ggtitle(paste(var, "vs Y2")))
}
set.seed(123)
particion <- createDataPartition(datos$Y1, p = 0.7, list = FALSE)
dataTrain <- datos[particion, ]
dataValidation <- datos[-particion, ]
trainControlctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
formulas <- list(
Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8,
Y1 ~ X1 + X3 + X5 + X7,
Y1 ~ X2 + X4 + X6 + X8,
Y1 ~ X1 + X5 + X2 + X6,
Y1 ~ X1 + X4 + X7 # Modelo propuesto
)
modelos <- lapply(formulas, function(f) {
train(f, data = dataTrain, method = "lm", trControl = ctrl)
})
rmse_vals <- sapply(modelos, function(m) min(m$results$RMSE))
orden <- order(rmse_vals)
modelos_ordenados <- modelos[orden]
Champion <- train(formulas[[orden[1]]], data = dataTrain, method = "lm")
Challenger <- train(formulas[[orden[2]]], data = dataTrain, method = "lm")
preds <- predict(Champion, newdata = dataValidation)
rmse_val <- RMSE(preds, dataValidation$Y1)
r2_val <- R2(preds, dataValidation$Y1)
rmse_val
## [1] 2.930979
r2_val
## [1] 0.9150631
ggplot() +
geom_density(aes(x = dataTrain$Y1), color = "blue") +
geom_density(aes(x = dataValidation$Y1), color = "red") +
ggtitle("Densidad Y1: Train (azul) vs Validation (rojo)")
ggplot(data.frame(obs = dataValidation$Y1, pred = preds), aes(x = obs, y = pred)) +
geom_point() +
ggtitle("Predicciones vs Observaciones")