Ejercicio 1: Función de regresión sin lm()

mi_regresion <- function(df) {
  x <- df[[1]]
  y <- df[[2]]
  n <- length(x)

  beta1 <- (sum(x) * sum(y) - n * sum(x * y)) / ((sum(x)^2) - n * sum(x^2))
  beta0 <- (sum(y) - beta1 * sum(x)) / n

  y_hat <- beta0 + beta1 * x
  residuos <- y - y_hat

  r2 <- sum((y_hat - mean(y))^2) / sum((y - mean(y))^2)
  r <- sqrt(r2)

  grafico <- ggplot(data = df, aes(x = x, y = y)) +
    geom_point() +
    geom_abline(intercept = beta0, slope = beta1, color = "blue") +
    ggtitle("Regresión Lineal sin lm()")

  return(list(
    betas = c(beta0, beta1),
    R2 = r2,
    r = r,
    residuos = residuos,
    grafico = grafico
  ))
}

Comparación con modelo usando lm()

datos <- read_excel("dataset.xlsx")

# Crear un subconjunto simple de prueba
sub_df <- datos[, c("X1", "Y1")]

# Aplicar nuestra función personalizada
modelo_manual <- mi_regresion(sub_df)
modelo_manual$grafico

# Aplicar modelo con lm()
modelo_lm <- lm(Y1 ~ X1, data = datos)
summary(modelo_lm)
## 
## Call:
## lm(formula = Y1 ~ X1, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.569  -6.332  -1.028   3.393  19.259 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -23.053      2.081  -11.08   <2e-16 ***
## X1            59.359      2.698   22.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.904 on 766 degrees of freedom
## Multiple R-squared:  0.3872, Adjusted R-squared:  0.3864 
## F-statistic:   484 on 1 and 766 DF,  p-value: < 2.2e-16
# Gráfica usando ggplot para comparación
ggplot(datos, aes(x = X1, y = Y1)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Regresión Lineal con lm()")
## `geom_smooth()` using formula = 'y ~ x'

# Comparar métricas
r2_lm <- summary(modelo_lm)$r.squared
r_manual <- modelo_manual$r
r2_manual <- modelo_manual$R2

cat("R2 (manual):", r2_manual, "\n")
## R2 (manual): 0.3872224
cat("R2 (lm):", r2_lm, "\n")
## R2 (lm): 0.3872224
cat("Correlación manual:", r_manual, "\n")
## Correlación manual: 0.6222719
cat("Correlación lm:", sqrt(r2_lm), "\n")
## Correlación lm: 0.6222719

Ejercicio 2: Análisis del dataset

1. Leer los datos

datos <- read_excel("dataset.xlsx")
summary(datos)
##        X1               X2              X3              X4       
##  Min.   :0.6200   Min.   :514.5   Min.   :245.0   Min.   :110.2  
##  1st Qu.:0.6825   1st Qu.:606.4   1st Qu.:294.0   1st Qu.:140.9  
##  Median :0.7500   Median :673.8   Median :318.5   Median :183.8  
##  Mean   :0.7642   Mean   :671.7   Mean   :318.5   Mean   :176.6  
##  3rd Qu.:0.8300   3rd Qu.:741.1   3rd Qu.:343.0   3rd Qu.:220.5  
##  Max.   :0.9800   Max.   :808.5   Max.   :416.5   Max.   :220.5  
##        X5             X6             X7               X8              Y1       
##  Min.   :3.50   Min.   :2.00   Min.   :0.0000   Min.   :0.000   Min.   : 6.01  
##  1st Qu.:3.50   1st Qu.:2.75   1st Qu.:0.1000   1st Qu.:1.750   1st Qu.:12.99  
##  Median :5.25   Median :3.50   Median :0.2500   Median :3.000   Median :18.95  
##  Mean   :5.25   Mean   :3.50   Mean   :0.2344   Mean   :2.812   Mean   :22.31  
##  3rd Qu.:7.00   3rd Qu.:4.25   3rd Qu.:0.4000   3rd Qu.:4.000   3rd Qu.:31.67  
##  Max.   :7.00   Max.   :5.00   Max.   :0.4000   Max.   :5.000   Max.   :43.10  
##        Y2       
##  Min.   :10.90  
##  1st Qu.:15.62  
##  Median :22.08  
##  Mean   :24.59  
##  3rd Qu.:33.13  
##  Max.   :48.03

2. Gráficas de densidad

lapply(names(datos), function(var) {
  ggplot(datos, aes_string(x = var)) + geom_density() + ggtitle(paste("Densidad de", var))
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

3. Matriz de correlación

ggcorr(datos, label = TRUE)

4. Comentarios sobre el análisis

Los comentarios se deben redactar luego de observar el resumen y las gráficas.

5. Scatter plots contra Y1 y Y2

vars <- setdiff(names(datos), c("Y1", "Y2"))
for (var in vars) {
  print(ggplot(datos, aes_string(x = var, y = "Y1")) + geom_point() + ggtitle(paste(var, "vs Y1")))
  print(ggplot(datos, aes_string(x = var, y = "Y2")) + geom_point() + ggtitle(paste(var, "vs Y2")))
}

6. Partición del dataset

set.seed(123)
particion <- createDataPartition(datos$Y1, p = 0.7, list = FALSE)
dataTrain <- datos[particion, ]
dataValidation <- datos[-particion, ]

7. Configuración del trainControl

ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)

8. Entrenamiento de modelos

formulas <- list(
  Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8,
  Y1 ~ X1 + X3 + X5 + X7,
  Y1 ~ X2 + X4 + X6 + X8,
  Y1 ~ X1 + X5 + X2 + X6,
  Y1 ~ X1 + X4 + X7 # Modelo propuesto
)

modelos <- lapply(formulas, function(f) {
  train(f, data = dataTrain, method = "lm", trControl = ctrl)
})

9. Ordenar modelos por RMSE

rmse_vals <- sapply(modelos, function(m) min(m$results$RMSE))
orden <- order(rmse_vals)
modelos_ordenados <- modelos[orden]

10. Entrenar Champion y Challenger

Champion <- train(formulas[[orden[1]]], data = dataTrain, method = "lm")
Challenger <- train(formulas[[orden[2]]], data = dataTrain, method = "lm")

11. Validación final

preds <- predict(Champion, newdata = dataValidation)
rmse_val <- RMSE(preds, dataValidation$Y1)
r2_val <- R2(preds, dataValidation$Y1)

rmse_val
## [1] 2.930979
r2_val
## [1] 0.9150631

Densidad de Y1 en train y validación

ggplot() +
  geom_density(aes(x = dataTrain$Y1), color = "blue") +
  geom_density(aes(x = dataValidation$Y1), color = "red") +
  ggtitle("Densidad Y1: Train (azul) vs Validation (rojo)")

Predicciones vs Observaciones

ggplot(data.frame(obs = dataValidation$Y1, pred = preds), aes(x = obs, y = pred)) +
  geom_point() +
  ggtitle("Predicciones vs Observaciones")