library(tidyverse)
library(leaps)
library(DataExplorer)
library(caret)
library(ISLR)
library(ggvis)
library(glmnet)
library(gam)
library(gamlss)
library(PerformanceAnalytics)
catdat <- read.csv(file = 'cat_data.csv', header = T)
RNGkind(sample.kind="Rounding")
set.seed(4268)
train.ind = sample(1:nrow(catdat), 0.5 * nrow(catdat))
catdat.train = catdat[train.ind, ]
catdat.test = catdat[-train.ind, ]
Utilice la mejor selección de subconjuntos para identificar un modelo satisfactorio que utilice un subconjunto de las variables. Justifique cualquier elección que haga. Reporte las variables seleccionadas y la prueba MSE.
# Numero maximo de predictoras (sin considerar intercepto)
nvars <- model.matrix(birds ~ ., catdat.train) %>% ncol() - 1
# Aplicando el método de mejores subconjuntos a los datos de entrenamiento
regfit.full <- regsubsets(birds ~ .,data = catdat.train, nvmax = nvars)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(birds ~ ., data = catdat.train, nvmax = nvars)
## 17 Variables (and intercept)
## Forced in Forced out
## sex FALSE FALSE
## weight FALSE FALSE
## dryfood FALSE FALSE
## wetfood FALSE FALSE
## age FALSE FALSE
## owner.income FALSE FALSE
## daily.playtime FALSE FALSE
## fellow.cats FALSE FALSE
## owner.age FALSE FALSE
## house.area FALSE FALSE
## children.13 FALSE FALSE
## urban FALSE FALSE
## bell FALSE FALSE
## dogs FALSE FALSE
## daily.outdoortime FALSE FALSE
## daily.catnip FALSE FALSE
## neutered FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: exhaustive
## sex weight dryfood wetfood age owner.income daily.playtime
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " " " "*"
## 4 ( 1 ) " " " " " " "*" " " " " "*"
## 5 ( 1 ) " " " " " " "*" " " " " "*"
## 6 ( 1 ) " " " " " " "*" " " " " "*"
## 7 ( 1 ) " " " " "*" "*" " " " " "*"
## 8 ( 1 ) " " "*" " " "*" " " " " "*"
## 9 ( 1 ) " " "*" "*" "*" " " " " "*"
## 10 ( 1 ) " " "*" "*" "*" " " " " "*"
## 11 ( 1 ) " " "*" "*" "*" " " " " "*"
## 12 ( 1 ) " " "*" "*" "*" " " " " "*"
## 13 ( 1 ) "*" "*" "*" "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## fellow.cats owner.age house.area children.13 urban bell dogs
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " "*" " "
## 4 ( 1 ) " " " " " " " " " " "*" " "
## 5 ( 1 ) " " " " " " " " "*" "*" " "
## 6 ( 1 ) " " " " " " "*" "*" "*" " "
## 7 ( 1 ) " " " " " " "*" "*" "*" " "
## 8 ( 1 ) " " " " " " "*" "*" "*" " "
## 9 ( 1 ) " " " " " " "*" "*" "*" " "
## 10 ( 1 ) "*" " " " " "*" "*" "*" " "
## 11 ( 1 ) "*" "*" " " "*" "*" "*" " "
## 12 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 14 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## daily.outdoortime daily.catnip neutered
## 1 ( 1 ) "*" " " " "
## 2 ( 1 ) "*" " " " "
## 3 ( 1 ) "*" " " " "
## 4 ( 1 ) "*" " " " "
## 5 ( 1 ) "*" " " " "
## 6 ( 1 ) "*" " " " "
## 7 ( 1 ) "*" " " " "
## 8 ( 1 ) "*" " " "*"
## 9 ( 1 ) "*" " " "*"
## 10 ( 1 ) "*" " " "*"
## 11 ( 1 ) "*" " " "*"
## 12 ( 1 ) "*" " " "*"
## 13 ( 1 ) "*" " " "*"
## 14 ( 1 ) "*" " " "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
# Matriz de diseño para las regresiones (regsubsets no tiene método predict)
test.mat <- model.matrix(birds ~ . , data = catdat.test)
# R2 para cada tamaño de subconjunto
reg.summary$rsq
## [1] 0.7466277 0.8617156 0.9264257 0.9378112 0.9433293 0.9459755 0.9470128
## [8] 0.9479709 0.9484982 0.9486288 0.9487164 0.9487697 0.9487846 0.9487892
## [15] 0.9487895 0.9487896 0.9487896
# Gráfica para los R2
rsq <- as.data.frame(reg.summary$rsq)
names(rsq) <- "R2"
rsq %>%
ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
layer_points(fill = ~ R2 ) %>%
add_axis("y", title = "R2") %>%
add_axis("x", title = "Número de Variables")
# Grafica comparativa para la RSS, el R2 ajustado, Cp y BIC
par(mfrow = c(2,2))
plot(reg.summary$rss, xlab="Numero de Variables", ylab="RSS", type="l")
plot(reg.summary$adjr2, xlab="Numero de Variables", ylab="R2 Ajustado", type="l")
np.r2 <- which.max(reg.summary$adjr2) #valor optimo para este indicador
points(np.r2, reg.summary$adjr2[np.r2], col = "red", cex = 2, pch = 20)
plot(reg.summary$cp, xlab="Numero de Variables", ylab="Cp", type='l')
np.cp <- which.min(reg.summary$cp)
points(np.cp,reg.summary$cp[np.cp],col="red",cex=2,pch=20)
plot(reg.summary$bic, xlab="Numero de Variables", ylab = "BIC", type = 'l')
np.bic <- which.min(reg.summary$bic)
points(np.bic, reg.summary$bic[np.bic], col = "red", cex = 2, pch = 20)
# Mostrando las variables seleccionadas para el mejor modelo con un número
# determinado de predictores
# Criterio: R2
plot(regfit.full,scale="r2")
# Criterio: R2 ajustado
plot(regfit.full,scale="adjr2")
# Criterio: Cp de Mallow
plot(regfit.full,scale="Cp")
# Criterio: Criterio de informacion bayesiano
plot(regfit.full,scale="bic")
# Viendo los coeficientes asociados al mejor modelo según el BIC
coefi <- coef(regfit.full, np.bic);coefi
## (Intercept) wetfood daily.playtime children.13
## 110.788002 -10.491222 -10.278148 4.652786
## urban bell daily.outdoortime
## -11.788021 -53.048301 1.123833
# Realiza las predicciones multplicando los coeficientes por la matriz de diseño
pred = test.mat[,names(coefi)]%*%coefi
# Calculo del MSE
MSE.test = mean((catdat.test$birds-pred)^2);MSE.test
## [1] 350.9913
## Selección de modelos usando el esquema de validación (retención)
# Calcular el error en el conjunto de validación
val.errors=rep(NA,nvars)
## iterar para cada tamaño i
for(i in 1:nvars){
# Extraer el vector de predictores del mejor modelo con i predictores
coefi=coef(regfit.full,id=i)
# Realiza las predicciones multplicando los coeficientes por la matriz de diseño
pred=test.mat[,names(coefi)]%*%coefi
# Calculo del MSE
val.errors[i]=mean((catdat.test$birds-pred)^2)
}
val.errors
## [1] 1180.4898 821.5044 485.3275 416.8986 367.9665 350.9913 352.7766
## [8] 329.1987 332.7839 333.6023 335.2497 337.1138 338.1776 338.8596
## [15] 338.9740 338.9917 338.9965
# Encontrar el modelo con el menor MSE
min = which.min(val.errors)
min
## [1] 8
# MSE minimo
val.errors[min]
## [1] 329.1987
Conclusiones:
Ahora use la regresión de lazo en el mismo conjunto de datos. Explica cómo eliges λ. Reporte los coeficientes distintos de cero y la prueba MSE.
# Configurando la estructura de datos
x.train <- model.matrix(birds ~ ., data = catdat.train)[, -1]
y.train <- catdat.train$birds
x.test = model.matrix(birds ~ ., data = catdat.test)[, -1]
y.test = catdat.test$birds
# Búsqueda de valores para lambda (desde 10^10 hasta 10^-2 )
grid <- 10^seq(10, -2, length = 100)
# Ajustar el modelo a los datos de entrenamiento
lasso.mod = glmnet(x.train, y.train, alpha = 1, lambda = grid)
plot(lasso.mod)
# Realizando la validación cruzada para seleccionar el valor de lambda
set.seed(1)
# Ajustar el modelo a los datos de entrenamiento
cv.out=cv.glmnet(x.train, y.train, alpha=1)
plot(cv.out)
# Seleccionar el valor de lambda que minimiza el MSE
bestlam = cv.out$lambda.min
bestlam
## [1] 0.8338972
# Usar el valor de lambda para predecir los datos de test
lasso.pred = predict(lasso.mod, s = bestlam,newx = x.test)
# MSE de test
MSE.test.lasso <- mean( (lasso.pred - y.test)^2 )
MSE.test.lasso
## [1] 338.1803
# Estimación del modelo final (todos los datos)
out = glmnet(catdat[, -1], catdat[, 1], alpha = 1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:18,]
lasso.coef
## (Intercept) sex weight dryfood
## 110.208685314 0.000000000 -2.995802389 -0.338370867
## wetfood age owner.income daily.playtime
## -6.926685964 -0.001240092 0.000000000 -9.869753084
## fellow.cats owner.age house.area children.13
## 0.000000000 0.000000000 0.016487044 4.579185954
## urban bell dogs daily.outdoortime
## -10.770542017 -49.190749015 0.000000000 1.117772385
## daily.catnip neutered
## 0.000000000 -5.296325431
# Coeficientes distintos de 0
coef.non.zero <- lasso.coef[lasso.coef!=0];coef.non.zero
## (Intercept) weight dryfood wetfood
## 110.208685314 -2.995802389 -0.338370867 -6.926685964
## age daily.playtime house.area children.13
## -0.001240092 -9.869753084 0.016487044 4.579185954
## urban bell daily.outdoortime neutered
## -10.770542017 -49.190749015 1.117772385 -5.296325431
Conclusiones:
Teniendo en consideracion que se trabajara con la data de entrenamiento, Lambda se escoge a partir de la aplicacion de la validacion cruzada usando como valor de k = 10 (por defecto) apoyados de la funcion cv.glmnet donde se obtiene un valor para lambda por cada fold (es decir, 10 valores lambda) promediandose al final y siendo este promedio el valor optimo de lambda.
Los coefiecientes distintos de cero son weight, dryfood, wetfood, age, daily.playtime, house.area, children.13, urban, bell, daily.outdoortime, neutered, es decir, todos los predictores tienen coeficientes distintos de 0.
El valor del MSE para los datos de prueba es 338.1802867
Para la regresión de lazo, ¿qué sucede cuando λ → ∞? ¿Qué sucede cuando λ = 0?
Respuesta:
A medida que lambda tiene a infinito, la penalizacion que se tiene sobre los coeficientes sera muy grande y los coeficientes tenderan a un valor de 0 lo que nos dejaria un modelo solo con la constante ya que la penalizacion no se aplica sobre esta ultima.
Cuando la manda es 0, al no haber penalizacion sobre los coeficientes, esto resultaria en una regresion lineal por minimos cuadrados ordinarios usual.
Las respuestas aqui obtenidas hacen referencia al libro Statistical Learning, se tuvo en cuenta la ultima ecuacion 3.52 descrita en forma de Lagrange en la pagina 68, se observo el impacto de lambda en la ecuacion.
Ahora verifique si el MSE de prueba es realmente mejor para los modelos en a) y b), en comparación con i) un modelo con solo el intercepto, y ii) una regresión lineal múltiple utilizando todas las covariables.
# Solo intercepto
lm.cons <- lm(formula = birds ~ 1, data = catdat.train)
summary(lm.cons)
##
## Call:
## lm(formula = birds ~ 1, data = catdat.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -164.35 -55.89 -12.75 38.97 276.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 167.710 5.099 32.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76.65 on 225 degrees of freedom
# Predichos
pred.cons <- predict(object = lm.cons, newdata = catdat.test[, -1])
# MSE
MSE.cons <- mean( (catdat.test$birds - pred.cons)^2 );MSE.cons
## [1] 5049.779
Conclusion:
# Modelo completo
lm.full <- lm(formula = birds ~ ., data = catdat.train)
summary(lm.full)
##
## Call:
## lm(formula = birds ~ ., data = catdat.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.509 -10.968 -2.753 11.156 72.152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.105e+02 2.916e+01 3.791 0.000197 ***
## sex 6.576e-01 2.668e+00 0.246 0.805587
## weight -2.269e+00 1.463e+00 -1.551 0.122459
## dryfood -3.026e+00 2.081e+00 -1.454 0.147513
## wetfood -8.377e+00 2.099e+00 -3.991 9.10e-05 ***
## age 4.890e-02 3.583e-01 0.136 0.891570
## owner.income 1.460e-07 1.263e-05 0.012 0.990791
## daily.playtime -1.024e+01 5.429e-01 -18.866 < 2e-16 ***
## fellow.cats -1.112e-02 8.811e+00 -0.001 0.998994
## owner.age 1.157e-01 1.944e-01 0.595 0.552220
## house.area 4.751e-02 1.143e-01 0.416 0.678122
## children.13 4.834e+00 6.277e+00 0.770 0.442136
## urban -1.115e+01 2.570e+00 -4.338 2.24e-05 ***
## bell -5.292e+01 3.282e+00 -16.123 < 2e-16 ***
## dogs -2.904e+00 7.405e+00 -0.392 0.695323
## daily.outdoortime 1.125e+00 2.092e-01 5.377 2.03e-07 ***
## daily.catnip -3.368e-02 8.736e-01 -0.039 0.969283
## neutered -5.933e+00 3.482e+00 -1.704 0.089885 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.04 on 208 degrees of freedom
## Multiple R-squared: 0.9488, Adjusted R-squared: 0.9446
## F-statistic: 226.7 on 17 and 208 DF, p-value: < 2.2e-16
# Predichos
pred.full <- predict(object = lm.full, newdata = catdat.test[, -1])
# MSE
MSE.full <- mean( (catdat.test$birds - pred.full)^2 );MSE.full
## [1] 338.9965
Conclusion:
Presente todos los valores de MSE de la mejor selección de subconjuntos, regresión de lasso, regresión lineal con solo el intercepto y regresión lineal ordinaria en una tabla. Explique lo que ve. ¿Encaja con lo que hubiera esperado?
# Mejores subconjuntos (usando BIC)
MSE.test
## [1] 350.9913
# Regression Lasso
MSE.test.lasso
## [1] 338.1803
# Regresion lineal con solo intercepto
MSE.cons
## [1] 5049.779
# Regresion lineal con todas las predictoras.
MSE.full
## [1] 338.9965
MSE.agr <-
data.frame(Metodo = c('Subconjuntos', 'Lasso', 'RL.Inter', 'RL.Full'),
MSE = c(MSE.test, MSE.test.lasso, MSE.cons, MSE.full))
MSE.agr
## Metodo MSE
## 1 Subconjuntos 350.9913
## 2 Lasso 338.1803
## 3 RL.Inter 5049.7793
## 4 RL.Full 338.9965
Conclusiones:
Se observa que al usar el metodo por subconjuntos, regresion lasso y regresion lineal con todas las predictoras, el MSE de prueba es relativamente bajo y presentan valores parecidos.
Al usar el metodo por regresion lineal simple con solo la constante se obtiene un valor para MSE extremadamente alto en comparacion con el resto de metodos presentados, esto es de esperarse ya que las predicciones se estan realizando sin considerar alguna intervencion adicional esperando que la variable objetivo se explique a si misma, con ello obtendremos siempre el mismo valor para la prediccion bajo cualquier circunstancia, algo que es irreal.
El conjunto de datos College de la librerÃa ISLR contiene estadÃsticas sobre caracterÃsticas demográficas, matrÃcula, etc. de un gran número de universidades estadounidenses de la edición de 1995 de US News and World Report. Divida los datos en dos partes: Seleccione al azar 600 observaciones y asÃgnelas a una muestra de entrenamiento y las restantes a una muestra de evaluación (use 281117 como valor de semilla para realizar la selección).
data("College")
data.set <- tibble(College)
# Estructura del dataset
str(data.set)
## tibble[,18] [777 x 18] (S3: tbl_df/tbl/data.frame)
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num [1:777] 1660 2186 1428 417 193 ...
## $ Accept : num [1:777] 1232 1924 1097 349 146 ...
## $ Enroll : num [1:777] 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num [1:777] 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num [1:777] 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num [1:777] 2885 2683 1036 510 249 ...
## $ P.Undergrad: num [1:777] 537 1227 99 63 869 ...
## $ Outstate : num [1:777] 7440 12280 11250 12960 7560 ...
## $ Room.Board : num [1:777] 3300 6450 3750 5450 4120 ...
## $ Books : num [1:777] 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num [1:777] 2200 1500 1165 875 1500 ...
## $ PhD : num [1:777] 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num [1:777] 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num [1:777] 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num [1:777] 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num [1:777] 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num [1:777] 60 56 54 59 15 55 63 73 80 52 ...
# Proporcion de clases
table.class <- table(data.set$Private);table.class
##
## No Yes
## 212 565
prop.table(table.class)
##
## No Yes
## 0.2728443 0.7271557
# Estructura del dataset
dplyr::glimpse(data.set)
## Rows: 777
## Columns: 18
## $ Private <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes~
## $ Apps <dbl> 1660, 2186, 1428, 417, 193, 587, 353, 1899, 1038, 582, 173~
## $ Accept <dbl> 1232, 1924, 1097, 349, 146, 479, 340, 1720, 839, 498, 1425~
## $ Enroll <dbl> 721, 512, 336, 137, 55, 158, 103, 489, 227, 172, 472, 484,~
## $ Top10perc <dbl> 23, 16, 22, 60, 16, 38, 17, 37, 30, 21, 37, 44, 38, 44, 23~
## $ Top25perc <dbl> 52, 29, 50, 89, 44, 62, 45, 68, 63, 44, 75, 77, 64, 73, 46~
## $ F.Undergrad <dbl> 2885, 2683, 1036, 510, 249, 678, 416, 1594, 973, 799, 1830~
## $ P.Undergrad <dbl> 537, 1227, 99, 63, 869, 41, 230, 32, 306, 78, 110, 44, 638~
## $ Outstate <dbl> 7440, 12280, 11250, 12960, 7560, 13500, 13290, 13868, 1559~
## $ Room.Board <dbl> 3300, 6450, 3750, 5450, 4120, 3335, 5720, 4826, 4400, 3380~
## $ Books <dbl> 450, 750, 400, 450, 800, 500, 500, 450, 300, 660, 500, 400~
## $ Personal <dbl> 2200, 1500, 1165, 875, 1500, 675, 1500, 850, 500, 1800, 60~
## $ PhD <dbl> 70, 29, 53, 92, 76, 67, 90, 89, 79, 40, 82, 73, 60, 79, 36~
## $ Terminal <dbl> 78, 30, 66, 97, 72, 73, 93, 100, 84, 41, 88, 91, 84, 87, 6~
## $ S.F.Ratio <dbl> 18.1, 12.2, 12.9, 7.7, 11.9, 9.4, 11.5, 13.7, 11.3, 11.5, ~
## $ perc.alumni <dbl> 12, 16, 30, 37, 2, 11, 26, 37, 23, 15, 31, 41, 21, 32, 26,~
## $ Expend <dbl> 7041, 10527, 8735, 19016, 10922, 9727, 8861, 11487, 11644,~
## $ Grad.Rate <dbl> 60, 56, 54, 59, 15, 55, 63, 73, 80, 52, 73, 76, 74, 68, 55~
skimr::skim_without_charts(data.set)
| Name | data.set |
| Number of rows | 777 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 17 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Private | 0 | 1 | FALSE | 2 | Yes: 565, No: 212 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Apps | 0 | 1 | 3001.64 | 3870.20 | 81.0 | 776.0 | 1558.0 | 3624.0 | 48094.0 |
| Accept | 0 | 1 | 2018.80 | 2451.11 | 72.0 | 604.0 | 1110.0 | 2424.0 | 26330.0 |
| Enroll | 0 | 1 | 779.97 | 929.18 | 35.0 | 242.0 | 434.0 | 902.0 | 6392.0 |
| Top10perc | 0 | 1 | 27.56 | 17.64 | 1.0 | 15.0 | 23.0 | 35.0 | 96.0 |
| Top25perc | 0 | 1 | 55.80 | 19.80 | 9.0 | 41.0 | 54.0 | 69.0 | 100.0 |
| F.Undergrad | 0 | 1 | 3699.91 | 4850.42 | 139.0 | 992.0 | 1707.0 | 4005.0 | 31643.0 |
| P.Undergrad | 0 | 1 | 855.30 | 1522.43 | 1.0 | 95.0 | 353.0 | 967.0 | 21836.0 |
| Outstate | 0 | 1 | 10440.67 | 4023.02 | 2340.0 | 7320.0 | 9990.0 | 12925.0 | 21700.0 |
| Room.Board | 0 | 1 | 4357.53 | 1096.70 | 1780.0 | 3597.0 | 4200.0 | 5050.0 | 8124.0 |
| Books | 0 | 1 | 549.38 | 165.11 | 96.0 | 470.0 | 500.0 | 600.0 | 2340.0 |
| Personal | 0 | 1 | 1340.64 | 677.07 | 250.0 | 850.0 | 1200.0 | 1700.0 | 6800.0 |
| PhD | 0 | 1 | 72.66 | 16.33 | 8.0 | 62.0 | 75.0 | 85.0 | 103.0 |
| Terminal | 0 | 1 | 79.70 | 14.72 | 24.0 | 71.0 | 82.0 | 92.0 | 100.0 |
| S.F.Ratio | 0 | 1 | 14.09 | 3.96 | 2.5 | 11.5 | 13.6 | 16.5 | 39.8 |
| perc.alumni | 0 | 1 | 22.74 | 12.39 | 0.0 | 13.0 | 21.0 | 31.0 | 64.0 |
| Expend | 0 | 1 | 9660.17 | 5221.77 | 3186.0 | 6751.0 | 8377.0 | 10830.0 | 56233.0 |
| Grad.Rate | 0 | 1 | 65.46 | 17.18 | 10.0 | 53.0 | 65.0 | 78.0 | 118.0 |
# Evaluacion general del conjunto de datos
DataExplorer::plot_intro(data.set)
# Evaluacion de la variable categorica
data.set %>%
DataExplorer::plot_bar(
ggtheme = ggplot2::theme_minimal(base_family = 'serif'))
# Evaluacion de las variables numericas
DataExplorer::plot_histogram(
data = data.set,
ggtheme = ggplot2::theme_minimal(base_family = 'serif'))
# qqplot para las variables numericas
DataExplorer::plot_qq(
data = data.set,
ggtheme = ggplot2::theme_minimal(base_family = 'serif'))
# # Outstate
# plotly::plot_ly() %>%
# plotly::add_histogram(x = ~Outstate, data = data.set) %>%
# plotly::layout(title = 'Distribución de Outstate')
RNGkind(sample.kind="Rounding")
set.seed(281117)
filas <- sample(1:777, 600, replace= F)
data.train <- data.set[filas,]#datos de entrenamiento
data.test <- data.set[-filas,]
Con la muestra de entrenamiento, use la variable Outstate como variable respuesta y el resto como predictoras y realice el ajuste de los siguientes modelos:
mod1 <- glm(Outstate ~ .,
family = gaussian(link = identity), data = data.train)
summary(mod1)
##
## Call:
## glm(formula = Outstate ~ ., family = gaussian(link = identity),
## data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7789.7 -1265.9 -42.5 1261.7 9722.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.856e+03 8.788e+02 -2.112 0.035110 *
## PrivateYes 2.479e+03 2.844e+02 8.717 < 2e-16 ***
## Apps -2.985e-01 7.268e-02 -4.107 4.58e-05 ***
## Accept 7.887e-01 1.397e-01 5.646 2.57e-08 ***
## Enroll -6.937e-01 4.007e-01 -1.731 0.083941 .
## Top10perc 2.099e+01 1.246e+01 1.685 0.092456 .
## Top25perc -1.050e+01 9.652e+00 -1.088 0.276990
## F.Undergrad -5.458e-02 6.893e-02 -0.792 0.428804
## P.Undergrad 2.765e-02 6.510e-02 0.425 0.671147
## Room.Board 8.460e-01 9.802e-02 8.631 < 2e-16 ***
## Books -5.275e-01 4.829e-01 -1.092 0.275184
## Personal -2.162e-01 1.313e-01 -1.647 0.100076
## PhD 1.011e+01 9.654e+00 1.047 0.295317
## Terminal 3.299e+01 1.068e+01 3.089 0.002101 **
## S.F.Ratio -5.490e+01 3.013e+01 -1.822 0.068988 .
## perc.alumni 3.688e+01 8.541e+00 4.317 1.85e-05 ***
## Expend 2.507e-01 2.941e-02 8.523 < 2e-16 ***
## Grad.Rate 2.325e+01 6.294e+00 3.694 0.000241 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3718008)
##
## Null deviance: 9398382681 on 599 degrees of freedom
## Residual deviance: 2163880821 on 582 degrees of freedom
## AIC: 10800
##
## Number of Fisher Scoring iterations: 2
Seleccionaremos las variables usando el metodo hibrido (hacia adelante y atras) de stepwise, y las usaremos para el modelo GAM.
regfit.swd <-
regsubsets(Outstate ~ ., data = data.train, nvmax = 17, method = "seqrep")
summary(regfit.swd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = data.train, nvmax = 17,
## method = "seqrep")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: 'sequential replacement'
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " "*" " " " " " " "*"
## 9 ( 1 ) "*" "*" "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" "*" "*" " " " " " "
## 11 ( 1 ) "*" "*" "*" "*" " " " " " "
## 12 ( 1 ) "*" "*" "*" "*" " " " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " "*" " "
## 5 ( 1 ) " " "*" " " " " " " "*" " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " "*" " " "*" " "
## 8 ( 1 ) " " "*" " " " " " " "*" " "
## 9 ( 1 ) " " "*" " " " " " " "*" " "
## 10 ( 1 ) " " "*" " " "*" " " "*" " "
## 11 ( 1 ) " " "*" " " "*" " " "*" "*"
## 12 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 13 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 14 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) " " "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" " " " "
## 16 ( 1 ) "*" "*" " "
## 17 ( 1 ) "*" "*" "*"
# Mejor modelo según el Cp
plot(regfit.swd, scale = "Cp")
cp.low <- which.min(summary(regfit.swd)$cp)
plot(summary(regfit.swd)$cp,xlab="Número de Variables",ylab="Cp",type='l')
points(cp.low,summary(regfit.swd)$cp[cp.low],col="red",cex=2,pch=20)
# variables seleccionadas:
vars.selec <- coef(regfit.swd, cp.low);vars.selec
## (Intercept) PrivateYes Apps Accept Enroll
## -2367.5159474 2550.2735822 -0.2648657 0.7168814 -0.8580567
## Room.Board Personal PhD Terminal S.F.Ratio
## 0.8287475 -0.2476054 14.7421943 29.2619243 -58.1057200
## perc.alumni Expend Grad.Rate
## 38.2784421 0.2670621 24.6978797
# Modelo Spline
mod2 <-
gam(Outstate ~
s(Apps,df=1) + s(Accept,df=1) + s(Enroll,df=1) + s(Room.Board,df=1) +
s(Personal,df=1) + s(PhD,df=1) + s(Terminal,df=1) + s(S.F.Ratio,df=1) +
s(perc.alumni,df=1) + s(Expend,df=1) + s(Grad.Rate,df=1) + Private,
data = data.train)
summary(mod2)
##
## Call: gam(formula = Outstate ~ s(Apps, df = 1) + s(Accept, df = 1) +
## s(Enroll, df = 1) + s(Room.Board, df = 1) + s(Personal, df = 1) +
## s(PhD, df = 1) + s(Terminal, df = 1) + s(S.F.Ratio, df = 1) +
## s(perc.alumni, df = 1) + s(Expend, df = 1) + s(Grad.Rate,
## df = 1) + Private, data = data.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7834.27 -1264.70 -15.78 1255.21 9596.64
##
## (Dispersion Parameter for gaussian family taken to be 3716956)
##
## Null Deviance: 9398382681 on 599 degrees of freedom
## Residual Deviance: 2181845640 on 586.998 degrees of freedom
## AIC: 10794.63
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(Apps, df = 1) 1 30615387 30615387 8.2367 0.004253 **
## s(Accept, df = 1) 1 310457410 310457410 83.5246 < 2.2e-16 ***
## s(Enroll, df = 1) 1 849781177 849781177 228.6229 < 2.2e-16 ***
## s(Room.Board, df = 1) 1 3142881257 3142881257 845.5525 < 2.2e-16 ***
## s(Personal, df = 1) 1 116195677 116195677 31.2610 3.459e-08 ***
## s(PhD, df = 1) 1 471089910 471089910 126.7408 < 2.2e-16 ***
## s(Terminal, df = 1) 1 55848514 55848514 15.0253 0.000118 ***
## s(S.F.Ratio, df = 1) 1 964131574 964131574 259.3874 < 2.2e-16 ***
## s(perc.alumni, df = 1) 1 415234445 415234445 111.7136 < 2.2e-16 ***
## s(Expend, df = 1) 1 427454173 427454173 115.0011 < 2.2e-16 ***
## s(Grad.Rate, df = 1) 1 120672404 120672404 32.4654 1.921e-08 ***
## Private 1 312020967 312020967 83.9453 < 2.2e-16 ***
## Residuals 587 2181845640 3716956
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(Apps, df = 1) 0 8.088 0.0029194 **
## s(Accept, df = 1) 0 9.187 0.0011829 **
## s(Enroll, df = 1) 0 0.602 0.0006359 ***
## s(Room.Board, df = 1) 0 11.471 3.097e-05 ***
## s(Personal, df = 1) 0 3.893 0.0002018 ***
## s(PhD, df = 1) 0 4.097 2.959e-05 ***
## s(Terminal, df = 1) 0 119.481 9.370e-07 ***
## s(S.F.Ratio, df = 1) 0 3.626 3.463e-05 ***
## s(perc.alumni, df = 1) 0 0.062 1.898e-05 ***
## s(Expend, df = 1) 0 68.875 0.0003162 ***
## s(Grad.Rate, df = 1) 0 1.583 1.070e-05 ***
## Private
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3 <-
gam(Outstate ~
s(Apps,df=4) + s(Accept,df=4) + s(Enroll,df=4) + s(Room.Board,df=1) +
s(Personal,df=4) + s(PhD,df=4) + s(Terminal,df=4) + s(S.F.Ratio,df=4) +
s(perc.alumni,df=4) + s(Expend,df=4) + s(Grad.Rate,df=4) + Private,
data = data.train)
summary(mod3)
##
## Call: gam(formula = Outstate ~ s(Apps, df = 4) + s(Accept, df = 4) +
## s(Enroll, df = 4) + s(Room.Board, df = 1) + s(Personal, df = 4) +
## s(PhD, df = 4) + s(Terminal, df = 4) + s(S.F.Ratio, df = 4) +
## s(perc.alumni, df = 4) + s(Expend, df = 4) + s(Grad.Rate,
## df = 4) + Private, data = data.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4871.37 -1096.65 53.34 1167.29 7109.48
##
## (Dispersion Parameter for gaussian family taken to be 2963610)
##
## Null Deviance: 9398382681 on 599 degrees of freedom
## Residual Deviance: 1650730199 on 556.9998 degrees of freedom
## AIC: 10687.26
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(Apps, df = 4) 1 34851347 34851347 11.760 0.0006500 ***
## s(Accept, df = 4) 1 470724409 470724409 158.835 < 2.2e-16 ***
## s(Enroll, df = 4) 1 1059892615 1059892615 357.636 < 2.2e-16 ***
## s(Room.Board, df = 1) 1 2412781512 2412781512 814.136 < 2.2e-16 ***
## s(Personal, df = 4) 1 94158212 94158212 31.771 2.758e-08 ***
## s(PhD, df = 4) 1 289779100 289779100 97.779 < 2.2e-16 ***
## s(Terminal, df = 4) 1 40850322 40850322 13.784 0.0002257 ***
## s(S.F.Ratio, df = 4) 1 846272581 846272581 285.555 < 2.2e-16 ***
## s(perc.alumni, df = 4) 1 394017331 394017331 132.952 < 2.2e-16 ***
## s(Expend, df = 4) 1 651751406 651751406 219.918 < 2.2e-16 ***
## s(Grad.Rate, df = 4) 1 112932033 112932033 38.106 1.291e-09 ***
## Private 1 292292567 292292567 98.627 < 2.2e-16 ***
## Residuals 557 1650730199 2963610
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(Apps, df = 4) 3 2.8999 0.034476 *
## s(Accept, df = 4) 3 9.0330 7.547e-06 ***
## s(Enroll, df = 4) 3 1.4738 0.220669
## s(Room.Board, df = 1) 0 5.0246 3.363e-05 ***
## s(Personal, df = 4) 3 3.9220 0.008667 **
## s(PhD, df = 4) 3 2.2117 0.085724 .
## s(Terminal, df = 4) 3 4.1167 0.006648 **
## s(S.F.Ratio, df = 4) 3 3.5929 0.013554 *
## s(perc.alumni, df = 4) 3 1.2754 0.281969
## s(Expend, df = 4) 3 24.3006 8.660e-15 ***
## s(Grad.Rate, df = 4) 3 3.0965 0.026492 *
## Private
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod4<-
gam::gam(Outstate ~ gam::lo(Apps,span=0.25)+gam::lo(Accept,span=0.25)+gam::lo(Enroll,span=0.25)+gam::lo(Room.Board,span=0.25)+gam::lo(Personal,span=0.25)+gam::lo(PhD,span=0.25)+gam::lo(Terminal,span=0.25)+gam::lo(S.F.Ratio,span=0.25)+gam::lo(perc.alumni,span=0.25)+gam::lo(Expend,span=0.25)+gam::lo(Grad.Rate,span=0.25)+ Private,data=data.train)
summary(mod4)
##
## Call: gam::gam(formula = Outstate ~ gam::lo(Apps, span = 0.25) + gam::lo(Accept,
## span = 0.25) + gam::lo(Enroll, span = 0.25) + gam::lo(Room.Board,
## span = 0.25) + gam::lo(Personal, span = 0.25) + gam::lo(PhD,
## span = 0.25) + gam::lo(Terminal, span = 0.25) + gam::lo(S.F.Ratio,
## span = 0.25) + gam::lo(perc.alumni, span = 0.25) + gam::lo(Expend,
## span = 0.25) + gam::lo(Grad.Rate, span = 0.25) + Private,
## data = data.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7834.73 -1264.37 -15.72 1255.29 9596.71
##
## (Dispersion Parameter for gaussian family taken to be 3717090)
##
## Null Deviance: 9398382681 on 599 degrees of freedom
## Residual Deviance: 2181932055 on 587 degrees of freedom
## AIC: 10794.65
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## gam::lo(Apps, span = 0.25) 1 30615220 30615220 8.2363 0.004254
## gam::lo(Accept, span = 0.25) 1 310430369 310430369 83.5143 < 2.2e-16
## gam::lo(Enroll, span = 0.25) 1 849632446 849632446 228.5746 < 2.2e-16
## gam::lo(Room.Board, span = 0.25) 1 3143091690 3143091690 845.5785 < 2.2e-16
## gam::lo(Personal, span = 0.25) 1 116193505 116193505 31.2593 3.462e-08
## gam::lo(PhD, span = 0.25) 1 471134652 471134652 126.7482 < 2.2e-16
## gam::lo(Terminal, span = 0.25) 1 55851986 55851986 15.0257 0.000118
## gam::lo(S.F.Ratio, span = 0.25) 1 964145212 964145212 259.3817 < 2.2e-16
## gam::lo(perc.alumni, span = 0.25) 1 415209278 415209278 111.7028 < 2.2e-16
## gam::lo(Expend, span = 0.25) 1 427469376 427469376 115.0011 < 2.2e-16
## gam::lo(Grad.Rate, span = 0.25) 1 120683655 120683655 32.4672 1.919e-08
## Private 1 311993237 311993237 83.9348 < 2.2e-16
## Residuals 587 2181932055 3717090
##
## gam::lo(Apps, span = 0.25) **
## gam::lo(Accept, span = 0.25) ***
## gam::lo(Enroll, span = 0.25) ***
## gam::lo(Room.Board, span = 0.25) ***
## gam::lo(Personal, span = 0.25) ***
## gam::lo(PhD, span = 0.25) ***
## gam::lo(Terminal, span = 0.25) ***
## gam::lo(S.F.Ratio, span = 0.25) ***
## gam::lo(perc.alumni, span = 0.25) ***
## gam::lo(Expend, span = 0.25) ***
## gam::lo(Grad.Rate, span = 0.25) ***
## Private ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod5 <-
gam::gam(Outstate ~
gam::lo(Apps,span=0.75) + gam::lo(Accept,span=0.75) +
gam::lo(Enroll,span=0.75)+ gam::lo(Room.Board,span=0.75) +
gam::lo(Personal,span=0.75)+gam::lo(PhD,span=0.75)+
gam::lo(Terminal,span=0.75)+gam::lo(S.F.Ratio,span=0.75)+
gam::lo(perc.alumni,span=0.75)+gam::lo(Expend,span=0.75)+
gam::lo(Grad.Rate,span=0.75)+Private,
data=data.train)
summary(mod5)
##
## Call: gam::gam(formula = Outstate ~ gam::lo(Apps, span = 0.75) + gam::lo(Accept,
## span = 0.75) + gam::lo(Enroll, span = 0.75) + gam::lo(Room.Board,
## span = 0.75) + gam::lo(Personal, span = 0.75) + gam::lo(PhD,
## span = 0.75) + gam::lo(Terminal, span = 0.75) + gam::lo(S.F.Ratio,
## span = 0.75) + gam::lo(perc.alumni, span = 0.75) + gam::lo(Expend,
## span = 0.75) + gam::lo(Grad.Rate, span = 0.75) + Private,
## data = data.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7834.73 -1264.37 -15.72 1255.29 9596.71
##
## (Dispersion Parameter for gaussian family taken to be 3717090)
##
## Null Deviance: 9398382681 on 599 degrees of freedom
## Residual Deviance: 2181932055 on 587 degrees of freedom
## AIC: 10794.65
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## gam::lo(Apps, span = 0.75) 1 30615220 30615220 8.2363 0.004254
## gam::lo(Accept, span = 0.75) 1 310430369 310430369 83.5143 < 2.2e-16
## gam::lo(Enroll, span = 0.75) 1 849632446 849632446 228.5746 < 2.2e-16
## gam::lo(Room.Board, span = 0.75) 1 3143091690 3143091690 845.5785 < 2.2e-16
## gam::lo(Personal, span = 0.75) 1 116193505 116193505 31.2593 3.462e-08
## gam::lo(PhD, span = 0.75) 1 471134652 471134652 126.7482 < 2.2e-16
## gam::lo(Terminal, span = 0.75) 1 55851986 55851986 15.0257 0.000118
## gam::lo(S.F.Ratio, span = 0.75) 1 964145212 964145212 259.3817 < 2.2e-16
## gam::lo(perc.alumni, span = 0.75) 1 415209278 415209278 111.7028 < 2.2e-16
## gam::lo(Expend, span = 0.75) 1 427469376 427469376 115.0011 < 2.2e-16
## gam::lo(Grad.Rate, span = 0.75) 1 120683655 120683655 32.4672 1.919e-08
## Private 1 311993237 311993237 83.9348 < 2.2e-16
## Residuals 587 2181932055 3717090
##
## gam::lo(Apps, span = 0.75) **
## gam::lo(Accept, span = 0.75) ***
## gam::lo(Enroll, span = 0.75) ***
## gam::lo(Room.Board, span = 0.75) ***
## gam::lo(Personal, span = 0.75) ***
## gam::lo(PhD, span = 0.75) ***
## gam::lo(Terminal, span = 0.75) ***
## gam::lo(S.F.Ratio, span = 0.75) ***
## gam::lo(perc.alumni, span = 0.75) ***
## gam::lo(Expend, span = 0.75) ***
## gam::lo(Grad.Rate, span = 0.75) ***
## Private ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1, mod2, mod3, mod4, mod5)
## df AIC
## mod1 19 10799.67
## mod2 14 10794.63
## mod3 14 10687.26
## mod4 14 10794.65
## mod5 14 10794.65
Comente sus hallazgos y compare los resultados seleccionando un modelo usando como criterio el AIC.
En el modelo 3 se observa que incluyendo considerando splines de suavizamiento con df = 4 se tienen efectos significativos, lo que significa que tomando el spline para las 12 variables seleccionadas (seleccion vÃa stepwise) tenemos un aporte adicional respecto al modelo 1.
Existen efectos no lineales de todas las variables, de acuerdo con el Anova para efectos Parametricos. Mientras que efectos no lineables para las variables APPS,, ACCEPT, ROOM.BOARD, PERSONAL, TERMINAL, SF RATIO, EXPEND y Grand. rate son indicados por el ANOVA para efectos no paramatricos
El modelo 1, es el modelo base, se observa que tiene el mayor criterio de informacion de Akaike (AIC), respecto a los demás modelos.Solo se estan tomando n consideración los efectos lineales.
Los modelos 3 y 5 indican que no es necesario considerar la regresión local para modelar el comportamiento de la variable Outstate, puesto que no exisitira un aporte significativo. No existe un aporte significativo de las variables predictoras
Usando el criterio de informacion de Akaike (AIC), se observa que el menor valor para este indicador se obtiene en el modelo 3 (GAM considerando splines de suavizamiento con df = 4) por lo que seria el mas apropiado de acuerdo a lo construido lineas arriba.
Evalúe los modelos obtenidos usando la muestra de evaluación con el MSE. Explique los resultados obtenidos y compárelos con los de la pregunta (a).
# Evaluación del modelo
data.test$pred.1 <- predict(object = mod1,newdata = data.test)
mse.mod1 <- Metrics::mse(data.test$Outstate, data.test$pred.1)
data.test$pred.1 <- predict(object = mod2,newdata = data.test)
mse.mod2 <- Metrics::mse(data.test$Outstate, data.test$pred.1)
data.test$pred.1 <- predict(object = mod3,newdata = data.test)
mse.mod3 <- Metrics::mse(data.test$Outstate, data.test$pred.1)
data.test$pred.1 <- predict(object = mod4,newdata = data.test)
mse.mod4<- Metrics::mse(data.test$Outstate, data.test$pred.1)
data.test$pred.1 <- predict(object = mod5,newdata = data.test)
mse.mod5 <- Metrics::mse(data.test$Outstate, data.test$pred.1)
# Tabla considerando todos los MSE
MSE.table <-
data.frame(Metodo = c('Mod1', 'Mod2', 'Mod3', 'Mod4', 'Mod5'),
MSE = c(mse.mod1, mse.mod2, mse.mod3, mse.mod4, mse.mod5))
MSE.table
## Metodo MSE
## 1 Mod1 4530463
## 2 Mod2 4660060
## 3 Mod3 3803011
## 4 Mod4 4660308
## 5 Mod5 4660308
Conclusiones:
¿Para qué variables (si fuera el caso) hay evidencia de una relación no lineal con la variable respuesta? Sustente su respuesta usando gráficas y pruebas estadÃsticas.
# Diagnóstico con la librerÃa gamlss
plot(mod3)
PerformanceAnalytics::chart.Correlation(data.set[, -1], method = 'spearman')
Conclusiones: