library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-10
data("Hitters")
set.seed(123)
Check the number of observations (rows) and variables (columns)
dim(Hitters)
## [1] 322 20
Check how many rows have missing data (sum(is.na())) Identify which column(s) contain missing values
(sum(is.na(Hitters)))
## [1] 59
sum(nrow(is.na(Hitters)))
## [1] 322
colSums(is.na(Hitters))
## AtBat Hits HmRun Runs RBI Walks Years CAtBat
## 0 0 0 0 0 0 0 0
## CHits CHmRun CRuns CRBI CWalks League Division PutOuts
## 0 0 0 0 0 0 0 0
## Assists Errors Salary NewLeague
## 0 0 59 0
Action: Remove rows with missing values
Create a new dataset ‘Hitters_clean’ using na.omit()
Verify dimensions again to ensure rows were removed
Hitters_clean<- na.omit(Hitters)
dim(Hitters_clean)
## [1] 263 20
Create a histogram of the ‘Salary’ variable Observation: Does it look normal or skewed? (Write comment below)
hist(Hitters_clean$Salary, main = "Salary Distribution", xlab = "Salary")
Yes it is very positively skewed to the left, with most of the salaries being at that lower range.
Split ‘Hitters_clean’ into Training (70%) and Testing (30%)
Create index for training data
Create ‘train_data’ and ‘test_data’
Hitters_index <- sample(1:nrow(Hitters_clean),
round(0.7 * nrow(Hitters_clean)))
Hitters_Train <- Hitters_clean[Hitters_index,]
Hitters_Test <- Hitters_clean[-Hitters_index,]
Fit a full linear regression model on ‘train_data’ using all predictors
Display the summary of the model
full_model <- lm(Salary~., data = Hitters_Train)
summary(full_model)
##
## Call:
## lm(formula = Salary ~ ., data = Hitters_Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -828.0 -169.1 -11.2 134.6 1757.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 236.5886 107.6923 2.197 0.02943 *
## AtBat -2.0079 0.8134 -2.469 0.01458 *
## Hits 6.3735 3.2080 1.987 0.04861 *
## HmRun -3.4758 7.4955 -0.464 0.64347
## Runs -2.9202 3.8616 -0.756 0.45061
## RBI 2.1402 3.2835 0.652 0.51543
## Walks 7.2022 2.4394 2.953 0.00361 **
## Years -16.6985 15.3867 -1.085 0.27940
## CAtBat -0.2203 0.1648 -1.336 0.18336
## CHits 0.2019 0.8722 0.232 0.81721
## CHmRun 1.1003 2.0708 0.531 0.59589
## CRuns 2.2317 0.9931 2.247 0.02596 *
## CRBI 0.4078 0.9501 0.429 0.66831
## CWalks -0.9651 0.4245 -2.273 0.02430 *
## LeagueN 11.3278 105.7700 0.107 0.91484
## DivisionW -119.3533 49.7837 -2.397 0.01763 *
## PutOuts 0.2173 0.0930 2.337 0.02066 *
## Assists 0.6353 0.2898 2.192 0.02979 *
## Errors -10.2234 5.7896 -1.766 0.07929 .
## NewLeagueN 85.9137 106.3435 0.808 0.42033
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 315.4 on 164 degrees of freedom
## Multiple R-squared: 0.6234, Adjusted R-squared: 0.5798
## F-statistic: 14.29 on 19 and 164 DF, p-value: < 2.2e-16
Perform stepwise selection using step() with direction = “both” or “backward”
Save the final model as ‘lm_step_aic’
lm_step_aic <- step(full_model, direction = "both")
## Start: AIC=2136.23
## Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League +
## Division + PutOuts + Assists + Errors + NewLeague
##
## Df Sum of Sq RSS AIC
## - League 1 1141 16314124 2134.2
## - CHits 1 5331 16318315 2134.3
## - CRBI 1 18327 16331310 2134.4
## - HmRun 1 21389 16334373 2134.5
## - CHmRun 1 28085 16341068 2134.5
## - RBI 1 42262 16355246 2134.7
## - Runs 1 56881 16369864 2134.9
## - NewLeague 1 64922 16377905 2135.0
## - Years 1 117153 16430136 2135.5
## - CAtBat 1 177576 16490559 2136.2
## <none> 16312983 2136.2
## - Errors 1 310159 16623143 2137.7
## - Hits 1 392630 16705613 2138.6
## - Assists 1 477940 16790923 2139.5
## - CRuns 1 502331 16815314 2139.8
## - CWalks 1 514117 16827100 2139.9
## - PutOuts 1 543154 16856137 2140.2
## - Division 1 571720 16884704 2140.6
## - AtBat 1 606233 16919217 2140.9
## - Walks 1 867111 17180094 2143.8
##
## Step: AIC=2134.24
## Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + Division +
## PutOuts + Assists + Errors + NewLeague
##
## Df Sum of Sq RSS AIC
## - CHits 1 4996 16319121 2132.3
## - CRBI 1 18992 16333116 2132.4
## - HmRun 1 20881 16335005 2132.5
## - CHmRun 1 27759 16341884 2132.6
## - RBI 1 41745 16355869 2132.7
## - Runs 1 57457 16371581 2132.9
## - Years 1 118958 16433082 2133.6
## - CAtBat 1 176459 16490583 2134.2
## <none> 16314124 2134.2
## - Errors 1 309024 16623148 2135.7
## + League 1 1141 16312983 2136.2
## - NewLeague 1 362431 16676556 2136.3
## - Hits 1 399424 16713548 2136.7
## - Assists 1 480702 16794827 2137.6
## - CRuns 1 503846 16817970 2137.8
## - CWalks 1 515328 16829452 2138.0
## - PutOuts 1 543475 16857599 2138.3
## - Division 1 585192 16899316 2138.7
## - AtBat 1 619320 16933445 2139.1
## - Walks 1 866978 17181102 2141.8
##
## Step: AIC=2132.3
## Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CAtBat + CHmRun + CRuns + CRBI + CWalks + Division + PutOuts +
## Assists + Errors + NewLeague
##
## Df Sum of Sq RSS AIC
## - HmRun 1 16905 16336026 2130.5
## - CHmRun 1 25826 16344947 2130.6
## - RBI 1 37322 16356443 2130.7
## - CRBI 1 62193 16381314 2131.0
## - Runs 1 94145 16413266 2131.3
## - Years 1 117713 16436833 2131.6
## <none> 16319121 2132.3
## - CAtBat 1 279627 16598748 2133.4
## - Errors 1 304238 16623359 2133.7
## + CHits 1 4996 16314124 2134.2
## + League 1 806 16318315 2134.3
## - NewLeague 1 362717 16681838 2134.3
## - Assists 1 476229 16795350 2135.6
## - PutOuts 1 548374 16867494 2136.4
## - Division 1 595322 16914443 2136.9
## - Hits 1 675471 16994591 2137.8
## - AtBat 1 748906 17068027 2138.6
## - Walks 1 1004833 17323954 2141.3
## - CWalks 1 1004927 17324047 2141.3
## - CRuns 1 1363813 17682934 2145.1
##
## Step: AIC=2130.49
## Salary ~ AtBat + Hits + Runs + RBI + Walks + Years + CAtBat +
## CHmRun + CRuns + CRBI + CWalks + Division + PutOuts + Assists +
## Errors + NewLeague
##
## Df Sum of Sq RSS AIC
## - CHmRun 1 12959 16348985 2128.6
## - RBI 1 20825 16356852 2128.7
## - CRBI 1 103493 16439519 2129.7
## - Years 1 112035 16448061 2129.7
## - Runs 1 139858 16475884 2130.1
## <none> 16336026 2130.5
## - Errors 1 303422 16639448 2131.9
## - CAtBat 1 327154 16663180 2132.1
## + HmRun 1 16905 16319121 2132.3
## + CHits 1 1021 16335005 2132.5
## + League 1 537 16335490 2132.5
## - NewLeague 1 367598 16703624 2132.6
## - Assists 1 498834 16834860 2134.0
## - PutOuts 1 534469 16870495 2134.4
## - Division 1 631601 16967627 2135.5
## - AtBat 1 741076 17077102 2136.7
## - Hits 1 806608 17142634 2137.3
## - CWalks 1 1030647 17366673 2139.7
## - Walks 1 1082394 17418420 2140.3
## - CRuns 1 1413550 17749576 2143.8
##
## Step: AIC=2128.63
## Salary ~ AtBat + Hits + Runs + RBI + Walks + Years + CAtBat +
## CRuns + CRBI + CWalks + Division + PutOuts + Assists + Errors +
## NewLeague
##
## Df Sum of Sq RSS AIC
## - RBI 1 22754 16371739 2126.9
## - Years 1 104941 16453926 2127.8
## - Runs 1 131110 16480095 2128.1
## <none> 16348985 2128.6
## - Errors 1 303907 16652892 2130.0
## + CHmRun 1 12959 16336026 2130.5
## + HmRun 1 4038 16344947 2130.6
## + CHits 1 2345 16346640 2130.6
## + League 1 827 16348158 2130.6
## - NewLeague 1 364917 16713902 2130.7
## - Assists 1 493812 16842797 2132.1
## - PutOuts 1 525656 16874641 2132.5
## - CRBI 1 555650 16904635 2132.8
## - CAtBat 1 586729 16935714 2133.1
## - Division 1 627367 16976352 2133.6
## - AtBat 1 754211 17103196 2134.9
## - Hits 1 842706 17191691 2135.9
## - CWalks 1 1029162 17378147 2137.9
## - Walks 1 1076916 17425901 2138.4
## - CRuns 1 1530189 17879174 2143.1
##
## Step: AIC=2126.89
## Salary ~ AtBat + Hits + Runs + Walks + Years + CAtBat + CRuns +
## CRBI + CWalks + Division + PutOuts + Assists + Errors + NewLeague
##
## Df Sum of Sq RSS AIC
## - Years 1 101239 16472978 2126.0
## - Runs 1 112932 16484671 2126.2
## <none> 16371739 2126.9
## - Errors 1 284935 16656675 2128.1
## + RBI 1 22754 16348985 2128.6
## + CHmRun 1 14888 16356852 2128.7
## - NewLeague 1 352905 16724644 2128.8
## + HmRun 1 3944 16367795 2128.8
## + CHits 1 3335 16368404 2128.8
## + League 1 898 16370841 2128.9
## - Assists 1 472437 16844177 2130.1
## - PutOuts 1 522951 16894690 2130.7
## - CAtBat 1 599049 16970788 2131.5
## - Division 1 657207 17028946 2132.1
## - AtBat 1 736119 17107858 2133.0
## - Hits 1 850593 17222333 2134.2
## - CRBI 1 1000696 17372436 2135.8
## - CWalks 1 1033802 17405542 2136.2
## - Walks 1 1080275 17452014 2136.7
## - CRuns 1 1533465 17905205 2141.4
##
## Step: AIC=2126.02
## Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI +
## CWalks + Division + PutOuts + Assists + Errors + NewLeague
##
## Df Sum of Sq RSS AIC
## - Runs 1 120538 16593516 2125.4
## <none> 16472978 2126.0
## + Years 1 101239 16371739 2126.9
## - Errors 1 278006 16750984 2127.1
## + RBI 1 19052 16453926 2127.8
## + CHmRun 1 7168 16465810 2127.9
## + HmRun 1 3176 16469802 2128.0
## + League 1 2369 16470610 2128.0
## + CHits 1 1255 16471724 2128.0
## - NewLeague 1 391771 16864749 2128.3
## - Assists 1 531656 17004634 2129.9
## - PutOuts 1 546577 17019555 2130.0
## - Division 1 622345 17095323 2130.8
## - AtBat 1 701954 17174932 2131.7
## - Hits 1 875012 17347990 2133.6
## - CRBI 1 1076310 17549288 2135.7
## - Walks 1 1094610 17567588 2135.9
## - CWalks 1 1131525 17604504 2136.2
## - CAtBat 1 1525973 17998951 2140.3
## - CRuns 1 1960284 18433262 2144.7
##
## Step: AIC=2125.36
## Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
## Division + PutOuts + Assists + Errors + NewLeague
##
## Df Sum of Sq RSS AIC
## <none> 16593516 2125.4
## + Runs 1 120538 16472978 2126.0
## + Years 1 108845 16484671 2126.2
## - Errors 1 265808 16859324 2126.3
## + CHits 1 11950 16581566 2127.2
## + HmRun 1 8470 16585046 2127.3
## + RBI 1 2716 16590800 2127.3
## + League 1 1598 16591918 2127.3
## + CHmRun 1 959 16592558 2127.3
## - NewLeague 1 462408 17055924 2128.4
## - PutOuts 1 576802 17170318 2129.7
## - Division 1 609581 17203098 2130.0
## - Assists 1 625244 17218761 2130.2
## - Hits 1 754748 17348264 2131.6
## - AtBat 1 856640 17450157 2132.6
## - Walks 1 982030 17575546 2133.9
## - CWalks 1 1058866 17652383 2134.8
## - CRBI 1 1259648 17853164 2136.8
## - CAtBat 1 1415014 18008531 2138.4
## - CRuns 1 1926711 18520227 2143.6
summary(lm_step_aic)
##
## Call:
## lm(formula = Salary ~ AtBat + Hits + Walks + CAtBat + CRuns +
## CRBI + CWalks + Division + PutOuts + Assists + Errors + NewLeague,
## data = Hitters_Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -797.0 -171.8 -10.1 127.8 1765.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 172.19676 83.25669 2.068 0.040121 *
## AtBat -2.00031 0.67324 -2.971 0.003394 **
## Hits 5.93052 2.12649 2.789 0.005889 **
## Walks 6.42584 2.01994 3.181 0.001742 **
## CAtBat -0.25730 0.06738 -3.819 0.000187 ***
## CRuns 2.30925 0.51824 4.456 1.51e-05 ***
## CRBI 1.05082 0.29166 3.603 0.000412 ***
## CWalks -1.01174 0.30628 -3.303 0.001164 **
## DivisionW -120.14999 47.93791 -2.506 0.013132 *
## PutOuts 0.22148 0.09084 2.438 0.015790 *
## Assists 0.67724 0.26680 2.538 0.012030 *
## Errors -9.26152 5.59590 -1.655 0.099747 .
## NewLeagueN 106.50038 48.78764 2.183 0.030403 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 311.5 on 171 degrees of freedom
## Multiple R-squared: 0.6169, Adjusted R-squared: 0.59
## F-statistic: 22.95 on 12 and 171 DF, p-value: < 2.2e-16
length(coef(lm_step_aic)) - 1
## [1] 12
Question: How many predictors were left in the final AIC model?
Comment: There are 12 predictors left.
Create the input matrix ‘x’ (predictors) and vector ‘y’ (response) from ‘train_data’ Hint: use model.matrix() for x to handle categorical variables automatically
x <- model.matrix(Salary~., data = Hitters_Train)[, -1]
y <- Hitters_Train$Salary
Use cv.glmnet() to find optimal lambda
Hitters_lasso <- cv.glmnet(x, y, alpha = 1)
Hitters_lasso$lambda.min
## [1] 3.216343
Report the value of lambda.min
## [1] 3.216343
Fit the final LASSO model using lambda.min
Display the coefficients using coef()
Hitters_lasso_final <- glmnet(x, y, alpha = 1, lambda = Hitters_lasso$lambda.min)
coef(Hitters_lasso_final)
## 20 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 207.8575346
## AtBat -1.5163170
## Hits 4.4833440
## HmRun -3.8302523
## Runs .
## RBI 1.2041422
## Walks 5.9093856
## Years -26.6539318
## CAtBat .
## CHits .
## CHmRun 1.6096210
## CRuns 1.1849423
## CRBI 0.1229401
## CWalks -0.6825704
## LeagueN 10.5114166
## DivisionW -131.1153295
## PutOuts 0.1882778
## Assists 0.3445808
## Errors -6.5273150
## NewLeagueN 71.3468791
Question: Which variables were shrunk exactly to zero?
Comment: The runs, CatBar and Chits variables have been shrunk to zero meaning that they do not meaningfully improve ours models prediction abilities.
Hitters_lasso_min <- as.matrix(coef(Hitters_lasso, s = "lambda.min"))
rownames(Hitters_lasso_min)[Hitters_lasso_min == 0]
## [1] "Runs" "CAtBat" "CHits"
’ 1. Make predictions on ‘test_data’ using ‘lm_step_aic’
Hitters_pred_aic <- predict(lm_step_aic, newdata = Hitters_Test)
Hint: newx argument in predict() for LASSO needs a matrix
x_test <- model.matrix(Salary ~ ., data = Hitters_Test)[, -1]
pred_lasso <- predict(Hitters_lasso,
s = "lambda.min",
newx = x_test)
MSE = mean((Actual - Predicted)^2)
mse_aic <- mean((Hitters_Test$Salary - Hitters_pred_aic)^2)
mse_aic
## [1] 124511.4
mse_lasso <- mean((Hitters_Test$Salary - pred_lasso)^2)
mse_lasso
## [1] 122374.9
Conclusion: Based on OOS MSE, which model performed better?
Comment: While close, we can see that the MSE of the Lasso model is ever so slightly smaller, indicating that the lasso model perfomed better the AIC Model performed better.