library("ggplot2")
library("tidyverse")
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ✓ purrr 0.3.3
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("dplyr")
library("magrittr")
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
# LOGIT MODEL
install.packages("titanic", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages
library(tidyverse)
library(titanic)
logit_t=glm(formula=Survived ~ Pclass+Sex, family=binomial(link="logit"), data=titanic_train)
# linear model
lm_t=lm(Survived ~ Fare, data=titanic_train)
ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="lm", se=TRUE)
## `geom_smooth()` using formula 'y ~ x'
summary(logit_t) #see the regression results
##
## Call:
## glm(formula = Survived ~ Pclass + Sex, family = binomial(link = "logit"),
## data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2030 -0.7036 -0.4519 0.6719 2.1599
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.2946 0.2974 11.077 <2e-16 ***
## Pclass -0.9606 0.1061 -9.057 <2e-16 ***
## Sexmale -2.6434 0.1838 -14.380 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 827.2 on 888 degrees of freedom
## AIC: 833.2
##
## Number of Fisher Scoring iterations: 4
ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="glm", method.args=list(family="binomial"(link="logit")), se=TRUE)
## `geom_smooth()` using formula 'y ~ x'
probit_t=glm(formula=Survived ~ Fare, family=binomial(link="probit"), data=titanic_train)
summary(probit_t) # see the results
##
## Call:
## glm(formula = Survived ~ Fare, family = binomial(link = "probit"),
## data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4601 -0.8920 -0.8601 1.3527 1.5811
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.563592 0.056440 -9.986 < 2e-16 ***
## Fare 0.008453 0.001242 6.804 1.02e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 1119.9 on 889 degrees of freedom
## AIC: 1123.9
##
## Number of Fisher Scoring iterations: 6
ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="glm", method.args=list(family="binomial"(link="probit")), se=TRUE)
## `geom_smooth()` using formula 'y ~ x'
install.packages("nnet", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages
install.packages("Ecdat", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages
library(nnet)
library(Ecdat)
## Loading required package: Ecfun
##
## Attaching package: 'Ecfun'
## The following object is masked from 'package:base':
##
## sign
##
## Attaching package: 'Ecdat'
## The following object is masked from 'package:datasets':
##
## Orange
Y <- Yogurt
M <- multinom(choice ~., data = Y) #the model
## # weights: 44 (30 variable)
## initial value 3343.741999
## iter 10 value 2740.831153
## iter 20 value 2593.985140
## iter 30 value 2563.707966
## final value 2562.990614
## converged
summary(M)
## Call:
## multinom(formula = choice ~ ., data = Y)
##
## Coefficients:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## dannon -0.03520968 0.0001402729 -0.3401250 -0.4537247 -0.06022368
## hiland 4.14419228 -0.0005795527 -0.5042782 -0.5157286 1.07464455
## weight 1.18249094 0.0048663666 -0.2611470 -0.7810408 -1.23737832
## feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon 0.4865494 0.3675842 -0.7044580 0.1846033 0.13864682
## hiland 1.0134997 0.1305399 -0.4733090 -0.7054197 -0.06976759
## weight 1.1985995 0.4751706 -0.5396113 -0.3682489 -0.05816486
##
## Std. Errors:
## (Intercept) id feat.yoplait feat.dannon feat.hiland feat.weight
## dannon 0.8822363 0.001870004 0.2239925 0.3140816 0.3276537 0.3276952
## hiland 2.0291901 0.004573466 0.7435141 0.8224360 0.4603284 0.6936778
## weight 1.0029899 0.002120281 0.2688138 0.3821215 0.3936001 0.3360577
## price.yoplait price.dannon price.hiland price.weight
## dannon 0.03461712 0.06710173 0.07237536 0.08089368
## hiland 0.06907950 0.15048557 0.15430184 0.16553123
## weight 0.04614420 0.07464468 0.08106812 0.08519369
##
## Residual Deviance: 5125.981
## AIC: 5185.981
exp(coef(M)) #log odds relative to baseline
## (Intercept) id feat.yoplait feat.dannon feat.hiland feat.weight
## dannon 0.965403 1.0001403 0.7116813 0.6352576 0.9415539 1.626694
## hiland 63.066661 0.9994206 0.6039413 0.5970654 2.9289516 2.755227
## weight 3.262491 1.0048782 0.7701677 0.4579292 0.2901439 3.315470
## price.yoplait price.dannon price.hiland price.weight
## dannon 1.444241 0.4943765 1.2027413 1.1487183
## hiland 1.139443 0.6229375 0.4939013 0.9326105
## weight 1.608289 0.5829748 0.6919450 0.9434944
head(fitted(M)) #probabilities of picking each category for each observation
## yoplait dannon hiland weight
## 1 0.3245838 0.5091098 0.014125195 0.1521812
## 2 0.6018923 0.2850152 0.009748865 0.1033436
## 3 0.5813145 0.3033498 0.010775186 0.1045605
## 4 0.5813145 0.3033498 0.010775186 0.1045605
## 5 0.4133581 0.2930154 0.023418517 0.2702081
## 6 0.4915570 0.2899564 0.027613810 0.1908728
Y$choice <- relevel(Y$choice, ref = "dannon") #set new baseline
# FULL MODEL
M <- multinom(choice ~., data = Y)
## # weights: 44 (30 variable)
## initial value 3343.741999
## iter 10 value 2723.422944
## iter 20 value 2610.939896
## iter 30 value 2563.450128
## final value 2562.990625
## converged
summary(M)
## Call:
## multinom(formula = choice ~ ., data = Y)
##
## Coefficients:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.03458607 -0.0001407410 0.34004481 0.45379140 0.06035367
## hiland 4.17701865 -0.0007153663 -0.16441842 -0.06136236 1.13495466
## weight 1.21648919 0.0047271381 0.07897677 -0.32719222 -1.17703765
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait -0.4864858 -0.3675775 0.7044753 -0.1845759 -0.1386124
## hiland 0.5276242 -0.2369168 0.2316330 -0.8900015 -0.2088435
## weight 0.7120903 0.1076158 0.1648777 -0.5528200 -0.1967649
##
## Std. Errors:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.8822345 0.001870002 0.2239905 0.3140810 0.3276568
## hiland 2.0045405 0.004559142 0.7531383 0.8017917 0.4601638
## weight 0.8945313 0.001968840 0.2807492 0.3197073 0.3823698
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait 0.3276950 0.03461661 0.06710192 0.07237505 0.08089485
## hiland 0.6865561 0.07314327 0.14532253 0.15387341 0.16449378
## weight 0.2871956 0.04632864 0.05918325 0.07819600 0.07641287
##
## Residual Deviance: 5125.981
## AIC: 5185.981
z <- summary(M)$coefficients/summary(M)$standard.errors #Wald test for p values
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
## (Intercept) id feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait 0.9687287 0.94000587 0.1289838 0.1485077 0.853858262 0.13765801
## hiland 0.0371803 0.87531728 0.8271868 0.9389962 0.013647324 0.44218515
## weight 0.1738559 0.01635142 0.7784748 0.3061132 0.002082057 0.01315811
## price.yoplait price.dannon price.hiland price.weight
## yoplait 0.000000000 0.000000000 1.076396e-02 0.08662261
## hiland 0.001199163 0.110953132 7.295151e-09 0.20422249
## weight 0.020185676 0.005338194 1.552980e-12 0.01002335
# MODEL 1
M1 <- multinom(choice ~ price.yoplait + price.dannon + price.hiland + price.weight, data = Y) #price model
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2696.572347
## iter 20 value 2590.714860
## final value 2590.678971
## converged
summary(M1)
## Call:
## multinom(formula = choice ~ price.yoplait + price.dannon + price.hiland +
## price.weight, data = Y)
##
## Coefficients:
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait -0.01730093 -0.37181637 0.6872791 -0.2021630 -0.09475388
## hiland 6.36906160 -0.24038641 0.2362219 -1.1644689 -0.29925903
## weight 1.11884075 0.08901697 0.1897998 -0.4480783 -0.22525465
##
## Std. Errors:
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.8186591 0.03417663 0.06291827 0.06846684 0.07733802
## hiland 1.6863041 0.07344800 0.13278123 0.11799583 0.14801776
## weight 0.8152734 0.04589639 0.05402714 0.07013378 0.07000950
##
## Residual Deviance: 5181.358
## AIC: 5211.358
z1 <- summary(M1)$coefficients/summary(M1)$standard.errors
p1 <- (1 - pnorm(abs(z1), 0, 1)) * 2
p1
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.9831393611 0.000000000 0.0000000000 3.149938e-03 0.220503123
## hiland 0.0001587699 0.001064581 0.0752347040 0.000000e+00 0.043199301
## weight 0.1699543794 0.052438052 0.0004430004 1.670737e-10 0.001293189
# MODEL 2
library(stargazer) #a package for making the output into a nice table
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
M2 <- multinom(choice ~ feat.yoplait + feat.dannon + feat.hiland + feat.weight, data = Y) #feature model
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2811.342546
## iter 20 value 2786.117255
## final value 2786.109276
## converged
summary(M2)
## Call:
## multinom(formula = choice ~ feat.yoplait + feat.dannon + feat.hiland +
## feat.weight, data = Y)
##
## Coefficients:
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait -0.1936272 0.846540156 -0.6273882 -0.02145402 -0.1248708
## hiland -2.9226341 -0.021333436 -0.2961226 2.48442120 0.5246249
## weight -0.5541230 0.007649006 -0.5247896 -0.31481937 0.4915248
##
## Std. Errors:
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait 0.05202093 0.2066922 0.2610760 0.2737980 0.2732667
## hiland 0.15475460 0.7416536 0.7374798 0.3258950 0.6225818
## weight 0.05788051 0.2740806 0.2866666 0.3354511 0.2567316
##
## Residual Deviance: 5572.219
## AIC: 5602.219
z2 <- summary(M2)$coefficients/summary(M2)$standard.errors
p2 <- (1 - pnorm(abs(z2), 0, 1)) * 2
p2
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait 0.0001975709 4.209728e-05 0.01625733 9.375440e-01 0.6477029
## hiland 0.0000000000 9.770523e-01 0.68802757 2.464695e-14 0.3994185
## weight 0.0000000000 9.777356e-01 0.06715108 3.479898e-01 0.0555503
# MODEL 3
M3 <- multinom(choice ~ feat.weight + price.yoplait + price.dannon + price.hiland, data = Y)
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2689.814698
## iter 20 value 2586.232922
## final value 2586.218053
## converged
summary(M3)
## Call:
## multinom(formula = choice ~ feat.weight + price.yoplait + price.dannon +
## price.hiland, data = Y)
##
## Coefficients:
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait -0.4868515 -0.3600374 -0.38592053 0.6645636 -0.1906188
## hiland 4.3842502 0.9043384 -0.24106611 0.2005119 -1.1886847
## weight -0.1938915 0.9480919 0.08262245 0.1532484 -0.4768760
##
## Std. Errors:
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.6953676 0.3133467 0.03414217 0.06176640 0.06882024
## hiland 1.4367336 0.6610763 0.07445377 0.12879778 0.11938915
## weight 0.7029393 0.2679710 0.04484957 0.05149647 0.07105390
##
## Residual Deviance: 5172.436
## AIC: 5202.436
z3 <- summary(M3)$coefficients/summary(M3)$standard.errors
p3 <- (1 - pnorm(abs(z3), 0, 1)) * 2
p3
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.483842694 0.2505532970 0.00000000 0.000000000 5.608959e-03
## hiland 0.002276706 0.1713186599 0.00120457 0.119518777 0.000000e+00
## weight 0.782678888 0.0004031105 0.06544402 0.002921295 1.926770e-11
stargazer(M3, type = "text", title = "M3")
##
## M3
## ===============================================
## Dependent variable:
## -----------------------------
## yoplait hiland weight
## (1) (2) (3)
## -----------------------------------------------
## feat.weight -0.360 0.904 0.948***
## (0.313) (0.661) (0.268)
##
## price.yoplait -0.386*** -0.241*** 0.083*
## (0.034) (0.074) (0.045)
##
## price.dannon 0.665*** 0.201 0.153***
## (0.062) (0.129) (0.051)
##
## price.hiland -0.191*** -1.189*** -0.477***
## (0.069) (0.119) (0.071)
##
## Constant -0.487 4.384*** -0.194
## (0.695) (1.437) (0.703)
##
## -----------------------------------------------
## Akaike Inf. Crit. 5,202.436 5,202.436 5,202.436
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
####Model selection for MLB Salary
library(ISLR)
data("Hitters")
names(Hitters)
## [1] "AtBat" "Hits" "HmRun" "Runs" "RBI" "Walks"
## [7] "Years" "CAtBat" "CHits" "CHmRun" "CRuns" "CRBI"
## [13] "CWalks" "League" "Division" "PutOuts" "Assists" "Errors"
## [19] "Salary" "NewLeague"
dim(Hitters)
## [1] 322 20
#First we check for any missing data in your response, which is salary. Then we will remove the NAs. This process is called listwise deletion.
# How many data points are misisng for Salary?
sum(is.na(Hitters$Salary))
## [1] 59
# We are going to remove the NAs
Hitters<-na.omit(Hitters)
dim(Hitters)
## [1] 263 20
sum(is.na(Hitters$Salary))
## [1] 0
#Classic Variable Section Approaches
#Forward Selection
#Backward Selection
#Mixed Selection
#Best Subset Selection
#In order to demonstrate how the algorithms work, we only consider a small subset of predictors.
#Candidate Predictors:
#CRBI: Career runs batted in
#Hits: Number of hits in 1986
#Runs: Number of runs in 1986
#Forward Selection Algorithm
#Start with nothing
#Pick variable with lowest sum of squares residual (or p-value)
#Add variables until hit stopping rule
#Let’s do this with our three variables.
#STEP 1: Use simple linear regression to select one variable to enter the model.
# Forward Selection
mod1<-lm(Salary~CRBI, data=Hitters)
summary(mod1)
##
## Call:
## lm(formula = Salary ~ CRBI, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1099.27 -203.45 -97.43 146.37 1847.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 274.58039 32.85537 8.357 3.85e-15 ***
## CRBI 0.79095 0.07113 11.120 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 372.3 on 261 degrees of freedom
## Multiple R-squared: 0.3215, Adjusted R-squared: 0.3189
## F-statistic: 123.6 on 1 and 261 DF, p-value: < 2.2e-16
mod2<-lm(Salary~Hits, data=Hitters)
summary(mod2)
##
## Call:
## lm(formula = Salary ~ Hits, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -893.99 -245.63 -59.08 181.12 2059.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.0488 64.9822 0.970 0.333
## Hits 4.3854 0.5561 7.886 8.53e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 406.2 on 261 degrees of freedom
## Multiple R-squared: 0.1924, Adjusted R-squared: 0.1893
## F-statistic: 62.19 on 1 and 261 DF, p-value: 8.531e-14
mod3<-lm(Salary~Runs, data=Hitters)
summary(mod3)
##
## Call:
## lm(formula = Salary ~ Runs, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -714.8 -251.2 -55.9 193.6 1997.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.9292 59.9240 2.168 0.031 *
## Runs 7.4161 0.9923 7.474 1.18e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 410.2 on 261 degrees of freedom
## Multiple R-squared: 0.1763, Adjusted R-squared: 0.1731
## F-statistic: 55.86 on 1 and 261 DF, p-value: 1.18e-12
# First: keep CRBI (smallest p-val)
#STEP 2: Look for a second variable to add to the model.
mod12<-lm(Salary~CRBI+Hits, data=Hitters)
summary(mod12)
##
## Call:
## lm(formula = Salary ~ CRBI + Hits, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -942.47 -183.72 -37.85 96.55 2167.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -47.95590 55.98245 -0.857 0.392
## CRBI 0.68990 0.06723 10.262 < 2e-16 ***
## Hits 3.30084 0.48177 6.851 5.28e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 343.3 on 260 degrees of freedom
## Multiple R-squared: 0.4252, Adjusted R-squared: 0.4208
## F-statistic: 96.17 on 2 and 260 DF, p-value: < 2.2e-16
mod13<-lm(Salary~CRBI+Runs, data=Hitters)
summary(mod13)
##
## Call:
## lm(formula = Salary ~ CRBI + Runs, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1037.58 -194.51 -41.45 111.69 2125.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.40744 52.04504 -0.065 0.948
## CRBI 0.70114 0.06737 10.408 < 2e-16 ***
## Runs 5.61989 0.85295 6.589 2.45e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 345.3 on 260 degrees of freedom
## Multiple R-squared: 0.4185, Adjusted R-squared: 0.4141
## F-statistic: 93.57 on 2 and 260 DF, p-value: < 2.2e-16
# Second step: keep Hits
#STEP 3: Look for a third variable, but ….OH NO! ITs not significant! So we will stop and only use the best model with two variables.
mod123<-lm(Salary~CRBI+Hits+Runs, data=Hitters)
summary(mod123)
##
## Call:
## lm(formula = Salary ~ CRBI + Hits + Runs, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -969.16 -186.72 -41.07 100.55 2166.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -46.24811 56.01270 -0.826 0.4098
## CRBI 0.68948 0.06724 10.255 <2e-16 ***
## Hits 2.28191 1.14187 1.998 0.0467 *
## Runs 1.97827 2.00996 0.984 0.3259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 343.3 on 259 degrees of freedom
## Multiple R-squared: 0.4274, Adjusted R-squared: 0.4207
## F-statistic: 64.43 on 3 and 259 DF, p-value: < 2.2e-16
# STOP! Because adding Hits is not significant
#Start with saturated model
#Remove largest p-value
#Remove variables until hit stopping rule
#STEP 1: Start with a full model using all the variables. Take out any variables that are not significant.
# Backward
# Start with all vars and take away
mod123<-lm(Salary~CRBI+Hits+Runs, data=Hitters)
summary(mod123)
##
## Call:
## lm(formula = Salary ~ CRBI + Hits + Runs, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -969.16 -186.72 -41.07 100.55 2166.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -46.24811 56.01270 -0.826 0.4098
## CRBI 0.68948 0.06724 10.255 <2e-16 ***
## Hits 2.28191 1.14187 1.998 0.0467 *
## Runs 1.97827 2.00996 0.984 0.3259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 343.3 on 259 degrees of freedom
## Multiple R-squared: 0.4274, Adjusted R-squared: 0.4207
## F-statistic: 64.43 on 3 and 259 DF, p-value: < 2.2e-16
# First: Remove Runs
#STEP 2: Continue taking out variables until the only variables that remain are significant. We can stop with a two variable model.
mod12<-lm(Salary~CRBI+Hits, data=Hitters)
summary(mod12)
##
## Call:
## lm(formula = Salary ~ CRBI + Hits, data = Hitters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -942.47 -183.72 -37.85 96.55 2167.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -47.95590 55.98245 -0.857 0.392
## CRBI 0.68990 0.06723 10.262 < 2e-16 ***
## Hits 3.30084 0.48177 6.851 5.28e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 343.3 on 260 degrees of freedom
## Multiple R-squared: 0.4252, Adjusted R-squared: 0.4208
## F-statistic: 96.17 on 2 and 260 DF, p-value: < 2.2e-16
# STOP! Now everything is significant
#Mixed Selection
#Start with no variables
#Add variables with best fit
#Continue to add variables, but if p-values become larger as new variables are added remove that variable from the model
#Continue going back and forth until variable have sufficiently low p-value
#Ok, now for the shortcut!… We can use an R package to automate this
#The leaps package in R can used for variable selection. For some reason, there can be problems with installing this package, but if you specify the repository it will work! (See the example in my code)
install.packages("leaps", repo = 'https://mac.R-project.org')
##
## The downloaded binary packages are in
## /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages
library(leaps)
#This package will default to telling you the best models with one through eight variables included in the model.
#However, if you want to know all of the different possibilities up to the full model with all the predictors, you can do that too using the nvmax argument.
#Must specify what type of selection you would like to do using the method argument.
# Forward
regfit.fwd<-regsubsets(Salary~., data=Hitters,
method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
# Backward
regfit.bwd<-regsubsets(Salary~., data=Hitters,
method="backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, method = "backward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " "
## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " " " "*" " " " " " "
## 5 ( 1 ) " " " " " " "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
#Best Subset Selection Algorithm
#Fit all the models for the specified number of predictors
#Select the model that is best (p-val, R-squared adj, Cp, BIC, AIC,…)
# Best Subset
regfit.full<-regsubsets(Salary~., data=Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
regfit.full<-regsubsets(Salary~., data=Hitters, nvmax=19)
reg.summary<-summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
#look at the variables that are contained in the regsubset object. We can learn a lot about different model comparison metrics.
#R-squared
#R-squared Adjusted
#BIC
#Mallows Cp
#the goal of these model comparison metrics is to find the “best” model that balances the model fit and simplicity/interpretability.
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
head(reg.summary)
## $which
## (Intercept) AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 4 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 5 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 6 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 7 TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE
## 8 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## 9 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## 10 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## 11 TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## 12 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## 13 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## 14 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## 15 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE
## 16 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## 17 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## 18 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 19 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## CRuns CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 4 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## 5 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## 6 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## 7 FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## 8 TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE
## 9 TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE
## 10 TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 14 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 15 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 16 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 17 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 18 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 19 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $rsq
## [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
## [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
##
## $rss
## [1] 36179679 30646560 29249297 27970852 27149899 26194904 25906548 25136930
## [9] 24814051 24500402 24387345 24333232 24289148 24248660 24235177 24219377
## [17] 24209447 24201837 24200700
##
## $adjr2
## [1] 0.3188503 0.4208024 0.4450753 0.4672734 0.4808971 0.4972001 0.5007849
## [8] 0.5137083 0.5180572 0.5222606 0.5225706 0.5217245 0.5206736 0.5195431
## [15] 0.5178661 0.5162219 0.5144464 0.5126097 0.5106270
##
## $cp
## [1] 104.281319 50.723090 38.693127 27.856220 21.613011 14.023870
## [7] 13.128474 7.400719 6.158685 5.009317 5.874113 7.330766
## [13] 8.888112 10.481576 12.346193 14.187546 16.087831 18.011425
## [19] 20.000000
##
## $bic
## [1] -90.84637 -128.92622 -135.62693 -141.80892 -144.07143 -147.91690
## [7] -145.25594 -147.61525 -145.44316 -143.21651 -138.86077 -133.87283
## [13] -128.77759 -123.64420 -118.21832 -112.81768 -107.35339 -101.86391
## [19] -96.30412
#R-squared
#Although the R-squared metric is useful in simple linear regression, it has some drawbacks for multiple linear regression.
#R-squared monotonically increases as we add more and more variables into the model, which is not good because it does not balance for the increased complexity.
#We want to maximize this value!
# R-squared (monotonically increasing)
library(ggplot2)
reg.summary$rsq
## [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
## [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
rsq_df<-data.frame(rsq=reg.summary$rsq,
size=1:19)
ggplot(rsq_df, aes(x=size, y=rsq))+
geom_point()+
geom_line()+
ggtitle("R-squared")
#R-squared Adjusted
#This is similar to R-squared but gives some penalty to the number of predictors, p. We want to maximize this value!
adjrsq_df<-data.frame(rsq=c(reg.summary$rsq,reg.summary$adjr2),
type=c(rep("rsq",19),rep("rsq_adj",19)),
size=rep(1:19,2))
ggplot(adjrsq_df, aes(x=size, y=rsq, color=type))+
geom_point()+
geom_line()+
ggtitle("R-squared vs Adj R-squared")
#Mallows Cp and BIC
#Unlike the R-squared values, we want to minimize these metrics:
#Note that BIC gives a harsher penalty and thereby picks smaller models.
# Lets look at the other metrics
metric_df<-data.frame(metric=c(reg.summary$rsq,
reg.summary$adjr2,
reg.summary$cp,
reg.summary$bic),
type=c(rep("rsq",19),
rep("rsq_adj",19),
rep("cp",19),
rep("bic",19)),
size=rep(1:19,4))
# Add vertical lines to show the best model
which.min(reg.summary$bic)
## [1] 6
which.max(reg.summary$rsq)
## [1] 19
which.max(reg.summary$adjr2)
## [1] 11
vline.dat <- data.frame(type=levels(metric_df$type), vl=c(6, 10,19, 11 ))
ggplot(metric_df, aes(x=size, y=metric, color=type))+
geom_point()+
geom_line()+
ggtitle("Model Comparison Metrics")+
geom_vline(aes(xintercept=vl), data=vline.dat, lty=2) +
facet_grid(type~., scales = "free_y")
#MLR Model Extensions
#Numeric Interactions
#we have looked at the Boston dataset to explore polynomial regression. We are going to use this dataset for Part I. You will need to load it into this lab session from library
library(dplyr)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:Ecdat':
##
## SP500
## The following object is masked from 'package:dplyr':
##
## select
library(ISLR)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
#(week6)looked at median value as a function of lower status and proportion of older homes.
#When we fit a model with just the additive effects of lstat and age, these are known as the main effects.
modInt0<-lm(medv~lstat+age, data=Boston)
summary(modInt0)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
#However, there could be a relationship between lstat and age. We will now explore the their interaction.
#A colon can be use to create a model with only that specified interaction An astrix can be used to create a model with all levels of interactions and main effects
#Exercise 1: Interaction
#First, create a model with only the lstat and age interaction.
modInt1<-lm(medv~lstat:age, data=Boston)
summary(modInt1)
##
## Call:
## lm(formula = medv ~ lstat:age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.347 -4.372 -1.534 1.914 27.193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.1588631 0.4828240 62.46 <2e-16 ***
## lstat:age -0.0077146 0.0003799 -20.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.827 on 504 degrees of freedom
## Multiple R-squared: 0.4501, Adjusted R-squared: 0.449
## F-statistic: 412.4 on 1 and 504 DF, p-value: < 2.2e-16
#Exercise 2: Full Model
#Now create a model with all possible interactions between lstat and age.
modInt2<-lm(medv~lstat*age, data=Boston)
summary(modInt2)
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
#Part II: Multicollinearity
#Lets change datasets again! We’ll be using the Credit data set in the ISLR package.
#This is another simulated dataset about 10000 credit card customers.
#For this exercise we’re only interested in four variables: balance, age, limit, and rating.
data(Credit)
names(Credit)
## [1] "ID" "Income" "Limit" "Rating" "Cards" "Age"
## [7] "Education" "Gender" "Student" "Married" "Ethnicity" "Balance"
creditTrim<-data.frame(balance=Credit$Balance,
limit=Credit$Limit,
rating=Credit$Rating,
age=Credit$Age)
#Multicollinearity occurs when there is are linear relationships between explanatory variables. So lets looks at a pairs plot and also the correlation matrix.
pairs(creditTrim)
cor(creditTrim)
## balance limit rating age
## balance 1.000000000 0.8616973 0.8636252 0.001835119
## limit 0.861697267 1.0000000 0.9968797 0.100887922
## rating 0.863625161 0.9968797 1.0000000 0.103164996
## age 0.001835119 0.1008879 0.1031650 1.000000000
#This causes our variance to be overestimated thereby decreasing the respective test statistic and causing larger p-values.
#Consider the two models :
credMod1<-lm(balance~age+limit, creditTrim)
summary(credMod1)
##
## Call:
## lm(formula = balance ~ age + limit, data = creditTrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -696.84 -150.78 -13.01 126.68 755.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.734e+02 4.383e+01 -3.957 9.01e-05 ***
## age -2.291e+00 6.725e-01 -3.407 0.000723 ***
## limit 1.734e-01 5.026e-03 34.496 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 230.5 on 397 degrees of freedom
## Multiple R-squared: 0.7498, Adjusted R-squared: 0.7486
## F-statistic: 595 on 2 and 397 DF, p-value: < 2.2e-16
credMod2<-lm(balance~rating+limit, creditTrim)
summary(credMod2)
##
## Call:
## lm(formula = balance ~ rating + limit, data = creditTrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -707.8 -135.9 -9.5 124.0 817.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -377.53680 45.25418 -8.343 1.21e-15 ***
## rating 2.20167 0.95229 2.312 0.0213 *
## limit 0.02451 0.06383 0.384 0.7012
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 232.3 on 397 degrees of freedom
## Multiple R-squared: 0.7459, Adjusted R-squared: 0.7447
## F-statistic: 582.8 on 2 and 397 DF, p-value: < 2.2e-16
#A way to quantify the impact of multicollinearity is to look at the VIF (variance inflation factor).
#A VIF close to 1 would imply no correlation. The larger the VIF the larger the amount of multicollinearity.
library(DAAG)
## Loading required package: lattice
##
## Attaching package: 'DAAG'
## The following object is masked from 'package:MASS':
##
## hills
vif(lm(balance~age+rating+limit, creditTrim))
## age rating limit
## 1.0114 160.6700 160.5900