Multicollinearity

library("ggplot2") 
library("tidyverse")

## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ✓ purrr   0.3.3

## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library("dplyr")
library("magrittr")

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

# LOGIT MODEL
install.packages("titanic", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages

library(tidyverse)
library(titanic)

logit_t=glm(formula=Survived ~ Pclass+Sex, family=binomial(link="logit"), data=titanic_train)

# linear model
lm_t=lm(Survived ~ Fare, data=titanic_train)

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() + 
  stat_smooth(method="lm", se=TRUE)

## `geom_smooth()` using formula 'y ~ x'

summary(logit_t) #see the regression results

## 
## Call:
## glm(formula = Survived ~ Pclass + Sex, family = binomial(link = "logit"), 
##     data = titanic_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2030  -0.7036  -0.4519   0.6719   2.1599  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   3.2946     0.2974  11.077   <2e-16 ***
## Pclass       -0.9606     0.1061  -9.057   <2e-16 ***
## Sexmale      -2.6434     0.1838 -14.380   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.7  on 890  degrees of freedom
## Residual deviance:  827.2  on 888  degrees of freedom
## AIC: 833.2
## 
## Number of Fisher Scoring iterations: 4

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() + 
  stat_smooth(method="glm", method.args=list(family="binomial"(link="logit")), se=TRUE)

## `geom_smooth()` using formula 'y ~ x'

Probit model

probit_t=glm(formula=Survived ~ Fare, family=binomial(link="probit"), data=titanic_train)

summary(probit_t) # see the results

## 
## Call:
## glm(formula = Survived ~ Fare, family = binomial(link = "probit"), 
##     data = titanic_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4601  -0.8920  -0.8601   1.3527   1.5811  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.563592   0.056440  -9.986  < 2e-16 ***
## Fare         0.008453   0.001242   6.804 1.02e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.7  on 890  degrees of freedom
## Residual deviance: 1119.9  on 889  degrees of freedom
## AIC: 1123.9
## 
## Number of Fisher Scoring iterations: 6

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() + 
  stat_smooth(method="glm", method.args=list(family="binomial"(link="probit")), se=TRUE)

## `geom_smooth()` using formula 'y ~ x'

Multinominal Logistic Regression model

install.packages("nnet", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages

install.packages("Ecdat", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages

library(nnet)
library(Ecdat)

## Loading required package: Ecfun

## 
## Attaching package: 'Ecfun'

## The following object is masked from 'package:base':
## 
##     sign

## 
## Attaching package: 'Ecdat'

## The following object is masked from 'package:datasets':
## 
##     Orange

Y <- Yogurt
M <- multinom(choice ~., data = Y) #the model

## # weights:  44 (30 variable)
## initial  value 3343.741999 
## iter  10 value 2740.831153
## iter  20 value 2593.985140
## iter  30 value 2563.707966
## final  value 2562.990614 
## converged

summary(M)

## Call:
## multinom(formula = choice ~ ., data = Y)
## 
## Coefficients:
##        (Intercept)            id feat.yoplait feat.dannon feat.hiland
## dannon -0.03520968  0.0001402729   -0.3401250  -0.4537247 -0.06022368
## hiland  4.14419228 -0.0005795527   -0.5042782  -0.5157286  1.07464455
## weight  1.18249094  0.0048663666   -0.2611470  -0.7810408 -1.23737832
##        feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon   0.4865494     0.3675842   -0.7044580    0.1846033   0.13864682
## hiland   1.0134997     0.1305399   -0.4733090   -0.7054197  -0.06976759
## weight   1.1985995     0.4751706   -0.5396113   -0.3682489  -0.05816486
## 
## Std. Errors:
##        (Intercept)          id feat.yoplait feat.dannon feat.hiland feat.weight
## dannon   0.8822363 0.001870004    0.2239925   0.3140816   0.3276537   0.3276952
## hiland   2.0291901 0.004573466    0.7435141   0.8224360   0.4603284   0.6936778
## weight   1.0029899 0.002120281    0.2688138   0.3821215   0.3936001   0.3360577
##        price.yoplait price.dannon price.hiland price.weight
## dannon    0.03461712   0.06710173   0.07237536   0.08089368
## hiland    0.06907950   0.15048557   0.15430184   0.16553123
## weight    0.04614420   0.07464468   0.08106812   0.08519369
## 
## Residual Deviance: 5125.981 
## AIC: 5185.981

exp(coef(M)) #log odds relative to baseline

##        (Intercept)        id feat.yoplait feat.dannon feat.hiland feat.weight
## dannon    0.965403 1.0001403    0.7116813   0.6352576   0.9415539    1.626694
## hiland   63.066661 0.9994206    0.6039413   0.5970654   2.9289516    2.755227
## weight    3.262491 1.0048782    0.7701677   0.4579292   0.2901439    3.315470
##        price.yoplait price.dannon price.hiland price.weight
## dannon      1.444241    0.4943765    1.2027413    1.1487183
## hiland      1.139443    0.6229375    0.4939013    0.9326105
## weight      1.608289    0.5829748    0.6919450    0.9434944

head(fitted(M)) #probabilities of picking each category for each observation

##     yoplait    dannon      hiland    weight
## 1 0.3245838 0.5091098 0.014125195 0.1521812
## 2 0.6018923 0.2850152 0.009748865 0.1033436
## 3 0.5813145 0.3033498 0.010775186 0.1045605
## 4 0.5813145 0.3033498 0.010775186 0.1045605
## 5 0.4133581 0.2930154 0.023418517 0.2702081
## 6 0.4915570 0.2899564 0.027613810 0.1908728

Y$choice <- relevel(Y$choice, ref = "dannon") #set new baseline

# FULL MODEL
M <- multinom(choice ~., data = Y)

## # weights:  44 (30 variable)
## initial  value 3343.741999 
## iter  10 value 2723.422944
## iter  20 value 2610.939896
## iter  30 value 2563.450128
## final  value 2562.990625 
## converged

summary(M)

## Call:
## multinom(formula = choice ~ ., data = Y)
## 
## Coefficients:
##         (Intercept)            id feat.yoplait feat.dannon feat.hiland
## yoplait  0.03458607 -0.0001407410   0.34004481  0.45379140  0.06035367
## hiland   4.17701865 -0.0007153663  -0.16441842 -0.06136236  1.13495466
## weight   1.21648919  0.0047271381   0.07897677 -0.32719222 -1.17703765
##         feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait  -0.4864858    -0.3675775    0.7044753   -0.1845759   -0.1386124
## hiland    0.5276242    -0.2369168    0.2316330   -0.8900015   -0.2088435
## weight    0.7120903     0.1076158    0.1648777   -0.5528200   -0.1967649
## 
## Std. Errors:
##         (Intercept)          id feat.yoplait feat.dannon feat.hiland
## yoplait   0.8822345 0.001870002    0.2239905   0.3140810   0.3276568
## hiland    2.0045405 0.004559142    0.7531383   0.8017917   0.4601638
## weight    0.8945313 0.001968840    0.2807492   0.3197073   0.3823698
##         feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait   0.3276950    0.03461661   0.06710192   0.07237505   0.08089485
## hiland    0.6865561    0.07314327   0.14532253   0.15387341   0.16449378
## weight    0.2871956    0.04632864   0.05918325   0.07819600   0.07641287
## 
## Residual Deviance: 5125.981 
## AIC: 5185.981

z <- summary(M)$coefficients/summary(M)$standard.errors #Wald test for p values
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p

##         (Intercept)         id feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait   0.9687287 0.94000587    0.1289838   0.1485077 0.853858262  0.13765801
## hiland    0.0371803 0.87531728    0.8271868   0.9389962 0.013647324  0.44218515
## weight    0.1738559 0.01635142    0.7784748   0.3061132 0.002082057  0.01315811
##         price.yoplait price.dannon price.hiland price.weight
## yoplait   0.000000000  0.000000000 1.076396e-02   0.08662261
## hiland    0.001199163  0.110953132 7.295151e-09   0.20422249
## weight    0.020185676  0.005338194 1.552980e-12   0.01002335

# MODEL 1
M1 <- multinom(choice ~ price.yoplait + price.dannon + price.hiland + price.weight, data = Y) #price model

## # weights:  24 (15 variable)
## initial  value 3343.741999 
## iter  10 value 2696.572347
## iter  20 value 2590.714860
## final  value 2590.678971 
## converged

summary(M1)

## Call:
## multinom(formula = choice ~ price.yoplait + price.dannon + price.hiland + 
##     price.weight, data = Y)
## 
## Coefficients:
##         (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait -0.01730093   -0.37181637    0.6872791   -0.2021630  -0.09475388
## hiland   6.36906160   -0.24038641    0.2362219   -1.1644689  -0.29925903
## weight   1.11884075    0.08901697    0.1897998   -0.4480783  -0.22525465
## 
## Std. Errors:
##         (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait   0.8186591    0.03417663   0.06291827   0.06846684   0.07733802
## hiland    1.6863041    0.07344800   0.13278123   0.11799583   0.14801776
## weight    0.8152734    0.04589639   0.05402714   0.07013378   0.07000950
## 
## Residual Deviance: 5181.358 
## AIC: 5211.358

z1 <- summary(M1)$coefficients/summary(M1)$standard.errors 
p1 <- (1 - pnorm(abs(z1), 0, 1)) * 2
p1

##          (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.9831393611   0.000000000 0.0000000000 3.149938e-03  0.220503123
## hiland  0.0001587699   0.001064581 0.0752347040 0.000000e+00  0.043199301
## weight  0.1699543794   0.052438052 0.0004430004 1.670737e-10  0.001293189

# MODEL 2
library(stargazer) #a package for making the output into a nice table

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

M2 <- multinom(choice ~ feat.yoplait + feat.dannon + feat.hiland + feat.weight, data = Y) #feature model

## # weights:  24 (15 variable)
## initial  value 3343.741999 
## iter  10 value 2811.342546
## iter  20 value 2786.117255
## final  value 2786.109276 
## converged

summary(M2)

## Call:
## multinom(formula = choice ~ feat.yoplait + feat.dannon + feat.hiland + 
##     feat.weight, data = Y)
## 
## Coefficients:
##         (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait  -0.1936272  0.846540156  -0.6273882 -0.02145402  -0.1248708
## hiland   -2.9226341 -0.021333436  -0.2961226  2.48442120   0.5246249
## weight   -0.5541230  0.007649006  -0.5247896 -0.31481937   0.4915248
## 
## Std. Errors:
##         (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait  0.05202093    0.2066922   0.2610760   0.2737980   0.2732667
## hiland   0.15475460    0.7416536   0.7374798   0.3258950   0.6225818
## weight   0.05788051    0.2740806   0.2866666   0.3354511   0.2567316
## 
## Residual Deviance: 5572.219 
## AIC: 5602.219

z2 <- summary(M2)$coefficients/summary(M2)$standard.errors
p2 <- (1 - pnorm(abs(z2), 0, 1)) * 2
p2

##          (Intercept) feat.yoplait feat.dannon  feat.hiland feat.weight
## yoplait 0.0001975709 4.209728e-05  0.01625733 9.375440e-01   0.6477029
## hiland  0.0000000000 9.770523e-01  0.68802757 2.464695e-14   0.3994185
## weight  0.0000000000 9.777356e-01  0.06715108 3.479898e-01   0.0555503

# MODEL 3
M3 <- multinom(choice ~ feat.weight + price.yoplait + price.dannon + price.hiland, data = Y)

## # weights:  24 (15 variable)
## initial  value 3343.741999 
## iter  10 value 2689.814698
## iter  20 value 2586.232922
## final  value 2586.218053 
## converged

summary(M3)

## Call:
## multinom(formula = choice ~ feat.weight + price.yoplait + price.dannon + 
##     price.hiland, data = Y)
## 
## Coefficients:
##         (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait  -0.4868515  -0.3600374   -0.38592053    0.6645636   -0.1906188
## hiland    4.3842502   0.9043384   -0.24106611    0.2005119   -1.1886847
## weight   -0.1938915   0.9480919    0.08262245    0.1532484   -0.4768760
## 
## Std. Errors:
##         (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait   0.6953676   0.3133467    0.03414217   0.06176640   0.06882024
## hiland    1.4367336   0.6610763    0.07445377   0.12879778   0.11938915
## weight    0.7029393   0.2679710    0.04484957   0.05149647   0.07105390
## 
## Residual Deviance: 5172.436 
## AIC: 5202.436

z3 <- summary(M3)$coefficients/summary(M3)$standard.errors
p3 <- (1 - pnorm(abs(z3), 0, 1)) * 2
p3

##         (Intercept)  feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.483842694 0.2505532970    0.00000000  0.000000000 5.608959e-03
## hiland  0.002276706 0.1713186599    0.00120457  0.119518777 0.000000e+00
## weight  0.782678888 0.0004031105    0.06544402  0.002921295 1.926770e-11

stargazer(M3, type = "text", title = "M3")

## 
## M3
## ===============================================
##                        Dependent variable:     
##                   -----------------------------
##                    yoplait   hiland    weight  
##                      (1)       (2)       (3)   
## -----------------------------------------------
## feat.weight        -0.360     0.904   0.948*** 
##                    (0.313)   (0.661)   (0.268) 
##                                                
## price.yoplait     -0.386*** -0.241***  0.083*  
##                    (0.034)   (0.074)   (0.045) 
##                                                
## price.dannon      0.665***    0.201   0.153*** 
##                    (0.062)   (0.129)   (0.051) 
##                                                
## price.hiland      -0.191*** -1.189*** -0.477***
##                    (0.069)   (0.119)   (0.071) 
##                                                
## Constant           -0.487   4.384***   -0.194  
##                    (0.695)   (1.437)   (0.703) 
##                                                
## -----------------------------------------------
## Akaike Inf. Crit. 5,202.436 5,202.436 5,202.436
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

####Model selection for MLB Salary

library(ISLR)
data("Hitters")
names(Hitters)

##  [1] "AtBat"     "Hits"      "HmRun"     "Runs"      "RBI"       "Walks"    
##  [7] "Years"     "CAtBat"    "CHits"     "CHmRun"    "CRuns"     "CRBI"     
## [13] "CWalks"    "League"    "Division"  "PutOuts"   "Assists"   "Errors"   
## [19] "Salary"    "NewLeague"

dim(Hitters)

## [1] 322  20

#First we check for any missing data in your response, which is salary. Then we will remove the NAs. This process is called listwise deletion.

# How many data points are misisng for Salary?
sum(is.na(Hitters$Salary))

## [1] 59

# We are going to remove the NAs
Hitters<-na.omit(Hitters)
dim(Hitters)

## [1] 263  20

sum(is.na(Hitters$Salary))

## [1] 0

#Classic Variable Section Approaches
#Forward Selection
#Backward Selection
#Mixed Selection
#Best Subset Selection
#In order to demonstrate how the algorithms work, we only consider a small subset of predictors.

#Candidate Predictors:
  
#CRBI: Career runs batted in
#Hits: Number of hits in 1986
#Runs: Number of runs in 1986
#Forward Selection Algorithm
#Start with nothing
#Pick variable with lowest sum of squares residual (or p-value)
#Add variables until hit stopping rule
#Let’s do this with our three variables.

#STEP 1: Use simple linear regression to select one variable to enter the model.

# Forward Selection
mod1<-lm(Salary~CRBI, data=Hitters)
summary(mod1)

## 
## Call:
## lm(formula = Salary ~ CRBI, data = Hitters)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1099.27  -203.45   -97.43   146.37  1847.22 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 274.58039   32.85537   8.357 3.85e-15 ***
## CRBI          0.79095    0.07113  11.120  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 372.3 on 261 degrees of freedom
## Multiple R-squared:  0.3215, Adjusted R-squared:  0.3189 
## F-statistic: 123.6 on 1 and 261 DF,  p-value: < 2.2e-16

mod2<-lm(Salary~Hits, data=Hitters)
summary(mod2)

## 
## Call:
## lm(formula = Salary ~ Hits, data = Hitters)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -893.99 -245.63  -59.08  181.12 2059.90 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  63.0488    64.9822   0.970    0.333    
## Hits          4.3854     0.5561   7.886 8.53e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 406.2 on 261 degrees of freedom
## Multiple R-squared:  0.1924, Adjusted R-squared:  0.1893 
## F-statistic: 62.19 on 1 and 261 DF,  p-value: 8.531e-14

mod3<-lm(Salary~Runs, data=Hitters)
summary(mod3)

## 
## Call:
## lm(formula = Salary ~ Runs, data = Hitters)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -714.8 -251.2  -55.9  193.6 1997.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 129.9292    59.9240   2.168    0.031 *  
## Runs          7.4161     0.9923   7.474 1.18e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 410.2 on 261 degrees of freedom
## Multiple R-squared:  0.1763, Adjusted R-squared:  0.1731 
## F-statistic: 55.86 on 1 and 261 DF,  p-value: 1.18e-12

# First: keep CRBI (smallest p-val)

#STEP 2: Look for a second variable to add to the model.
mod12<-lm(Salary~CRBI+Hits, data=Hitters)
summary(mod12)

## 
## Call:
## lm(formula = Salary ~ CRBI + Hits, data = Hitters)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -942.47 -183.72  -37.85   96.55 2167.16 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -47.95590   55.98245  -0.857    0.392    
## CRBI          0.68990    0.06723  10.262  < 2e-16 ***
## Hits          3.30084    0.48177   6.851 5.28e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 343.3 on 260 degrees of freedom
## Multiple R-squared:  0.4252, Adjusted R-squared:  0.4208 
## F-statistic: 96.17 on 2 and 260 DF,  p-value: < 2.2e-16

mod13<-lm(Salary~CRBI+Runs, data=Hitters)
summary(mod13)

## 
## Call:
## lm(formula = Salary ~ CRBI + Runs, data = Hitters)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1037.58  -194.51   -41.45   111.69  2125.83 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.40744   52.04504  -0.065    0.948    
## CRBI         0.70114    0.06737  10.408  < 2e-16 ***
## Runs         5.61989    0.85295   6.589 2.45e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 345.3 on 260 degrees of freedom
## Multiple R-squared:  0.4185, Adjusted R-squared:  0.4141 
## F-statistic: 93.57 on 2 and 260 DF,  p-value: < 2.2e-16

# Second step: keep Hits
#STEP 3: Look for a third variable, but ….OH NO! ITs not significant! So we will stop and only use the best model with two variables.

mod123<-lm(Salary~CRBI+Hits+Runs, data=Hitters)
summary(mod123)

## 
## Call:
## lm(formula = Salary ~ CRBI + Hits + Runs, data = Hitters)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -969.16 -186.72  -41.07  100.55 2166.47 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -46.24811   56.01270  -0.826   0.4098    
## CRBI          0.68948    0.06724  10.255   <2e-16 ***
## Hits          2.28191    1.14187   1.998   0.0467 *  
## Runs          1.97827    2.00996   0.984   0.3259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 343.3 on 259 degrees of freedom
## Multiple R-squared:  0.4274, Adjusted R-squared:  0.4207 
## F-statistic: 64.43 on 3 and 259 DF,  p-value: < 2.2e-16

# STOP! Because adding Hits is not significant

Backward Selection ALgorithm

#Start with saturated model
#Remove largest p-value
#Remove variables until hit stopping rule
#STEP 1: Start with a full model using all the variables. Take out any variables that are not significant.

# Backward
# Start with all vars and take away
mod123<-lm(Salary~CRBI+Hits+Runs, data=Hitters)
summary(mod123)

## 
## Call:
## lm(formula = Salary ~ CRBI + Hits + Runs, data = Hitters)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -969.16 -186.72  -41.07  100.55 2166.47 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -46.24811   56.01270  -0.826   0.4098    
## CRBI          0.68948    0.06724  10.255   <2e-16 ***
## Hits          2.28191    1.14187   1.998   0.0467 *  
## Runs          1.97827    2.00996   0.984   0.3259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 343.3 on 259 degrees of freedom
## Multiple R-squared:  0.4274, Adjusted R-squared:  0.4207 
## F-statistic: 64.43 on 3 and 259 DF,  p-value: < 2.2e-16

# First: Remove Runs

#STEP 2: Continue taking out variables until the only variables that remain are significant. We can stop with a two variable model.

mod12<-lm(Salary~CRBI+Hits, data=Hitters)
summary(mod12)

## 
## Call:
## lm(formula = Salary ~ CRBI + Hits, data = Hitters)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -942.47 -183.72  -37.85   96.55 2167.16 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -47.95590   55.98245  -0.857    0.392    
## CRBI          0.68990    0.06723  10.262  < 2e-16 ***
## Hits          3.30084    0.48177   6.851 5.28e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 343.3 on 260 degrees of freedom
## Multiple R-squared:  0.4252, Adjusted R-squared:  0.4208 
## F-statistic: 96.17 on 2 and 260 DF,  p-value: < 2.2e-16

# STOP! Now everything is significant 

#Mixed Selection
#Start with no variables
#Add variables with best fit
#Continue to add variables, but if p-values become larger as new variables are added remove that variable from the model
#Continue going back and forth until variable have sufficiently low p-value
#Ok, now for the shortcut!… We can use an R package to automate this
#The leaps package in R can used for variable selection. For some reason, there can be problems with installing this package, but if you specify the repository it will work! (See the example in my code)

install.packages("leaps", repo = 'https://mac.R-project.org')

## 
## The downloaded binary packages are in
##  /var/folders/1m/pcxtsxrd3ld0k9h22g50195r0000gn/T//Rtmpghr2Ii/downloaded_packages

library(leaps)
#This package will default to telling you the best models with one through eight variables included in the model. 

#However, if you want to know all of the different possibilities up to the full model with all the predictors, you can do that too using the nvmax argument.
#Must specify what type of selection you would like to do using the method argument.

# Forward 
regfit.fwd<-regsubsets(Salary~., data=Hitters,
                       method="forward")
summary(regfit.fwd)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
##          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "

# Backward 
regfit.bwd<-regsubsets(Salary~., data=Hitters, 
                       method="backward")
summary(regfit.bwd)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, method = "backward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 4  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 5  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 7  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
##          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 5  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "

#Best Subset Selection Algorithm
#Fit all the models for the specified number of predictors
#Select the model that is best (p-val, R-squared adj, Cp, BIC, AIC,…)
# Best Subset 
regfit.full<-regsubsets(Salary~., data=Hitters)
summary(regfit.full)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
##          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "

regfit.full<-regsubsets(Salary~., data=Hitters, nvmax=19)
reg.summary<-summary(regfit.full)
reg.summary

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 )  " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
##           CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"

Model Comparison Metrics

#look at the variables that are contained in the regsubset object. We can learn a lot about different model comparison metrics.

#R-squared
#R-squared Adjusted
#BIC
#Mallows Cp
#the goal of these model comparison metrics is to find the “best” model that balances the model fit and simplicity/interpretability.

names(reg.summary)

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

head(reg.summary)

## $which
##    (Intercept) AtBat  Hits HmRun  Runs   RBI Walks Years CAtBat CHits CHmRun
## 1         TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE FALSE  FALSE
## 2         TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  FALSE FALSE  FALSE
## 3         TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  FALSE FALSE  FALSE
## 4         TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  FALSE FALSE  FALSE
## 5         TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  FALSE FALSE  FALSE
## 6         TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  FALSE FALSE  FALSE
## 7         TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE   TRUE  TRUE   TRUE
## 8         TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  FALSE FALSE   TRUE
## 9         TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE   TRUE FALSE  FALSE
## 10        TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE   TRUE FALSE  FALSE
## 11        TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE   TRUE FALSE  FALSE
## 12        TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE   TRUE FALSE  FALSE
## 13        TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE   TRUE FALSE  FALSE
## 14        TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE   TRUE FALSE  FALSE
## 15        TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE   TRUE  TRUE  FALSE
## 16        TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE   TRUE  TRUE  FALSE
## 17        TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE   TRUE  TRUE  FALSE
## 18        TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE   TRUE  TRUE  FALSE
## 19        TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE   TRUE  TRUE   TRUE
##    CRuns  CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  FALSE  TRUE  FALSE   FALSE     FALSE   FALSE   FALSE  FALSE      FALSE
## 2  FALSE  TRUE  FALSE   FALSE     FALSE   FALSE   FALSE  FALSE      FALSE
## 3  FALSE  TRUE  FALSE   FALSE     FALSE    TRUE   FALSE  FALSE      FALSE
## 4  FALSE  TRUE  FALSE   FALSE      TRUE    TRUE   FALSE  FALSE      FALSE
## 5  FALSE  TRUE  FALSE   FALSE      TRUE    TRUE   FALSE  FALSE      FALSE
## 6  FALSE  TRUE  FALSE   FALSE      TRUE    TRUE   FALSE  FALSE      FALSE
## 7  FALSE FALSE  FALSE   FALSE      TRUE    TRUE   FALSE  FALSE      FALSE
## 8   TRUE FALSE   TRUE   FALSE      TRUE    TRUE   FALSE  FALSE      FALSE
## 9   TRUE  TRUE   TRUE   FALSE      TRUE    TRUE   FALSE  FALSE      FALSE
## 10  TRUE  TRUE   TRUE   FALSE      TRUE    TRUE    TRUE  FALSE      FALSE
## 11  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE  FALSE      FALSE
## 12  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE  FALSE      FALSE
## 13  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE   TRUE      FALSE
## 14  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE   TRUE      FALSE
## 15  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE   TRUE      FALSE
## 16  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE   TRUE      FALSE
## 17  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE   TRUE       TRUE
## 18  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE   TRUE       TRUE
## 19  TRUE  TRUE   TRUE    TRUE      TRUE    TRUE    TRUE   TRUE       TRUE
## 
## $rsq
##  [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
##  [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
## 
## $rss
##  [1] 36179679 30646560 29249297 27970852 27149899 26194904 25906548 25136930
##  [9] 24814051 24500402 24387345 24333232 24289148 24248660 24235177 24219377
## [17] 24209447 24201837 24200700
## 
## $adjr2
##  [1] 0.3188503 0.4208024 0.4450753 0.4672734 0.4808971 0.4972001 0.5007849
##  [8] 0.5137083 0.5180572 0.5222606 0.5225706 0.5217245 0.5206736 0.5195431
## [15] 0.5178661 0.5162219 0.5144464 0.5126097 0.5106270
## 
## $cp
##  [1] 104.281319  50.723090  38.693127  27.856220  21.613011  14.023870
##  [7]  13.128474   7.400719   6.158685   5.009317   5.874113   7.330766
## [13]   8.888112  10.481576  12.346193  14.187546  16.087831  18.011425
## [19]  20.000000
## 
## $bic
##  [1]  -90.84637 -128.92622 -135.62693 -141.80892 -144.07143 -147.91690
##  [7] -145.25594 -147.61525 -145.44316 -143.21651 -138.86077 -133.87283
## [13] -128.77759 -123.64420 -118.21832 -112.81768 -107.35339 -101.86391
## [19]  -96.30412

#R-squared
#Although the R-squared metric is useful in simple linear regression, it has some drawbacks for multiple linear regression. 
#R-squared monotonically increases as we add more and more variables into the model, which is not good because it does not balance for the increased complexity. 
#We want to maximize this value!

# R-squared (monotonically increasing)
library(ggplot2)
reg.summary$rsq

##  [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
##  [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159

rsq_df<-data.frame(rsq=reg.summary$rsq,
                   size=1:19)

ggplot(rsq_df, aes(x=size, y=rsq))+
  geom_point()+
  geom_line()+
  ggtitle("R-squared")

#R-squared Adjusted
#This is similar to R-squared but gives some penalty to the number of predictors, p. We want to maximize this value!

adjrsq_df<-data.frame(rsq=c(reg.summary$rsq,reg.summary$adjr2),
                      type=c(rep("rsq",19),rep("rsq_adj",19)),
                      size=rep(1:19,2))

ggplot(adjrsq_df, aes(x=size, y=rsq, color=type))+
  geom_point()+
  geom_line()+
  ggtitle("R-squared vs Adj R-squared")

#Mallows Cp and BIC
#Unlike the R-squared values, we want to minimize these metrics:

#Note that BIC gives a harsher penalty and thereby picks smaller models.
# Lets look at the other metrics
metric_df<-data.frame(metric=c(reg.summary$rsq,
                               reg.summary$adjr2,
                               reg.summary$cp, 
                               reg.summary$bic),
                      type=c(rep("rsq",19),
                             rep("rsq_adj",19),
                             rep("cp",19),
                             rep("bic",19)),
                      size=rep(1:19,4))

# Add vertical lines to show the best model
which.min(reg.summary$bic)

## [1] 6

which.max(reg.summary$rsq)

## [1] 19

which.max(reg.summary$adjr2)

## [1] 11

vline.dat <- data.frame(type=levels(metric_df$type), vl=c(6, 10,19, 11 ))


ggplot(metric_df, aes(x=size, y=metric, color=type))+
  geom_point()+
  geom_line()+
  ggtitle("Model Comparison Metrics")+
  geom_vline(aes(xintercept=vl), data=vline.dat, lty=2) +
  facet_grid(type~., scales = "free_y")

Critical Thinking

#MLR Model Extensions
#Numeric Interactions
#we have looked at the Boston dataset to explore polynomial regression. We are going to use this dataset for Part I. You will need to load it into this lab session from library 

library(dplyr)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:Ecdat':
## 
##     SP500

## The following object is masked from 'package:dplyr':
## 
##     select

library(ISLR)

names(Boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

#(week6)looked at median value as a function of lower status and proportion of older homes. 
#When we fit a model with just the additive effects of lstat and age, these are known as the main effects.

modInt0<-lm(medv~lstat+age, data=Boston)
summary(modInt0)

## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16

#However, there could be a relationship between lstat and age. We will now explore the their interaction.

#A colon can be use to create a model with only that specified interaction An astrix can be used to create a model with all levels of interactions and main effects


#Exercise 1: Interaction
#First, create a model with only the lstat and age interaction.

modInt1<-lm(medv~lstat:age, data=Boston)
summary(modInt1)

## 
## Call:
## lm(formula = medv ~ lstat:age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.347  -4.372  -1.534   1.914  27.193 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.1588631  0.4828240   62.46   <2e-16 ***
## lstat:age   -0.0077146  0.0003799  -20.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.827 on 504 degrees of freedom
## Multiple R-squared:  0.4501, Adjusted R-squared:  0.449 
## F-statistic: 412.4 on 1 and 504 DF,  p-value: < 2.2e-16

#Exercise 2: Full Model
#Now create a model with all possible interactions between lstat and age.

modInt2<-lm(medv~lstat*age, data=Boston)
summary(modInt2)

## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

#Part II: Multicollinearity
#Lets change datasets again! We’ll be using the Credit data set in the ISLR package. 
#This is another simulated dataset about 10000 credit card customers. 
#For this exercise we’re only interested in four variables: balance, age, limit, and rating.

data(Credit)
names(Credit)

##  [1] "ID"        "Income"    "Limit"     "Rating"    "Cards"     "Age"      
##  [7] "Education" "Gender"    "Student"   "Married"   "Ethnicity" "Balance"

creditTrim<-data.frame(balance=Credit$Balance, 
                       limit=Credit$Limit, 
                       rating=Credit$Rating, 
                       age=Credit$Age)

#Multicollinearity occurs when there is are linear relationships between explanatory variables. So lets looks at a pairs plot and also the correlation matrix.

pairs(creditTrim)

cor(creditTrim)

##             balance     limit    rating         age
## balance 1.000000000 0.8616973 0.8636252 0.001835119
## limit   0.861697267 1.0000000 0.9968797 0.100887922
## rating  0.863625161 0.9968797 1.0000000 0.103164996
## age     0.001835119 0.1008879 0.1031650 1.000000000

#This causes our variance to be overestimated thereby decreasing the respective test statistic and causing larger p-values.

#Consider the two models :
  
credMod1<-lm(balance~age+limit, creditTrim)
summary(credMod1)

## 
## Call:
## lm(formula = balance ~ age + limit, data = creditTrim)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -696.84 -150.78  -13.01  126.68  755.56 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.734e+02  4.383e+01  -3.957 9.01e-05 ***
## age         -2.291e+00  6.725e-01  -3.407 0.000723 ***
## limit        1.734e-01  5.026e-03  34.496  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 230.5 on 397 degrees of freedom
## Multiple R-squared:  0.7498, Adjusted R-squared:  0.7486 
## F-statistic:   595 on 2 and 397 DF,  p-value: < 2.2e-16

credMod2<-lm(balance~rating+limit, creditTrim)
summary(credMod2)

## 
## Call:
## lm(formula = balance ~ rating + limit, data = creditTrim)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -707.8 -135.9   -9.5  124.0  817.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -377.53680   45.25418  -8.343 1.21e-15 ***
## rating         2.20167    0.95229   2.312   0.0213 *  
## limit          0.02451    0.06383   0.384   0.7012    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 232.3 on 397 degrees of freedom
## Multiple R-squared:  0.7459, Adjusted R-squared:  0.7447 
## F-statistic: 582.8 on 2 and 397 DF,  p-value: < 2.2e-16

#A way to quantify the impact of multicollinearity is to look at the VIF (variance inflation factor). 
#A VIF close to 1 would imply no correlation. The larger the VIF the larger the amount of multicollinearity.


library(DAAG)

## Loading required package: lattice

## 
## Attaching package: 'DAAG'

## The following object is masked from 'package:MASS':
## 
##     hills

vif(lm(balance~age+rating+limit, creditTrim))

##      age   rating    limit 
##   1.0114 160.6700 160.5900

Multicollinearity - KNN Algo

Takuma Mimura

6/26/2020

Probit model

Multinominal Logistic Regression model

Backward Selection ALgorithm

Model Comparison Metrics

Critical Thinking