RMD2.R

#RMD 2

#Model Building and Selection
#There are two methods to decide what predictors to use in a model
`2019` <- read.csv("~/Downloads/data_v2 2/yearly/2019.csv")
WRs <- subset(`2019`, Pos == "WR")
attach(WRs)

#Forward Selection:
#Being with the grand mean model
currentmod <- lm(FantasyPoints~1)

#Now consider all of the following one-predictor models
mod1 <- lm(FantasyPoints~Rec)
mod2 <- lm(FantasyPoints~ReceivingTD)
mod3 <- lm(FantasyPoints~ReceivingYds)
mod4 <- lm(FantasyPoints~Fumbles)
mod5 <- lm(FantasyPoints~Tgt)

#We now need to compare each model to the current model
#We are searching for the p values to see if the model is significant in comparison to our grand mean model
anova(currentmod,mod1)

## Analysis of Variance Table
## 
## Model 1: FantasyPoints ~ 1
## Model 2: FantasyPoints ~ Rec
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    216 1392828                                  
## 2    215   58561  1   1334267 4898.6 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Isolate the p values
pvals <- c(anova(currentmod,mod1)[6], + anova(currentmod,mod2)[6], + anova(currentmod,mod3)[6], + anova(currentmod,mod4)[6], + anova(currentmod,mod5)[6])
pvals

## $`Pr(>F)`
## [1]            NA 6.218387e-150
## 
## $`Pr(>F)`
## [1]           NA 3.709489e-79
## 
## $`Pr(>F)`
## [1]            NA 5.127645e-190
## 
## $`Pr(>F)`
## [1]        NA 0.0101707
## 
## $`Pr(>F)`
## [1]            NA 3.387067e-145

#We can see that all of our models are very significant except mod4. We can also see that our best model (smallest Pval) is mod3
currentmod <- mod3

#Now that we have completed one iteration of forward selection, we consider adding another predictor
mod6 <- lm(FantasyPoints~ReceivingYds+Rec)
mod7 <- lm(FantasyPoints~ReceivingYds+ReceivingTD)
mod8 <- lm(FantasyPoints~ReceivingYds+Fumbles)
mod9 <- lm(FantasyPoints~ReceivingYds+Tgt)

pvals1 <- c(anova(currentmod,mod6)[6], + anova(currentmod,mod7)[6], + anova(currentmod,mod8)[6], + anova(currentmod,mod9)[6])
pvals1

## $`Pr(>F)`
## [1]           NA 1.115432e-23
## 
## $`Pr(>F)`
## [1]           NA 4.169256e-24
## 
## $`Pr(>F)`
## [1]        NA 0.3853764
## 
## $`Pr(>F)`
## [1]           NA 2.478845e-13

#Our next best predictor is model 7 which includes Recieving TDs
#We should see if we need to include any more predictors

currentmod <- mod7
moda <- lm(FantasyPoints~ReceivingYds+ReceivingTD+Rec)
modb <- lm(FantasyPoints~ReceivingTD+ReceivingYds+Tgt)

pvals2 <- c(anova(currentmod,moda)[6], + anova(currentmod,modb)[6])
pvals2

## $`Pr(>F)`
## [1]           NA 9.399293e-80
## 
## $`Pr(>F)`
## [1]           NA 3.806118e-38

#We would then proceed to add Receptions and then targets. We stop adding when the new model ceases to be significant or we run out of predictors

#We can also use AIC and BIC to evaluate models. We consider a drop of 10 to be a significant change
AIC(mod1)

## [1] 1836.571

AIC(mod3)

## [1] 1650.286

AIC(mod7)

## [1] 1548.047

AIC(moda)

## [1] 1184.998

#We can clearly see that each model is significantly better than the model before
#and similarly with BIC

BIC(mod1)

## [1] 1846.711

BIC(mod3)

## [1] 1660.426

BIC(mod7)

## [1] 1561.567

BIC(moda)

## [1] 1201.897

#Another way to select predictors is by running stepAIC
library(MASS)
forwardmod <- stepAIC(moda, direction = "forward")

## Start:  AIC=567.18
## FantasyPoints ~ ReceivingYds + ReceivingTD + Rec

coef(forwardmod)

##  (Intercept) ReceivingYds  ReceivingTD          Rec 
##   0.21184195   0.09854928   5.82983044   1.05879999

#Or backwards
backwardmod <- stepAIC(moda, direction = "backward")

## Start:  AIC=567.18
## FantasyPoints ~ ReceivingYds + ReceivingTD + Rec
## 
##                Df Sum of Sq     RSS    AIC
## <none>                       2854.8 567.18
## - Rec           1     12497 15352.2 930.23
## - ReceivingTD   1     12638 15493.0 932.21
## - ReceivingYds  1     14765 17619.8 960.12

coef(backwardmod)

##  (Intercept) ReceivingYds  ReceivingTD          Rec 
##   0.21184195   0.09854928   5.82983044   1.05879999

#Distributions used in testing
#Chi Squared - used when comparing 2 categorical variables
chisq.test(Rec,ReceivingYds)

## Warning in chisq.test(Rec, ReceivingYds): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Rec and ReceivingYds
## X-squared = 15089, df = 13838, p-value = 1.351e-13

#Look for the test stat, df, and pvalue
#Remember H0 is that there is a relationship between variables

RMD2.R

nicholastourtillott

2021-02-25