Week 3

#Homework: https://www.dropbox.com/s/ds9fobjxrcawljn/LikelihoodHW.pdf?dl=0
#Notes from Thursday on Likelihood: https://www.dropbox.com/s/wdvpuxleqo5djd8/Transformation.pdf?dl=0
#Model Selection Notes: https://www.dropbox.com/s/wdvpuxleqo5djd8/Transformation.pdf?dl=0

library(MASS) #For StepAIC

#Model Building/Selection
#How to choose which predictors to include:
#start simple and add more predictors or start w/ a larger model and remove less useful preds
#first is called "Foreward selection," second is called "Backward selection"
#in general called "stepwise selection" --> iterative methods

#We need to select a criterion for adding/dropping predictors and threshold!
#could use of of the following - Ftest (w/ sig lev 0.05), LRT (sig lev), Wald Test (sig lev), AIC (difference), BIC (difference)

#FORWARD SELECTION
#Grand mean model is the model that only uses Beta0 to predict the dependent variable
#(HANDOUT) in foreward selection, start with grand mean model. If NONE of the 1-predictor models are better, tehn we'll stick with Grand Mean Model
#A model is "better" if the anova ftest gives a pvalue smaller than .01
#If many are better than GMM, find the one that is the most better (smallest pvalue) --> becomes the one to beat in the next iteration of FS.
#Find the most sig pred each time and add it
#STOP WHEN: run out of predictors to add or stop getting pvalues less than the significance level.

#BACKWARD SELECTION
#Start with a reasonably large model, not too complicated (may take a long time!)
#Find non-significant (least sig) pred and drop it.
#STOP WHEN:Run out of preds to drop or pvalues are all greater than the significance level.


#Forward Selection Practice from Handowut
data<- read.csv("http://www.cknudson.com/data/UN.csv")
newdata<-na.omit(data)
currentmod<- lm(lifeExpF~1, data=newdata)
mod1<-  lm(lifeExpF~region, data=newdata)
mod2<-  lm(lifeExpF~ppgdp, data=newdata)
mod3<-  lm(lifeExpF~pctUrban, data=newdata)
mod4<-  lm(lifeExpF~infantMortality, data=newdata)
mod5<- lm(lifeExpF~fertility, data=newdata)

pvals<- c(anova(currentmod, mod1)[6], anova(currentmod, mod2)[6], anova(currentmod, mod3)[6], anova(currentmod, mod4)[6],anova(currentmod, mod5)[6])
pvals

## $`Pr(>F)`
## [1]           NA 4.222081e-38
## 
## $`Pr(>F)`
## [1]           NA 3.673818e-17
## 
## $`Pr(>F)`
## [1]           NA 6.337908e-20
## 
## $`Pr(>F)`
## [1]           NA 8.225623e-87
## 
## $`Pr(>F)`
## [1]           NA 5.719746e-49

#Mod 4 had the best ftest pvalue, so it becomes the current mod
currentmod<- mod4
mod11<-  lm(lifeExpF~region+ infantMortality, data=newdata)
mod12<-  lm(lifeExpF~group+ infantMortality, data=newdata)
mod13<-  lm(lifeExpF~pctUrban+ infantMortality, data=newdata)
mod15<- lm(lifeExpF~fertility+ infantMortality, data=newdata)

pvals<- c(anova(currentmod, mod11)[6], anova(currentmod, mod12)[6], anova(currentmod, mod13)[6],anova(currentmod, mod15)[6])
pvals

## $`Pr(>F)`
## [1]           NA 3.543629e-07
## 
## $`Pr(>F)`
## [1]          NA 3.85907e-09
## 
## $`Pr(>F)`
## [1]         NA 0.07870966
## 
## $`Pr(>F)`
## [1]         NA 0.08196719

currentmod<- mod12

mod21<-  lm(lifeExpF~region+group+ infantMortality, data=newdata)
mod23<-  lm(lifeExpF~pctUrban+ group+infantMortality, data=newdata)
mod25<- lm(lifeExpF~fertility+ group+infantMortality, data=newdata)

pvals<- c(anova(currentmod, mod21)[6], anova(currentmod, mod23)[6],anova(currentmod, mod25)[6])
pvals

## $`Pr(>F)`
## [1]          NA 0.004085691
## 
## $`Pr(>F)`
## [1]         NA 0.08777467
## 
## $`Pr(>F)`
## [1]        NA 0.2436129

currentmod<-mod21
moda<-  lm(lifeExpF~pctUrban+region+group+ infantMortality, data=newdata)
modb<-  lm(lifeExpF~fertility+ region+group+ infantMortality, data=newdata)

pvals<- c(anova(currentmod, moda)[6], anova(currentmod, modb)[6])
pvals

## $`Pr(>F)`
## [1]        NA 0.2734193
## 
## $`Pr(>F)`
## [1]        NA 0.8308651

#Neither pvalue is significant at the 0.25 level so we leave our model as mod21

#could use AIC by checking for the lowest AIC value for 1-pred models, then keep adding as long as AIC is dropping by at least [your criteria ie 10]

#StepAIC
#Need library(MASS)
 forwardmod<- stepAIC(mod23, direction = "forward", scope = list(upper = ~infantMortality+group+region+pctUrban, lower= ~1))

## Start:  AIC=467.24
## lifeExpF ~ pctUrban + group + infantMortality
## 
##          Df Sum of Sq    RSS    AIC
## + region  5    166.03 1896.8 461.05
## <none>                2062.8 467.24
## 
## Step:  AIC=461.05
## lifeExpF ~ pctUrban + group + infantMortality + region

backwardmod<- stepAIC(mod23, direction = "backward")

## Start:  AIC=467.24
## lifeExpF ~ pctUrban + group + infantMortality
## 
##                   Df Sum of Sq    RSS    AIC
## <none>                         2062.8 467.24
## - pctUrban         1      32.3 2095.1 468.24
## - group            2     467.4 2530.2 502.66
## - infantMortality  1    3738.6 5801.4 664.81

#check to see if they produce the same model, they dont always have to:
all.equal(coef(forwardmod), coef(backwardmod))

## [1] "Numeric: lengths (11, 5) differ"

#Powerpoint with review of Ftest,T-Test, and Chi-Squared tests
#https://uofstthomasmn-my.sharepoint.com/:p:/g/personal/knud8583_stthomas_edu/EeJdegMjowhCuB-7O92R-pUBHHbqPBS83p7giqQ__4sT8A?e=LobEFa

Week 3

Cheyanne Simpson

2/21/2021