#Homework: https://www.dropbox.com/s/ds9fobjxrcawljn/LikelihoodHW.pdf?dl=0
#Notes from Thursday on Likelihood: https://www.dropbox.com/s/wdvpuxleqo5djd8/Transformation.pdf?dl=0
#Model Selection Notes: https://www.dropbox.com/s/wdvpuxleqo5djd8/Transformation.pdf?dl=0
library(MASS) #For StepAIC
#Model Building/Selection
#How to choose which predictors to include:
#start simple and add more predictors or start w/ a larger model and remove less useful preds
#first is called "Foreward selection," second is called "Backward selection"
#in general called "stepwise selection" --> iterative methods
#We need to select a criterion for adding/dropping predictors and threshold!
#could use of of the following - Ftest (w/ sig lev 0.05), LRT (sig lev), Wald Test (sig lev), AIC (difference), BIC (difference)
#FORWARD SELECTION
#Grand mean model is the model that only uses Beta0 to predict the dependent variable
#(HANDOUT) in foreward selection, start with grand mean model. If NONE of the 1-predictor models are better, tehn we'll stick with Grand Mean Model
#A model is "better" if the anova ftest gives a pvalue smaller than .01
#If many are better than GMM, find the one that is the most better (smallest pvalue) --> becomes the one to beat in the next iteration of FS.
#Find the most sig pred each time and add it
#STOP WHEN: run out of predictors to add or stop getting pvalues less than the significance level.
#BACKWARD SELECTION
#Start with a reasonably large model, not too complicated (may take a long time!)
#Find non-significant (least sig) pred and drop it.
#STOP WHEN:Run out of preds to drop or pvalues are all greater than the significance level.
#Forward Selection Practice from Handowut
data<- read.csv("http://www.cknudson.com/data/UN.csv")
newdata<-na.omit(data)
currentmod<- lm(lifeExpF~1, data=newdata)
mod1<- lm(lifeExpF~region, data=newdata)
mod2<- lm(lifeExpF~ppgdp, data=newdata)
mod3<- lm(lifeExpF~pctUrban, data=newdata)
mod4<- lm(lifeExpF~infantMortality, data=newdata)
mod5<- lm(lifeExpF~fertility, data=newdata)
pvals<- c(anova(currentmod, mod1)[6], anova(currentmod, mod2)[6], anova(currentmod, mod3)[6], anova(currentmod, mod4)[6],anova(currentmod, mod5)[6])
pvals
## $`Pr(>F)`
## [1] NA 4.222081e-38
##
## $`Pr(>F)`
## [1] NA 3.673818e-17
##
## $`Pr(>F)`
## [1] NA 6.337908e-20
##
## $`Pr(>F)`
## [1] NA 8.225623e-87
##
## $`Pr(>F)`
## [1] NA 5.719746e-49
#Mod 4 had the best ftest pvalue, so it becomes the current mod
currentmod<- mod4
mod11<- lm(lifeExpF~region+ infantMortality, data=newdata)
mod12<- lm(lifeExpF~group+ infantMortality, data=newdata)
mod13<- lm(lifeExpF~pctUrban+ infantMortality, data=newdata)
mod15<- lm(lifeExpF~fertility+ infantMortality, data=newdata)
pvals<- c(anova(currentmod, mod11)[6], anova(currentmod, mod12)[6], anova(currentmod, mod13)[6],anova(currentmod, mod15)[6])
pvals
## $`Pr(>F)`
## [1] NA 3.543629e-07
##
## $`Pr(>F)`
## [1] NA 3.85907e-09
##
## $`Pr(>F)`
## [1] NA 0.07870966
##
## $`Pr(>F)`
## [1] NA 0.08196719
currentmod<- mod12
mod21<- lm(lifeExpF~region+group+ infantMortality, data=newdata)
mod23<- lm(lifeExpF~pctUrban+ group+infantMortality, data=newdata)
mod25<- lm(lifeExpF~fertility+ group+infantMortality, data=newdata)
pvals<- c(anova(currentmod, mod21)[6], anova(currentmod, mod23)[6],anova(currentmod, mod25)[6])
pvals
## $`Pr(>F)`
## [1] NA 0.004085691
##
## $`Pr(>F)`
## [1] NA 0.08777467
##
## $`Pr(>F)`
## [1] NA 0.2436129
currentmod<-mod21
moda<- lm(lifeExpF~pctUrban+region+group+ infantMortality, data=newdata)
modb<- lm(lifeExpF~fertility+ region+group+ infantMortality, data=newdata)
pvals<- c(anova(currentmod, moda)[6], anova(currentmod, modb)[6])
pvals
## $`Pr(>F)`
## [1] NA 0.2734193
##
## $`Pr(>F)`
## [1] NA 0.8308651
#Neither pvalue is significant at the 0.25 level so we leave our model as mod21
#could use AIC by checking for the lowest AIC value for 1-pred models, then keep adding as long as AIC is dropping by at least [your criteria ie 10]
#StepAIC
#Need library(MASS)
forwardmod<- stepAIC(mod23, direction = "forward", scope = list(upper = ~infantMortality+group+region+pctUrban, lower= ~1))
## Start: AIC=467.24
## lifeExpF ~ pctUrban + group + infantMortality
##
## Df Sum of Sq RSS AIC
## + region 5 166.03 1896.8 461.05
## <none> 2062.8 467.24
##
## Step: AIC=461.05
## lifeExpF ~ pctUrban + group + infantMortality + region
backwardmod<- stepAIC(mod23, direction = "backward")
## Start: AIC=467.24
## lifeExpF ~ pctUrban + group + infantMortality
##
## Df Sum of Sq RSS AIC
## <none> 2062.8 467.24
## - pctUrban 1 32.3 2095.1 468.24
## - group 2 467.4 2530.2 502.66
## - infantMortality 1 3738.6 5801.4 664.81
#check to see if they produce the same model, they dont always have to:
all.equal(coef(forwardmod), coef(backwardmod))
## [1] "Numeric: lengths (11, 5) differ"
#Powerpoint with review of Ftest,T-Test, and Chi-Squared tests
#https://uofstthomasmn-my.sharepoint.com/:p:/g/personal/knud8583_stthomas_edu/EeJdegMjowhCuB-7O92R-pUBHHbqPBS83p7giqQ__4sT8A?e=LobEFa