#Model building/selection

#How to choose which predictors to include
  #Grand mean model
    #start simple and add more predictors (forward selection) or start with many predictors and remove less useful ones (backward selection)
    #forward and backward are stepwise (iterative methods)

#We need to select a criterion for adding/dropping predictors and threshold
  #F-test with significance level 0.05
  #LRT with significance level 0.05
  #Wald test with significance level 0.05
  #AIC with a difference of 10
  #BIC with a difference of 10


#In the handout (ModelSelection), using F-test with significance level of 0.01
#FORWARD SELECTION: start with the grand mean model. If NONE of the 1-predictor models are better, then we'll stick with the grand mean model
#A model is "better" if the anova f-test gives a p-value  smaller than 0.01
#The anova with the smallest p-value involves the model with infant mortality rate, so now our current model is the one with infant mortality rate
#Keep going. find the most significant predictor each time and add it
#Stop when: run out of predictors to add OR stop getting p-values less than the significance level

#BACKWARD SELECTION: start with a model with many predictors
#Find a non-significant predictor (p-value>significance level) and drop it
#Keep going. Find the least significant predictor each time and drop it.
#Stop when: run out of predictors to drop OR p-values are all greater than the significance level (all predictors are significant)

#----------------------practice------------------------------

library(alr4) #applied linear regression package
newdata = na.omit(UN)

currentmod = mod0 = lm(lifeExpF ~ 1, data=newdata)
mod1 = lm(lifeExpF ~ region, data = newdata)
mod2 = lm(lifeExpF ~ ppgdp, data=newdata)
mod3 = lm(lifeExpF ~ pctUrban, data=newdata)
mod4 = lm(lifeExpF ~ infantMortality, data=newdata)
mod5 = lm(lifeExpF ~ fertility, data=newdata)

#compare each model to the current model
anova(currentmod, mod1) #compares current mod to the first model
## Analysis of Variance Table
## 
## Model 1: lifeExpF ~ 1
## Model 2: lifeExpF ~ region
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    192 19906.4                                  
## 2    186  7278.9  6     12628 53.779 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#isolate p-values from anovas
pvals<-c(anova(currentmod,mod1)[6],
+ anova(currentmod,mod2)[6],
+ anova(currentmod,mod3)[6],
+ anova(currentmod,mod4)[6],
+ anova(currentmod,mod5)[6])
pvals #mod4 has the smallest p-value at 8.225623e-87
## $`Pr(>F)`
## [1]           NA 4.222081e-38
## 
## $`Pr(>F)`
## [1]           NA 3.673818e-17
## 
## $`Pr(>F)`
## [1]           NA 6.337908e-20
## 
## $`Pr(>F)`
## [1]           NA 8.225623e-87
## 
## $`Pr(>F)`
## [1]           NA 5.719746e-49
#this is also less than 0.01, so we know mod4 is better than currentmod
#Therefore, mod4 replaces the grand-mean model as our currentmod
currentmod = mod4
#this is one iteration of forward selection
#Now we consider adding another predictor (in addition to infantMortality, which was the predictor in mod4)

mod12 = lm(lifeExpF ~ infantMortality + region, data=newdata)
mod13 = lm(lifeExpF ~ infantMortality + group , data=newdata)
mod14 = lm(lifeExpF ~ infantMortality + fertility, data=newdata)
mod15 = lm(lifeExpF ~ infantMortality + pctUrban, data=newdata)
#p-values from comparing the models against the current model
pvals<-c(
+ anova(currentmod,mod12)[6],
+ anova(currentmod,mod13)[6],
+ anova(currentmod,mod14)[6],
+ anova(currentmod,mod15)[6])
pvals
## $`Pr(>F)`
## [1]           NA 3.543629e-07
## 
## $`Pr(>F)`
## [1]          NA 3.85907e-09
## 
## $`Pr(>F)`
## [1]         NA 0.08196719
## 
## $`Pr(>F)`
## [1]         NA 0.07870966
#mod13 has the smalles p-value (3.85907e-09) and is smaller than 0.01, so mod13 is now the best model
#our two predictors are infantMortality and group

currentmod = mod13

#we now consider adding a third predictor
moda = lm(lifeExpF ~ infantMortality+group +region, data=newdata)
modb = lm(lifeExpF ~ infantMortality+group+ fertility,data=newdata)
modc = lm(lifeExpF ~ infantMortality+group+pctUrban, data=newdata)

pvals<-c(
+ anova(currentmod,moda)[6],
+ anova(currentmod,modb)[6],
+ anova(currentmod,modc)[6])
pvals
## $`Pr(>F)`
## [1]          NA 0.004085691
## 
## $`Pr(>F)`
## [1]        NA 0.2436129
## 
## $`Pr(>F)`
## [1]         NA 0.08777467
#moda has the smallest p-value (0.004085691) so we add region to the model

currentmod = moda

modd = lm(lifeExpF ~ infantMortality + group + region + fertility, data=newdata)
mode = lm(lifeExpF ~ infantMortality + group + region + ppgdp, data=newdata)
modf = lm(lifeExpF ~ infantMortality + group + region + pctUrban, data=newdata)

pvals<-c(
+ anova(currentmod,modd)[6],
+ anova(currentmod,mode)[6],
+ anova(currentmod,modf)[6])
pvals
## $`Pr(>F)`
## [1]        NA 0.8308651
## 
## $`Pr(>F)`
## [1]         NA 0.01419862
## 
## $`Pr(>F)`
## [1]        NA 0.2734193
#none of these p-values are below 0.01, so we stop at moda


#AIC for forward selection
AIC(mod0)
## [1] 1446.478
AIC(mod1)
## [1] 1264.308
AIC(mod2)
## [1] 1376.609
AIC(mod3)
## [1] 1363.88
AIC(mod4)
## [1] 1053.52
AIC(mod5)
## [1] 1229.366
#the lowest AIC among the one-predictor models is from mod5 (1229.366), which has fertility as its sole predictor
#we would choose mod5 over the other one-predictor models AND over the grand-mean model

#BIC for forward selection
BIC(mod0)
## [1] 1453.004
BIC(mod1)
## [1] 1290.41
BIC(mod2)
## [1] 1386.397
BIC(mod3)
## [1] 1373.669
BIC(mod4)
## [1] 1063.308
BIC(mod5)
## [1] 1239.154
#the lowest BIC among the one-predictor models is from mod4 (1063.308), which has infantMortality as its sole predictor


#stepAIC
library(MASS)
forwardmod <- stepAIC(modc, direction = "forward")
## Start:  AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
backwardmod <- stepAIC(modc, direction = "backward")
## Start:  AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
## 
##                   Df Sum of Sq    RSS    AIC
## <none>                         2062.8 467.24
## - pctUrban         1      32.3 2095.1 468.24
## - group            2     467.4 2530.2 502.66
## - infantMortality  1    3738.6 5801.4 664.81
#forward and backward selection can end up at different models

BIGmod <- lm(lifeExpF ~ infantMortality * group * pctUrban, data = UN)
forwardmod <- stepAIC(BIGmod, direction = "forward")
## Start:  AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
backwardmod <- stepAIC(BIGmod, direction = "backward")
## Start:  AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
## 
##                                  Df Sum of Sq    RSS    AIC
## - infantMortality:group:pctUrban  2     6.221 1846.0 455.81
## <none>                                        1839.8 459.16
## 
## Step:  AIC=455.81
## lifeExpF ~ infantMortality + group + pctUrban + infantMortality:group + 
##     infantMortality:pctUrban + group:pctUrban
## 
##                            Df Sum of Sq    RSS    AIC
## <none>                                  1846.0 455.81
## - infantMortality:group     2    44.111 1890.2 456.37
## - group:pctUrban            2   132.098 1978.1 465.15
## - infantMortality:pctUrban  1   181.645 2027.7 471.93
#mods stay the same because adding/removing predictors increases AIC
#T-test
  #T distribution looks more normal as df -> infinity
  #H0: (1 sample) true mean = some value, (2 sample) the two means are equal
  #HA: (1 sample) true mean =/=, <, > the value, (2 sample) the two means are not equal
  #Test stat(2 sample): T = (mean1 - mean2) / (sd of differences / sqrt(n))
  #Test stat (1 sample): T = (xbar - M) / (s / sqrt(n))
  #P-value: use the T distribution with n-1 (>2 df)
    #in R: pt(q, df, ncp, lower.tail = TRUE, log.p = FALSE)
  #Wald test uses t-distribution
  #t.test function

#chi-square test
  #assumptions: 
    #goodness of fit test (involving 1 categorical variable)
      #independence in observations
      #expected frequencies of each cell should be 5 (if there's 2 categories)
      #expected frequencies should all be 1, no more than 20% should be > 5 (if there's more than 2 categories)
    #test of association (2 categorical variables)
      #independence in observations
      #expected frequencies should all be 1, no more than 20% should be > 5
  #test statistic
    #H0: there is no association between the two chosen variables
    #HA: there is an association between the two chosen variables
    #chi-square test statistic = sum((observed - expected)^2 / expected)
    #df = (# of rows - 1) * (# of columns - 1)
    #expected value = (row total * column total) / grand total
    #tells us how much difference exists between observed counts vs expected counts if no relationship exists
    #we aim for a small value to indicate high correlation and a relationship
  #results: chi-square distributions are all right skewed
    #the more dfs, the more it will look like a normal distribution
  #interpretation: an insignificant p-value should be interpreted as there is no association between the groups being compared. The number of cases observed are approximately equal to the number of expected cases
    #significant p-value means there is an association between the groups
  #in R: chisq.test(table)
    #gives chi-square value, df, and p
  #likelihood ratio test uses chi-square distribution

#F-test
  #H0: there is no difference between the variances
  #HA: there is a difference in variances
  #df: df numerator, df denominator
  #can use to compare 2 nested models
    #a significant p-value means that some of the predictors have a significant relationship with the response variable
    #large p-value means that added predictors don't add much to model