#Model building/selection
#How to choose which predictors to include
#Grand mean model
#start simple and add more predictors (forward selection) or start with many predictors and remove less useful ones (backward selection)
#forward and backward are stepwise (iterative methods)
#We need to select a criterion for adding/dropping predictors and threshold
#F-test with significance level 0.05
#LRT with significance level 0.05
#Wald test with significance level 0.05
#AIC with a difference of 10
#BIC with a difference of 10
#In the handout (ModelSelection), using F-test with significance level of 0.01
#FORWARD SELECTION: start with the grand mean model. If NONE of the 1-predictor models are better, then we'll stick with the grand mean model
#A model is "better" if the anova f-test gives a p-value smaller than 0.01
#The anova with the smallest p-value involves the model with infant mortality rate, so now our current model is the one with infant mortality rate
#Keep going. find the most significant predictor each time and add it
#Stop when: run out of predictors to add OR stop getting p-values less than the significance level
#BACKWARD SELECTION: start with a model with many predictors
#Find a non-significant predictor (p-value>significance level) and drop it
#Keep going. Find the least significant predictor each time and drop it.
#Stop when: run out of predictors to drop OR p-values are all greater than the significance level (all predictors are significant)
#----------------------practice------------------------------
library(alr4) #applied linear regression package
newdata = na.omit(UN)
currentmod = mod0 = lm(lifeExpF ~ 1, data=newdata)
mod1 = lm(lifeExpF ~ region, data = newdata)
mod2 = lm(lifeExpF ~ ppgdp, data=newdata)
mod3 = lm(lifeExpF ~ pctUrban, data=newdata)
mod4 = lm(lifeExpF ~ infantMortality, data=newdata)
mod5 = lm(lifeExpF ~ fertility, data=newdata)
#compare each model to the current model
anova(currentmod, mod1) #compares current mod to the first model
## Analysis of Variance Table
##
## Model 1: lifeExpF ~ 1
## Model 2: lifeExpF ~ region
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 192 19906.4
## 2 186 7278.9 6 12628 53.779 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#isolate p-values from anovas
pvals<-c(anova(currentmod,mod1)[6],
+ anova(currentmod,mod2)[6],
+ anova(currentmod,mod3)[6],
+ anova(currentmod,mod4)[6],
+ anova(currentmod,mod5)[6])
pvals #mod4 has the smallest p-value at 8.225623e-87
## $`Pr(>F)`
## [1] NA 4.222081e-38
##
## $`Pr(>F)`
## [1] NA 3.673818e-17
##
## $`Pr(>F)`
## [1] NA 6.337908e-20
##
## $`Pr(>F)`
## [1] NA 8.225623e-87
##
## $`Pr(>F)`
## [1] NA 5.719746e-49
#this is also less than 0.01, so we know mod4 is better than currentmod
#Therefore, mod4 replaces the grand-mean model as our currentmod
currentmod = mod4
#this is one iteration of forward selection
#Now we consider adding another predictor (in addition to infantMortality, which was the predictor in mod4)
mod12 = lm(lifeExpF ~ infantMortality + region, data=newdata)
mod13 = lm(lifeExpF ~ infantMortality + group , data=newdata)
mod14 = lm(lifeExpF ~ infantMortality + fertility, data=newdata)
mod15 = lm(lifeExpF ~ infantMortality + pctUrban, data=newdata)
#p-values from comparing the models against the current model
pvals<-c(
+ anova(currentmod,mod12)[6],
+ anova(currentmod,mod13)[6],
+ anova(currentmod,mod14)[6],
+ anova(currentmod,mod15)[6])
pvals
## $`Pr(>F)`
## [1] NA 3.543629e-07
##
## $`Pr(>F)`
## [1] NA 3.85907e-09
##
## $`Pr(>F)`
## [1] NA 0.08196719
##
## $`Pr(>F)`
## [1] NA 0.07870966
#mod13 has the smalles p-value (3.85907e-09) and is smaller than 0.01, so mod13 is now the best model
#our two predictors are infantMortality and group
currentmod = mod13
#we now consider adding a third predictor
moda = lm(lifeExpF ~ infantMortality+group +region, data=newdata)
modb = lm(lifeExpF ~ infantMortality+group+ fertility,data=newdata)
modc = lm(lifeExpF ~ infantMortality+group+pctUrban, data=newdata)
pvals<-c(
+ anova(currentmod,moda)[6],
+ anova(currentmod,modb)[6],
+ anova(currentmod,modc)[6])
pvals
## $`Pr(>F)`
## [1] NA 0.004085691
##
## $`Pr(>F)`
## [1] NA 0.2436129
##
## $`Pr(>F)`
## [1] NA 0.08777467
#moda has the smallest p-value (0.004085691) so we add region to the model
currentmod = moda
modd = lm(lifeExpF ~ infantMortality + group + region + fertility, data=newdata)
mode = lm(lifeExpF ~ infantMortality + group + region + ppgdp, data=newdata)
modf = lm(lifeExpF ~ infantMortality + group + region + pctUrban, data=newdata)
pvals<-c(
+ anova(currentmod,modd)[6],
+ anova(currentmod,mode)[6],
+ anova(currentmod,modf)[6])
pvals
## $`Pr(>F)`
## [1] NA 0.8308651
##
## $`Pr(>F)`
## [1] NA 0.01419862
##
## $`Pr(>F)`
## [1] NA 0.2734193
#none of these p-values are below 0.01, so we stop at moda
#AIC for forward selection
AIC(mod0)
## [1] 1446.478
AIC(mod1)
## [1] 1264.308
AIC(mod2)
## [1] 1376.609
AIC(mod3)
## [1] 1363.88
AIC(mod4)
## [1] 1053.52
AIC(mod5)
## [1] 1229.366
#the lowest AIC among the one-predictor models is from mod5 (1229.366), which has fertility as its sole predictor
#we would choose mod5 over the other one-predictor models AND over the grand-mean model
#BIC for forward selection
BIC(mod0)
## [1] 1453.004
BIC(mod1)
## [1] 1290.41
BIC(mod2)
## [1] 1386.397
BIC(mod3)
## [1] 1373.669
BIC(mod4)
## [1] 1063.308
BIC(mod5)
## [1] 1239.154
#the lowest BIC among the one-predictor models is from mod4 (1063.308), which has infantMortality as its sole predictor
#stepAIC
library(MASS)
forwardmod <- stepAIC(modc, direction = "forward")
## Start: AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
backwardmod <- stepAIC(modc, direction = "backward")
## Start: AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
##
## Df Sum of Sq RSS AIC
## <none> 2062.8 467.24
## - pctUrban 1 32.3 2095.1 468.24
## - group 2 467.4 2530.2 502.66
## - infantMortality 1 3738.6 5801.4 664.81
#forward and backward selection can end up at different models
BIGmod <- lm(lifeExpF ~ infantMortality * group * pctUrban, data = UN)
forwardmod <- stepAIC(BIGmod, direction = "forward")
## Start: AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
backwardmod <- stepAIC(BIGmod, direction = "backward")
## Start: AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
##
## Df Sum of Sq RSS AIC
## - infantMortality:group:pctUrban 2 6.221 1846.0 455.81
## <none> 1839.8 459.16
##
## Step: AIC=455.81
## lifeExpF ~ infantMortality + group + pctUrban + infantMortality:group +
## infantMortality:pctUrban + group:pctUrban
##
## Df Sum of Sq RSS AIC
## <none> 1846.0 455.81
## - infantMortality:group 2 44.111 1890.2 456.37
## - group:pctUrban 2 132.098 1978.1 465.15
## - infantMortality:pctUrban 1 181.645 2027.7 471.93
#mods stay the same because adding/removing predictors increases AIC
#T-test
#T distribution looks more normal as df -> infinity
#H0: (1 sample) true mean = some value, (2 sample) the two means are equal
#HA: (1 sample) true mean =/=, <, > the value, (2 sample) the two means are not equal
#Test stat(2 sample): T = (mean1 - mean2) / (sd of differences / sqrt(n))
#Test stat (1 sample): T = (xbar - M) / (s / sqrt(n))
#P-value: use the T distribution with n-1 (>2 df)
#in R: pt(q, df, ncp, lower.tail = TRUE, log.p = FALSE)
#Wald test uses t-distribution
#t.test function
#chi-square test
#assumptions:
#goodness of fit test (involving 1 categorical variable)
#independence in observations
#expected frequencies of each cell should be 5 (if there's 2 categories)
#expected frequencies should all be 1, no more than 20% should be > 5 (if there's more than 2 categories)
#test of association (2 categorical variables)
#independence in observations
#expected frequencies should all be 1, no more than 20% should be > 5
#test statistic
#H0: there is no association between the two chosen variables
#HA: there is an association between the two chosen variables
#chi-square test statistic = sum((observed - expected)^2 / expected)
#df = (# of rows - 1) * (# of columns - 1)
#expected value = (row total * column total) / grand total
#tells us how much difference exists between observed counts vs expected counts if no relationship exists
#we aim for a small value to indicate high correlation and a relationship
#results: chi-square distributions are all right skewed
#the more dfs, the more it will look like a normal distribution
#interpretation: an insignificant p-value should be interpreted as there is no association between the groups being compared. The number of cases observed are approximately equal to the number of expected cases
#significant p-value means there is an association between the groups
#in R: chisq.test(table)
#gives chi-square value, df, and p
#likelihood ratio test uses chi-square distribution
#F-test
#H0: there is no difference between the variances
#HA: there is a difference in variances
#df: df numerator, df denominator
#can use to compare 2 nested models
#a significant p-value means that some of the predictors have a significant relationship with the response variable
#large p-value means that added predictors don't add much to model