### Title: Stats & Methods Lab 4 Practice Script
### Author: Kyle M. Lang
### Created: 2018-09-24
### Modified: 2020-09-24
### ###
### Overview ###
### ###
## You will practice fitting MLR models with categorical predictor variables.
## You will need the built-in R datasets "bfi" (from the psych package) and "BMI" (from the wec) package.
### ###
### Tasks / Questions ###
### ###
## Function to automatically fix effects-coded names:
fix_ec_names <- function(x) {
tmp <- contrasts(x)
colnames(contrasts(x)) <- rownames(tmp)[rowSums(tmp) > 0]
x
}
knitr::opts_chunk$set(echo = T, comment = NA, warning = F, message = F)
packages <- c('dplyr', 'wec', 'psych', 'MOTE')
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
options(scipen = 999)
invisible(lapply(packages, library, character.only = TRUE))
data('bfi', 'BMI')
source(file.path(dirname(getwd()), 'code', 'studentFunctions.R'))
You may ignore any missing data, for the purposes of these exercises (although you should never do so in a real data analysis).
# help(bfi)
# gender: Males = 1, Females =2
# education: 1 = HS, 2 = finished HS, 3 = some college, 4 = college graduate, 5 = graduate degree
bfi$gender <- factor(bfi$gender,
levels = c(1, 2),
labels = c("male", "female"))
bfi$education <- factor(bfi$education,
levels = c(1, 2, 3, 4, 5),
labels = c("HS", "finished HS", "some college", "college graduate", "graduate degree"))
gender_edu_tbl <- table(bfi$gender, bfi$education)
num <- gender_edu_tbl['female','graduate degree']
cat(num, 'women in this sample have graduate degrees.')
266 women in this sample have graduate degrees.
male_tbl <- gender_edu_tbl['male',]
male_most_edu <- male_tbl[male_tbl == max(male_tbl)]
cat('"', names(male_most_edu), '" is the most frequently reported level of educational attainment among men in this sample.', sep = '')
"some college" is the most frequently reported level of educational attainment among men in this sample.
cat('"education" factor has', nlevels(BMI$education), 'levels.')
"education" factor has 3 levels.
cat('"', levels(BMI$sex)[1], '" is the reference level of the "sex" factor', sep = '')
"male" is the reference level of the "sex" factor
cat('"', levels(BMI$education)[1], '" is the reference level of the "education" factor', sep = '')
"lowest" is the reference level of the "education" factor
BMI$sex <- relevel(BMI$sex, ref = "male")
BMI$education <- relevel(BMI$education, ref = "highest")
reg3a <- lm(BMI ~ sex + education, data = BMI)
sum_reg3a <- summary(reg3a)
sum_reg3a
Call:
lm(formula = BMI ~ sex + education, data = BMI)
Residuals:
Min 1Q Median 3Q Max
-8.2780 -2.6173 -0.4563 1.9877 15.5884
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.5456 0.1263 194.379 < 0.0000000000000002 ***
sexfemale -0.4986 0.1305 -3.820 0.000136 ***
educationlowest 1.8564 0.1783 10.412 < 0.0000000000000002 ***
educationmiddle 0.7139 0.1472 4.851 0.00000129 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.744 on 3310 degrees of freedom
Multiple R-squared: 0.03565, Adjusted R-squared: 0.03477
F-statistic: 40.78 on 3 and 3310 DF, p-value: < 0.00000000000000022
sex_pval <- sum_reg3a$coefficients['sexfemale','Pr(>|t|)']
sex_f <- sum_reg3a$coefficients['sexfemale','Estimate']
if (sex_pval <= 0.05) {
sex_signi <- 'a'
} else {
sex_signi <- ' NOT'
}
cat('There is ', sex_signi, ' significant effect [t-statistic = ', MOTE::apa(sex_f, decimals = 4), ', p-value = ', MOTE::apa(sex_pval, decimals = 4), '] (at alpha = 0.05) of "sex" on "BMI" after controlling for "education".',
sep = '')
There is a significant effect [t-statistic = -0.4986, p-value = 0.0001] (at alpha = 0.05) of "sex" on "BMI" after controlling for "education".
male_highest_est <- sum_reg3a$coefficients['(Intercept)','Estimate']
cat(MOTE::apa(male_highest_est, decimals = 4), 'is the expected BMI for males in the highest education group.')
24.5456 is the expected BMI for males in the highest education group.
BMI$centered_BMI <- scale(BMI$BMI)
reg2a <- lm(centered_BMI ~ education - 1, data = BMI)
sum_reg2a <- summary.cellMeans(reg2a)
sum_reg2a
Call:
lm(formula = centered_BMI ~ education - 1, data = BMI)
Residuals:
Min 1Q Median 3Q Max
-2.2366 -0.6861 -0.1107 0.5337 4.0253
Coefficients:
Estimate Std. Error t value Pr(>|t|)
educationhighest -0.17887404 0.02845492 -6.286 0.000000000367796303 ***
educationlowest 0.30682881 0.03726291 8.234 0.000000000000000257 ***
educationmiddle -0.00003825 0.02613445 -0.001 0.999
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9845 on 3311 degrees of freedom
Multiple R-squared: 0.03139, Adjusted R-squared: 0.03081
F-statistic: 53.66 on 2 and 3311 DF, p-value: < 0.00000000000000022
edu_pval <- pf(sum_reg2a$fstatistic[1], sum_reg2a$fstatistic[2], sum_reg2a$fstatistic[3], lower.tail = FALSE)
edu_f <- sum_reg2a$fstatistic[1]
if (edu_pval <= 0.05) {
edu_signi <- 'a'
} else {
edu_signi <- ' NOT'
}
cat('There is ', edu_signi, ' significant effect [F-statistic = ', MOTE::apa(edu_f, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.05) of "education" on "BMI".',
sep = '')
There is a significant effect [F-statistic = 53.6590, p-value = 0.0000] (at alpha = 0.05) of "education" on "BMI".
cat('F-statistic = ', MOTE::apa(edu_f, decimals = 4),
sep = '')
F-statistic = 53.6590
var <- 'educationlowest'
edu_pval <- sum_reg2a$coefficients[var,'Pr(>|t|)']
edu_t <- sum_reg2a$coefficients[var,'t value']
edu_est <- sum_reg2a$coefficients[var,'Estimate']
if (edu_pval <= 0.05) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('The mean BMI level in the ', var, ' group is', edu_signi, ' significantly different from 25 [estimated slope = ', MOTE::apa(edu_est, decimals = 4), ', t-statistic = ', MOTE::apa(edu_t, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.05) of "education" on "BMI".',
sep = '')
The mean BMI level in the educationlowest group is significantly different from 25 [estimated slope = 0.3068, t-statistic = 8.2342, p-value = 0.0000] (at alpha = 0.05) of "education" on "BMI".
var <- 'educationmiddle'
edu_pval <- sum_reg2a$coefficients[var,'Pr(>|t|)']
edu_t <- sum_reg2a$coefficients[var,'t value']
edu_est <- sum_reg2a$coefficients[var,'Estimate']
if (edu_pval <= 0.05) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('The mean BMI level in the ', var, ' group is', edu_signi, ' significantly different from 25 [estimated slope = ', MOTE::apa(edu_est, decimals = 4), ', t-statistic = ', MOTE::apa(edu_t, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.05) of "education" on "BMI".',
sep = '')
The mean BMI level in the educationmiddle group is NOT significantly different from 25 [estimated slope = 0.0000, t-statistic = -0.0015, p-value = 0.9988] (at alpha = 0.05) of "education" on "BMI".
var <- 'educationhighest'
edu_pval <- sum_reg2a$coefficients[var,'Pr(>|t|)']
edu_t <- sum_reg2a$coefficients[var,'t value']
edu_est <- sum_reg2a$coefficients[var,'Estimate']
if (edu_pval <= 0.05) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('The mean BMI level in the ', var, ' group is', edu_signi, ' significantly different from 25 [estimated slope = ', MOTE::apa(edu_est, decimals = 4), ', t-statistic = ', MOTE::apa(edu_t, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.05) of "education" on "BMI".',
sep = '')
The mean BMI level in the educationhighest group is significantly different from 25 [estimated slope = -0.1789, t-statistic = -6.2862, p-value = 0.0000] (at alpha = 0.05) of "education" on "BMI".
contrasts(BMI$childless)
yes
no 0
yes 1
BMI$education_uwc <- BMI$education
contrasts(BMI$education_uwc) <- contr.sum(levels(BMI$education_uwc))
BMI$education_uwc <- fix_ec_names(BMI$education_uwc)
contrasts(BMI$education_uwc)
highest lowest
highest 1 0
lowest 0 1
middle -1 -1
uwc_reg1 <- lm(BMI ~ education_uwc + childless, data = BMI)
sum_uwc_reg1 <- summary(uwc_reg1)
sum_uwc_reg1
Call:
lm(formula = BMI ~ education_uwc + childless, data = BMI)
Residuals:
Min 1Q Median 3Q Max
-8.7947 -2.5500 -0.4678 1.9613 16.2403
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.57683 0.07909 323.395 < 0.0000000000000002 ***
education_uwchighest -0.73465 0.09170 -8.012 0.00000000000000155 ***
education_uwclowest 0.84868 0.10603 8.004 0.00000000000000165 ***
childlessyes -1.44709 0.13901 -10.410 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.692 on 3310 degrees of freedom
Multiple R-squared: 0.0621, Adjusted R-squared: 0.06125
F-statistic: 73.05 on 3 and 3310 DF, p-value: < 0.00000000000000022
change_omitted <- function(x) relevel(x, ref = levels(x)[nlevels(x)])
BMI$education_uwc2 <- change_omitted(BMI$education)
contrasts(BMI$education_uwc2) <- contr.sum(nlevels(BMI$education_uwc2))
BMI$education_uwc2 <- fix_ec_names(BMI$education_uwc2)
contrasts(BMI$education_uwc2)
middle highest
middle 1 0
highest 0 1
lowest -1 -1
uwc_reg2 <- lm(BMI ~ education_uwc2 + childless, data = BMI)
sum_uwc_reg2 <- summary(uwc_reg2)
sum_uwc_reg2
Call:
lm(formula = BMI ~ education_uwc2 + childless, data = BMI)
Residuals:
Min 1Q Median 3Q Max
-8.7947 -2.5500 -0.4678 1.9613 16.2403
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.57683 0.07909 323.395 < 0.0000000000000002 ***
education_uwc2middle -0.11403 0.08790 -1.297 0.195
education_uwc2highest -0.73465 0.09170 -8.012 0.00000000000000155 ***
childlessyes -1.44709 0.13901 -10.410 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.692 on 3310 degrees of freedom
Multiple R-squared: 0.0621, Adjusted R-squared: 0.06125
F-statistic: 73.05 on 3 and 3310 DF, p-value: < 0.00000000000000022
child_est <- sum_uwc_reg2$coefficients['(Intercept)','Estimate']
cat(MOTE::apa(child_est, decimals = 4), 'is the expected BMI (averaged across education groups) for people with children.')
25.5768 is the expected BMI (averaged across education groups) for people with children.
if ( 'education_uwc2highest' %in% rownames(sum_uwc_reg2$coefficients) ) {
eduhigh_est <- sum_uwc_reg2$coefficients['education_uwc2highest','Estimate']
} else if ( 'education_uwchighest' %in% rownames(sum_uwc_reg1$coefficients) ) {
eduhigh_est <- sum_uwc_reg1$coefficients['education_uwchighest','Estimate']
}
cat(MOTE::apa(eduhigh_est, decimals = 4), 'is the expected difference in BMI between the most highly educated group and the average BMI across education groups, after controlling for childlessness.')
-0.7347 is the expected difference in BMI between the most highly educated group and the average BMI across education groups, after controlling for childlessness.
if ( 'education_uwc2highest' %in% rownames(sum_uwc_reg2$coefficients) ) {
var <- 'education_uwc2highest'
df <- sum_uwc_reg2
} else if ( 'education_uwchighest' %in% rownames(sum_uwc_reg1$coefficients) ) {
var <- 'education_uwchighest'
df <- sum_uwc_reg1
}
edu_pval <- df$coefficients[var,'Pr(>|t|)']
edu_t <- df$coefficients[var,'t value']
edu_est <- df$coefficients[var,'Estimate']
if (edu_pval <= 0.05) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('The difference reported in (3b) is', edu_signi, ' different from zero [estimated slope = ', MOTE::apa(edu_est, decimals = 4), ', t-statistic = ', MOTE::apa(edu_t, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.05).',
sep = '')
The difference reported in (3b) is different from zero [estimated slope = -0.7347, t-statistic = -8.0117, p-value = 0.0000] (at alpha = 0.05).
if ( 'education_uwc2middle' %in% rownames(sum_uwc_reg2$coefficients) ) {
var <- 'education_uwc2middle'
df <- sum_uwc_reg2
} else if ( 'education_uwcmiddle' %in% rownames(sum_uwc_reg1$coefficients) ) {
var <- 'education_uwcmiddle'
df <- sum_uwc_reg1
}
edumiddle_est <- df$coefficients[var,'Estimate']
cat(MOTE::apa(edumiddle_est, decimals = 4), 'is the expected difference in BMI between the middle education group and the average BMI across education groups, after controlling for childlessness.')
-0.1140 is the expected difference in BMI between the middle education group and the average BMI across education groups, after controlling for childlessness.
if ( 'education_uwc2middle' %in% rownames(sum_uwc_reg2$coefficients) ) {
var <- 'education_uwc2middle'
df <- sum_uwc_reg2
} else if ( 'education_uwcmiddle' %in% rownames(sum_uwc_reg1$coefficients) ) {
var <- 'education_uwcmiddle'
df <- sum_uwc_reg1
}
edu_pval <- df$coefficients[var,'Pr(>|t|)']
edu_t <- df$coefficients[var,'t value']
edu_est <- df$coefficients[var,'Estimate']
if (edu_pval <= 0.05) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('The difference reported in (3d) is', edu_signi, ' significantly different from zero [estimated slope = ', MOTE::apa(edu_est, decimals = 4), ', t-statistic = ', MOTE::apa(edu_t, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.05).',
sep = '')
The difference reported in (3d) is NOT significantly different from zero [estimated slope = -0.1140, t-statistic = -1.2973, p-value = 0.1946] (at alpha = 0.05).
contrasts(BMI$sex)
female
male 0
female 1
BMI$education_wc <- BMI$education
contrasts(BMI$education_wc) <- contr.wec(BMI$education_wc, omitted = "highest")
BMI$education_wc <- fix_ec_names(BMI$education_wc)
contrasts(BMI$education_wc)
lowest middle
highest -0.5831245 -1.185464
lowest 1.0000000 0.000000
middle 0.0000000 1.000000
wc_reg1 <- lm(BMI ~ sex + education_wc, data = BMI)
sum_wc_reg1 <- summary(wc_reg1)
sum_wc_reg1
Call:
lm(formula = BMI ~ sex + education_wc, data = BMI)
Residuals:
Min 1Q Median 3Q Max
-8.2780 -2.6173 -0.4563 1.9877 15.5884
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.24231 0.09485 266.140 < 0.0000000000000002 ***
sexfemale -0.49864 0.13052 -3.820 0.000136 ***
education_wclowest 1.15972 0.12592 9.210 < 0.0000000000000002 ***
education_wcmiddle 0.01721 0.07529 0.229 0.819185
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.744 on 3310 degrees of freedom
Multiple R-squared: 0.03565, Adjusted R-squared: 0.03477
F-statistic: 40.78 on 3 and 3310 DF, p-value: < 0.00000000000000022
BMI$education_wc2 <- BMI$education
contrasts(BMI$education_wc2) <- contr.wec(BMI$education_wc, omitted = "middle")
BMI$education_wc2 <- fix_ec_names(BMI$education_wc2)
contrasts(BMI$education_wc2)
highest lowest
highest 1.0000000 0.0000000
lowest 0.0000000 1.0000000
middle -0.8435518 -0.4918957
wc_reg2 <- lm(BMI ~ sex + education_wc2, data = BMI)
sum_wc_reg2 <- summary(wc_reg2)
sum_wc_reg2
Call:
lm(formula = BMI ~ sex + education_wc2, data = BMI)
Residuals:
Min 1Q Median 3Q Max
-8.2780 -2.6173 -0.4563 1.9877 15.5884
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.24231 0.09485 266.140 < 0.0000000000000002 ***
sexfemale -0.49864 0.13052 -3.820 0.000136 ***
education_wc2highest -0.69666 0.08657 -8.047 0.00000000000000117 ***
education_wc2lowest 1.15972 0.12592 9.210 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.744 on 3310 degrees of freedom
Multiple R-squared: 0.03565, Adjusted R-squared: 0.03477
F-statistic: 40.78 on 3 and 3310 DF, p-value: < 0.00000000000000022
female_est <- sum_wc_reg1$coefficients['(Intercept)','Estimate'] + sum_wc_reg1$coefficients['sexfemale','Estimate']
cat(MOTE::apa(female_est, decimals = 4), 'is the average BMI for females.')
24.7437 is the average BMI for females.
edu_est <- sum_wc_reg2$coefficients['education_wc2lowest','Estimate']
cat(MOTE::apa(edu_est, decimals = 4), 'is the expected difference in BMI between the least educated group and the average BMI, after controlling for sex.')
1.1597 is the expected difference in BMI between the least educated group and the average BMI, after controlling for sex.
var <- 'education_wc2lowest'
edu_pval <- sum_wc_reg2$coefficients[var,'Pr(>|t|)']
edu_t <- sum_wc_reg2$coefficients[var,'t value']
edu_est <- sum_wc_reg2$coefficients[var,'Estimate']
if (edu_pval <= 0.01) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('The difference reported in (3b) is', edu_signi, ' significantly different from zero [estimated slope = ', MOTE::apa(edu_est, decimals = 4), ', t-statistic = ', MOTE::apa(edu_t, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.01).',
sep = '')
The difference reported in (3b) is significantly different from zero [estimated slope = 1.1597, t-statistic = 9.2100, p-value = 0.0000] (at alpha = 0.01).
edu_est <- sum_wc_reg2$coefficients['education_wc2highest','Estimate']
cat(MOTE::apa(edu_est, decimals = 4), 'is the expected difference in BMI between the most highly educated group and the average BMI, after controlling for sex.')
-0.6967 is the expected difference in BMI between the most highly educated group and the average BMI, after controlling for sex.
var <- 'education_wc2highest'
edu_pval <- sum_wc_reg2$coefficients[var,'Pr(>|t|)']
edu_t <- sum_wc_reg2$coefficients[var,'t value']
edu_est <- sum_wc_reg2$coefficients[var,'Estimate']
if (edu_pval <= 0.01) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('The difference reported in (3d) is', edu_signi, ' significantly different from zero [estimated slope = ', MOTE::apa(edu_est, decimals = 4), ', t-statistic = ', MOTE::apa(edu_t, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.01).',
sep = '')
The difference reported in (3d) is significantly different from zero [estimated slope = -0.6967, t-statistic = -8.0471, p-value = 0.0000] (at alpha = 0.01).
wc_reg4a <- lm(BMI ~ sex, data = BMI)
sum_wc_reg4a <- summary(wc_reg4a)
sum_wc_reg4a
Call:
lm(formula = BMI ~ sex, data = BMI)
Residuals:
Min 1Q Median 3Q Max
-7.9293 -2.7112 -0.4974 1.9697 15.0702
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.23544 0.09626 262.149 < 0.0000000000000002 ***
sexfemale -0.48566 0.13236 -3.669 0.000247 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.803 on 3312 degrees of freedom
Multiple R-squared: 0.004049, Adjusted R-squared: 0.003748
F-statistic: 13.46 on 1 and 3312 DF, p-value: 0.000247
aov_wc_reg4a_2 <- anova(wc_reg4a, wc_reg2)
aov_wc_reg4a_2
Analysis of Variance Table
Model 1: BMI ~ sex
Model 2: BMI ~ sex + education_wc2
Res.Df RSS Df Sum of Sq F Pr(>F)
1 3312 47909
2 3310 46389 2 1520 54.229 < 0.00000000000000022 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
edu_pval <- aov_wc_reg4a_2['Pr(>F)'][!is.na(aov_wc_reg4a_2['Pr(>F)'])]
edu_f <- aov_wc_reg4a_2['F'][!is.na(aov_wc_reg4a_2['F'])]
if (edu_pval <= 0.05) {
edu_signi <- ''
} else {
edu_signi <- ' NOT'
}
cat('Education level DOES', edu_signi, ' explain a significant proportion of variance in BMI, above and beyond sex [F-statistic = ', MOTE::apa(edu_f, decimals = 4), ', p-value = ', MOTE::apa(edu_pval, decimals = 4), '] (at alpha = 0.05).',
sep = '')
Education level DOES explain a significant proportion of variance in BMI, above and beyond sex [F-statistic = 54.2286, p-value = 0.0000] (at alpha = 0.05).
cat('F-statistic = ', MOTE::apa(edu_f, decimals = 4),
sep = '')
F-statistic = 54.2286