#install.packages("afex")
#install.packages("emmeans")
#install.packages("ggbeeswarm")
#install.packages("expss")
library(psych) # for the describe() command
library(ggplot2) # to visualize our results
library(expss) # for the cross_cases() command
library(car) # for the leveneTest() command
library(afex) # to run the ANOVA
library(ggbeeswarm) # to run plot results
library(emmeans) # for posthoc tests
# For HW, import the project dataset you cleaned previously this will be the dataset you'll use throughout the rest of the semester
d <- read.csv(file="projectdata.csv", header=T)
# new code! this adds a column with a number for each row. It will make it easier if we need to drop outliers later
d$row_id <- 1:nrow(d)
I predict that there will be a significant difference in people’s Satisfaction with Life based on their level of Education (having a high school diploma or less; having completed some college, but no longer in college; and having completed a bachelor’s degree).
describe(d)
## vars n mean sd median trimmed mad min max range
## ResponseID* 1 3126 1563.50 902.54 1563.50 1563.50 1158.65 1.0 3126 3125.0
## sibling* 2 3126 1.10 0.29 1.00 1.00 0.00 1.0 2 1.0
## edu* 3 3126 2.51 1.25 2.00 2.18 0.00 1.0 7 6.0
## npi 4 3126 0.28 0.31 0.15 0.24 0.23 0.0 1 1.0
## swb 5 3126 4.47 1.32 4.67 4.53 1.48 1.0 7 6.0
## moa_maturity 6 3126 3.59 0.43 3.67 3.65 0.49 1.0 4 3.0
## efficacy 7 3126 3.13 0.45 3.10 3.13 0.44 1.1 4 2.9
## row_id 8 3126 1563.50 902.54 1563.50 1563.50 1158.65 1.0 3126 3125.0
## skew kurtosis se
## ResponseID* 0.00 -1.20 16.14
## sibling* 2.74 5.52 0.01
## edu* 2.19 3.69 0.02
## npi 0.94 -0.69 0.01
## swb -0.36 -0.46 0.02
## moa_maturity -1.20 1.87 0.01
## efficacy -0.24 0.45 0.01
## row_id 0.00 -1.20 16.14
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:expss':
##
## compute, contains, na_if, recode, vars, where
## The following objects are masked from 'package:maditr':
##
## between, coalesce, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
## Warning: package 'forcats' was built under R version 4.5.3
d <- subset(d, !(edu %in% c("2 Currently in college","4 Complete 2 year College degree","6 Currently in graduate education","7 Completed some graduate degree")))
d <- droplevels(d)
levels(d$edu)
## NULL
# you only need to check the variables you're using in the current analysis
str(d)
## 'data.frame': 226 obs. of 8 variables:
## $ ResponseID : chr "R_2TGbiBXmAtxywsD" "R_2Quh0h3wxTjZjKP" "R_1gTNDGWsqikPuEX" "R_8en7eLsUe2Mk0Qp" ...
## $ sibling : chr "at least one sibling" "at least one sibling" "only child" "at least one sibling" ...
## $ edu : chr "5 Completed Bachelors Degree" "5 Completed Bachelors Degree" "5 Completed Bachelors Degree" "5 Completed Bachelors Degree" ...
## $ npi : num 0.154 0.615 0 0.231 0.615 ...
## $ swb : num 4.17 3.67 5.33 5.67 3.33 ...
## $ moa_maturity: num 3.33 3.67 4 1.33 3.67 ...
## $ efficacy : num 3.4 2.3 2.9 3.1 2.9 3 3.8 2.7 3.3 3 ...
## $ row_id : int 2 7 16 31 56 57 58 59 62 64 ...
# make our categorical variables of interest "factors"
# because we'll use our newly created row ID variable for this analysis, so make sure it's coded as a factor, too.
d$edu <- as.factor(d$edu)
d$row_id<- as.factor(d$row_id)
# check that our categorical variables of interest are now factors
str(d)
## 'data.frame': 226 obs. of 8 variables:
## $ ResponseID : chr "R_2TGbiBXmAtxywsD" "R_2Quh0h3wxTjZjKP" "R_1gTNDGWsqikPuEX" "R_8en7eLsUe2Mk0Qp" ...
## $ sibling : chr "at least one sibling" "at least one sibling" "only child" "at least one sibling" ...
## $ edu : Factor w/ 3 levels "1 High school diploma or less, and NO COLLEGE",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ npi : num 0.154 0.615 0 0.231 0.615 ...
## $ swb : num 4.17 3.67 5.33 5.67 3.33 ...
## $ moa_maturity: num 3.33 3.67 4 1.33 3.67 ...
## $ efficacy : num 3.4 2.3 2.9 3.1 2.9 3 3.8 2.7 3.3 3 ...
## $ row_id : Factor w/ 226 levels "2","7","16","31",..: 1 2 3 4 5 6 7 8 9 10 ...
# check our DV skew and kurtosis
describe(d$swb)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 226 4.34 1.43 4.5 4.39 1.48 1 7 6 -0.3 -0.51 0.09
# we'll use the describeBy() command to view our DV's skew and kurtosis across our IVs' levels
describeBy(d$swb, group = d$edu)
##
## Descriptive statistics by group
## group: 1 High school diploma or less, and NO COLLEGE
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 55 4.07 1.54 4.17 4.15 1.73 1 6.67 5.67 -0.35 -0.86 0.21
## ------------------------------------------------------------
## group: 3 Completed some college, but no longer in college
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 34 3.99 1.54 4.25 4.02 1.36 1 7 6 -0.31 -0.69 0.26
## ------------------------------------------------------------
## group: 5 Completed Bachelors Degree
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 137 4.54 1.32 4.67 4.57 1.48 1.5 7 5.5 -0.12 -0.72 0.11
# also use histograms to examine your continuous variable
hist(d$swb)
table(d$edu)
##
## 1 High school diploma or less, and NO COLLEGE
## 55
## 3 Completed some college, but no longer in college
## 34
## 5 Completed Bachelors Degree
## 137
# REMEMBER your test's level of POWER is determined by your SMALLEST subsample
# use the lm() command to run the regression
# formula is y~x, where y is our DV, x is our IV
reg_model <- lm(swb~edu, data = d)
# Cook's distance
plot(reg_model, 4)
# Residuals VS Leverage
plot(reg_model, 5)
# use the leveneTest() command from the car package to test homogeneity of variance
# uses the 'formula' setup: formula is y~x, where y is our DV and x is our IV
leveneTest(swb~edu, data = d)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.9614 0.384
## 223
My cell sizes are very unbalanced between the Education Level group levels. A small sample size for one of the levels of our variable (Completed some college, but no longer in college, n = 34) limits my power and increases my Type II error rate.
Levene’s test was insignificant for my three-level Education Level variable with the One-Way ANOVA. YAY!🎉
No outliers were found.
aov_model <- aov_ez(data = d,
id = "ResponseID",
between = c("edu"),
dv = "swb",
anova_table = list(es = "pes"))
## Contrasts set to contr.sum for the following variables: edu
nice(aov_model)
## Anova Table (Type 3 tests)
##
## Response: swb
## Effect df MSE F pes p.value
## 1 edu 2, 223 1.99 3.42 * .030 .035
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1
ANOVA Effect Size [partial eta-squared] cutoffs from Cohen (1988): * η^2 < 0.01 indicates a trivial effect * η^2 >= 0.01 indicates a small effect * η^2 >= 0.06 indicates a medium effect * η^2 >= 0.14 indicates a large effect
afex_plot(aov_model, x = "edu")
Remember: We ONLY run posthoc IF the ANOVA test is SIGNIFICANT! E.g., only run the posthoc tests on Education Level if there is a main effect for Education Level type
emmeans(aov_model, specs="edu")
## edu emmean SE df lower.CL
## 1 High school diploma or less, and NO COLLEGE 4.07 0.190 223 3.70
## 3 Completed some college, but no longer in college 3.99 0.242 223 3.51
## 5 Completed Bachelors Degree 4.54 0.120 223 4.30
## upper.CL
## 4.44
## 4.46
## 4.77
##
## Confidence level used: 0.95
pairs(emmeans(aov_model, specs="edu", adjust="tukey"))
## contrast
## 1 High school diploma or less, and NO COLLEGE - 3 Completed some college, but no longer in college
## 1 High school diploma or less, and NO COLLEGE - 5 Completed Bachelors Degree
## 3 Completed some college, but no longer in college - 5 Completed Bachelors Degree
## estimate SE df t.ratio p.value
## 0.0844 0.308 223 0.274 0.9594
## -0.4668 0.225 223 -2.074 0.0976
## -0.5512 0.270 223 -2.040 0.1050
##
## P value adjustment: tukey method for comparing a family of 3 estimates
To test my hypothesis that there will be a significant difference in people’s Satisfaction with Life based on their level of Education (having a high school diploma or less; having completed some college, but no longer in college; and having completed a bachelor’s degree), I used a one-way ANOVA. My data for Education Level originally had seven levels, so I dropped four levels to run my one-way ANOVA test. My data was unbalanced, with many more people having completed a Bachelors Degree participating in my survey (n = 137) than who Completed some college, but are no longer in college (n = 34) or participants with a High school diploma or less, and NO COLLEGE (n = 55). This significantly reduces the power of my test and increases the chances of a Type II error.
I found a significant effect of level of Education, F(2, 223) = 3.42, p =.035, ηp2 = .030 (small effect; Cohen, 1988). Posthoc tests using Tukey’s HSD adjustment revealed that participants who have a High school diploma or less, and NO COLLEGE (M =4.07, SE = .19) reported more Satisfaction with Life than those who Completed some college, but are no longer in college (M = 3.99, SE = .24) but less Satisfaction with Life than those who Completed a Bachelors Degree (M = 4.54, SE = .12); participants who Completed a Bachelors Degree reported the highest amount of Satisfaction with Life overall (see Figure 1 for a comparison).
References
Cohen J. (1988). Statistical Power Analysis for the Behavioral Sciences. New York, NY: Routledge Academic.