1 Loading Libraries

#install.packages("afex")
#install.packages("emmeans")
#install.packages("ggbeeswarm")
#install.packages("expss")

library(psych) # for the describe() command
library(ggplot2) # to visualize our results
library(expss) # for the cross_cases() command
library(car) # for the leveneTest() command
library(afex) # to run the ANOVA 
library(ggbeeswarm) # to run plot results
library(emmeans) # for posthoc tests

2 Importing Data

# For HW, import the project dataset you cleaned previously this will be the dataset you'll use throughout the rest of the semester

d <- read.csv(file="projectdata.csv", header=T)


# new code! this adds a column with a number for each row. It will make it easier if we need to drop outliers later
d$row_id <- 1:nrow(d)

3 One-Way ANOVA

3.1 State Your Hypothesis

I predict that there will be a significant difference in people’s Satisfaction with Life based on their level of Education (having a high school diploma or less; having completed some college, but no longer in college; and having completed a bachelor’s degree).

3.2 Droping Unused Levels

describe(d)
##              vars    n    mean     sd  median trimmed     mad min  max  range
## ResponseID*     1 3126 1563.50 902.54 1563.50 1563.50 1158.65 1.0 3126 3125.0
## sibling*        2 3126    1.10   0.29    1.00    1.00    0.00 1.0    2    1.0
## edu*            3 3126    2.51   1.25    2.00    2.18    0.00 1.0    7    6.0
## npi             4 3126    0.28   0.31    0.15    0.24    0.23 0.0    1    1.0
## swb             5 3126    4.47   1.32    4.67    4.53    1.48 1.0    7    6.0
## moa_maturity    6 3126    3.59   0.43    3.67    3.65    0.49 1.0    4    3.0
## efficacy        7 3126    3.13   0.45    3.10    3.13    0.44 1.1    4    2.9
## row_id          8 3126 1563.50 902.54 1563.50 1563.50 1158.65 1.0 3126 3125.0
##               skew kurtosis    se
## ResponseID*   0.00    -1.20 16.14
## sibling*      2.74     5.52  0.01
## edu*          2.19     3.69  0.02
## npi           0.94    -0.69  0.01
## swb          -0.36    -0.46  0.02
## moa_maturity -1.20     1.87  0.01
## efficacy     -0.24     0.45  0.01
## row_id        0.00    -1.20 16.14
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:expss':
## 
##     compute, contains, na_if, recode, vars, where
## The following objects are masked from 'package:maditr':
## 
##     between, coalesce, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)
## Warning: package 'forcats' was built under R version 4.5.3
d <- subset(d, !(edu %in% c("2 Currently in college","4 Complete 2 year College degree","6 Currently in graduate education","7 Completed some graduate degree"))) 

d <- droplevels(d)
  
levels(d$edu)
## NULL

3.3 Check Your Variables

# you only need to check the variables you're using in the current analysis

str(d)
## 'data.frame':    226 obs. of  8 variables:
##  $ ResponseID  : chr  "R_2TGbiBXmAtxywsD" "R_2Quh0h3wxTjZjKP" "R_1gTNDGWsqikPuEX" "R_8en7eLsUe2Mk0Qp" ...
##  $ sibling     : chr  "at least one sibling" "at least one sibling" "only child" "at least one sibling" ...
##  $ edu         : chr  "5 Completed Bachelors Degree" "5 Completed Bachelors Degree" "5 Completed Bachelors Degree" "5 Completed Bachelors Degree" ...
##  $ npi         : num  0.154 0.615 0 0.231 0.615 ...
##  $ swb         : num  4.17 3.67 5.33 5.67 3.33 ...
##  $ moa_maturity: num  3.33 3.67 4 1.33 3.67 ...
##  $ efficacy    : num  3.4 2.3 2.9 3.1 2.9 3 3.8 2.7 3.3 3 ...
##  $ row_id      : int  2 7 16 31 56 57 58 59 62 64 ...
# make our categorical variables of interest "factors"
# because we'll use our newly created row ID variable for this analysis, so make sure it's coded as a factor, too.
d$edu <- as.factor(d$edu) 
d$row_id<- as.factor(d$row_id)


# check that our categorical variables of interest are now factors
str(d)
## 'data.frame':    226 obs. of  8 variables:
##  $ ResponseID  : chr  "R_2TGbiBXmAtxywsD" "R_2Quh0h3wxTjZjKP" "R_1gTNDGWsqikPuEX" "R_8en7eLsUe2Mk0Qp" ...
##  $ sibling     : chr  "at least one sibling" "at least one sibling" "only child" "at least one sibling" ...
##  $ edu         : Factor w/ 3 levels "1 High school diploma or less, and NO COLLEGE",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ npi         : num  0.154 0.615 0 0.231 0.615 ...
##  $ swb         : num  4.17 3.67 5.33 5.67 3.33 ...
##  $ moa_maturity: num  3.33 3.67 4 1.33 3.67 ...
##  $ efficacy    : num  3.4 2.3 2.9 3.1 2.9 3 3.8 2.7 3.3 3 ...
##  $ row_id      : Factor w/ 226 levels "2","7","16","31",..: 1 2 3 4 5 6 7 8 9 10 ...
# check our DV skew and kurtosis
describe(d$swb)
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 226 4.34 1.43    4.5    4.39 1.48   1   7     6 -0.3    -0.51 0.09
# we'll use the describeBy() command to view our DV's skew and kurtosis across our IVs' levels
describeBy(d$swb, group = d$edu)
## 
##  Descriptive statistics by group 
## group: 1 High school diploma or less, and NO COLLEGE
##    vars  n mean   sd median trimmed  mad min  max range  skew kurtosis   se
## X1    1 55 4.07 1.54   4.17    4.15 1.73   1 6.67  5.67 -0.35    -0.86 0.21
## ------------------------------------------------------------ 
## group: 3 Completed some college, but no longer in college
##    vars  n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 34 3.99 1.54   4.25    4.02 1.36   1   7     6 -0.31    -0.69 0.26
## ------------------------------------------------------------ 
## group: 5 Completed Bachelors Degree
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 137 4.54 1.32   4.67    4.57 1.48 1.5   7   5.5 -0.12    -0.72 0.11
# also use histograms to examine your continuous variable
hist(d$swb)

3.4 Check Your Assumptions

3.4.1 ANOVA Assumptions

  • DV should be normally distributed across levels of the IV (we checked previously using “describeBy” function)
  • All levels of the IVs should have an equal number of cases and there should be no empty cells. Cells with low numbers decrease the power of the test (which increases chance of Type II error)
  • Homogeneity of variance should be confirmed (using Levene’s Test)
  • Outliers should be identified and removed – we will actually remove them this time!
  • If you have confirmed everything above, the sampling distribution should be normal.

3.4.2 Check levels of IVs

table(d$edu)
## 
##      1 High school diploma or less, and NO COLLEGE 
##                                                 55 
## 3 Completed some college, but no longer in college 
##                                                 34 
##                       5 Completed Bachelors Degree 
##                                                137
# REMEMBER your test's level of POWER is determined by your SMALLEST subsample

3.4.3 Check for outliers using Cook’s distance and Residuals VS Leverage plot

3.4.3.1 Run a Regression to get both outlier plots

# use the lm() command to run the regression
# formula is y~x, where y is our DV, x is our IV

reg_model <- lm(swb~edu, data = d) 

3.4.3.2 Check for outliers

# Cook's distance
plot(reg_model, 4)

# Residuals VS Leverage
plot(reg_model, 5)

3.4.4 Check homogeneity of variance

# use the leveneTest() command from the car package to test homogeneity of variance
# uses the 'formula' setup: formula is y~x, where y is our DV and x is our IV 
leveneTest(swb~edu, data = d)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   2  0.9614  0.384
##       223

3.4.5 Issues with My Data

My cell sizes are very unbalanced between the Education Level group levels. A small sample size for one of the levels of our variable (Completed some college, but no longer in college, n = 34) limits my power and increases my Type II error rate.

Levene’s test was insignificant for my three-level Education Level variable with the One-Way ANOVA. YAY!🎉

No outliers were found.

3.5 Run a One-Way ANOVA

aov_model <- aov_ez(data = d,
                    id = "ResponseID",
                    between = c("edu"),
                    dv = "swb",
                    anova_table = list(es = "pes"))
## Contrasts set to contr.sum for the following variables: edu

3.6 View One-Way Output

nice(aov_model)
## Anova Table (Type 3 tests)
## 
## Response: swb
##   Effect     df  MSE      F  pes p.value
## 1    edu 2, 223 1.99 3.42 * .030    .035
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1

ANOVA Effect Size [partial eta-squared] cutoffs from Cohen (1988): * η^2 < 0.01 indicates a trivial effect * η^2 >= 0.01 indicates a small effect * η^2 >= 0.06 indicates a medium effect * η^2 >= 0.14 indicates a large effect

3.7 Visualize One-Way Results

afex_plot(aov_model, x = "edu")

3.8 Run One-Way Posthoc Tests

Remember: We ONLY run posthoc IF the ANOVA test is SIGNIFICANT! E.g., only run the posthoc tests on Education Level if there is a main effect for Education Level type

emmeans(aov_model, specs="edu")
##  edu                                                emmean    SE  df lower.CL
##  1 High school diploma or less, and NO COLLEGE        4.07 0.190 223     3.70
##  3 Completed some college, but no longer in college   3.99 0.242 223     3.51
##  5 Completed Bachelors Degree                         4.54 0.120 223     4.30
##  upper.CL
##      4.44
##      4.46
##      4.77
## 
## Confidence level used: 0.95
pairs(emmeans(aov_model, specs="edu", adjust="tukey"))
##  contrast                                                                                          
##  1 High school diploma or less, and NO COLLEGE - 3 Completed some college, but no longer in college
##  1 High school diploma or less, and NO COLLEGE - 5 Completed Bachelors Degree                      
##  3 Completed some college, but no longer in college - 5 Completed Bachelors Degree                 
##  estimate    SE  df t.ratio p.value
##    0.0844 0.308 223   0.274  0.9594
##   -0.4668 0.225 223  -2.074  0.0976
##   -0.5512 0.270 223  -2.040  0.1050
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

3.9 Write Up One-Way ANOVA Results

To test my hypothesis that there will be a significant difference in people’s Satisfaction with Life based on their level of Education (having a high school diploma or less; having completed some college, but no longer in college; and having completed a bachelor’s degree), I used a one-way ANOVA. My data for Education Level originally had seven levels, so I dropped four levels to run my one-way ANOVA test. My data was unbalanced, with many more people having completed a Bachelors Degree participating in my survey (n = 137) than who Completed some college, but are no longer in college (n = 34) or participants with a High school diploma or less, and NO COLLEGE (n = 55). This significantly reduces the power of my test and increases the chances of a Type II error.

I found a significant effect of level of Education, F(2, 223) = 3.42, p =.035, ηp2 = .030 (small effect; Cohen, 1988). Posthoc tests using Tukey’s HSD adjustment revealed that participants who have a High school diploma or less, and NO COLLEGE (M =4.07, SE = .19) reported more Satisfaction with Life than those who Completed some college, but are no longer in college (M = 3.99, SE = .24) but less Satisfaction with Life than those who Completed a Bachelors Degree (M = 4.54, SE = .12); participants who Completed a Bachelors Degree reported the highest amount of Satisfaction with Life overall (see Figure 1 for a comparison).

References

Cohen J. (1988). Statistical Power Analysis for the Behavioral Sciences. New York, NY: Routledge Academic.