library(afex) # to run the ANOVA and plot results
library(psych) # for the describe() command
library(ggplot2) # to visualize our results
library(expss) # for the cross_cases() command
library(car) # for the leveneTest() command
library(emmeans) # for posthoc tests
ANOVA Lab
Loading Libraries
Importing Data
<- read.csv(file="Data/mydata.csv", header=T)
d #
# # new code! this adds a column with a number for each row. it acts as an identifier for our participations
$row_id <- 1:nrow(d) d
State Your Hypothesis - PART OF YOUR WRITEUP
One-Way ANOVA: People who have no college education will be lower in self efficacy than than those with a degree or who are pursuing graduate education. # Check Your Assumptions
ANOVA Assumptions
- Independence of observations (confirmed by data report)
- All levels of the IVs should have equal number of cases (ideally; in the real world, this varies) and there should be no empty cells. Cells with low numbers increase chance of Type II error. (we will check this below)
- Homogeneity of variance should be assured (we will check this below)
- Outliers should be identified and removed (we will check this below)
- DV should be normally distributed for each level of the IV (we will check this below)
Check levels of IVs
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUR FROM THE HW IF YOU DO NOT USE IT
table(d$edu, useNA = "always")
1 High school diploma or less, and NO COLLEGE
57
2 Currently in college
2544
3 Completed some college, but no longer in college
35
4 Complete 2 year College degree
181
5 Completed Bachelors Degree
138
6 Currently in graduate education
135
7 Completed some graduate degree
59
<NA>
0
# # if you need to recode
# # to drop levels from your variable
# # this subsets the data and says that any participant who is coded as 'LEVEL BAD' should be removed
# # if you don't need this for the homework, comment it out (add a # at the beginning of the line)
#d <- subset(d, pet != "cat and dog")
#d <- subset(d, pet != "fish")
#d <- subset(d, pet != "multiple types of pet")
#
# # to combine levels
# # this says that where any participant is coded as 'LEVEL BAD' it should be replaced by 'LEVEL GOOD'
# # you can repeat this as needed, changing 'LEVEL BAD' if you have multiple levels that you want to combine into a single level
# # if you don't need this for the homework, comment it out (add a # at the beginning of the line)
$edu_rc[d$edu == "1 High school diploma or less, and NO COLLEGE"] <- "No Degree"
d$edu_rc[d$edu == "2 Currently in college"] <- "No Degree"
d$edu_rc[d$edu == "3 Completed some college, but no longer in college"] <- "No Degree"
d$edu_rc[d$edu == "4 Completed 2 year College degree"] <- "Degree"
d$edu_rc[d$edu == "5 Completed Bachelors Degree"] <- "Degree"
d$edu_rc[d$edu == "6 Currently in graduate education"] <- "Graduate Education"
d$edu_rc[d$edu == "7 Completed some graduate degree"] <- "Graduate Education"
d<- subset(d, edu_rc != "<NA>")
d
# # preview your changes and make sure everything is correct
table(d$edu_rc, useNA = "always")
Degree Graduate Education No Degree <NA>
138 194 2636 0
# #
#
# # check your variable types
str(d)
'data.frame': 2968 obs. of 8 variables:
$ mindful : num 2.4 1.8 2.2 2.2 3.2 ...
$ socmeduse: int 47 23 34 35 37 13 37 43 37 29 ...
$ npi : num 0.6923 0.1538 0.0769 0.0769 0.7692 ...
$ efficacy : num 3.4 3.4 2.2 2.8 3 2.4 2.3 3 3 3.7 ...
$ gender : chr "f" "m" "m" "f" ...
$ edu : chr "2 Currently in college" "5 Completed Bachelors Degree" "2 Currently in college" "2 Currently in college" ...
$ row_id : int 1 2 3 4 5 6 7 8 9 10 ...
$ edu_rc : chr "No Degree" "Degree" "No Degree" "No Degree" ...
# # make sure that your IV is recognized as a factor by R
$edu <- as.factor(d$edu) d
Check homogeneity of variance
# # use the leveneTest() command from the car package to test homogeneity of variance
# # uses the 'formula' setup: formula is y~x1*x2, where y is our DV and x1 is our first IV and x2 is our second IV- FOR LEVENE"S TEST, NONE SIGNFICANCE IS GOOD
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUT FROM THE HW IF YOU DO NOT USE IT
leveneTest(efficacy~edu, data = d)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 2.8165 0.01526 *
2962
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Check for outliers using Cook’s distance and Residuals vs Leverage plot
Run a Regression
# # use this commented out section only if you need to remove outliers
# # to drop a single outlier, remove the # at the beginning of the line and use this code:
# # d <- subset(d, row_id!=c(1108))
#
# # to drop multiple outliers, remove the # at the beginning of the line and use this code:
# # d <- subset(d, row_id!=c(1108) & row_id!=c(602))
#
# use the lm() command to run the regression
# # formula is y~x1*x2 + c, where y is our DV, x1 is our first IV, x2 is our second IV, and c is our covariate
#
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUR FROM THE HW IF YOU DO NOT USE IT
<- lm(efficacy ~ edu, data = d) #for one-way reg_model
Check for outliers
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUR FROM THE HW IF YOU DO NOT USE IT
# # Cook's distance - cutoff is .5 not .05!!!!
plot(reg_model, 4)
#
# # Residuals vs Leverage-- look for wobble in the red line or any participants outside the dasked lines for Cook's distance
plot(reg_model, 5)
Check Your Variables
# # we'll use the describeBy() command to view skew and kurtosis across our IVs and make sure the DV is normally distributed across all of the levels
describeBy(d$efficacy, group = d$edu_rc)
Descriptive statistics by group
group: Degree
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 138 3.21 0.43 3.2 3.21 0.44 1.6 4 2.4 -0.32 0.78 0.04
------------------------------------------------------------
group: Graduate Education
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 194 3.19 0.47 3.1 3.19 0.44 1.7 4 2.3 -0.11 -0.34 0.03
------------------------------------------------------------
group: No Degree
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 2636 3.11 0.45 3.1 3.12 0.44 1.1 4 2.9 -0.26 0.58 0.01
Issues with My Data - PART OF YOUR WRITEUP
For our One-Way ANOVA, we dropped any participants who did not answer, or were NA. We also compiled our education levels into three: no degree, college degree, and higher graduate education. After performing levene’s test, we did find significance at 0.015, meaning we reject the null hypothesis of homogeneity. This means that our independent variable may not be equal across all levels of our independent variable, and we need take this into consideration. Skew and kurtosis were okay, within the bounds, for all of our levels of our dependent variable. There were no outliers.
Run an ANOVA
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUR FROM THE HW IF YOU DO NOT USE IT
<- aov_ez(data = d,
aov_model id = "row_id",
between = c("edu_rc"),
dv = "efficacy",
anova_table = list(es = "pes"))
Converting to factor: edu_rc
Contrasts set to contr.sum for the following variables: edu_rc
View Output
Effect size cutoffs from Cohen (1988):
- η2 = 0.01 indicates a small effect
- η2 = 0.06 indicates a medium effect
- η2 = 0.14 indicates a large effect
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUR FROM THE HW IF YOU DO NOT USE IT
nice(aov_model)
Anova Table (Type 3 tests)
Response: efficacy
Effect df MSE F pes p.value
1 edu_rc 2, 2965 0.20 5.17 ** .003 .006
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1
Visualize Results
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUR FROM THE HW IF YOU DO NOT USE IT
afex_plot(aov_model, x = "edu_rc")
Run Posthoc Tests
Only run posthocs if the test is significant! E.g., only run the posthoc tests on gender if there is a main effect for gender.
# # for a one-way ANOVA
# # COMMENT THIS SECTION OUR FROM THE HW IF YOU DO NOT USE IT
emmeans(aov_model, specs="edu_rc", adjust="tukey")
Note: adjust = "tukey" was changed to "sidak"
because "tukey" is only appropriate for one set of pairwise comparisons
edu_rc emmean SE df lower.CL upper.CL
Degree 3.21 0.03800 2965 3.11 3.30
Graduate Education 3.19 0.03200 2965 3.11 3.27
No Degree 3.11 0.00869 2965 3.09 3.13
Confidence level used: 0.95
Conf-level adjustment: sidak method for 3 estimates
pairs(emmeans(aov_model, specs="edu_rc", adjust="tukey"))
contrast estimate SE df t.ratio p.value
Degree - Graduate Education 0.0164 0.0497 2965 0.330 0.9416
Degree - No Degree 0.0927 0.0389 2965 2.379 0.0458
Graduate Education - No Degree 0.0763 0.0332 2965 2.298 0.0562
P value adjustment: tukey method for comparing a family of 3 estimates
Write Up Results
One-Way ANOVA
We believed that those with lower education levels would have lower self efficacy. While preparing to run a one-way ANOVA, we dropped participants who did not answer. We also did not have homogeneity, but showed heterogeneity. After performing our ANOVA, we analyzed our results. The ANOVA did show significance across education levels with the p-value at .006. However, the effect size was small at .003. This means that the practical significance may be very limited. Our research would benefit from a different kind of test. After running posthoc tests, we were able to better understand the differences. The marginal means show that those who reported having a college degree had the highest self-efficacy, closely followed by graduate educations, then a little more behind were those with no degree. Tukey’s showed that there was not a significant difference between those with a degree and graduate education while those with a degree report significantly higher efficacy than those without. Graduate education and no degree is right above the cut off of significance. Figure 1 shows these differences.
F(2,2965) = 5.117, p = .006, ηp2 < .01
References
Cohen J. (1988). Statistical Power Analysis for the Behavioral Sciences. New York, NY: Routledge Academic.