Hello everyone! Here is the code I talked about during out workshop today. You can reach out to me with questions regarding this code or anything else R/stats related: hlperkin@purdue.edu. Enjoy!
###############################################################################
# load libraries ----------------------------------------------------------
library(ggplot2) #needed for all but the most basic plots
library(tidyverse) #used to reformat data from wide to long
library(Rmisc) #used to help create some of the plots
library("tm") #used to get text data into R for wordcloud
library("wordcloud") #used to create the wordcloud
library(psych) #to create correlation matrix
library(corrplot) #for correlation plots
library(DT) #for interactive tables
library(afex) #for ANOVAs
library(lmtest) #for regressions
library(ggfortify) #for regressions
library(car) #for regressions
library(sjPlot) #for regressions
###############################################################################
###############################################################################
# import data -------------------------------------------------------------
data <- read.csv(file="Data/final_merged.csv", header = T)
data <- subset(data, select=c(1,3,4,6,7,8))
data_mf <- subset(data, gender_rc == "M" | gender_rc == "F")
data_long <- gather(data, variable, value, salg_pre:vbs_post, factor_key = T)
data_long_mf <- subset(data_long, gender_rc == "M" | gender_rc == "F")
###############################################################################
###############################################################################
# interactive tables ------------------------------------------------------
summary <- describe(data_mf[2:5]) #generates summary statistics from dataframe
summary2 <- subset(summary, select=c(n, mean, sd, min, max, skew, kurtosis, se)) #subsets to variables we're interested in examining for univariate normality
# use the datatable() command to create the table, and the format() commands to style it
datatable(summary2, colnames = c("N","Mean","SD","Min","Max","Skewness","Kurotsis","SE"), rownames = c("Pre-Test SALG","Pre-Test VBS","Post-Test SALG","Post-Test VBS")) %>%
formatRound(columns = c(2:8), digits = 2) %>%
formatStyle('skew', backgroundColor = styleInterval(cuts = c(-1,1), values = c('yellow','white','yellow'))) %>%
formatStyle('kurtosis', backgroundColor = styleInterval(cuts = c(-1,1), values = c('yellow','white','yellow')))
###############################################################################
###############################################################################
# correlation plots -------------------------------------------------------
# create a correlation matrix using the corr.test() command from the psych package
corrout <- corr.test(data_mf[2:5], use="pairwise", adjust="holm")
# plot it using variation on corrplot() command for corrplot package
corrplot.mixed(corrout$r, p.mat = corrout$p,
lower = "number", upper = "circle",
lower.col = "black",
insig = "pch", pch.col = "red")
###############################################################################
###############################################################################
# plot pre/post comparisons -----------------------------------------------
# reformat data -----------------------------------------------------------
data_long_mf$variable <- as.character(data_long_mf$variable)
data_long_mf_salg <- separate(data = data_long_mf, variable, c("varname","time"), sep = "_")
data_long_mf_salg <- subset(data_long_mf_salg, varname == "salg")
# fit the model
fit1 <- aov_ez(data = data_long_mf_salg, id = "id", dv = "value", between = c("gender_rc"), within = c("time"))
# examine the equality of variance using levene's test. if more than two levels of within-subjects variable check sphericity using mauchly's test
test_levene(fit1)
## Levene's Test for Homogeneity of Variance (center = center)
## Df F value Pr(>F)
## group 1 1.0688 0.3065
## 47
# levene's test not significant, can interpret results
summary(fit1)
##
## Univariate Type III Repeated-Measures ANOVA Assuming Sphericity
##
## Sum Sq num Df Error SS den Df F value Pr(>F)
## (Intercept) 968.29 1 46.249 47 984.0038 < 2e-16 ***
## gender_rc 0.09 1 46.249 47 0.0961 0.75797
## time 1.28 1 13.481 47 4.4518 0.04022 *
## gender_rc:time 0.59 1 13.481 47 2.0705 0.15680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot interaction using afex_plot() command
afex_plot(fit1, x = "time", trace = "gender_rc",
legend_title = "Gender",
factor_levels = list(time = c("Pre-Semester","Post-Semester"))) +
labs(caption = "Figure 1: Comparison of SALG scores at beginning and ending of semester") + xlab("SALG Scores") + ylab("Value")
###############################################################################
###############################################################################
# plot moderated regression with baseline control -------------------------
# recode gender as a factor - not required, just good practice
data_mf$gender_rc <- as.factor(data_mf$gender_rc)
# fit the model
fit2 <- lm(data = data_mf, formula = salg_post ~ salg_pre*gender_rc + vbs_pre*gender_rc)
# test for independence of residuals
dwtest(fit2)
##
## Durbin-Watson test
##
## data: fit2
## DW = 1.8974, p-value = 0.3078
## alternative hypothesis: true autocorrelation is greater than 0
# examine homoscedasticity (spread-location) and residuals (qq-plot)
autoplot(fit2)
# examine multicollinearity (vif is only for non-categorical variables)
vif(fit2)
## salg_pre gender_rc vbs_pre salg_pre:gender_rc
## 1.374278 127.065509 1.774711 40.084473
## gender_rc:vbs_pre
## 65.503190
# everything is ok, so it's safe to interpret results
summary(fit2)
##
## Call:
## lm(formula = salg_post ~ salg_pre * gender_rc + vbs_pre * gender_rc,
## data = data_mf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92870 -0.44815 0.02545 0.48005 1.60951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.90354 1.03529 1.839 0.072878 .
## salg_pre 0.57081 0.15835 3.605 0.000807 ***
## gender_rcM -3.31663 2.53917 -1.306 0.198434
## vbs_pre -0.02483 0.29463 -0.084 0.933219
## salg_pre:gender_rcM 0.37860 0.40346 0.938 0.353284
## gender_rcM:vbs_pre 0.51032 0.52327 0.975 0.334893
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7123 on 43 degrees of freedom
## (59 observations deleted due to missingness)
## Multiple R-squared: 0.3309, Adjusted R-squared: 0.2531
## F-statistic: 4.253 on 5 and 43 DF, p-value: 0.003136
# plot model using plot_model() command from sjPlot package
plot_model(model = fit2, type = "int")
## [[1]]
##
## [[2]]
rplot <- plot_model(model = fit2, type = "int") #save plots as an object
rplot_salg <- rplot[[1]] #extract plots into their own objects
rplot_vbs <- rplot[[2]]
# refine plots using ggplot2 package
rplotcaption2 <- str_wrap("Figure 1: Relationship between pre- and post-test VBS and SALG scores, as moderated by gender and controlling for pre-test SALG", 50)
rplot_vbs +
labs(caption = rplotcaption2, color = "Gender") + xlab("VBS") + ylab("Value") +
ylim(0,6)
# fit a second model without pre-test SALG data
fit3 <- lm(data = data_mf, formula = salg_post ~ vbs_pre*gender_rc)
#plot second model
rplot <- plot_model(model = fit3, type = "int")
rplotcaption3 <- str_wrap("Figure 2: Relationship between pre- and post-test VBS and SALG scores, as moderated by gender and without controlling for pre-test SALG", 50)
rplot +
labs(caption = rplotcaption3, color = "Gender") + xlab("VBS") + ylab("Value") +
ylim(0,6)