Quantitative Workshop #6: Visualizations

Hello everyone! Here is the code I talked about during out workshop today. You can reach out to me with questions regarding this code or anything else R/stats related: hlperkin@purdue.edu. Enjoy!

###############################################################################
# load libraries ----------------------------------------------------------

library(ggplot2) #needed for all but the most basic plots
library(tidyverse) #used to reformat data from wide to long
library(Rmisc) #used to help create some of the plots
library("tm") #used to get text data into R for wordcloud
library("wordcloud") #used to create the wordcloud
library(psych) #to create correlation matrix
library(corrplot) #for correlation plots
library(DT) #for interactive tables
library(afex) #for ANOVAs
library(lmtest) #for regressions
library(ggfortify) #for regressions
library(car) #for regressions
library(sjPlot) #for regressions
###############################################################################

###############################################################################
# import data -------------------------------------------------------------

data <- read.csv(file="Data/final_merged.csv", header = T)
data <- subset(data, select=c(1,3,4,6,7,8))
data_mf <- subset(data, gender_rc == "M" | gender_rc == "F")
data_long <- gather(data, variable, value, salg_pre:vbs_post, factor_key = T)
data_long_mf <- subset(data_long, gender_rc == "M" | gender_rc == "F")
###############################################################################

###############################################################################
# interactive tables ------------------------------------------------------

summary <- describe(data_mf[2:5]) #generates summary statistics from dataframe
summary2 <- subset(summary, select=c(n, mean, sd, min, max, skew, kurtosis, se)) #subsets to variables we're interested in examining for univariate normality

# use the datatable() command to create the table, and the format() commands to style it
datatable(summary2, colnames = c("N","Mean","SD","Min","Max","Skewness","Kurotsis","SE"), rownames = c("Pre-Test SALG","Pre-Test VBS","Post-Test SALG","Post-Test VBS")) %>%
  formatRound(columns = c(2:8), digits = 2) %>%
  formatStyle('skew', backgroundColor = styleInterval(cuts = c(-1,1), values = c('yellow','white','yellow'))) %>%
  formatStyle('kurtosis', backgroundColor = styleInterval(cuts = c(-1,1), values = c('yellow','white','yellow')))

###############################################################################

###############################################################################
# correlation plots -------------------------------------------------------

# create a correlation matrix using the corr.test() command from the psych package
corrout <- corr.test(data_mf[2:5], use="pairwise", adjust="holm")

# plot it using variation on corrplot() command for corrplot package
corrplot.mixed(corrout$r, p.mat = corrout$p,
               lower = "number", upper = "circle",
               lower.col = "black",
               insig = "pch", pch.col = "red")

###############################################################################

###############################################################################
# plot pre/post comparisons -----------------------------------------------

# reformat data -----------------------------------------------------------
data_long_mf$variable <- as.character(data_long_mf$variable)
data_long_mf_salg <- separate(data = data_long_mf, variable, c("varname","time"), sep = "_")
data_long_mf_salg <- subset(data_long_mf_salg, varname == "salg")

# fit the model
fit1 <- aov_ez(data = data_long_mf_salg, id = "id", dv = "value", between = c("gender_rc"), within = c("time"))

# examine the equality of variance using levene's test. if more than two levels of within-subjects variable check sphericity using mauchly's test
test_levene(fit1)

## Levene's Test for Homogeneity of Variance (center = center)
##       Df F value Pr(>F)
## group  1  1.0688 0.3065
##       47

# levene's test not significant, can interpret results
summary(fit1)

## 
## Univariate Type III Repeated-Measures ANOVA Assuming Sphericity
## 
##                Sum Sq num Df Error SS den Df  F value  Pr(>F)    
## (Intercept)    968.29      1   46.249     47 984.0038 < 2e-16 ***
## gender_rc        0.09      1   46.249     47   0.0961 0.75797    
## time             1.28      1   13.481     47   4.4518 0.04022 *  
## gender_rc:time   0.59      1   13.481     47   2.0705 0.15680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# plot interaction using afex_plot() command
afex_plot(fit1, x = "time", trace = "gender_rc",
                   legend_title = "Gender",
          factor_levels = list(time = c("Pre-Semester","Post-Semester"))) +
          labs(caption = "Figure 1: Comparison of SALG scores at beginning and ending of semester") + xlab("SALG Scores") + ylab("Value")

###############################################################################

###############################################################################
# plot moderated regression with baseline control -------------------------

# recode gender as a factor - not required, just good practice
data_mf$gender_rc <- as.factor(data_mf$gender_rc)

# fit the model
fit2 <- lm(data = data_mf, formula = salg_post ~ salg_pre*gender_rc + vbs_pre*gender_rc)

# test for independence of residuals
dwtest(fit2)

## 
##  Durbin-Watson test
## 
## data:  fit2
## DW = 1.8974, p-value = 0.3078
## alternative hypothesis: true autocorrelation is greater than 0

# examine homoscedasticity (spread-location) and residuals (qq-plot)
autoplot(fit2)

# examine multicollinearity (vif is only for non-categorical variables)
vif(fit2)

##           salg_pre          gender_rc            vbs_pre salg_pre:gender_rc 
##           1.374278         127.065509           1.774711          40.084473 
##  gender_rc:vbs_pre 
##          65.503190

# everything is ok, so it's safe to interpret results
summary(fit2)

## 
## Call:
## lm(formula = salg_post ~ salg_pre * gender_rc + vbs_pre * gender_rc, 
##     data = data_mf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92870 -0.44815  0.02545  0.48005  1.60951 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.90354    1.03529   1.839 0.072878 .  
## salg_pre             0.57081    0.15835   3.605 0.000807 ***
## gender_rcM          -3.31663    2.53917  -1.306 0.198434    
## vbs_pre             -0.02483    0.29463  -0.084 0.933219    
## salg_pre:gender_rcM  0.37860    0.40346   0.938 0.353284    
## gender_rcM:vbs_pre   0.51032    0.52327   0.975 0.334893    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7123 on 43 degrees of freedom
##   (59 observations deleted due to missingness)
## Multiple R-squared:  0.3309, Adjusted R-squared:  0.2531 
## F-statistic: 4.253 on 5 and 43 DF,  p-value: 0.003136

# plot model using plot_model() command from sjPlot package
plot_model(model = fit2, type = "int")

## [[1]]

## 
## [[2]]

rplot <- plot_model(model = fit2, type = "int") #save plots as an object
rplot_salg <- rplot[[1]] #extract plots into their own objects
rplot_vbs <- rplot[[2]]

# refine plots using ggplot2 package
rplotcaption2 <- str_wrap("Figure 1: Relationship between pre- and post-test VBS and SALG scores, as moderated by gender and controlling for pre-test SALG", 50)

rplot_vbs +
  labs(caption = rplotcaption2, color = "Gender") + xlab("VBS") + ylab("Value") +
  ylim(0,6)

# fit a second model without pre-test SALG data
fit3 <- lm(data = data_mf, formula = salg_post ~ vbs_pre*gender_rc)

#plot second model
rplot <- plot_model(model = fit3, type = "int")
rplotcaption3 <- str_wrap("Figure 2: Relationship between pre- and post-test VBS and SALG scores, as moderated by gender and without controlling for pre-test SALG", 50)
rplot +
  labs(caption = rplotcaption3, color = "Gender") + xlab("VBS") + ylab("Value") +
  ylim(0,6)

Quantitative Workshop #6: Visualizations

Heather Perkins

6/26/2021