library(semPlot)
library(lavaan)
## This is lavaan 0.6-17
## lavaan is FREE software! Please report any bugs.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(qgraph)
library(magrittr)
# install.packages(c("semPlot", "lavaan", "dplyr", "qgraph", "magrittr"))
#First Input data
# set directory
# you will need to tell R where to find the data
#below is the location of data for me, so change to be relevant for you
#setwd("C:\Users\klmcdonald2\OneDrive - The University of Alabama\Documents\SEMsp25")
#create r data table
#name dataset "ex1sem"
# read data with col names
ex1sem<- readxl::read_xlsx("./hw1_data rev.xlsx")
# define missing values
ex1sem[ex1sem == -99] <- NA #setting missing values
print(ex1sem)
## # A tibble: 20 × 4
## HW1 HW2 HW3 FINAL
## <dbl> <dbl> <dbl> <dbl>
## 1 73 80 75 82
## 2 93 95 93 96
## 3 89 91 90 90
## 4 56 98 75 80
## 5 73 66 70 77
## 6 53 46 55 55
## 7 69 74 77 70
## 8 47 56 90 70
## 9 87 79 90 92
## 10 79 70 88 85
## 11 69 75 73 70
## 12 70 65 74 72
## 13 78 95 91 88
## 14 79 80 73 80
## 15 70 73 85 82
## 16 93 62 82 90
## 17 78 75 68 76
## 18 62 90 93 90
## 19 88 92 86 91
## 20 95 82 88 93
# get means for variables in data frame mydata
# excluding missing values
sapply(ex1sem, mean, na.rm=TRUE)
## HW1 HW2 HW3 FINAL
## 75.05 77.20 80.80 81.45
# mean,median,25th and 75th quartiles,min,max
summary(ex1sem)
## HW1 HW2 HW3 FINAL
## Min. :47.00 Min. :46.00 Min. :55.00 Min. :55.00
## 1st Qu.:69.00 1st Qu.:69.00 1st Qu.:73.75 1st Qu.:75.00
## Median :75.50 Median :77.00 Median :83.50 Median :82.00
## Mean :75.05 Mean :77.20 Mean :80.80 Mean :81.45
## 3rd Qu.:87.25 3rd Qu.:90.25 3rd Qu.:90.00 3rd Qu.:90.00
## Max. :95.00 Max. :98.00 Max. :93.00 Max. :96.00
#install psych package
# can get basic descriptives
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:lavaan':
##
## cor2cov
describeFast(ex1sem)
##
## Number of observations = 20 of which 20 are complete cases. Number of variables = 4 of which 0 are numeric and 0 are factors
##
## To list the items and their counts, print with short = FALSE
describe(ex1sem,na.rm=TRUE, skew = TRUE, ranges = TRUE,
type=3,check=TRUE,fast=TRUE,quant=NULL,IQR=FALSE,omit=FALSE,data=NULL)
## vars n mean sd median min max range skew kurtosis se
## HW1 1 20 75.05 13.64 75.5 47 95 48 -0.32 -0.87 3.05
## HW2 2 20 77.20 13.95 77.0 46 98 52 -0.34 -0.70 3.12
## HW3 3 20 80.80 10.26 83.5 55 93 38 -0.66 -0.39 2.29
## FINAL 4 20 81.45 10.41 82.0 55 96 41 -0.68 -0.23 2.33
# model statements and request summary statistics
#install lavaan package if you haven't already
#library activates the package
library (lavaan)
# agg regressed on gender
model.1 <- ('FINAL ~ HW1+HW2+HW3
HW1 ~~ HW2
HW1 ~~ HW3
HW2 ~~ HW3')
fit <-sem(model.1,data=ex1sem, meanstructure=TRUE)
print(summary (fit, standardized=TRUE, rsquare= TRUE) )
## lavaan 0.6.17 ended normally after 70 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 14
##
## Number of observations 20
##
## Model Test User Model:
##
## Test statistic 0.000
## Degrees of freedom 0
##
## Parameter Estimates:
##
## Standard errors Standard
## Information Expected
## Information saturated (h1) model Structured
##
## Regressions:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## FINAL ~
## HW1 0.353 0.059 6.040 0.000 0.353 0.463
## HW2 0.205 0.062 3.328 0.001 0.205 0.275
## HW3 0.467 0.084 5.555 0.000 0.467 0.460
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## HW1 ~~
## HW2 71.440 43.471 1.643 0.100 71.440 0.395
## HW3 52.960 32.008 1.655 0.098 52.960 0.398
## HW2 ~~
## HW3 71.740 34.390 2.086 0.037 71.740 0.527
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .FINAL 1.327 5.979 0.222 0.824 1.327 0.131
## HW1 75.050 2.973 25.246 0.000 75.050 5.645
## HW2 77.200 3.041 25.386 0.000 77.200 5.676
## HW3 80.800 2.237 36.124 0.000 80.800 8.078
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .FINAL 9.609 3.039 3.162 0.002 9.609 0.093
## HW1 176.748 55.892 3.162 0.002 176.748 1.000
## HW2 184.960 58.489 3.162 0.002 184.960 1.000
## HW3 100.060 31.642 3.162 0.002 100.060 1.000
##
## R-Square:
## Estimate
## FINAL 0.907
# The Std.all column contains the fully standardized estimates
# the Std.lv column contains the estimates standardized on the
# variances of latent only (not relevant if no latent variables, as is case here)
# ==============================================================================
# Helper Function: Add Significance Visuals (Flexible)
# ------------------------------------------------------------------------------
# Arguments:
# type: "std" (Standardized) or "est" (Unstandardized)
# digits: Number of decimal places (default = 2)
# ==============================================================================
add_sig_visuals <- function(plot_object, fit_object, type = "std", digits = 2) {
# 1. Process Parameters
params <- parameterEstimates(fit_object, standardized = TRUE) %>%
filter(op %in% c("~", "~~", "=~")) %>%
mutate(
# Select value based on user input ("est" or "std")
val_to_show = if(type == "est") est else std.all,
# Assign significance stars
stars = case_when(
pvalue < 0.001 ~ "***",
pvalue < 0.01 ~ "**",
pvalue < 0.05 ~ "*",
TRUE ~ ""
),
# Define line type: Solid (1) if significant, Dashed (2) otherwise
target_lty = ifelse(pvalue < 0.05, 1, 2),
# Format label: Dynamic rounding + stars
new_label = paste0(sprintf(paste0("%.", digits, "f"), val_to_show), stars)
)
# 2. Map Nodes
node_names <- plot_object$graphAttributes$Nodes$names
# 3. Update Edges
num_edges <- length(plot_object$Edgelist$from)
for(i in 1:num_edges) {
from_node <- node_names[plot_object$Edgelist$from[i]]
to_node <- node_names[plot_object$Edgelist$to[i]]
match_row <- params %>%
filter((lhs == from_node & rhs == to_node) |
(lhs == to_node & rhs == from_node))
if(nrow(match_row) > 0) {
plot_object$graphAttributes$Edges$lty[i] <- match_row$target_lty[1]
plot_object$graphAttributes$Edges$labels[i] <- match_row$new_label[1]
}
}
return(plot_object)
}
# Unstandardized plot
print(semPaths(fit,
whatLabels = "est",
layout = "tree",
rotation = 2,
style = "ram",
exoVar = FALSE,
residuals = TRUE,
intercepts = FALSE,
curvature = 2,
sizeMan = 8,
sizeMan2 = 5,
edge.label.cex = 0.9,
mar = c(4, 5, 4, 6),
nCharNodes = 0,
edge.color = "black",
fade = F,DoNotPlot = TRUE)%>%
add_sig_visuals(fit,type="est") %>%
plot())
## $mar
## [1] 0 0 0 0
##
## $bg
## [1] "white"
# standardized plot
p <- semPaths(fit,
whatLabels = "std",
layout = "tree",
rotation = 2,
style = "ram",
exoVar = FALSE,
residuals = TRUE,
intercepts = FALSE,
curvature = 2,
sizeMan = 8,
sizeMan2 = 5,
edge.label.cex = 0.9,
mar = c(4, 5, 4, 6),
nCharNodes = 0,
edge.color = "black",
fade = F,DoNotPlot = TRUE)%>%
add_sig_visuals(fit) %>%
plot()
print(p)
## $mar
## [1] 0 0 0 0
##
## $bg
## [1] "white"
To examine the hypothesis that final exam performance varies as a function of homework scores, a path analysis was conducted to examine how HW1, HW2, and HW3 were uniquely related to the FINAL exam score. The analysis significantly predicted 90.7% of the variance in final exam scores (\(R^2 = .91\), \(p < .001\)). HW1 scores were positively related to final exam performance (\(b = .35\), \(\beta = .463\), \(SE = .059\), \(p < .001\)), as were HW2 scores (\(b = .21\), \(\beta = .275\), \(SE = .062\), \(p = .001\)) and HW3 scores (\(b = .47\), \(\beta = .460\), \(SE = .084\), \(p < .001\)), such that higher scores on homework assignments were associated with higher grades on the final exam.