Osborne & Suddick Children’s Longitudinal Vocabulary Data

Load Vocabulary Data

<a https://docs.google.com/document/d/17bGg1gwmfbiygIhWI45n0N1ECs3JGxNJJ-OoJVU4y0Q/edit?usp=sharing

Read in Vocabulary Data

Read in Osborne & Suddick Vocabulary Data

# Load necessary libraries
#Read in data
Osborne <- read.table("osbornesuddick.txt", header = TRUE)
long <- reshape(Osborne, 
             varying = c("wisc_1", "wisc_2", "wisc_3", "wisc_4"), 
             v.names = "score",
             timevar = "time", 
             times = c(0, 1, 2, 3), 
             new.row.names = 1:1000,
             direction = "long")

long.sort <- long[order(long$subno),]
long.sort[1:10,]

##     subno moth_ed time score id
## 1       1     9.5    0 22.13  1
## 205     1     9.5    1 24.97  1
## 409     1     9.5    2 41.76  1
## 613     1     9.5    3 49.91  1
## 2       2     5.5    0  9.17  2
## 206     2     5.5    1 13.91  2
## 410     2     5.5    2 20.10  2
## 614     2     5.5    3 39.10  2
## 3       3    14.0    0 30.03  3
## 207     3    14.0    1 39.27  3

library(lattice)

Spaghetti Plot of Data (Not in the book)

It is helpful to look at the patterns of individual data points across all measurement occasions to gain insight into the patterns of growth in the data. Unfortunately, for large datasets, as you can see below, the resulting plot just looks like a solid mass. The Galton squeeze diagram (Campbell & Kenny (1999), p. 163) sorts the data by values of a measurement occasion and calculates subgroup means for the data. This can be done for any measurement occasion or even for the average across all measurement occasions.

Uses of Spaghetti Plots

Spaghetti plots are useful for determining - if regression to the mean is responsible for longitudinal differences, - if a different functional form of change exists for higher versus lower-scoring individuals relative to a measurement occasion,

Forward squeeze (pre → post) and reverse squeeze (post → pre) can be used to emphasize that RTM is bidirectional when variables are imperfectly correlated.

Squeeze plots are also a diagnostic in quasi-experiments/program evaluation. When groups are selected for extreme pretest scores (or via matching), squeeze diagrams help distinguish artifactual “improvement” from real treatment effects.

Decisions based on Spaghetti Plots

If regression to the mean effects are thought present, this constitutes an alternative explanation for a growth model.
The researcher may wish to explore whether some observed variable (such as age-at-testing, sex, family of origin, or SES) indicate different trajectories, in which case a multi-group SEM growth model may be more appropriate.
If different functional forms exist as a function of level but no single or suite of manifest variables appears responsible for the differential trajecgtories, the researcher to a finite mixture model.

long <- reshape(Osborne, 
             varying = c("wisc_1", "wisc_2", "wisc_3", "wisc_4"), 
             v.names = "score",
             timevar = "time", 
             times = c(0, 1, 2, 3), 
             new.row.names = 1:1000,
             direction = "long")

long.sort <- long[order(long$subno),]
long.sort[1:10,]

##     subno moth_ed time score id
## 1       1     9.5    0 22.13  1
## 205     1     9.5    1 24.97  1
## 409     1     9.5    2 41.76  1
## 613     1     9.5    3 49.91  1
## 2       2     5.5    0  9.17  2
## 206     2     5.5    1 13.91  2
## 410     2     5.5    2 20.10  2
## 614     2     5.5    3 39.10  2
## 3       3    14.0    0 30.03  3
## 207     3    14.0    1 39.27  3

long <- reshape(Osborne, 
             varying = c("wisc_1", "wisc_2", "wisc_3", "wisc_4"), 
             v.names = "score",
             timevar = "time", 
             times = c(0, 1, 2, 3), 
             new.row.names = 1:1000,
             direction = "long")

long.sort <- long[order(long$subno),]
long.sort[1:10,]

##     subno moth_ed time score id
## 1       1     9.5    0 22.13  1
## 205     1     9.5    1 24.97  1
## 409     1     9.5    2 41.76  1
## 613     1     9.5    3 49.91  1
## 2       2     5.5    0  9.17  2
## 206     2     5.5    1 13.91  2
## 410     2     5.5    2 20.10  2
## 614     2     5.5    3 39.10  2
## 3       3    14.0    0 30.03  3
## 207     3    14.0    1 39.27  3

#Plots can be done in either lattice or ggplot

#Here's lattice
library(lattice)
xyplot(score ~ time, groups = subno,
       data = long.sort,
       type = "l" ,xlab="Time",ylab="Score",
        main = "Lattice Spaghetti Plot",
       scales =list(x=list(at=c(0,1,2,4,6))))

#Here's ggplot
library(ggplot2)
library(svglite)
library(cowplot)
ggplot(long, aes(x = time, y = score, group = subno)) +
  geom_line() +
  geom_smooth(method = lm, se = FALSE, formula = y ~ x) +
  background_grid(major = c("xy"),
                  minor = c("none"),
                  size.major = 0.2,
                  colour.major = "grey90",
                  colour.minor = "black98") +
  labs(
    title = "Ggplot Spaghetti Plot of Scores Over Time",
    subtitle = "Individual trajectories",
    x = "Time",
    y = "Score"
  )

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

# helper: quintile-mean spaghetti plot with SD error bars + horizontal jitter
make_group_mean_plot <- function(data, sort_by, n_groups = 5,
                                 times = c(0, 1, 2, 3),
                                 jitter_width = 0.06) {
  stopifnot(sort_by %in% c("wisc_1", "wisc_2", "wisc_3", "wisc_4"))

  # Put participants into rank groups (quintiles) by the chosen occasion
  dat_grp <- data %>% mutate(.grp = ntile(.data[[sort_by]], n_groups))

  # Long + group summary stats at each time: mean and SD
  long_stats <- dat_grp %>%
    pivot_longer(starts_with("wisc_"), names_to = "occasion", values_to = "score") %>%
    mutate(time = recode(occasion,
                         wisc_1 = times[1],
                         wisc_2 = times[2],
                         wisc_3 = times[3],
                         wisc_4 = times[4])) %>%
    group_by(.grp, time) %>%
    summarise(
      mean_score = mean(score, na.rm = TRUE),
      sd_score   = sd(score,   na.rm = TRUE),
      n          = sum(!is.na(score)),
      .groups = "drop"
    )

  # Horizontal jitter per group (keeps points, bars, and lines aligned and off-set in the graph)
  offsets <- seq(-(n_groups - 1) / 2, (n_groups - 1) / 2, length.out = n_groups) * jitter_width
  long_stats <- long_stats %>% mutate(x_jit = time + offsets[.grp])

  title_txt <- paste0("Spaghetti of Group Means +/- SD (sorted by ", sort_by, ")")

  ggplot(long_stats, aes(x = x_jit, y = mean_score, group = .grp, color = factor(.grp))) +
    geom_line(linewidth = 1, alpha = 0.9) +
    geom_point(size = 2) +
    geom_errorbar(aes(ymin = mean_score - sd_score,
                      ymax = mean_score + sd_score),
                  width = 0.06) +
    scale_x_continuous(breaks = times, labels = times) +
    labs(x = "Time", y = "Mean score",
         color = paste("Quintile of", sort_by),
         title = title_txt) +
    theme_minimal(base_size = 13) +
    theme(legend.position = "right")
}

# (1) Sorted by first occasion (wisc_1)
plot_by_w1 <- make_group_mean_plot(Osborne, sort_by = "wisc_1")
print(plot_by_w1)

# (2) Sorted by last occasion (wisc_4)
plot_by_w4 <- make_group_mean_plot(Osborne, sort_by = "wisc_4")
print(plot_by_w4)

Scores Predicted from Factor Scores (Figure 17.1)

# Renaming the variables for simplicity's sake in the graphs
Osborne <- Osborne %>%
  rename(
    t1 = wisc_1,
    t2 = wisc_2,
    t3 = wisc_3,
    t4 = wisc_4
  )
library(lavaan)

## This is lavaan 0.6-19
## lavaan is FREE software! Please report any bugs.

library(ggplot2)
library(dplyr)
library(tidyr)
library(cowplot)

# --- Fit your model
FCSI <- '
  i =~ 1*t1 + 1*t2 + 1*t3 + 1*t4
  s =~ NA*t1 + L1*t1 + L2*t2 + L3*t3 + L4*t4

  s ~~ 1*s  
  i ~~ var_i*i
  i ~~ 0*s

  t1 ~~ var1*t1
  t2 ~~ var2*t2
  t3 ~~ var3*t3
  t4 ~~ var4*t4

  t1 ~ 0*1
  t2 ~ 0*1
  t3 ~ 0*1
  t4 ~ 0*1

  i ~ 1
  s ~ 1
'
FCSIfit <- lavaan(FCSI, data = Osborne)

# --- Get factor scores
fscores <- lavPredict(FCSIfit) %>% as.data.frame()  # columns: i, s

# Merge with observed data
dat <- cbind(Osborne, fscores)

# --- Long version for plotting
long_dat <- dat %>%
  select(i, s, t1, t2, t3, t4) %>%
  pivot_longer(cols = starts_with("t"),
               names_to = "occasion", values_to = "score")

# --- Plot set 1: Intercept factor vs observed
p_intercept <- ggplot(long_dat, aes(x = i, y = score, color = occasion)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, size = 1.2) +
  facet_wrap(~ occasion, scales = "free_y") +
  labs(title = "Observed Scores Predicted by Intercept Factor",
       x = "Intercept Factor Score", y = "Observed Score") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# --- Plot set 2: Slope factor vs observed
p_slope <- ggplot(long_dat, aes(x = s, y = score, color = occasion)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, size = 1.2) +
  facet_wrap(~ occasion, scales = "free_y") +
  labs(title = "Observed Scores Predicted by Slope Factor",
       x = "Slope Factor Score", y = "Observed Score") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

# Show them
print(p_intercept)

## `geom_smooth()` using formula = 'y ~ x'

print(p_slope)

## `geom_smooth()` using formula = 'y ~ x'

Actual and Predicted Scores (In this case for a growth model) Figure 17.2

library(lavaan)
FCSI <- '
  i =~ 1*t1 + 1*t2 + 1*t3 + 1*t4
  s =~ NA*t1 + L1*t1 + L2*t2 + L3*t3 + L4*t4

  # name the slope variance for influence tracking
  s ~~ 1*s  

  i ~~ var_i*i
  i ~~ 0*s

  t1 ~~ var1*t1
  t2 ~~ var2*t2
  t3 ~~ var3*t3
  t4 ~~ var4*t4

  t1 ~ 0*1
  t2 ~ 0*1
  t3 ~ 0*1
  t4 ~ 0*1

  i ~ 1
  s ~ 1
'
FCSIfit <- lavaan(FCSI, data=Osborne)
summary(FCSIfit, standardized=TRUE)

## lavaan 0.6-19 ended normally after 66 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        11
## 
##   Number of observations                           204
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 7.713
##   Degrees of freedom                                 3
##   P-value (Chi-square)                           0.052
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   i =~                                                                  
##     t1                1.000                               4.328    0.686
##     t2                1.000                               4.328    0.610
##     t3                1.000                               4.328    0.536
##     t4                1.000                               4.328    0.432
##   s =~                                                                  
##     t1        (L1)    3.591    0.749    4.796    0.000    3.591    0.569
##     t2        (L2)    4.821    0.694    6.950    0.000    4.821    0.680
##     t3        (L3)    6.300    0.661    9.531    0.000    6.300    0.780
##     t4        (L4)    8.108    0.676   11.996    0.000    8.108    0.809
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   i ~~                                                                  
##     s                 0.000                               0.000    0.000
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .t1                0.000                               0.000    0.000
##    .t2                0.000                               0.000    0.000
##    .t3                0.000                               0.000    0.000
##    .t4                0.000                               0.000    0.000
##     i                -3.952    6.384   -0.619    0.536   -0.913   -0.913
##     s                 6.332    0.718    8.817    0.000    6.332    6.332
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     s                 1.000                               1.000    1.000
##     i       (var_)   18.731    6.179    3.031    0.002    1.000    1.000
##    .t1      (var1)    8.211    1.424    5.767    0.000    8.211    0.206
##    .t2      (var2)    8.348    1.103    7.567    0.000    8.348    0.166
##    .t3      (var3)    6.896    1.154    5.976    0.000    6.896    0.106
##    .t4      (var4)   15.863    2.675    5.930    0.000   15.863    0.158

library(influence.SEM)

## Warning: package 'influence.SEM' was built under R version 4.5.1

fitres <- sem.fitres(FCSIfit)
head(fitres)

##      t1    t2    t3    t4    hat.t1   hat.t2   hat.t3   hat.t4       e.t1
## 1 22.13 24.97 41.76 49.91 20.897858 29.07450 38.90908 50.92114  1.2321417
## 2  9.17 13.91 20.10 39.10  9.592033 15.74597 23.14771 32.18827 -0.4220331
## 3 30.03 39.27 40.65 63.95 26.021901 35.09202 46.00124 59.32588  4.0080990
## 4 27.93 29.03 44.06 53.19 23.502709 32.12785 42.50185 55.17278  4.4272910
## 5 27.93 41.12 53.27 67.59 30.101441 39.94185 51.77754 66.23379 -2.1714413
## 6 12.25 17.95 32.50 39.51 14.349934 21.36129 29.79430 40.09448 -2.0999336
##        e.t2      e.t3       e.t4
## 1 -4.104503  2.850922 -1.0111450
## 2 -1.835971 -3.047706  6.9117265
## 3  4.177978 -5.351237  4.6241171
## 4 -3.097846  1.558150 -1.9827840
## 5  1.178151  1.492463  1.3562123
## 6 -3.411292  2.705698 -0.5844812

library(ggplot2)
library(cowplot)

plots <- list()

for (var in c("t1", "t2", "t3", "t4")) {
  predicted <- paste0("hat.", var)
  
  p <- ggplot(fitres, aes_string(x = predicted, y = var)) +
    geom_point(alpha = 0.6, color = "black") +
    geom_smooth(method = "loess", se = FALSE, color = "black", size = 1.2, span = 0.5) +
    labs(
      title = paste("Predicted vs Actual", var),
      x = paste("Predicted", var),
      y = paste("Actual", var)
    ) +
    theme_minimal(base_size = 13) +
    theme(plot.title = element_text(hjust = 0.5))
  
  plots[[var]] <- p
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Arrange the 4 plots into a 2x2 grid
combined <- plot_grid(
  plots$t1, plots$t2,
  plots$t3, plots$t4,
  labels = c("A", "B", "C", "D"),  # optional subplot labels
  ncol = 2
)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(combined)

Delta chi (Figure 17.3), Likelihood distance, and Plot of Delta Chi with Likelihood Distance (Figure 17.4)

library(influence.SEM)
fitres <- sem.fitres(FCSIfit)
#Delta Chi
Dchi <- Deltachi(FCSI, data=Osborne)
head(Dchi)

## [1] -0.005102748 -1.534880394  1.439814358 -0.207210189 -0.258932661
## [6]  0.534237557

Dchi_df <- as.data.frame(Dchi)
Dchi_df <- Dchi_df %>%
  mutate(Obs = row_number())
library(ggplot2)


ggplot(Dchi_df, aes(x = Obs, y = Dchi)) +
  geom_point() +        
  geom_line() +         
  labs(x = "Observation", y = "Delta Chi Value") +
  scale_y_continuous(breaks = seq(from = -1.6, to = 1.6, by = 0.2)) +
   labs(title = "Figure 17.3 Delta Chi by Observation") +
  theme_minimal()

# Cases with largest impact on chi-square
sort(Dchi, decreasing=TRUE)[1:5]

## [1] 1.6448782 1.4398144 1.0072240 0.9012845 0.8212425

#Likelihood distance
LD <- Likedist(FCSI, data=Osborne)
head(LD)

## [1] -0.04736678  0.10404429  0.25827830 -0.03033972  0.04330706 -0.04157347

# Identify large likelihood distances
sort(LD, decreasing=TRUE)[1:5]

## [1] 0.9250000 0.3047206 0.2929142 0.2810829 0.2582783

library(ggplot2)

# Combine Dchi and LD into a data frame
diagnostics <- data.frame(Dchi = Dchi, LD = LD)

# Plot Dchi vs LD
ggplot(diagnostics, aes(x = LD, y = Dchi)) +
  geom_point() +
  labs(
    title = "Figure 17.4 Delta Chi-Square vs Likelihood Distance",
    x = "Likelihood Distance (LD)",
    y = "Delta Chi-Square (Dchi)"
  ) +
  theme_minimal()

print(LD)

##   [1] -0.0473667804  0.1040442927  0.2582782978 -0.0303397168  0.0433070633
##   [6] -0.0415734654  0.0426755355  0.0111560242 -0.0509867130 -0.0291998732
##  [11]  0.0310122059 -0.0293330241  0.0415975749  0.0920153169  0.0026045857
##  [16] -0.0039212041  0.0658798486  0.0009090326  0.0041352423 -0.0279721104
##  [21] -0.0088672622  0.1791359179  0.0015018644 -0.0038480226  0.0548692866
##  [26]  0.0216406699  0.0607361096  0.1170574174 -0.0024325877  0.0043838953
##  [31] -0.0277413984  0.0046272355  0.0113307787 -0.0435689572 -0.0252830420
##  [36]  0.0322026191  0.0243629341  0.0237467133 -0.0213574649 -0.0115643219
##  [41] -0.0001031683  0.0312855323  0.0315553368 -0.0401444996  0.0078378686
##  [46] -0.0179579135 -0.0356908061  0.0418365127 -0.0023933599 -0.0064861671
##  [51]  0.0360684587  0.0219532360  0.0488390362 -0.0054438825 -0.0255294984
##  [56]  0.0154268449  0.0304230127  0.0108516071 -0.0192108189 -0.0213916468
##  [61] -0.0108868035 -0.0018412737  0.0402445657  0.1061490432 -0.0068365624
##  [66] -0.0375175243 -0.0316579580  0.0074971510  0.0032954054  0.0024558282
##  [71]  0.0580057027  0.0164461748  0.0186760295  0.0163279973  0.0092382829
##  [76]  0.0186958195  0.0422671181 -0.0125876532 -0.0099109049  0.0374509643
##  [81]  0.0200053309 -0.0312947680  0.0037486140 -0.0125400777 -0.0038609404
##  [86]  0.0200250393 -0.0299106059  0.2810829318  0.0322785721  0.0240735935
##  [91]  0.0581887942  0.0713127304  0.0700059916  0.0796602806  0.0312772152
##  [96]  0.0216353667  0.0754596314  0.0277952782  0.1679018395  0.0273264827
## [101] -0.0286600379 -0.0387945026  0.0079736505 -0.0012857239  0.0431700089
## [106]  0.0025627289 -0.0132523067  0.0135204660  0.0313305859 -0.0426323568
## [111] -0.0346311252 -0.0099647427 -0.0505048599 -0.0150236601  0.0042681784
## [116]  0.1253917479  0.0422623618  0.0313724093 -0.0003570950  0.9249999972
## [121] -0.0191562735  0.0170334103  0.0121588656 -0.0096714809  0.2160895658
## [126]  0.0029818200  0.2101949086  0.0833741515 -0.0124102775 -0.0165454937
## [131]  0.0478604451  0.0121564974  0.1243401279 -0.0509481935  0.1297324542
## [136] -0.0088268027  0.0422671031  0.0820699744  0.0438263213  0.0982832808
## [141] -0.0432729391  0.0088707949  0.1706977014  0.1162218949 -0.0280574646
## [146]  0.1938111856  0.0613750838 -0.0543526220  0.0555533621  0.0076544814
## [151] -0.0091457563  0.0734748181  0.0353834079  0.0373444578 -0.0414394489
## [156]  0.0146454040  0.0687156922  0.0237802020  0.1814262130  0.0158067048
## [161] -0.0002150501 -0.0069273237  0.0233820226  0.0354376504  0.0059591027
## [166]  0.0031397821 -0.0028367496 -0.0134720737  0.0897303570 -0.0519023422
## [171]  0.1840947956  0.0643016564  0.1328404142  0.0147573401  0.1275711989
## [176] -0.0175480014 -0.0364903619  0.0230614089  0.0130658267  0.0615952441
## [181]  0.0557800302  0.0303438244 -0.0186455780  0.3047205567 -0.0479207291
## [186]  0.1729446302  0.0367499472  0.0105511783  0.0963754902  0.0591708161
## [191]  0.0405602598  0.0114126934 -0.0543111896 -0.0372855436  0.0543512007
## [196]  0.0357045918  0.0434242222  0.0849158302  0.2364314183  0.1098836287
## [201]  0.0209442775 -0.0158966677  0.0807072213  0.2929142063

LDdf<-data.frame(LD)
quantile(Osborne$t1, probs = seq(0.1, 0.9, by = 0.1))

##    10%    20%    30%    40%    50%    60%    70%    80%    90% 
## 11.358 13.046 15.196 16.954 18.150 20.290 21.900 24.526 27.190

quantile(Osborne$t2, probs = seq(0.1, 0.9, by = 0.1))

##    10%    20%    30%    40%    50%    60%    70%    80%    90% 
## 17.876 20.034 21.466 23.552 25.375 28.624 31.092 33.190 36.248

quantile(Osborne$t3, probs = seq(0.1, 0.9, by = 0.1))

##    10%    20%    30%    40%    50%    60%    70%    80%    90% 
## 26.774 29.380 31.833 34.112 35.645 37.280 39.710 43.190 45.543

quantile(Osborne$t4, probs = seq(0.1, 0.9, by = 0.1))

##    10%    20%    30%    40%    50%    60%    70%    80%    90% 
## 34.683 38.632 40.917 43.988 47.475 50.448 53.669 56.770 60.555

Generalized Cook’s Distance (gCD)

#Generalized Cook's distance
gCD <- genCookDist(FCSI, data=Osborne)
head(gCD)

## [1] 0.03206096 0.08574387 0.18691677 0.04050668 0.08240841 0.02328056

# Top generalized Cook distances
sort(gCD, decreasing=TRUE)[1:5]

## [1] 0.9850733 0.5085786 0.3363826 0.2591383 0.2528644

gCD_df<-data.frame(gCD)
gCD_df <- gCD_df %>%
  mutate(Obs = row_number())
ggplot(gCD_df, aes(x = Obs, y = gCD)) +
  geom_point() +
  labs(
    title = "Figure 17.5 Generalized Cook's Distance",
    x = "Observation",
    y = "Generalized Cook's Distance"
  ) +
  theme_minimal()

Influence on Individual Parameters

library(lavaan)
library(influence.SEM)
library(dplyr)
library(tidyr)
library(ggplot2)

#Parameter Influence
FCSI <- '
  i =~ 1*t1 + 1*t2 + 1*t3 + 1*t4
  s =~ t1 + NA*t1+t2 + t3 + t4

  # name the slope variance for influence tracking
  s ~~ 1*s  

  i ~~ var_i*i
  i ~~ 0*s

  t1 ~~ var1*t1
  t2 ~~ var2*t2
  t3 ~~ var3*t3
  t4 ~~ var4*t4

  t1 ~ 0*1
  t2 ~ 0*1
  t3 ~ 0*1
  t4 ~ 0*1

  i ~ 1
  s ~ 1
'


FCSIfit <- sem(FCSI, data=Osborne)
summary(FCSIfit, standardized=TRUE)

## lavaan 0.6-19 ended normally after 66 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        11
## 
##   Number of observations                           204
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 7.713
##   Degrees of freedom                                 3
##   P-value (Chi-square)                           0.052
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   i =~                                                                  
##     t1                1.000                               4.328    0.686
##     t2                1.000                               4.328    0.610
##     t3                1.000                               4.328    0.536
##     t4                1.000                               4.328    0.432
##   s =~                                                                  
##     t1                3.591    0.749    4.796    0.000    3.591    0.569
##     t2                4.821    0.694    6.950    0.000    4.821    0.680
##     t3                6.300    0.661    9.531    0.000    6.300    0.780
##     t4                8.108    0.676   11.996    0.000    8.108    0.809
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   i ~~                                                                  
##     s                 0.000                               0.000    0.000
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .t1                0.000                               0.000    0.000
##    .t2                0.000                               0.000    0.000
##    .t3                0.000                               0.000    0.000
##    .t4                0.000                               0.000    0.000
##     i                -3.952    6.384   -0.619    0.536   -0.913   -0.913
##     s                 6.332    0.718    8.817    0.000    6.332    6.332
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     s                 1.000                               1.000    1.000
##     i       (var_)   18.731    6.179    3.031    0.002    1.000    1.000
##    .t1      (var1)    8.211    1.424    5.767    0.000    8.211    0.206
##    .t2      (var2)    8.348    1.103    7.567    0.000    8.348    0.166
##    .t3      (var3)    6.896    1.154    5.976    0.000    6.896    0.106
##    .t4      (var4)   15.863    2.675    5.930    0.000   15.863    0.158

parameterEstimates(FCSIfit)

##    lhs op rhs label    est    se      z pvalue ci.lower ci.upper
## 1    i =~  t1        1.000 0.000     NA     NA    1.000    1.000
## 2    i =~  t2        1.000 0.000     NA     NA    1.000    1.000
## 3    i =~  t3        1.000 0.000     NA     NA    1.000    1.000
## 4    i =~  t4        1.000 0.000     NA     NA    1.000    1.000
## 5    s =~  t1        3.591 0.749  4.796  0.000    2.123    5.058
## 6    s =~  t2        4.821 0.694  6.950  0.000    3.461    6.180
## 7    s =~  t3        6.300 0.661  9.531  0.000    5.004    7.595
## 8    s =~  t4        8.108 0.676 11.996  0.000    6.783    9.432
## 9    s ~~   s        1.000 0.000     NA     NA    1.000    1.000
## 10   i ~~   i var_i 18.731 6.179  3.031  0.002    6.620   30.842
## 11   i ~~   s        0.000 0.000     NA     NA    0.000    0.000
## 12  t1 ~~  t1  var1  8.211 1.424  5.767  0.000    5.420   11.001
## 13  t2 ~~  t2  var2  8.348 1.103  7.567  0.000    6.186   10.510
## 14  t3 ~~  t3  var3  6.896 1.154  5.976  0.000    4.634    9.157
## 15  t4 ~~  t4  var4 15.863 2.675  5.930  0.000   10.620   21.107
## 16  t1 ~1            0.000 0.000     NA     NA    0.000    0.000
## 17  t2 ~1            0.000 0.000     NA     NA    0.000    0.000
## 18  t3 ~1            0.000 0.000     NA     NA    0.000    0.000
## 19  t4 ~1            0.000 0.000     NA     NA    0.000    0.000
## 20   i ~1           -3.952 6.384 -0.619  0.536  -16.466    8.561
## 21   s ~1            6.332 0.718  8.817  0.000    4.925    7.740

PAR <- c("s=~t1","s=~t2","s=~t3","s=~t4","i~1","s~1")

LY <- parinfluence(PAR,FCSI,Osborne,cook=TRUE)
str(LY)

## List of 2
##  $ gCD  : num [1:204, 1:6] 1.71e-05 1.82e-03 6.47e-04 2.69e-03 6.73e-03 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:6] "s=~t1" "s=~t2" "s=~t3" "s=~t4" ...
##  $ Dparm: num [1:204, 1:6] 0.00413 0.04261 0.02543 0.05188 0.08206 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:6] "s=~t1" "s=~t2" "s=~t3" "s=~t4" ...

LY_df<-data.frame(LY)
LY_df <- LY_df %>%
  mutate(Obs = row_number())
names(LY_df)

##  [1] "gCD.s..t1"   "gCD.s..t2"   "gCD.s..t3"   "gCD.s..t4"   "gCD.i.1"    
##  [6] "gCD.s.1"     "Dparm.s..t1" "Dparm.s..t2" "Dparm.s..t3" "Dparm.s..t4"
## [11] "Dparm.i.1"   "Dparm.s.1"   "Obs"

library(ggplot2)

library(dplyr)
library(tidyr)
library(ggplot2)

# If LY_df doesn't yet have slope scores, add them (pick your fit object)
if (!"s" %in% names(LY_df)) {
  s_scores <- if (exists("FCSIfit")) as.numeric(lavPredict(FCSIfit)[, "s"])
  else if (exists("fit"))  as.numeric(lavPredict(fit)[, "s"])
  else stop("Need a lavaan fit (FCSIfit or fit) to compute slope scores.")
  # If LY_df has an Obs index, join by Obs; else bind in row order
  if ("Obs" %in% names(LY_df) && length(s_scores) == nrow(LY_df)) {
    LY_df <- LY_df |> mutate(s = s_scores)
  } else {
    LY_df$s <- s_scores
  }
}

# To avoid any name clashes, rename 's' -> 'slope'
names(LY_df)[names(LY_df) == "s"] <- "slope"

# Build long frame with safe, string-based selection
want <- c("Obs", "slope", "Dparm.s..t1", "Dparm.s..t2", "Dparm.s..t3", "Dparm.s..t4")
long_s <- LY_df |>
  select(all_of(want)) |>
  pivot_longer(
    cols = c("Dparm.s..t1","Dparm.s..t2","Dparm.s..t3","Dparm.s..t4"),
    names_to = "param",
    values_to = "influence"
  ) |>
  mutate(occasion = dplyr::recode(param,
                                  "Dparm.s..t1"="t1",
                                  "Dparm.s..t2"="t2",
                                  "Dparm.s..t3"="t3",
                                  "Dparm.s..t4"="t4"))

# Fences per facet
bounds <- long_s |>
  group_by(occasion) |>
  summarise(
    med   = median(influence, na.rm = TRUE),
    iqr   = IQR(influence, na.rm = TRUE),
    lower = med - 1.5 * iqr,
    upper = med + 1.5 * iqr,
    .groups = "drop"
  )

# Plot (note the .data pronoun + 'slope' column)
ggplot(long_s, aes(x = .data[["slope"]], y = .data[["influence"]])) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = FALSE, span = 0.5) +
  geom_hline(data = bounds, aes(yintercept = lower), linetype = "dashed", color = "red") +
  geom_hline(data = bounds, aes(yintercept = upper), linetype = "dashed", color = "red") +
  facet_wrap(~ occasion, ncol = 2, scales = "free_y") +
  labs(
    title = "Case Influence on Slope Loadings (per occasion)",
    x = "Slope Factor Score (s)",
    y = "Influence (Dparm)"
  ) +
  theme_minimal(base_size = 12)

## `geom_smooth()` using formula = 'y ~ x'

Factor Mean Influence and gCD

library(lavaan)
library(influence.SEM)
library(dplyr)
library(tidyr)
library(ggplot2)

# --- Fit (your model as-is) ---
FCSI <- '
  i =~ 1*t1 + 1*t2 + 1*t3 + 1*t4
  s =~ t1 + NA*t1 + t2 + t3 + t4

  s ~~ 1*s  
  i ~~ var_i*i
  i ~~ 0*s

  t1 ~~ var1*t1
  t2 ~~ var2*t2
  t3 ~~ var3*t3
  t4 ~~ var4*t4

  t1 ~ 0*1
  t2 ~ 0*1
  t3 ~ 0*1
  t4 ~ 0*1

  i ~ 1
  s ~ 1
'
FCSIfit <- sem(FCSI, data = Osborne)

# --- Influence for mean parameters only ---
PAR <- c("i~1","s~1")
LY  <- parinfluence(PAR, FCSI, Osborne, cook = TRUE)

# Pull the D contributions; your columns are "i~1" and "s~1"
Dparm <- as.data.frame(LY$Dparm)
Dparm$Obs <- seq_len(nrow(Dparm))

# Factor scores to use on x-axes
scores <- lavPredict(FCSIfit)
Dparm$i_score <- as.numeric(scores[, "i"])
Dparm$s_score <- as.numeric(scores[, "s"])

# Make sure we have the expected columns; fall back is easy to add if needed
target_cols <- intersect(c("i~1","s~1"), names(Dparm))
stopifnot(length(target_cols) == 2)

# Long form; pair each mean’s influence with the matching factor score
long_means <- Dparm |>
  pivot_longer(cols = all_of(target_cols),
               names_to = "param",
               values_to = "influence") |>
  mutate(score = if_else(param == "i~1", i_score, s_score))

# Tukey fences per facet
bounds <- long_means |>
  group_by(param) |>
  summarise(
    med   = median(influence, na.rm = TRUE),
    iqr   = IQR(influence, na.rm = TRUE),
    lower = med - 1.5 * iqr,
    upper = med + 1.5 * iqr,
    .groups = "drop"
  )
library(ggh4x)

## Warning: package 'ggh4x' was built under R version 4.5.1

ggplot(long_means, aes(x = score, y = influence)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = FALSE, span = 0.5) +
  geom_hline(data = bounds, aes(yintercept = lower), linetype = "dashed", color = "red") +
  geom_hline(data = bounds, aes(yintercept = upper), linetype = "dashed", color = "red") +
  facet_wrap(~ param, ncol = 2, scales = "free_x") +
  ggh4x::facetted_pos_scales(
    x = list(param == "s~1" ~ scale_x_continuous(limits = c(2, 10), expand = expansion(mult = 0)))
  ) +
  labs(
    title = "Case Influence on Factor Means",
    x = "Matching Factor Score (i or s)",
    y = "Influence (Dparm)"
  ) +
  theme_minimal(base_size = 12)

## `geom_smooth()` using formula = 'y ~ x'

# Generalized Cook's distance

PAR <- c("i~1","s~1")
LY  <- parinfluence(PAR, FCSI, Osborne, cook = TRUE)

# gCD matrix -> data frame; expect columns named "i~1" and "s~1"
gcd_df <- as.data.frame(LY$gCD)

# Fallback rename if your installation uses sanitized names
if (!all(c("i~1","s~1") %in% names(gcd_df))) {
  alt_i <- intersect(c("gCD.i..1","gCD.i_1","gCD.i1","gCD.i.1","i..1","i_1","i1","i.1"), names(gcd_df))
  alt_s <- intersect(c("gCD.s..1","gCD.s_1","gCD.s1","gCD.s.1","s..1","s_1","s1","s.1"), names(gcd_df))
  if (length(alt_i)) names(gcd_df)[match(alt_i[1], names(gcd_df))] <- "i~1"
  if (length(alt_s)) names(gcd_df)[match(alt_s[1], names(gcd_df))] <- "s~1"
}
stopifnot(all(c("i~1","s~1") %in% names(gcd_df)))

# Factor scores for x-axes
scores <- lavPredict(FCSIfit)
gcd_df$Obs      <- seq_len(nrow(gcd_df))
gcd_df$i_score  <- as.numeric(scores[, "i"])
gcd_df$s_score  <- as.numeric(scores[, "s"])

# Long form pairing each mean’s gCD with the matching factor score
long_gcd <- gcd_df |>
  pivot_longer(cols = c("i~1","s~1"),
               names_to = "param",
               values_to = "gCD") |>
  mutate(score = if_else(param == "i~1", i_score, s_score))

# Tukey fences (optional visual guides)
bounds <- long_gcd |>
  group_by(param) |>
  summarise(
    med   = median(gCD, na.rm = TRUE),
    iqr   = IQR(gCD, na.rm = TRUE),
    lower = med - 1.5 * iqr,
    upper = med + 1.5 * iqr,
    .groups = "drop"
  )

# Plot: per-facet x scales; force slope-mean (s~1) x-range to 2..10
ggplot(long_gcd, aes(x = score, y = gCD)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = FALSE, span = 0.5) +
  geom_hline(data = bounds, aes(yintercept = lower), linetype = "dashed", color = "red") +
  geom_hline(data = bounds, aes(yintercept = upper), linetype = "dashed", color = "red") +
  facet_wrap(~ param, ncol = 2, scales = "free_x") +
  ggh4x::facetted_pos_scales(
    x = list(param == "s~1" ~ scale_x_continuous(limits = c(2, 10), expand = expansion(mult = 0)))
  ) +
  labs(
    title = "Generalized Cook's Distance for Factor Means",
    x = "Matching Factor Score (i or s)",
    y = "gCD"
  ) +
  theme_minimal(base_size = 12)

## `geom_smooth()` using formula = 'y ~ x'

# References

Campbell, D. T., & Kenny, D. A. (1999). A primer on regression artifacts. Guilford Press.

Chapter 17:

Phil Wood

2025-08-10