EDA

Intro

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
library(rstatix)

Attaching package: 'rstatix'

The following object is masked from 'package:stats':

    filter
library(naniar)
library(pls)

Attaching package: 'pls'

The following object is masked from 'package:stats':

    loadings
data <- read_sav("MomOff2011Sept1.sav")

# subset of interest variables 
complete_case <- data %>%
  select(contains("stress") | contains("Delin")) %>%
  select(-"stress.r") %>%
  drop_na() 

Missing Data

n_mis_col <- 
  data %>%
  is.na() %>% 
  colSums() %>% 
  sort(decreasing = TRUE)

perc_mis_col <-  round(( n_mis_col / nrow(data) ) * 100, 2)

incomplete_case <- data %>%
  select(contains("stress") | contains("Delin")) %>%
  select(-"stress.r")

vis_miss(incomplete_case)

Correlations

cor_mat <- cor_mat(complete_case, method = "spearman") 

cor_pmat <- cor_pmat(complete_case, method = "spearman")

cor_mat %>%
  pull_lower_triangle() %>%
  cor_plot(label = TRUE, significant.level = 0.05, method = "square")

# Correlation with sum 

sum_stress <- complete_case %>%
  mutate(sum_stress = rowSums(select(., c(1:9)))) 

cor_test(sum_stress, `DelinOverall#8v`, method = "spearman", data=sum_stress)
# A tibble: 1 × 6
  var1       var2              cor    statistic     p method  
  <chr>      <chr>           <dbl>        <dbl> <dbl> <chr>   
1 sum_stress DelinOverall#8v 0.018 13822568404. 0.241 Spearman

Various Analyses

linreg <- lm(complete_case$'DelinOverall#8v' ~ ., data=complete_case[-c(10:11)])
summary(linreg)

Call:
lm(formula = complete_case$"DelinOverall#8v" ~ ., data = complete_case[-c(10:11)])

Residuals:
    Min      1Q  Median      3Q     Max 
 -42.96  -20.46  -17.13   -4.13 2158.15 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  20.9643     0.9104  23.028   <2e-16 ***
stress1      -2.1758     2.7249  -0.798    0.425    
stress2       2.2010     3.7895   0.581    0.561    
stress3      -0.5911     3.0356  -0.195    0.846    
stress4       0.8926     2.4950   0.358    0.721    
stress5       0.4336     2.2348   0.194    0.846    
stress6      -1.9445     2.4445  -0.795    0.426    
stress7       1.7235     1.4862   1.160    0.246    
stress8       1.2808     1.4675   0.873    0.383    
stress9      -0.8949     1.1741  -0.762    0.446    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 59.42 on 4377 degrees of freedom
Multiple R-squared:  0.001043,  Adjusted R-squared:  -0.001011 
F-statistic: 0.5079 on 9 and 4377 DF,  p-value: 0.8699
set.seed (999)

pcr_model <- 
  pcr(complete_case$'DelinOverall#8v' ~ ., data=complete_case[-c(10:11)],
      scale = TRUE, validation = "CV")
summary(pcr_model)
Data:   X dimension: 4387 9 
    Y dimension: 4387 1
Fit method: svdpc
Number of components considered: 9

VALIDATION: RMSEP
Cross-validated using 10 random segments.
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
CV           59.39     59.4    59.41    59.42    59.41    59.42    59.42
adjCV        59.39     59.4    59.41    59.41    59.41    59.42    59.42
       7 comps  8 comps  9 comps
CV       59.42    59.42    59.43
adjCV    59.42    59.42    59.42

TRAINING: % variance explained
                                  1 comps   2 comps   3 comps  4 comps
X                                65.58379  78.35970  84.24691  88.3909
complete_case$"DelinOverall#8v"   0.01659   0.03347   0.04197   0.0616
                                  5 comps   6 comps   7 comps   8 comps
X                                91.73596  94.46373  96.84973  98.83718
complete_case$"DelinOverall#8v"   0.07349   0.08511   0.08718   0.09465
                                  9 comps
X                                100.0000
complete_case$"DelinOverall#8v"    0.1043
# Indicates that there is principal component that summarises the stress vars. 
which.min( MSEP( pcr_model )$val[1,1, ] ) - 1
(Intercept) 
          0 
factanal(scale(complete_case[1:9]), factors = 1)

Call:
factanal(x = scale(complete_case[1:9]), factors = 1)

Uniquenesses:
stress1 stress2 stress3 stress4 stress5 stress6 stress7 stress8 stress9 
  0.350   0.206   0.200   0.244   0.296   0.259   0.570   0.633   0.730 

Loadings:
        Factor1
stress1 0.806  
stress2 0.891  
stress3 0.894  
stress4 0.869  
stress5 0.839  
stress6 0.861  
stress7 0.655  
stress8 0.606  
stress9 0.520  

               Factor1
SS loadings      5.511
Proportion Var   0.612

Test of the hypothesis that 1 factor is sufficient.
The chi square statistic is 5749.92 on 27 degrees of freedom.
The p-value is 0 
factanal(scale(complete_case[1:9]), factors = 2)

Call:
factanal(x = scale(complete_case[1:9]), factors = 2)

Uniquenesses:
stress1 stress2 stress3 stress4 stress5 stress6 stress7 stress8 stress9 
  0.325   0.178   0.176   0.254   0.309   0.267   0.370   0.258   0.393 

Loadings:
        Factor1 Factor2
stress1 0.782   0.252  
stress2 0.858   0.292  
stress3 0.863   0.281  
stress4 0.794   0.341  
stress5 0.768   0.319  
stress6 0.744   0.424  
stress7 0.405   0.682  
stress8 0.298   0.808  
stress9 0.229   0.745  

               Factor1 Factor2
SS loadings      4.171   2.298
Proportion Var   0.463   0.255
Cumulative Var   0.463   0.719

Test of the hypothesis that 2 factors are sufficient.
The chi square statistic is 2633.28 on 19 degrees of freedom.
The p-value is 0 
factanal(scale(complete_case[1:9]), factors = 3)

Call:
factanal(x = scale(complete_case[1:9]), factors = 3)

Uniquenesses:
stress1 stress2 stress3 stress4 stress5 stress6 stress7 stress8 stress9 
  0.269   0.005   0.203   0.194   0.203   0.229   0.371   0.255   0.379 

Loadings:
        Factor1 Factor2 Factor3
stress1 0.387   0.262   0.716  
stress2 0.415   0.284   0.862  
stress3 0.611   0.274   0.590  
stress4 0.750   0.305   0.387  
stress5 0.773   0.268   0.357  
stress6 0.679   0.392   0.395  
stress7 0.361   0.668   0.230  
stress8 0.236   0.804   0.207  
stress9 0.160   0.751   0.174  

               Factor1 Factor2 Factor3
SS loadings      2.529   2.199   2.162
Proportion Var   0.281   0.244   0.240
Cumulative Var   0.281   0.525   0.766

Test of the hypothesis that 3 factors are sufficient.
The chi square statistic is 337.85 on 12 degrees of freedom.
The p-value is 5.12e-65 

Many people experienced 0 stress overall, interesting!

stress_sum <- complete_case %>%
  mutate(rowsums = rowSums(select(., c(1:10)))) %>%
  dplyr::select(rowsums) %>%
  unlist() %>%
  as.vector()

sum(stress_sum == 0) / length(stress_sum) * 100
[1] 94.43811

Various visuals

average_row <- complete_case %>%
  mutate(measurement = row_number()) %>%
  mutate(rowsums = rowSums(select(., c(1:10)))) %>%
  dplyr::filter(rowsums != 0) %>%
  colMeans() 

average_row['measurement'] <- 99999

complete_case %>%
  mutate(measurement = row_number()) %>%
  mutate(rowsums = rowSums(select(., c(1:10)))) %>%
  dplyr::filter(rowsums != 0) %>%
  dplyr::slice_sample(n = 5) %>%
  rbind(., average_row) %>%
  rename_with(., ~ gsub("stress", "", .x, fixed = TRUE)) %>%
  pivot_longer(!c('DelinOverall#8v', measurement, rowsums), names_to = 'month', values_to = 'stress') %>%
  mutate(month = as.numeric(month)) %>%
  ggplot(data=., aes(x=month, y=stress, group=measurement)) + 
  geom_line() + 
  scale_x_continuous(breaks = 1:10) + 
  scale_y_continuous(breaks = c(0, 5, 10)) +
  facet_grid(rows = vars(measurement))

summary(complete_case$`DelinOverall#8v`)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
   0.0000    0.6667    3.8333   21.1109   16.8333 2179.1111 
quantile(complete_case$`DelinOverall#8v`, 0.95)
95% 
100 
ggplot(data=complete_case, aes(x = `DelinOverall#8v`)) + 
  geom_histogram(color = "white") + 
  xlim(0,100) + 
  ylim(0, 1e3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 220 rows containing non-finite values (`stat_bin()`).
Warning: Removed 2 rows containing missing values (`geom_bar()`).

sum(complete_case$`DelinOverall#8v` == 0) / length(complete_case$`DelinOverall#8v`) * 100
[1] 16.02462
# 16% of the data will be -Inf with log transform 
ggplot(data=complete_case, aes(x = `DelinOverall#8v`)) + 
  geom_histogram(color = "white") + 
  scale_x_log10()
Warning: Transformation introduced infinite values in continuous x-axis
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 703 rows containing non-finite values (`stat_bin()`).