PSYC214 HW1

#Began by setting working directory, creating a chunk, and reading in data. 
data <- read.csv("Data_HW1.csv")

#Installing necessary packages and loading libraries that I'll need 
#install.packages("lavaan") 
#install.packages("semTools") 
#install.packages("semPlot")  

library(psych)
library(lavaan)

## This is lavaan 0.6-15
## lavaan is FREE software! Please report any bugs.

## 
## Attaching package: 'lavaan'

## The following object is masked from 'package:psych':
## 
##     cor2cov

library(semTools)

##

## ###############################################################################

## This is semTools 0.5-6

## All users of R (or SEM) are invited to submit functions or ideas for functions.

## ###############################################################################

## 
## Attaching package: 'semTools'

## The following objects are masked from 'package:psych':
## 
##     reliability, skew

library(semPlot)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.4.0     ✔ purrr   0.3.5
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.3     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()     masks psych::%+%()
## ✖ ggplot2::alpha()   masks psych::alpha()
## ✖ readr::clipboard() masks semTools::clipboard()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()

#Calculating descriptives for my data set, beginning with the gender of my participants. I'll have (1=male, 2=female)
Gender <- factor(data$sexrsp, levels = 1:2, labels = c("Male", "Female"))
summary(Gender)

##   Male Female 
##   4637   5051

#gender is fairly well distributed, slightly more females.

#Next I'll go through and look at the age of my participants. Using the mutate function to make a new column because the age data is #only provided within the data by birth year
data <- mutate(data, Age = 103-(data$byear)) 
psych::describe(data$Age)

##    vars    n  mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 9688 64.16 0.51     64    64.1   0  63  66     3  1.1     2.71 0.01

#Now I'll look at the education levels of my participants. 
library(tidyverse) #needed to change the code
data$educ92 <- as.numeric(data$educ92)
psych::describe(data$educ92)

##    vars    n  mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 8452 13.61 2.26     12   13.21   0  12  21     9 1.14     0.04 0.02

#this looks a bit deceptive because the SD is 2.26. Might be better off reporting the median.

#And finally I'll look at the nationality data. Beginning with nationality on the father's side. I know this variable was coded with a #ton of options from reviewing the codebook. Beginning here by checking to see how R #coded the variable.
glimpse(data$ge096fa)

##  int [1:9688] NA 14 4 NA 2 14 14 NA 14 14 ...

#R coded this as an interval, so I'll change it to a factor since I know this variable has a ton of levels to it
data$ge096fa <- as.factor(data$ge096fa)
summary(data$ge096fa)

##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  633  108   52  463  549  166  128  182    6   82   81  231    4 3417  474  202 
##   17   18   19   20   22   23   24   25   26   27   28   29   30   31   32   33 
##   43   21   47    4   20   10   91    3    8   21   55    2    1    1    1    1 
##   34   35   36   37   38   39   40   41   42 NA's 
##    1    5   16   11   20    9   16   11    4 2488

#Repeating this same process for the nationality on the mother's side
glimpse(data$ge095ma)

##  int [1:9688] NA 4 1 NA 41 14 4 NA 14 14 ...

data$ge095ma <- as.factor(data$ge095ma)
summary(data$ge095ma)

##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  622   91   38  495  554  151  104  168   28   75   81  218   14 3106  527  192 
##   17   18   19   20   22   23   24   25   26   27   28   30   32   33   35   36 
##   48   31   59    3   50    7   80    5    8   32   36    1    1    1    2   16 
##   37   38   39   40   41   42   43   44   46   47 NA's 
##    8   58   26   27   32   11    3    1    1    1 2676

#Now that the descriptives are done I'll truncate to my identity items only
dataT <- data[grep("in501rer$", colnames(data)):grep("in507rer$", colnames(data))]

#Renaming my variables so I can tell which is which when I'm analyzing the outputs
dataT <- dataT %>%
  rename(ID_work = in501rer,
         ID_relig = in502rer,
         ID_fam = in503rer,
         ID_volunt = in504rer,
         ID_union = in505rer,
         ID_political = in506rer,
         ID_nation = in507rer)

#Creating a correlation matrix
cor_matrix <- cor(dataT[, c("ID_work", "ID_relig", "ID_fam", "ID_volunt", "ID_union", "ID_political", "ID_nation")], use = "complete", method = "pearson")
print(cor_matrix)

##                ID_work  ID_relig    ID_fam ID_volunt  ID_union ID_political
## ID_work      1.0000000 0.2792627 0.3178501 0.3187617 0.3245953    0.2274303
## ID_relig     0.2792627 1.0000000 0.3957867 0.4068193 0.3168857    0.2216130
## ID_fam       0.3178501 0.3957867 1.0000000 0.3038079 0.2239655    0.1543646
## ID_volunt    0.3187617 0.4068193 0.3038079 1.0000000 0.6495276    0.3410393
## ID_union     0.3245953 0.3168857 0.2239655 0.6495276 1.0000000    0.4479959
## ID_political 0.2274303 0.2216130 0.1543646 0.3410393 0.4479959    1.0000000
## ID_nation    0.2614723 0.2816966 0.2044126 0.3231119 0.3972244    0.5039401
##              ID_nation
## ID_work      0.2614723
## ID_relig     0.2816966
## ID_fam       0.2044126
## ID_volunt    0.3231119
## ID_union     0.3972244
## ID_political 0.5039401
## ID_nation    1.0000000

#What jumps out right way is that volunteerism and union/group membership have the strongest correlation of this #matrix with a .649 #correlation, which makes sense as I would theoretically expect those identity elements to share similarity with one another. I'm also #seeing that nationality/ethnicity and political identity are correlating more strongly, and then there's a weaker but consistent #correlation relationship with #religion/family/volunteerism. Interestingly, I'm seeing political identity and family correlate the #weakest.

#Creating a covariance matrix
cov_matrix <- cov(dataT[, c("ID_work", "ID_relig", "ID_fam", "ID_volunt", "ID_union", "ID_political", "ID_nation")] ,use = "complete")
print(cov_matrix)

##                ID_work  ID_relig    ID_fam ID_volunt  ID_union ID_political
## ID_work      3.4519497 1.0836549 0.8668438 1.0220569 1.0973510    0.7619236
## ID_relig     1.0836549 4.3620590 1.2133700 1.4663040 1.2042580    0.8345877
## ID_fam       0.8668438 1.2133700 2.1546295 0.7695947 0.5981891    0.4085685
## ID_volunt    1.0220569 1.4663040 0.7695947 2.9781975 2.0396021    1.0612363
## ID_union     1.0973510 1.2042580 0.5981891 2.0396021 3.3108706    1.4698602
## ID_political 0.7619236 0.8345877 0.4085685 1.0612363 1.4698602    3.2513359
## ID_nation    0.9173476 1.1109728 0.5665912 1.0529450 1.3648441    1.7158757
##              ID_nation
## ID_work      0.9173476
## ID_relig     1.1109728
## ID_fam       0.5665912
## ID_volunt    1.0529450
## ID_union     1.3648441
## ID_political 1.7158757
## ID_nation    3.5657587

#What sticks out to me here is that nationality/ethnicity and political identity are co-varying together more 
#strongly than most of the the others. I saw a correlation relationship with those two variables in the correlation matrix as well. #The .64 correlation I saw above is also standing out here, between volunteerism and 
#union/group membership, showing that these two variables also co-vary together.

#Now I'm going to create a null model syntax for my variables 
model0 <- ' 
ID_work ~~ ID_work 
ID_relig ~~ ID_relig 
ID_fam ~~ ID_fam 
ID_volunt ~~ ID_volunt 
ID_union ~~ ID_union 
ID_political ~~ ID_political 
ID_nation ~~ ID_nation'  

#Fitting the null model 
fit0 <- cfa(model0, data = dataT)  
#view summary
summary(fit0, standard = TRUE, fit.measures = TRUE)

## lavaan 0.6.15 ended normally after 9 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         7
## 
##                                                   Used       Total
##   Number of observations                          6312        9688
## 
## Model Test User Model:
##                                                        
##   Test statistic                              11105.204
##   Degrees of freedom                                 21
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                             11105.204
##   Degrees of freedom                                21
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.000
##   Tucker-Lewis Index (TLI)                      -0.000
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -88628.385
##   Loglikelihood unrestricted model (H1)     -83075.783
##                                                       
##   Akaike (AIC)                              177270.770
##   Bayesian (BIC)                            177318.022
##   Sample-size adjusted Bayesian (SABIC)     177295.778
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.289
##   90 Percent confidence interval - lower         0.285
##   90 Percent confidence interval - upper         0.294
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.300
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##     ID_work           3.451    0.061   56.178    0.000    3.451    1.000
##     ID_relig          4.361    0.078   56.178    0.000    4.361    1.000
##     ID_fam            2.154    0.038   56.178    0.000    2.154    1.000
##     ID_volunt         2.978    0.053   56.178    0.000    2.978    1.000
##     ID_union          3.310    0.059   56.178    0.000    3.310    1.000
##     ID_political      3.251    0.058   56.178    0.000    3.251    1.000
##     ID_nation         3.565    0.063   56.178    0.000    3.565    1.000

#This is what I'll use to compare against my other models. I suspect that my null model will not be my best 
#fitting model, as it does not fit well.

#Installing and loading GAIPE 
#install.packages("GAIPE")
library(GAIPE)

#Putting in the values for my Null model so that I can find my confidence intervals 
rmsea <- 0.2894
df <- 21
N <- 6312
clevel <- 0.90

# Calculate the CI for the Null model RMSEA
ci <- CI.RMSEA(rmsea, df, N, clevel)
ci

## $Lower.CI
## [1] 0.2848927
## 
## $RMSEA
## [1] 0.2894
## 
## $Upper.CI
## [1] 0.2939333

#Now I'm going to build my two factor model. First, I'll write the model syntax. What this does is allow me to
#define my latent variables. F1 is going to represent my variables that are more intrinsically salient. And then
#F2 is going to represent my variables that are more externally salient. 

CFA_ID<-
  'F1 = ~ ID_work + ID_relig + ID_fam 
  F2 = ~ ID_volunt + ID_union + ID_political + ID_nation'

#Now I'm going to fit the model, and plot it 
CFA_IDfit <- cfa(CFA_ID, data = dataT)
summary(CFA_IDfit, standard = TRUE, fit.measures = TRUE)

## lavaan 0.6.15 ended normally after 37 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        15
## 
##                                                   Used       Total
##   Number of observations                          6312        9688
## 
## Model Test User Model:
##                                                       
##   Test statistic                              1283.951
##   Degrees of freedom                                13
##   P-value (Chi-square)                           0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                             11105.204
##   Degrees of freedom                                21
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.885
##   Tucker-Lewis Index (TLI)                       0.815
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -83717.758
##   Loglikelihood unrestricted model (H1)     -83075.783
##                                                       
##   Akaike (AIC)                              167465.517
##   Bayesian (BIC)                            167566.770
##   Sample-size adjusted Bayesian (SABIC)     167519.104
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.124
##   90 Percent confidence interval - lower         0.119
##   90 Percent confidence interval - upper         0.130
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.058
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   F1 =~                                                                 
##     ID_work           1.000                               0.984    0.530
##     ID_relig          1.352    0.047   28.747    0.000    1.330    0.637
##     ID_fam            0.839    0.030   27.522    0.000    0.825    0.562
##   F2 =~                                                                 
##     ID_volunt         1.000                               1.315    0.762
##     ID_union          1.122    0.021   52.958    0.000    1.475    0.811
##     ID_political      0.754    0.019   39.452    0.000    0.991    0.550
##     ID_nation         0.755    0.020   37.750    0.000    0.992    0.526
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   F1 ~~                                                                 
##     F2                0.896    0.035   25.726    0.000    0.693    0.693
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .ID_work           2.484    0.054   45.658    0.000    2.484    0.720
##    .ID_relig          2.594    0.069   37.437    0.000    2.594    0.595
##    .ID_fam            1.474    0.034   43.616    0.000    1.474    0.684
##    .ID_volunt         1.249    0.034   36.875    0.000    1.249    0.419
##    .ID_union          1.134    0.037   30.436    0.000    1.134    0.342
##    .ID_political      2.269    0.045   50.640    0.000    2.269    0.698
##    .ID_nation         2.580    0.050   51.314    0.000    2.580    0.724
##     F1                0.968    0.053   18.210    0.000    1.000    1.000
##     F2                1.729    0.054   31.734    0.000    1.000    1.000

semPaths(CFA_IDfit)

# Then I'm running additional code to find my factor pattern and factor correlations. My $lambda is showing me
#my pathways
inspect(cfa(CFA_IDfit, dataT), what = "std")

## $lambda
##                 F1    F2
## ID_work      0.530 0.000
## ID_relig     0.637 0.000
## ID_fam       0.562 0.000
## ID_volunt    0.000 0.762
## ID_union     0.000 0.811
## ID_political 0.000 0.550
## ID_nation    0.000 0.526
## 
## $theta
##              ID_wrk ID_rlg ID_fam ID_vln ID_unn ID_plt ID_ntn
## ID_work       0.720                                          
## ID_relig      0.000  0.595                                   
## ID_fam        0.000  0.000  0.684                            
## ID_volunt     0.000  0.000  0.000  0.419                     
## ID_union      0.000  0.000  0.000  0.000  0.342              
## ID_political  0.000  0.000  0.000  0.000  0.000  0.698       
## ID_nation     0.000  0.000  0.000  0.000  0.000  0.000  0.724
## 
## $psi
##       F1    F2
## F1 1.000      
## F2 0.693 1.000

#right away it looks like my F2 might be better served being split between another latent variable, volunt/union hang better together #and political/nation hang better together.

#Building my one factor model. Re-writing the model syntax. F3 is going to see if identity is better explained 
#by these variables within a single factor model 
CFA_ID2<-
  'F3 = ~ ID_work + ID_relig + ID_fam + ID_volunt + ID_union + ID_political + ID_nation'

#Now I'm going to fit the model, and plot it 
CFA_ID2fit <- cfa(CFA_ID2, data = dataT)
summary(CFA_ID2fit, standard = TRUE, fit.measures = TRUE)

## lavaan 0.6.15 ended normally after 33 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        14
## 
##                                                   Used       Total
##   Number of observations                          6312        9688
## 
## Model Test User Model:
##                                                       
##   Test statistic                              1858.735
##   Degrees of freedom                                14
##   P-value (Chi-square)                           0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                             11105.204
##   Degrees of freedom                                21
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.834
##   Tucker-Lewis Index (TLI)                       0.750
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -84005.150
##   Loglikelihood unrestricted model (H1)     -83075.783
##                                                       
##   Akaike (AIC)                              168038.301
##   Bayesian (BIC)                            168132.804
##   Sample-size adjusted Bayesian (SABIC)     168088.315
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.144
##   90 Percent confidence interval - lower         0.139
##   90 Percent confidence interval - upper         0.150
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.072
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   F3 =~                                                                 
##     ID_work           1.000                               0.840    0.452
##     ID_relig          1.241    0.046   26.990    0.000    1.042    0.499
##     ID_fam            0.689    0.030   23.317    0.000    0.579    0.394
##     ID_volunt         1.563    0.048   32.499    0.000    1.312    0.760
##     ID_union          1.693    0.052   32.716    0.000    1.421    0.781
##     ID_political      1.170    0.041   28.303    0.000    0.982    0.545
##     ID_nation         1.198    0.043   27.973    0.000    1.006    0.533
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .ID_work           2.746    0.052   52.954    0.000    2.746    0.796
##    .ID_relig          3.276    0.063   51.999    0.000    3.276    0.751
##    .ID_fam            1.820    0.034   53.875    0.000    1.820    0.845
##    .ID_volunt         1.256    0.033   38.064    0.000    1.256    0.422
##    .ID_union          1.291    0.036   35.663    0.000    1.291    0.390
##    .ID_political      2.286    0.045   50.833    0.000    2.286    0.703
##    .ID_nation         2.554    0.050   51.167    0.000    2.554    0.716
##     F3                0.705    0.041   17.178    0.000    1.000    1.000

semPaths(CFA_ID2fit)

#I can see right away that my fit is worse here. My RMSEA increased to 0.144 and my CLI and TLI dropped

#Running additional code to find my factor pattern and factor correlations.
inspect(cfa(CFA_ID2fit, dataT), what = "std")

## $lambda
##                 F3
## ID_work      0.452
## ID_relig     0.499
## ID_fam       0.394
## ID_volunt    0.760
## ID_union     0.781
## ID_political 0.545
## ID_nation    0.533
## 
## $theta
##              ID_wrk ID_rlg ID_fam ID_vln ID_unn ID_plt ID_ntn
## ID_work       0.796                                          
## ID_relig      0.000  0.751                                   
## ID_fam        0.000  0.000  0.845                            
## ID_volunt     0.000  0.000  0.000  0.422                     
## ID_union      0.000  0.000  0.000  0.000  0.390              
## ID_political  0.000  0.000  0.000  0.000  0.000  0.703       
## ID_nation     0.000  0.000  0.000  0.000  0.000  0.000  0.716
## 
## $psi
##    F3
## F3  1

#Here the union variable looks like it's loading quite high.

#Creating an alternate drop model and dropping nationality/ethnicity from my model completely. This variable had a 
#strong correlation with political identity in my matrices. 

Alt_Model <-
  'F1 = ~ ID_ work + ID_relig + ID_fam 
  F2 = ~ + ID_volunt + ID_union + ID_political
'

# Fitting my alternative model and plotting it

Alt_Model_fit <- cfa((Alt_Model), data = dataT)
summary(Alt_Model_fit, standard = TRUE, fit.measures = TRUE)

## lavaan 0.6.15 ended normally after 39 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        13
## 
##                                                   Used       Total
##   Number of observations                          6318        9688
## 
## Model Test User Model:
##                                                       
##   Test statistic                               424.107
##   Degrees of freedom                                 8
##   P-value (Chi-square)                           0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                              8723.780
##   Degrees of freedom                                15
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.952
##   Tucker-Lewis Index (TLI)                       0.910
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -71589.550
##   Loglikelihood unrestricted model (H1)     -71377.496
##                                                       
##   Akaike (AIC)                              143205.100
##   Bayesian (BIC)                            143292.865
##   Sample-size adjusted Bayesian (SABIC)     143251.554
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.091
##   90 Percent confidence interval - lower         0.083
##   90 Percent confidence interval - upper         0.098
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    0.992
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.037
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   F1 =~                                                                 
##     ID_work           1.000                               0.972    0.523
##     ID_relig          1.374    0.048   28.351    0.000    1.335    0.639
##     ID_fam            0.856    0.031   27.289    0.000    0.832    0.567
##   F2 =~                                                                 
##     ID_volunt         1.000                               1.379    0.799
##     ID_union          1.073    0.022   49.246    0.000    1.479    0.813
##     ID_political      0.647    0.018   35.501    0.000    0.891    0.494
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   F1 ~~                                                                 
##     F2                0.909    0.036   25.328    0.000    0.678    0.678
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .ID_work           2.505    0.055   45.843    0.000    2.505    0.726
##    .ID_relig          2.580    0.070   36.887    0.000    2.580    0.591
##    .ID_fam            1.462    0.034   43.064    0.000    1.462    0.679
##    .ID_volunt         1.079    0.037   28.797    0.000    1.079    0.362
##    .ID_union          1.125    0.042   26.776    0.000    1.125    0.340
##    .ID_political      2.460    0.047   51.978    0.000    2.460    0.756
##     F1                0.945    0.053   17.919    0.000    1.000    1.000
##     F2                1.901    0.059   32.231    0.000    1.000    1.000

semPaths(Alt_Model_fit)

#Immediately see an improved fit! RMSEA fell to 0.091

#Running additional code to find my factor pattern and factor correlations.
inspect(cfa(Alt_Model_fit, dataT), what = "std")

## $lambda
##                 F1    F2
## ID_work      0.523 0.000
## ID_relig     0.639 0.000
## ID_fam       0.567 0.000
## ID_volunt    0.000 0.799
## ID_union     0.000 0.813
## ID_political 0.000 0.494
## 
## $theta
##              ID_wrk ID_rlg ID_fam ID_vln ID_unn ID_plt
## ID_work       0.726                                   
## ID_relig      0.000  0.591                            
## ID_fam        0.000  0.000  0.679                     
## ID_volunt     0.000  0.000  0.000  0.362              
## ID_union      0.000  0.000  0.000  0.000  0.340       
## ID_political  0.000  0.000  0.000  0.000  0.000  0.756
## 
## $psi
##       F1    F2
## F1 1.000      
## F2 0.678 1.000

#union is still sticking out

#Expected var/cov matrix for my alternative model with ethnicity/nationality dropped 

SMat_Alt_Model <- fitted(Alt_Model_fit) 
SMat_Alt_Model

## $cov
##              ID_wrk ID_rlg ID_fam ID_vln ID_unn ID_plt
## ID_work       3.449                                   
## ID_relig      1.298  4.364                            
## ID_fam        0.809  1.111  2.154                     
## ID_volunt     0.909  1.249  0.778  2.980              
## ID_union      0.975  1.339  0.835  2.039  3.312       
## ID_political  0.588  0.807  0.503  1.229  1.318  3.254

#I'm still seeing that union and volunteerism co-vary strongly together

#Residual matrix for my alternative model with ethnicity/nationality dropped 

SMat_Alt_Model <- resid(Alt_Model_fit) 
SMat_Alt_Model

## $type
## [1] "raw"
## 
## $cov
##              ID_wrk ID_rlg ID_fam ID_vln ID_unn ID_plt
## ID_work       0.000                                   
## ID_relig     -0.215  0.000                            
## ID_fam        0.057  0.104  0.000                     
## ID_volunt     0.113  0.220 -0.008  0.000              
## ID_union      0.123 -0.135 -0.237  0.002  0.000       
## ID_political  0.175  0.030 -0.092 -0.165  0.152  0.000

#And here they stick out to me again, the residual is at 0.002 for volunteer and union, which would typically be good, but I'm #suspicious of it because these two variable have shown strong covariance and correlation throughout

#I want to test to see what happens if I drop union/group membership as well from my model 

Alt_Model2 <-
  'F1 = ~ ID_work + ID_relig + ID_fam
  F2 = ~ + ID_volunt + ID_political
'

# Fitting my second alternative model and plotting it

Alt_Model2_fit <- cfa((Alt_Model2), data = dataT)
summary(Alt_Model2_fit, standard = TRUE, fit.measures = TRUE)

## lavaan 0.6.15 ended normally after 40 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        11
## 
##                                                   Used       Total
##   Number of observations                          6370        9688
## 
## Model Test User Model:
##                                                       
##   Test statistic                               133.487
##   Degrees of freedom                                 4
##   P-value (Chi-square)                           0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                              4521.566
##   Degrees of freedom                                10
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.971
##   Tucker-Lewis Index (TLI)                       0.928
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -61326.441
##   Loglikelihood unrestricted model (H1)     -61259.697
##                                                       
##   Akaike (AIC)                              122674.881
##   Bayesian (BIC)                            122749.234
##   Sample-size adjusted Bayesian (SABIC)     122714.279
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.071
##   90 Percent confidence interval - lower         0.061
##   90 Percent confidence interval - upper         0.082
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    0.090
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.027
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   F1 =~                                                                 
##     ID_work           1.000                               0.932    0.502
##     ID_relig          1.467    0.052   28.173    0.000    1.368    0.655
##     ID_fam            0.903    0.033   27.125    0.000    0.842    0.574
##   F2 =~                                                                 
##     ID_volunt         1.000                               1.324    0.767
##     ID_political      0.608    0.027   22.232    0.000    0.804    0.445
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   F1 ~~                                                                 
##     F2                0.958    0.037   25.612    0.000    0.776    0.776
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .ID_work           2.582    0.054   47.424    0.000    2.582    0.748
##    .ID_relig          2.491    0.069   35.846    0.000    2.491    0.571
##    .ID_fam            1.444    0.034   43.070    0.000    1.444    0.671
##    .ID_volunt         1.227    0.076   16.166    0.000    1.227    0.412
##    .ID_political      2.622    0.054   48.865    0.000    2.622    0.802
##     F1                0.869    0.050   17.327    0.000    1.000    1.000
##     F2                1.752    0.087   20.096    0.000    1.000    1.000

semPaths(Alt_Model2_fit)

#Even better! RMSEA is now 0.071 and my CFI is at 0.971 with TLI not too far behind at 0.928.

#Running my final factor pattern and factor correlations.
inspect(cfa(Alt_Model2_fit, dataT), what = "std")

## $lambda
##                 F1    F2
## ID_work      0.502 0.000
## ID_relig     0.655 0.000
## ID_fam       0.574 0.000
## ID_volunt    0.000 0.767
## ID_political 0.000 0.445
## 
## $theta
##              ID_wrk ID_rlg ID_fam ID_vln ID_plt
## ID_work       0.748                            
## ID_relig      0.000  0.571                     
## ID_fam        0.000  0.000  0.671              
## ID_volunt     0.000  0.000  0.000  0.412       
## ID_political  0.000  0.000  0.000  0.000  0.802
## 
## $psi
##       F1    F2
## F1 1.000      
## F2 0.776 1.000

#I like how my variables are hanging for F1, they look okay. But the variables for F2 are even more spread apart now. This is still not the ideal model for this data.

PSYC214 HW1

2023-04-25