So, with this version, I picked up from where my last update ended, and did the LPA analysis with everything I had last time, and added population from census tracts, and later the natural log of population. The model selection recommended 4 classes, which doesn’t completely make sense to me, so I ran the analysis with both 3 classes and 4 classes.

Right now, I’m not getting very much separation of classes from the analysis. hopefully tomorrow we can discuss ways I might be able to get more distinction of classes, and what the 4 classes might represent.

Also, I intentionally left the code displayed vice writing out the differences as I think the brevity is a little easier to understand.

Old PCA with population added in

##         eigenvalue percentage of variance
## comp 1  4.81506649             30.0941656
## comp 2  3.00051718             18.7532324
## comp 3  2.23288942             13.9555589
## comp 4  1.71631177             10.7269486
## comp 5  0.99122966              6.1951854
## comp 6  0.82148934              5.1343084
## comp 7  0.51880250              3.2425156
## comp 8  0.50620590              3.1637869
## comp 9  0.37114965              2.3196853
## comp 10 0.25769565              1.6105978
## comp 11 0.23594615              1.4746635
## comp 12 0.14859537              0.9287210
## comp 13 0.11836782              0.7397989
## comp 14 0.11274452              0.7046532
## comp 15 0.08217834              0.5136146
## comp 16 0.07081024              0.4425640

##            Dim.1      Dim.2       Dim.3       Dim.4        Dim.5
## y00_04 0.5082115  0.6794351 -0.09453642  0.01066100 -0.013505930
## y05_09 0.5530741  0.7721438 -0.09976778 -0.08208166 -0.013288453
## y10_14 0.5973760  0.7307166 -0.07993783 -0.08829118 -0.004722731
## y15_19 0.5243002  0.7488211 -0.07217826 -0.09042962  0.007533299
## pnhw00 0.7191840 -0.3749509  0.01603900 -0.08124349  0.010183651
## pnhw12 0.7777672 -0.4194049  0.03417118 -0.04690623 -0.012993279

## Compare tidyLPA solutions:
## 
##  Model Classes AIC      BIC      Warnings
##  1     1       19038.51 19079.48         
##  1     2       19049.29 19115.86 Warning 
##  1     3       18358.81 18451.00         
##  1     4       18335.94 18453.73         
##  6     1       19050.51 19122.21         
##  6     2       17292.71 17441.23         
##  6     3       16775.44 17000.77         
##  6     4       16593.65 16895.80         
## 
## Best model according to AIC is Model 6 with 4 classes.
## Best model according to BIC is Model 6 with 4 classes.
## 
## An analytic hierarchy process, based on the fit indices AIC, AWE, BIC, CLC, and KIC (Akogul & Erisoglu, 2017), suggests the best solution is Model 6 with 4 classes.
y1 <- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(3,
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()

y2<- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(3,package = "MplusAutomation",
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()

y3 <- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(4,
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()

y4<- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(4,package = "MplusAutomation",
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()

Using natural logarithms of tract population

#summary(dat_orig)
#bpsw[is.na(bpsw)[,4:22],] = 0
dat_spl <- dat_orig[,4:19]
dat_spl[is.na(dat_spl)] = 0
dat_comb <- cbind(dat_orig[,1:3],dat_spl)
dat_origln <- dat_comb %>% 
  mutate(
    pop00ln = log(pop00 + 0.0001),
    pop12ln = log(pop12 + 0.0001),
    pop19ln = log(pop19 + 0.0001)
  )
#dat_origln <- dat_origln[!is.infinite(rowSums(dat_origln)),]

#### troubleshooting
#summary(dat_comb)
#summary(dat_origln$pop00ln)
#print(dat_origln)



pca_origln <- PCA(dat_origln[,c("y00_04","y05_09","y10_14","y15_19", "pnhw00", "pnhw12", "pnhw19", "pedu00","pedu12","pedu19","mhhinc00","mhhinc12","mhhinc19", "pop00ln","pop12ln","pop19ln")],
           scale.unit = T)

#summary(pca)
#eigenvalues
eigenval_origln <- pca_origln$eig
eigvar_origln <- as.data.frame(eigenval_origln[,1:2])
print(eigvar_origln)
##         eigenvalue percentage of variance
## comp 1  4.81246383             30.0778989
## comp 2  2.97418799             18.5886749
## comp 3  2.29323024             14.3326890
## comp 4  1.65770238             10.3606399
## comp 5  0.90452136              5.6532585
## comp 6  0.78705599              4.9191000
## comp 7  0.54836202              3.4272626
## comp 8  0.50120577              3.1325361
## comp 9  0.38283320              2.3927075
## comp 10 0.30926878              1.9329299
## comp 11 0.23157371              1.4473357
## comp 12 0.16540408              1.0337755
## comp 13 0.15311328              0.9569580
## comp 14 0.11462541              0.7164088
## comp 15 0.08396484              0.5247803
## comp 16 0.08048711              0.5030445
#Scree plot
fviz_screeplot(pca_origln,ncp = 10)

# examining first four dimensions
var_orig <- get_pca_var(pca_origln)
#head(var_origln$coord)
corrplot::corrplot(var_orig$coord)

#stripping out the top four dimensions and adding them to dat_orig

pca_orig_dim <- as.data.frame(pca_orig$ind$coord[,1:4])
pca_orig_dim <- cbind(dat_orig,pca_orig_dim)

pca_orig_dim <- pca_orig_dim %>% 
  mutate(
    dim1 = Dim.1,
    dim2 = Dim.2,
    dim3 = Dim.3,
    dim4 = Dim.4
  )

#comparing models
## comparing models
pca_orig_dim <- as.data.frame(pca_orig_dim)
pca_orig_dim%>%
    dplyr::select(dim1,dim2,dim3, dim4) %>%
    single_imputation() %>%
    estimate_profiles(1:4, 
                      variances = c("equal", "varying"),
                      covariances = c("zero", "varying")
                      ) %>%
    compare_solutions(statistics = c("AIC", "BIC"))
## Warning: 
## One or more analyses resulted in warnings! Examine these analyses carefully: model_1_class_2

## Warning: 
## One or more analyses resulted in warnings! Examine these analyses carefully: model_1_class_2
## Compare tidyLPA solutions:
## 
##  Model Classes AIC      BIC      Warnings
##  1     1       19038.51 19079.48         
##  1     2       19049.29 19115.86 Warning 
##  1     3       18358.81 18451.00         
##  1     4       18335.94 18453.73         
##  6     1       19050.51 19122.21         
##  6     2       17292.71 17441.23         
##  6     3       16775.44 17000.77         
##  6     4       16593.65 16895.80         
## 
## Best model according to AIC is Model 6 with 4 classes.
## Best model according to BIC is Model 6 with 4 classes.
## 
## An analytic hierarchy process, based on the fit indices AIC, AWE, BIC, CLC, and KIC (Akogul & Erisoglu, 2017), suggests the best solution is Model 6 with 4 classes.

LPA

y5 <- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(3,
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()

y6<- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(3,package = "MplusAutomation",
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()

y7 <- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(4,
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()

y8<- pca_orig_dim %>% 
  dplyr::select(dim1,dim2,dim3,dim4) %>% 
  single_imputation() %>% 
  scale() %>% 
  estimate_profiles(4,package = "MplusAutomation",
                    variances = "varying",
                    covariances = "varying") %>% 
  plot_profiles()