This document is used to test the sensitivity of our Livability to the methods: geographically weighted principal component analysis (GWPCA), spatial principle analysis (sPCA), and two aspatial methods principle component analysis(PCA), the Bayesian factor. analysis (BFA). Then, we compare the correlation Livability ranking. Potentially, we can go with the GWPCA and the sBFA (from Matlab).

Next step: With the chosen method, we will conduction the regional analysis.

1 Set Up: Load packages

We need to load all of the packages that r requires for this exercise.

 # Load libraries
packages.wanted <- c("robustbase","mvoutlier","RColorBrewer","GWmodel","raster","tmap")
for (package in packages.wanted) require(package,character.only=TRUE)


library(reshape2) 
#sPCA
library(adegenet)
library(spdep) 
library(adespatial)
library(splancs)
library(readxl)

#The tigris R package obtaining and using Census geographic data sets.
library(tigris)  

library(ggplot2)
library(sf)
library(spdep)

# visualizes geographic data on an interactive, zoomable map.
library(mapview)
library(viridis) 
 
# Load the GWPCA functions....
library(GWmodel)
 
library(dplyr)
library(tidyr)

# Scree Plot
library(stats)


# Bayesian packages
library(blavaan)
library(lavaan)
library(future)
plan("multicore")
library(bayesplot)

library(data.table) 
library(vegan)
library(psych)
library(rcompanion)

library(patchwork)  # For combining ggplot objects easily
 
# library(rgdal)

rm(packages.wanted,package)
library(dplyr)

2 Data

2.1 Read Data

Now, set up the data set used for this practice. All the data are stadndardize (Z-scores).

# Read the QoL data, might be updated
QoL_Livability <- read_excel("Data/Matlab Output Data/Summary of Ranks_Final.xlsx") %>% 
  dplyr::select(-P2, -P1) %>% 
  rename(Livability_Rank = Original,
         Income_Rank = Rank_Income )


# UICs
UrbanInfluenceCodes2013 <- read_excel("Data/UrbanInfluenceCodes2013.xls", 
    sheet = "Urban Influence Codes 2013") %>% 
 dplyr::select(FIPS, UIC_2013, Description) %>% 
  mutate(FIPS = as.numeric(FIPS),
         UIC = factor(paste(UIC_2013, Description, sep = " - ")))

Merge all the data for analysis.

QoL<- QoL_Livability %>%
  left_join(ct, by = "FIPS") %>%
  left_join(UrbanInfluenceCodes2013 , by = "FIPS")   %>%
  distinct(FIPS, .keep_all = TRUE)  # Keep one record per FIPS and remove duplicates

str(QoL)

## tibble [3,036 × 29] (S3: tbl_df/tbl/data.frame)
##  $ Livability_Rank    : num [1:3036] 1399 826 1419 2416 2711 ...
##  $ FIPS               : num [1:3036] 1001 1003 1005 1007 1009 ...
##  $ PerCap_Income_19_23: num [1:3036] 0.117 0.635 -1.111 -1.045 -0.517 ...
##  $ Population         : num [1:3036] 59285 239945 24757 22152 59292 ...
##  $ Income_Rank        : num [1:3036] 1166 595 2780 2725 2084 ...
##  $ In_migration       : num [1:3036] -0.125 -0.5942 -0.0391 0.1407 -1.0343 ...
##  $ Out_migration      : num [1:3036] 1.59715 -0.85933 -0.07824 -0.00716 -0.61798 ...
##  $ HouseTenure        : num [1:3036] -0.06989 1.17359 -0.00708 -0.38903 -0.77979 ...
##  $ Commuting          : num [1:3036] 1.677 -0.648 -0.332 1.834 1.751 ...
##  $ Homeownership      : num [1:3036] 0.22 0.531 -0.66 0.489 0.765 ...
##  $ FamilyWithKids6_17 : num [1:3036] 1.14 -0.681 -0.542 -0.299 0.591 ...
##  $ Business_Startup   : num [1:3036] 0.2414 0.9013 -0.0393 0.8166 1.0365 ...
##  $ BirthRate          : num [1:3036] -0.652 -0.809 -0.258 0.479 -0.737 ...
##  $ STATEFP            : chr [1:3036] "01" "01" "01" "01" ...
##  $ COUNTYFP           : chr [1:3036] "001" "003" "005" "007" ...
##  $ COUNTYNS           : chr [1:3036] "00161526" "00161527" "00161528" "00161529" ...
##  $ AFFGEOID           : chr [1:3036] "0500000US01001" "0500000US01003" "0500000US01005" "0500000US01007" ...
##  $ GEOID              : chr [1:3036] "01001" "01003" "01005" "01007" ...
##  $ NAME               : chr [1:3036] "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ NAMELSAD           : chr [1:3036] "Autauga County" "Baldwin County" "Barbour County" "Bibb County" ...
##  $ STUSPS             : chr [1:3036] "AL" "AL" "AL" "AL" ...
##  $ STATE_NAME         : chr [1:3036] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ LSAD               : chr [1:3036] "06" "06" "06" "06" ...
##  $ ALAND              : num [1:3036] 1.54e+09 4.12e+09 2.29e+09 1.61e+09 1.67e+09 ...
##  $ AWATER             : num [1:3036] 2.57e+07 1.13e+09 5.05e+07 9.57e+06 1.49e+07 ...
##  $ geometry           :sfc_GEOMETRY of length 3036; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:368, 1:2] 845503 845570 845549 845766 846042 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  $ UIC_2013           : num [1:3036] 2 2 6 1 1 6 6 2 5 6 ...
##  $ Description        : chr [1:3036] "Small-in a metro area with fewer than 1 million residents" "Small-in a metro area with fewer than 1 million residents" "Noncore adjacent to a small metro with town of at least 2,500 residents" "Large-in a metro area with at least 1 million residents or more" ...
##  $ UIC                : Factor w/ 12 levels "1 - Large-in a metro area with at least 1 million residents or more",..: 5 5 9 1 1 9 9 5 8 9 ...

rm(ct, Matlab_subset,  QoL_Livability, UrbanInfluenceCodes2013 )

# Check new column names
# colnames(QoL)

2.2 Summary statistics of the revealed preferences

# Call inm y function
source("Functions/custom_summary.R")

QoL_RP <- QoL %>% dplyr::select("In_migration", "Out_migration", "HouseTenure", "Business_Startup", "Commuting", "FamilyWithKids6_17",  "Homeownership", "BirthRate")


# Applying the custom summary function to your QoL data frame
summary_table <- custom_summary(QoL_RP)

print(summary_table)

##             Variable Minimum Mean Maximum Sde Observations
## 1       In_migration   -1.88    0   10.82   1         3036
## 2      Out_migration   -2.09    0   18.60   1         3036
## 3        HouseTenure   -4.33    0    6.09   1         3036
## 4   Business_Startup   -3.88    0    8.88   1         3036
## 5          Commuting   -1.96    0    2.80   1         3036
## 6 FamilyWithKids6_17   -5.90    0    4.06   1         3036
## 7      Homeownership   -6.26    0    2.88   1         3036
## 8          BirthRate   -2.46    0   11.46   1         3036

2.3 Correlation of variables

A significant high correlation between in-migration and out-migration and a negative correlation between housing tenure and home ownership. The insiginificant correlation are corossed as figure shows.

source("Functions/corrplot2.R")

corrplot2(
  data = QoL_RP,
  method = "pearson",
  sig.level = 0.05,
  order = "original",
  diag = FALSE,
  type = "upper",
  tl.srt = 75
)

## corrplot 0.95 loaded

3 PCA

Adjust the weights and reduce the dimensions here using PCA. Fromt he Scree plot below, we will keep the first 5 PCs.

# Running PCA
pca <- princomp(QoL_RP[,], cor = TRUE, scores = TRUE,fix_sign = TRUE)

# Diagnosis
# Extracting the standard deviation of each principal component
std_dev <- pca$sdev

# Eigenvalues (standard deviations squared)
eigen_values <- std_dev^2

# Proportion of variance explained by each component
prop_variance <- (std_dev^2) / sum(std_dev^2)

# Ensure the plotting window has enough space for secondary axis labels
par(mar = c(5, 4, 4, 5) + 0.1)  # Adjust the right margin to give more space

# Plotting the primary axis
plot(prop_variance, xlab = "Principal Component", ylab = "Proportion of Variance Explained",
     type = "b", pch = 19, main = "Scree Plot", ylim = c(0, max(prop_variance) + 0.05))

# Mapping secondary axis correctly
max_ratio <- max(eigen_values) / max(prop_variance)
axis_positions <- seq(0, max(prop_variance), length.out = length(eigen_values))

# Adding secondary axis on the right
axis(side = 4, at = axis_positions,
     labels = round(axis_positions * max_ratio, 1), las = 1)

# Label the secondary axis with "Eigenvalues"
mtext("Eigenvalues", side = 4, line = 3)  # Adjust 'line' to position the label properly

Here, we show the loading for those revealed preference variables. Normally, we will interpret each PC and give it a meaning. But for the index value, we leave it for now. If we go with the PCA, then we will need to circulate back to it and have a second thought.

# Load necessary libraries
pca$loadings

## 
## Loadings:
##                    Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## In_migration        0.392  0.512  0.122                0.445  0.514  0.315
## Out_migration       0.274  0.435  0.525  0.178 -0.115 -0.603 -0.182 -0.137
## HouseTenure         0.561        -0.232                0.235 -0.129 -0.740
## Business_Startup           0.449 -0.630 -0.350  0.293 -0.413         0.149
## Commuting          -0.299  0.477        -0.196 -0.546  0.358 -0.465       
## FamilyWithKids6_17  0.285 -0.267        -0.435 -0.682 -0.251  0.337       
## Homeownership      -0.534  0.214  0.102                       0.587 -0.548
## BirthRate                         0.491 -0.771  0.364  0.126 -0.111       
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.125  0.125  0.125  0.125  0.125  0.125  0.125  0.125
## Cumulative Var  0.125  0.250  0.375  0.500  0.625  0.750  0.875  1.000

3.1 PCA Maps - PCs

I chose the first five PC based on the eigenvalues.

QoL$pc1 <- pca$scores[, 1]
QoL$pc2 <- pca$scores[, 2]
QoL$pc3 <- pca$scores[, 3]
QoL$pc4 <- pca$scores[, 4]
QoL$pc5 <- pca$scores[, 5]
#  
 
# # Check if 'QoL$geometry' is a spatial object or convert it to one if needed
# ggplot(data = QoL$geometry) +
#  geom_sf(aes(fill = QoL$pc1)) +
#   scale_fill_viridis_c(option = "P", begin = 0, end = 1, name = "PC1 Score") +
#   labs(title = "QoL-PC1 Scores" ) +
#   theme_void()
# 
# # Check if 'QoL$geometry' is a spatial object or convert it to one if needed
# ggplot(data = QoL$geometry) +
#  geom_sf(aes(fill = QoL$pc2)) +
#   scale_fill_viridis_c(option = "P", begin = 0, end = 1, name = "PC2 Score") +
#   labs(title = "QoL-PC2 Scores" ) +
#   theme_void()

library(ggplot2)
library(sf)  # Load the sf package for spatial data

# I wrote this functuon for convinentince. will move it in the final version
plot_pcs <- function(data, pca_result, num_pcs = 5) {
  # Loop through the specified number of PCs and create plots
  for (i in 1:num_pcs) {
    pc_name <- paste0("pc", i)
    
    # Dynamically create column names and assign PCA scores
    data$pc_name <- pca_result$scores[, i]  # Extract PC scores and assign to data
    
    # Generate the plot
    plot <- ggplot(data$geometry) +
      geom_sf(aes_string(fill = data$pc_name), show.legend = TRUE) +
      scale_fill_gradient2(low = "darkgreen", 
                       high ="darkred" , 
                       mid = "white", 
                       midpoint = median(data$pc_name, na.rm = TRUE),
                       na.value = "grey",
                       name = pc_name)   +
      labs(title = paste("Map of", pc_name)) +
      theme_void()
    
    # Print the plot
    print(plot)
  }
}
  
  

# Assuming 'pca' is your PCA result from prcomp or similar
# Assuming 'QoL' is your sf object with geometry data
plot_pcs(QoL, pca, num_pcs = 5)  # Change num_pcs to plot a different number of components

3.2 PCA Maps - Composite index map

I use the first five PC to construct the Index. Then, I convert the index to the ranking for comparisons.

calculate_pca_index_5pcs <- function(data, pca_result) {
  # Check if the PCA result has at least 5 components
  if (ncol(pca_result$scores) < 5) {
    stop("The PCA result does not have enough components.")
  }
  
  # Extract standard deviations and calculate the proportion of variance explained
  std_dev <- pca_result$sdev
  prop_variance <- (std_dev^2) / sum(std_dev^2)
  
  # Sum the variance explained by the first five components for normalization
  sum_five_variance <- sum(prop_variance[1:5])
  
  # Weight the PCA scores by the explained variance of each component, normalized by the sum of the first five
  data$pca_index <- data$pc1 * prop_variance[1] / sum_five_variance +
                    data$pc2 * prop_variance[2] / sum_five_variance +
                    data$pc3 * prop_variance[3] / sum_five_variance +
                    data$pc4 * prop_variance[4] / sum_five_variance +
                    data$pc5 * prop_variance[5] / sum_five_variance
  # Convert PCA index to ranks
  data$pca_rank <- rank(-data$pca_index)  # Negative sign for descending order
  
  return(data)
}
  
 

plot_pca_index_map <- function(data, title = "PCA Index Map") {
  # Ensure data is an sf object
  if (!inherits(data, "sf")) {
    data <- st_as_sf(data)
  }
  
  # Define the custom color palette
  col_func <- colorRampPalette(c( "#447733", "#77DD77",  "#FFFFFF","#EE9988","#BB4444"))
  
  # Plot the PCA rank
  pic_pca <- ggplot(data) +
    geom_sf(aes(fill = pca_rank)) +
    scale_fill_gradientn(colors = col_func(100), name = "PCA Rank", 
                         limits = c(min(data$pca_rank, na.rm = TRUE), max(data$pca_rank, na.rm = TRUE)),
                         oob = scales::oob_squish) +
    labs(title = title) +
    theme_void()

  return(pic_pca)
}
 

# Assuming 'pca' is your PCA result and 'QoL' is your dataset including PCA scores
QoL <- calculate_pca_index_5pcs(QoL, pca)

# Plot and save the PCA rank map
pic_pca <- plot_pca_index_map(QoL)
print(pic_pca)

# Save the plot
ggsave("Output/Pics/QoL_Map_pca.png", plot = pic_pca, width = 10, height = 5, units = "in", dpi = 300)

3.3 PCA Variable Selection

In this section, I show the importance of each variable to the final index.

pca$prop_variance <- (pca$sdev^2) / sum(pca$sdev^2)  # Proportion of variance explained by each component


# Sum the variance explained by the first five components
sum_first_five_variance <- sum(pca$prop_variance[1:5])

# Sum the variance explained by the first eight components
sum_variance <- sum(pca$prop_variance[1:8])


# Calculate the proportion of total variance (from the first eight components) explained by the first five
PTV_PCA <- sum_first_five_variance / sum_variance


# Weight the PCA scores by the explained variance of each component
# Normalized by the sum of the variance explained by the first five components
VS_PCA <- (pca$loadings[,1] * (pca$prop_variance[1]/sum_first_five_variance) +
           pca$loadings[,2] * (pca$prop_variance[2]/sum_first_five_variance) +
           pca$loadings[,3] * (pca$prop_variance[3]/sum_first_five_variance) +
           pca$loadings[,4] * (pca$prop_variance[4]/sum_first_five_variance) +
           pca$loadings[,5] * (pca$prop_variance[5]/sum_first_five_variance)) %>% 
  round(2)


# Show results of PCA loadings
VS_PCA

##       In_migration      Out_migration        HouseTenure   Business_Startup 
##               0.28               0.28               0.15              -0.02 
##          Commuting FamilyWithKids6_17      Homeownership          BirthRate 
##              -0.12              -0.13              -0.14               0.00

4 Geographically Weighted PCA

Consider the moving windows to calculate the PCA. Each location will have different loading matrix to show the heterogeneity. In every location, the loading matrix and the PCs are different.

# Calculate centroids of the geometry
ct_centroids <- QoL$geometry %>%
  st_geometry() %>%
  st_centroid()

# 'gw.dist' needs the coordinates in a specific format
d <- gw.dist(dp.locat = st_coordinates(ct_centroids), longlat = TRUE)

# Convert eh QoL data into SpatialPolygonsDataFrame for GWPCA analysis
spdf <- st_as_sf(QoL) %>% as(  "Spatial")

The chunk below takes a long time to run. I preselected the bandwith (bw=2320) based on the optimization algorithm.

# # Choose the bandwidth=2320
# bw.gw.pca <- bw.gwpca(spdf,
#                       vars = c("In_migration", "Out_migration", "HouseTenure", "Business_Startup", "Commuting", "FamilyWithKids6_17",  "Homeownership", "BirthRate"),
#                       k = 5,
#                       robust = FALSE,
#                       adaptive = TRUE)


# 2951 is the results from the code above, the optimal bandwidth
pca.gw <- gwpca(spdf,  
                vars = c("In_migration", "Out_migration", "HouseTenure", "Business_Startup", "Commuting", "FamilyWithKids6_17",  "Homeownership", "BirthRate"),
                bw = 2320 , k = 5, dMat = d, adaptive = TRUE,scores = T)

4.1 Sign!!!

Pay attention to this operation of the sign here. This step is not necessary for the method itself but we make this decision based on the conclusion. Here we preset the sign of the In-migration has the positive sign in the First Principle Component. This move is to ensure the comparability of the index. However, we should be aware that an important spatial structure information has been disguised under this operation.

pca.gw_back <- pca.gw 
# # I might need to double check the sign of the loading

# Loop through each of the 3100 observations
for (i in 1:3036) {
  # Check if the loading for the first variable in the first principal component is negative
  if (pca.gw$loadings[i, 1, 1] < 0  ) {
    # Flip all loadings in the principal component
    pca.gw$loadings[i, , ] <- (-1) * pca.gw$loadings[i, , ]
    # Set the flag to TRUE indicating the flip has been performed
  }
 
}

# pca.gw$loadings[, ,1 ]
# summary(pca.gw$loadings[, ,1 ])

# function for calculation pproportion of variance
prop.var <- function(gwpca.obj, n.components) {
            return((rowSums(gwpca.obj$var[, 1:n.components]) /rowSums(gwpca.obj$var)) * 100)
}

# First 5 PC in GWPCA
QoL$var.gwpca <- prop.var(pca.gw, 5)

summary(QoL$var.gwpca)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   82.06   82.41   82.97   82.84   83.17   83.60

4.2 GWPCA - The dominant Variable in each PCs

Circulae back if we decide to use GWPCA. More stories explored here.

# Convert SpatialPolygonsDataFrame back to an sf object
spdf_sf <- st_as_sf(spdf) #st_as_sf(QoL)  

# Iterate over the first 5 principal components
for (i in 1:5) {
  # Extract the loadings for the i-th component
  local.loadings <- pca.gw$loadings[, , i]  # loadings are stored in 3D array format
  lead.item <- colnames(local.loadings)[max.col(abs(local.loadings))]

  # Add the lead item as a new column in the spdf_sf data
  spdf_sf$lead.item <- lead.item

  # Plotting the spatial distribution of the lead item for the i-th component
  plot_title <- paste("Spatial Distribution of Lead Item Scores: PC", i)

  p <- ggplot(data = spdf_sf) +
    geom_sf(aes(fill = as.factor(lead.item)), show.legend = TRUE) +  # Convert to factor for categorical coloring
    scale_fill_viridis_d(option = "C", name = "Lead Item Score") +  # Using discrete color scale
    labs(title = plot_title,
         caption = "Data source: Your Data") +
    theme_void()

  # Print the plot
  print(p)
}

4.3 GWPCA Maps - PCs

To map the first few principal component score from a GWPCA

4.4 GWPCA Maps - Composite Index

# Sum the variance explained by the first five components for normalization
sum_five_variance <- sum(pca.gw$SDF@data[c("Comp.1_PV", "Comp.2_PV", "Comp.3_PV", "Comp.4_PV", "Comp.5_PV")])

# Weight the PCA scores by the explained variance of each component, normalized by the sum of the first five
QoL$gwpca_index <- (QoL$gwpca_pc1 * pca.gw$SDF@data$Comp.1_PV +
                    QoL$gwpca_pc2 * pca.gw$SDF@data$Comp.2_PV + 
                    QoL$gwpca_pc3 * pca.gw$SDF@data$Comp.3_PV + 
                    QoL$gwpca_pc4 * pca.gw$SDF@data$Comp.4_PV +
                    QoL$gwpca_pc5 * pca.gw$SDF@data$Comp.5_PV) / pca.gw$SDF@data$local_CP

# Ensure 'QoL$geometry' is recognized as an sf object for plotting
if (!inherits(QoL, "sf")) {
  QoL <- sf::st_as_sf(QoL)
}

# Plotting the PCA index
pic_gwpca <- ggplot(data = QoL) +
  geom_sf(aes(fill = gwpca_index)) +
   scale_fill_gradient2(low = "darkgreen", 
                       high ="darkred" , 
                       mid = "white", 
                       midpoint = median(QoL$gwpca_index, na.rm = TRUE),
                       na.value = "grey",
                       name = "GWPCA Index") +
  labs(title = "QoL-PCA Index Scores") +
  theme_void()

# Display the map
print(pic_gwpca)

# Save the map to a file
ggsave("Output/Pics/QoL_Map_gwpca.png", plot = pic_gwpca, width = 10, height = 5, units = "in", dpi = 300)

4.5 GWPCA Ranking

col_func <- colorRampPalette(c( "#447733", "#77DD77",  "#FFFFFF","#EE9988","#BB4444"))
  
QoL$gwpca_rank <- rank(-QoL$gwpca_index )

pic_gwpca_rank  <- ggplot(data = QoL$geometry) +
 geom_sf(aes(fill = QoL$gwpca_rank)) +
  scale_fill_gradientn(colors = col_func(100), name = "GWPCA Rank", 
                         limits = c(min(QoL$gwpca_rank, na.rm = TRUE), max(QoL$gwpca_rank, na.rm = TRUE)),
                         oob = scales::oob_squish)  +
  theme_void()
 

pic_gwpca_rank

ggsave("Output/Pics/QoL_Map_gwpca_rank.png", plot = pic_gwpca_rank , width = 10, height = 5, units = "in", dpi = 300)

4.6 GWPCA - Variabel selection

PTV_GWPCA <- pca.gw$SDF@data$local_CP 

# Weight the PCA.GW scores by the explained variance of each component
# Normalized by the sum of the variance explained by the first two components
# each component i has its own matrix of loadings
VS_GWPCA <-  (pca.gw$loadings[,,1]*  pca.gw$SDF@data$Comp.1_PV +
                    pca.gw$loadings[,,2]*  pca.gw$SDF@data$Comp.2_PV + 
                    pca.gw$loadings[,,3]*  pca.gw$SDF@data$Comp.3_PV + 
                    pca.gw$loadings[,,4]*  pca.gw$SDF@data$Comp.4_PV +
                    pca.gw$loadings[,,5]*  pca.gw$SDF@data$Comp.5_PV) /  pca.gw$SDF@data$local_CP   %>% round(2)


 


# VS_GWPCA is your matrix or data frame containing numerical data
VS_GWPCA <- colMeans(VS_GWPCA, na.rm = TRUE)  # na.rm = TRUE to handle any NA values safely

VS_GWPCA

##       In_migration      Out_migration        HouseTenure   Business_Startup 
##         0.14826131         0.09107879         0.20311630         0.04212529 
##          Commuting FamilyWithKids6_17      Homeownership          BirthRate 
##        -0.07913866         0.10027053        -0.18181539        -0.01185513

5 Spatial PCA

5.1 Spatial weight matrix

Spatial matrix constructed but we utilized the connection matrix in the spatial PCA function. This can be used later for robustness check.

# Check if QoL is already an sf object
if (!inherits(QoL, "sf")) {
  # 'geometry' is the column with spatial data
  QoL <- st_as_sf(QoL, sf_column_name = "geometry")
}

# QoL is properly an sf object and ready to use
# Using poly2nb to create neighbor relationships based on shared boundaries
neighbors <- poly2nb(QoL ) #
weights <- nb2listw(neighbors, style = "B", zero.policy = TRUE)

# Convert listw to a square matrix of weights
weight_matrix <- listw2mat(weights)

# Check the dimensions to ensure it's square
dim(weight_matrix)

## [1] 3036 3036

5.2 Centriod of the polygons

We need to construct the coordinates to calculate the spatial correlation of counties. We utilizaed the geometry information to calculate the centriods.

# Convert geometries to centroids
QoL <- QoL %>%
  mutate(centroid = st_centroid(geometry))

# Now extract coordinates from centroids
QoL <- QoL %>%
  mutate(x = st_coordinates(centroid)[,1],
         y = st_coordinates(centroid)[,2])

5.3 Remove NAs and change the format of the data

# Convert QoL to a data frame if it's not already one
QoL <- as.data.frame(QoL)

# Ensure specified columns are numeric and remove rows with NAs
QoL <- QoL %>%
  mutate(across(c("In_migration", "Out_migration", "HouseTenure", "Business_Startup", 
                  "Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate"), as.numeric)) %>%
  na.omit()  # This will remove any rows that have NA values in any column

5.4 Analysis

This is the code for Spatial PCA.

# centriod
xy_matrix <- cbind(QoL$x, QoL$y)

# spatial PCA
spca_results <- spca(QoL_RP[,] , xy = xy_matrix,  center = FALSE,
                     scale = FALSE, scannf = FALSE,
                      nfposi = 3, nfnega = 1,
                      type = 3, ask = FALSE,
                     #k = 5, # the number of the neighbor
                      plot.nb = TRUE, edit.nb = FALSE,
                      truenames = TRUE,
                      d1 = NULL, d2 = NULL,
                      a = NULL, dmin = NULL,
                      axis = 1,
                     #matWeight = weight_matrix,
                     useLag = TRUE)

5.4.1 Spatial Struture

We can see the first postive egeinvalue is larger than the rest. There is no local structure in the data.

pc <- length(spca_results$eig)
# barplot(spca_results$eig,main="Eigenvalues of sPCA", col=rep(c("red","grey"),c(1,pc)))

# If there is any local spatial structure
barplot(spca_results$eig, main="A variant of the plot\n of sPCA eigenvalues",
        col=spectral(length(spca_results$eig)))
legend("topright", fill=spectral(2), leg=c("Global structures", "Local structures"))
abline(h=0,col="grey")

5.4.2 Decomposition

Here we decompose th e object value var(x) I(x) into of the Spatial autocorrelation and the variance. We can see the first principle component shows enough representation of both sptial structure and the explaination of the variance.

screeplot(spca_results)

5.4.3 Contribution of Specific Variabels to the QoL

myLoadings <- spca_results$c1[,1]^2
names(myLoadings) <- rownames(spca_results$c1)

#to identify the 5% of alleles with the greatest contributions to the first global structure in mySpca, we need
loadingplot(myLoadings, threshold=quantile(myLoadings, 0.90), xlab="Variables", ylab="Weight of the variable", main="Contribution of variables \n to the first sPCA axis")

myLoadings <- spca_results$c1[,2]^2
names(myLoadings) <- rownames(spca_results$c1)

#to identify the 5% of alleles with the greatest contributions to the first global structure in mySpca, we need
loadingplot(myLoadings, threshold=quantile(myLoadings, 0.90), xlab="Variables", ylab="Weight of the variable", main="Contribution of variables \n to the second sPCA axis")

myLoadings <- spca_results$c1[,3]^2
names(myLoadings) <- rownames(spca_results$c1)

#to identify the 5% of alleles with the greatest contributions to the first global structure in mySpca, we need
loadingplot(myLoadings, threshold=quantile(myLoadings, 0.90), xlab="Variables", ylab="Weight of the variable", main="Contribution of variables \n to the third sPCA axis")

5.4.4 SPCA Maps - Map the first few global scores

#colorplot(spca_results,cex=1,main="colorplot of mySpca, first global score")


# Ensure merge them with scores, here I just map the first PC
QoL <- QoL %>%
  mutate(sPCA_PC1 = spca_results$ls[,1], sPCA_PC2 = spca_results$ls[,2] )

# # Perform the left join to merge data
# QoL <- QoL %>%
#   left_join(ct %>% select(FIPS, geometry), by = "FIPS")


if (!inherits(QoL, "sf")) {
    QoL <- st_as_sf(QoL) #,  crs = 4326
}

# Create a map using ggplot2 with colors based on PC1
ggplot(QoL) +
  geom_sf(aes(fill = sPCA_PC1), size = 0.5) +  # Use fill for color based on PC1
   scale_fill_viridis_c(option = "C", begin = 0, end = 1, name = "PC1 Score")  +
  labs(title = "sPCA- QoL 1st PC Scores") +
    theme_void()

ggplot(QoL) +
  geom_sf(aes(fill = sPCA_PC2), size = 0.5) +  # Use fill for color based on PC2
   scale_fill_viridis_c(option = "C", begin = 0, end = 1, name = "PC2 Score")  +
  labs(title = "sPCA- QoL 2nd PC Scores") +
    theme_void()

### SPCA Maps- Composite Index

# the eigenvalues are stored correctly in spca_results$eig
var1 <- spca_results$eig[1]  # Eigenvalue for the first principal component
var2 <- spca_results$eig[2]  # Eigenvalue for the second principal component

# Calculate the sPCA index using each component's specific eigenvalue
QoL$sPCA_index <- spca_results$ls[,1] * var1 / (var1 + var2) +
                  spca_results$ls[,2] * var2 / (var1 + var2)


# if (!inherits(QoL, "sf")) {
#     QoL <- st_as_sf(QoL,  crs = 4326)
# }
# Plotting the sPCA index
pic_sPCA <- ggplot(QoL ) +
  geom_sf(aes(fill = sPCA_index), size = 0.5) +  # Use fill for color based on sPCA index
     scale_fill_gradient2(low = "darkgreen", 
                       high ="darkred" , 
                       mid = "white", 
                       midpoint = median(QoL$sPCA_index, na.rm = TRUE),
                       na.value = "grey",
                       name = "sPCA Index") +
  # labs(title = "sPCA - sPCA Index Scores") +
  theme_void()

pic_sPCA

ggsave("Output/Pics/QoL_Map_spca.png", plot = pic_sPCA   , width = 10, height = 5, units = "in", dpi = 300)

5.4.5 SPCA Ranking

col_func <- colorRampPalette(c( "#447733", "#77DD77",  "#FFFFFF","#EE9988","#BB4444"))


QoL$spca_rank <- rank(QoL$sPCA_index ) # here as the sign of the loadingis differetn from other method, i flips

pic_sPCA_rank <- ggplot(data = QoL$geometry) +
 geom_sf(aes(fill = QoL$spca_rank)) +
  scale_fill_gradientn(colors = col_func(100), name = "sPCA Rank", 
                         limits = c(min(QoL$gwpca_rank, na.rm = TRUE), max(QoL$gwpca_rank, na.rm = TRUE)),
                         oob = scales::oob_squish)  +
  theme_void()


pic_sPCA_rank

ggsave("../03 Output/Pics/QoL_Map_spca_rank.png", plot =pic_sPCA_rank  , width = 10, height = 5, units = "in", dpi = 300)

5.4.6 SPCA Variable Selection

# Eigenvalues
var1 <- spca_results$eig[1]  # Eigenvalue for the first principal component
var2 <- spca_results$eig[2]  # Eigenvalue for the second principal component

# Sum of the eigenvalues for the first two and first six components
sum_2_sPC <- spca_results$eig[1] + spca_results$eig[2]
sum_sPC <- sum(spca_results$eig[1:6])


# Weighted loadings
VS_sPCA <- (spca_results$c1[,1]* var1 / sum_2_sPC +  # Loading for the first component weighted
           spca_results$c1[,2]* var2 / sum_2_sPC ) %>% round(2)   # Loading for the second component weighted


names(VS_sPCA) <- c("In_migration", "Out_migration", "HouseTenure", "Business_Startup", 
                  "Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate")

# Proportion of total variance explained by the first two components
PTV_sPCA <- sum_2_sPC / sum_sPC %>% round(2)

VS_sPCA

##       In_migration      Out_migration        HouseTenure   Business_Startup 
##              -0.32              -0.22              -0.38              -0.47 
##          Commuting FamilyWithKids6_17      Homeownership          BirthRate 
##               0.09              -0.06               0.13               0.01

PTV_sPCA

## [1] 0.603066

5.4.7 Other Summary

The loading for different components.

spca_results$c1

The PC values.

dim(spca_results$li)

## [1] 3036    4

head(spca_results$li)

The spatial lag term of the PC.

dim(spca_results$ls)

## [1] 3036    4

head(spca_results$ls)

Projection of the sPCA to the PCA

spca_results$as

rm(myGtest,neighbors ,pic_sPCA,pic_sPCA_rank,weight_matrix,weights,xy_matrix,myLoadings,pc,sum_2_sPC,sum_sPC, var1, var2)

6 Bayesian Factor Analysis

We try the Bayesian Factor Anslsys.

# Model fit
QoL.model <- ' QoL  =~  In_migration+ Out_migration + HouseTenure + Business_Startup + Commuting  + FamilyWithKids6_17  + Homeownership + BirthRate
              QoL~~1*QoL '

fit <- blavaan(QoL.model, data = QoL,
               dp = dpriors(lambda = "normal(1,1)"), #prior for loading
               auto.var = TRUE, auto.fix.first = FALSE,
               auto.cov.lv.x = TRUE,save.lvs=TRUE,std.lv=TRUE)

cbind(1:length(coef(fit)), coef(fit))



# convergence diagnostics
blavInspect(fit, "rhat")  ## want close to 1.000
blavInspect(fit, "neff")  ## want large values
plot(fit)  ## want a hairy caterpillar

plot(fit, 1:6)  ## the numbers specify which parameters to plot, here I just

summary(fit)
coef(fit)
fitMeasures(fit)

6.1 Bayesian Variable Selection

coefficients <- coef(fit)

# Correctly define loadings and unique variances based on the given coefficients
loadings <- c(
  In_migration = 0.464, 
  Out_migration = 0.221, 
  HouseTenure = 0.906,
  Business_Startup = 0.091,  # Updated from your input
  Commuting = -0.333,
  FamilyWithKids6_17 = 0.335,
  Homeownership = -0.701,  # Assuming negative based on your coefficients
  BirthRate = -0.006
)

unique_variances <- c(
  In_migration = 0.786, 
  Out_migration = 0.952, 
  HouseTenure = 0.181,  # Updated based on your coefficients for variances
  Business_Startup = 0.992,  # Updated based on your coefficients for variances
  Commuting = 0.890,
  FamilyWithKids6_17 = 0.888,
  Homeownership = 0.510,  # Updated from your coefficients
  BirthRate = 1.001
)

# Calculate R-squared for each variable
VS_Bayes<- sapply(names(loadings), function(var) {
  lambda <- loadings[var]
  psi <- unique_variances[var]
  round( sqrt((lambda^2) / (lambda^2 + psi) ) ,2)
})

names(VS_Bayes) <-  c("In_migration", "Out_migration", "HouseTenure", "Business_Startup", 
                  "Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate")
 
VS_Bayes

##       In_migration      Out_migration        HouseTenure   Business_Startup 
##               0.46               0.22               0.91               0.09 
##          Commuting FamilyWithKids6_17      Homeownership          BirthRate 
##               0.33               0.33               0.70               0.01

# latent variable samples
lvs <- blavInspect(fit, "lvs")
lvs <- do.call("rbind", lvs) ## convert to matrix


# Calculate the column means of lvs
QoL$BayeScores <- colMeans(lvs)

# The ranking of counties
QoL$BayeRank <- rank(-QoL$BayeScores, ties.method = "min")

write.csv(lvs, "Output/lvs.csv", row.names = TRUE)

6.2 Bayesian Maps

# Ensure QoL is an sf object; if it's not, convert it
if (!inherits(QoL, "sf")) {
  QoL  <- st_as_sf(QoL)
}

# Plotting the map with Bayesian Scores
pic_baye <- ggplot(data = QoL ) +
  geom_sf(aes(fill = BayeScores), size = 0.5) +
  labs(title = "Bayesian Scores by County",
       fill = "Factor Scores") +
  theme_void() +
scale_fill_gradient2(low = "darkgreen", 
                       high ="darkred" , 
                       mid = "white", 
                       midpoint = median(QoL$BayeScores, na.rm = TRUE),
                       na.value = "grey",
                       name = "Baye Factor Score") 

pic_baye

ggsave("Output/Pics/QoL_Map_baye.png", plot = pic_baye   , width = 10, height = 5, units = "in", dpi = 300)
 



col_func <- colorRampPalette(c( "#447733", "#77DD77",  "#FFFFFF","#EE9988","#BB4444"))
  
 
pic_baye_rank  <- ggplot(data = QoL ) +
 geom_sf(aes(fill =  BayeRank)) +
  scale_fill_gradientn(colors = col_func(100), name = "Baye FA Rank", 
                         limits = c(min(QoL$gwpca_rank, na.rm = TRUE), max(QoL$gwpca_rank, na.rm = TRUE)),
                         oob = scales::oob_squish)  +
  theme_void()+  labs(title = "QoL - Bayesian Scores  Ranking by County",
       fill = "Ranking")
 
 
pic_baye_rank

ggsave("Output/Pics/QoL_Map_baye_rank.png", plot = pic_baye_rank   , width = 10, height = 5, units = "in", dpi = 300)

–> –> –> –>

–>

–> –>

–>

Spatial Indices of QoL Working in Progress

Yang Cheng

2025-02-26