This document is used to test the sensitivity of our Livability to the methods: geographically weighted principal component analysis (GWPCA), spatial principle analysis (sPCA), and two aspatial methods principle component analysis(PCA), the Bayesian factor. analysis (BFA). Then, we compare the correlation Livability ranking. Potentially, we can go with the GWPCA and the sBFA (from Matlab).
Next step: With the chosen method, we will conduction the regional analysis.
We need to load all of the packages that r requires for this exercise.
# Load libraries
packages.wanted <- c("robustbase","mvoutlier","RColorBrewer","GWmodel","raster","tmap")
for (package in packages.wanted) require(package,character.only=TRUE)
library(reshape2)
#sPCA
library(adegenet)
library(spdep)
library(adespatial)
library(splancs)
library(readxl)
#The tigris R package obtaining and using Census geographic data sets.
library(tigris)
library(ggplot2)
library(sf)
library(spdep)
# visualizes geographic data on an interactive, zoomable map.
library(mapview)
library(viridis)
# Load the GWPCA functions....
library(GWmodel)
library(dplyr)
library(tidyr)
# Scree Plot
library(stats)
# Bayesian packages
library(blavaan)
library(lavaan)
library(future)
plan("multicore")
library(bayesplot)
library(data.table)
library(vegan)
library(psych)
library(rcompanion)
library(patchwork) # For combining ggplot objects easily
# library(rgdal)
rm(packages.wanted,package)
library(dplyr)
Now, set up the data set used for this practice. All the data are stadndardize (Z-scores).
# Read the QoL data, might be updated
QoL_Livability <- read_excel("Data/Matlab Output Data/Summary of Ranks_Final.xlsx") %>%
dplyr::select(-P2, -P1) %>%
rename(Livability_Rank = Original,
Income_Rank = Rank_Income )
# UICs
UrbanInfluenceCodes2013 <- read_excel("Data/UrbanInfluenceCodes2013.xls",
sheet = "Urban Influence Codes 2013") %>%
dplyr::select(FIPS, UIC_2013, Description) %>%
mutate(FIPS = as.numeric(FIPS),
UIC = factor(paste(UIC_2013, Description, sep = " - ")))
Merge all the data for analysis.
QoL<- QoL_Livability %>%
left_join(ct, by = "FIPS") %>%
left_join(UrbanInfluenceCodes2013 , by = "FIPS") %>%
distinct(FIPS, .keep_all = TRUE) # Keep one record per FIPS and remove duplicates
str(QoL)
## tibble [3,036 × 29] (S3: tbl_df/tbl/data.frame)
## $ Livability_Rank : num [1:3036] 1399 826 1419 2416 2711 ...
## $ FIPS : num [1:3036] 1001 1003 1005 1007 1009 ...
## $ PerCap_Income_19_23: num [1:3036] 0.117 0.635 -1.111 -1.045 -0.517 ...
## $ Population : num [1:3036] 59285 239945 24757 22152 59292 ...
## $ Income_Rank : num [1:3036] 1166 595 2780 2725 2084 ...
## $ In_migration : num [1:3036] -0.125 -0.5942 -0.0391 0.1407 -1.0343 ...
## $ Out_migration : num [1:3036] 1.59715 -0.85933 -0.07824 -0.00716 -0.61798 ...
## $ HouseTenure : num [1:3036] -0.06989 1.17359 -0.00708 -0.38903 -0.77979 ...
## $ Commuting : num [1:3036] 1.677 -0.648 -0.332 1.834 1.751 ...
## $ Homeownership : num [1:3036] 0.22 0.531 -0.66 0.489 0.765 ...
## $ FamilyWithKids6_17 : num [1:3036] 1.14 -0.681 -0.542 -0.299 0.591 ...
## $ Business_Startup : num [1:3036] 0.2414 0.9013 -0.0393 0.8166 1.0365 ...
## $ BirthRate : num [1:3036] -0.652 -0.809 -0.258 0.479 -0.737 ...
## $ STATEFP : chr [1:3036] "01" "01" "01" "01" ...
## $ COUNTYFP : chr [1:3036] "001" "003" "005" "007" ...
## $ COUNTYNS : chr [1:3036] "00161526" "00161527" "00161528" "00161529" ...
## $ AFFGEOID : chr [1:3036] "0500000US01001" "0500000US01003" "0500000US01005" "0500000US01007" ...
## $ GEOID : chr [1:3036] "01001" "01003" "01005" "01007" ...
## $ NAME : chr [1:3036] "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ NAMELSAD : chr [1:3036] "Autauga County" "Baldwin County" "Barbour County" "Bibb County" ...
## $ STUSPS : chr [1:3036] "AL" "AL" "AL" "AL" ...
## $ STATE_NAME : chr [1:3036] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ LSAD : chr [1:3036] "06" "06" "06" "06" ...
## $ ALAND : num [1:3036] 1.54e+09 4.12e+09 2.29e+09 1.61e+09 1.67e+09 ...
## $ AWATER : num [1:3036] 2.57e+07 1.13e+09 5.05e+07 9.57e+06 1.49e+07 ...
## $ geometry :sfc_GEOMETRY of length 3036; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:368, 1:2] 845503 845570 845549 845766 846042 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## $ UIC_2013 : num [1:3036] 2 2 6 1 1 6 6 2 5 6 ...
## $ Description : chr [1:3036] "Small-in a metro area with fewer than 1 million residents" "Small-in a metro area with fewer than 1 million residents" "Noncore adjacent to a small metro with town of at least 2,500 residents" "Large-in a metro area with at least 1 million residents or more" ...
## $ UIC : Factor w/ 12 levels "1 - Large-in a metro area with at least 1 million residents or more",..: 5 5 9 1 1 9 9 5 8 9 ...
rm(ct, Matlab_subset, QoL_Livability, UrbanInfluenceCodes2013 )
# Check new column names
# colnames(QoL)
# Call inm y function
source("Functions/custom_summary.R")
QoL_RP <- QoL %>% dplyr::select("In_migration", "Out_migration", "HouseTenure", "Business_Startup", "Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate")
# Applying the custom summary function to your QoL data frame
summary_table <- custom_summary(QoL_RP)
print(summary_table)
## Variable Minimum Mean Maximum Sde Observations
## 1 In_migration -1.88 0 10.82 1 3036
## 2 Out_migration -2.09 0 18.60 1 3036
## 3 HouseTenure -4.33 0 6.09 1 3036
## 4 Business_Startup -3.88 0 8.88 1 3036
## 5 Commuting -1.96 0 2.80 1 3036
## 6 FamilyWithKids6_17 -5.90 0 4.06 1 3036
## 7 Homeownership -6.26 0 2.88 1 3036
## 8 BirthRate -2.46 0 11.46 1 3036
A significant high correlation between in-migration and out-migration and a negative correlation between housing tenure and home ownership. The insiginificant correlation are corossed as figure shows.
source("Functions/corrplot2.R")
corrplot2(
data = QoL_RP,
method = "pearson",
sig.level = 0.05,
order = "original",
diag = FALSE,
type = "upper",
tl.srt = 75
)
## corrplot 0.95 loaded
Adjust the weights and reduce the dimensions here using PCA. Fromt he Scree plot below, we will keep the first 5 PCs.
# Running PCA
pca <- princomp(QoL_RP[,], cor = TRUE, scores = TRUE,fix_sign = TRUE)
# Diagnosis
# Extracting the standard deviation of each principal component
std_dev <- pca$sdev
# Eigenvalues (standard deviations squared)
eigen_values <- std_dev^2
# Proportion of variance explained by each component
prop_variance <- (std_dev^2) / sum(std_dev^2)
# Ensure the plotting window has enough space for secondary axis labels
par(mar = c(5, 4, 4, 5) + 0.1) # Adjust the right margin to give more space
# Plotting the primary axis
plot(prop_variance, xlab = "Principal Component", ylab = "Proportion of Variance Explained",
type = "b", pch = 19, main = "Scree Plot", ylim = c(0, max(prop_variance) + 0.05))
# Mapping secondary axis correctly
max_ratio <- max(eigen_values) / max(prop_variance)
axis_positions <- seq(0, max(prop_variance), length.out = length(eigen_values))
# Adding secondary axis on the right
axis(side = 4, at = axis_positions,
labels = round(axis_positions * max_ratio, 1), las = 1)
# Label the secondary axis with "Eigenvalues"
mtext("Eigenvalues", side = 4, line = 3) # Adjust 'line' to position the label properly
Here, we show the loading for those revealed preference variables. Normally, we will interpret each PC and give it a meaning. But for the index value, we leave it for now. If we go with the PCA, then we will need to circulate back to it and have a second thought.
# Load necessary libraries
pca$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## In_migration 0.392 0.512 0.122 0.445 0.514 0.315
## Out_migration 0.274 0.435 0.525 0.178 -0.115 -0.603 -0.182 -0.137
## HouseTenure 0.561 -0.232 0.235 -0.129 -0.740
## Business_Startup 0.449 -0.630 -0.350 0.293 -0.413 0.149
## Commuting -0.299 0.477 -0.196 -0.546 0.358 -0.465
## FamilyWithKids6_17 0.285 -0.267 -0.435 -0.682 -0.251 0.337
## Homeownership -0.534 0.214 0.102 0.587 -0.548
## BirthRate 0.491 -0.771 0.364 0.126 -0.111
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## Cumulative Var 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000
I chose the first five PC based on the eigenvalues.
QoL$pc1 <- pca$scores[, 1]
QoL$pc2 <- pca$scores[, 2]
QoL$pc3 <- pca$scores[, 3]
QoL$pc4 <- pca$scores[, 4]
QoL$pc5 <- pca$scores[, 5]
#
# # Check if 'QoL$geometry' is a spatial object or convert it to one if needed
# ggplot(data = QoL$geometry) +
# geom_sf(aes(fill = QoL$pc1)) +
# scale_fill_viridis_c(option = "P", begin = 0, end = 1, name = "PC1 Score") +
# labs(title = "QoL-PC1 Scores" ) +
# theme_void()
#
# # Check if 'QoL$geometry' is a spatial object or convert it to one if needed
# ggplot(data = QoL$geometry) +
# geom_sf(aes(fill = QoL$pc2)) +
# scale_fill_viridis_c(option = "P", begin = 0, end = 1, name = "PC2 Score") +
# labs(title = "QoL-PC2 Scores" ) +
# theme_void()
library(ggplot2)
library(sf) # Load the sf package for spatial data
# I wrote this functuon for convinentince. will move it in the final version
plot_pcs <- function(data, pca_result, num_pcs = 5) {
# Loop through the specified number of PCs and create plots
for (i in 1:num_pcs) {
pc_name <- paste0("pc", i)
# Dynamically create column names and assign PCA scores
data$pc_name <- pca_result$scores[, i] # Extract PC scores and assign to data
# Generate the plot
plot <- ggplot(data$geometry) +
geom_sf(aes_string(fill = data$pc_name), show.legend = TRUE) +
scale_fill_gradient2(low = "darkgreen",
high ="darkred" ,
mid = "white",
midpoint = median(data$pc_name, na.rm = TRUE),
na.value = "grey",
name = pc_name) +
labs(title = paste("Map of", pc_name)) +
theme_void()
# Print the plot
print(plot)
}
}
# Assuming 'pca' is your PCA result from prcomp or similar
# Assuming 'QoL' is your sf object with geometry data
plot_pcs(QoL, pca, num_pcs = 5) # Change num_pcs to plot a different number of components
I use the first five PC to construct the Index. Then, I convert the index to the ranking for comparisons.
calculate_pca_index_5pcs <- function(data, pca_result) {
# Check if the PCA result has at least 5 components
if (ncol(pca_result$scores) < 5) {
stop("The PCA result does not have enough components.")
}
# Extract standard deviations and calculate the proportion of variance explained
std_dev <- pca_result$sdev
prop_variance <- (std_dev^2) / sum(std_dev^2)
# Sum the variance explained by the first five components for normalization
sum_five_variance <- sum(prop_variance[1:5])
# Weight the PCA scores by the explained variance of each component, normalized by the sum of the first five
data$pca_index <- data$pc1 * prop_variance[1] / sum_five_variance +
data$pc2 * prop_variance[2] / sum_five_variance +
data$pc3 * prop_variance[3] / sum_five_variance +
data$pc4 * prop_variance[4] / sum_five_variance +
data$pc5 * prop_variance[5] / sum_five_variance
# Convert PCA index to ranks
data$pca_rank <- rank(-data$pca_index) # Negative sign for descending order
return(data)
}
plot_pca_index_map <- function(data, title = "PCA Index Map") {
# Ensure data is an sf object
if (!inherits(data, "sf")) {
data <- st_as_sf(data)
}
# Define the custom color palette
col_func <- colorRampPalette(c( "#447733", "#77DD77", "#FFFFFF","#EE9988","#BB4444"))
# Plot the PCA rank
pic_pca <- ggplot(data) +
geom_sf(aes(fill = pca_rank)) +
scale_fill_gradientn(colors = col_func(100), name = "PCA Rank",
limits = c(min(data$pca_rank, na.rm = TRUE), max(data$pca_rank, na.rm = TRUE)),
oob = scales::oob_squish) +
labs(title = title) +
theme_void()
return(pic_pca)
}
# Assuming 'pca' is your PCA result and 'QoL' is your dataset including PCA scores
QoL <- calculate_pca_index_5pcs(QoL, pca)
# Plot and save the PCA rank map
pic_pca <- plot_pca_index_map(QoL)
print(pic_pca)
# Save the plot
ggsave("Output/Pics/QoL_Map_pca.png", plot = pic_pca, width = 10, height = 5, units = "in", dpi = 300)
In this section, I show the importance of each variable to the final index.
pca$prop_variance <- (pca$sdev^2) / sum(pca$sdev^2) # Proportion of variance explained by each component
# Sum the variance explained by the first five components
sum_first_five_variance <- sum(pca$prop_variance[1:5])
# Sum the variance explained by the first eight components
sum_variance <- sum(pca$prop_variance[1:8])
# Calculate the proportion of total variance (from the first eight components) explained by the first five
PTV_PCA <- sum_first_five_variance / sum_variance
# Weight the PCA scores by the explained variance of each component
# Normalized by the sum of the variance explained by the first five components
VS_PCA <- (pca$loadings[,1] * (pca$prop_variance[1]/sum_first_five_variance) +
pca$loadings[,2] * (pca$prop_variance[2]/sum_first_five_variance) +
pca$loadings[,3] * (pca$prop_variance[3]/sum_first_five_variance) +
pca$loadings[,4] * (pca$prop_variance[4]/sum_first_five_variance) +
pca$loadings[,5] * (pca$prop_variance[5]/sum_first_five_variance)) %>%
round(2)
# Show results of PCA loadings
VS_PCA
## In_migration Out_migration HouseTenure Business_Startup
## 0.28 0.28 0.15 -0.02
## Commuting FamilyWithKids6_17 Homeownership BirthRate
## -0.12 -0.13 -0.14 0.00
Consider the moving windows to calculate the PCA. Each location will have different loading matrix to show the heterogeneity. In every location, the loading matrix and the PCs are different.
# Calculate centroids of the geometry
ct_centroids <- QoL$geometry %>%
st_geometry() %>%
st_centroid()
# 'gw.dist' needs the coordinates in a specific format
d <- gw.dist(dp.locat = st_coordinates(ct_centroids), longlat = TRUE)
# Convert eh QoL data into SpatialPolygonsDataFrame for GWPCA analysis
spdf <- st_as_sf(QoL) %>% as( "Spatial")
The chunk below takes a long time to run. I preselected the bandwith (bw=2320) based on the optimization algorithm.
# # Choose the bandwidth=2320
# bw.gw.pca <- bw.gwpca(spdf,
# vars = c("In_migration", "Out_migration", "HouseTenure", "Business_Startup", "Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate"),
# k = 5,
# robust = FALSE,
# adaptive = TRUE)
# 2951 is the results from the code above, the optimal bandwidth
pca.gw <- gwpca(spdf,
vars = c("In_migration", "Out_migration", "HouseTenure", "Business_Startup", "Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate"),
bw = 2320 , k = 5, dMat = d, adaptive = TRUE,scores = T)
Pay attention to this operation of the sign here. This step is not necessary for the method itself but we make this decision based on the conclusion. Here we preset the sign of the In-migration has the positive sign in the First Principle Component. This move is to ensure the comparability of the index. However, we should be aware that an important spatial structure information has been disguised under this operation.
pca.gw_back <- pca.gw
# # I might need to double check the sign of the loading
# Loop through each of the 3100 observations
for (i in 1:3036) {
# Check if the loading for the first variable in the first principal component is negative
if (pca.gw$loadings[i, 1, 1] < 0 ) {
# Flip all loadings in the principal component
pca.gw$loadings[i, , ] <- (-1) * pca.gw$loadings[i, , ]
# Set the flag to TRUE indicating the flip has been performed
}
}
# pca.gw$loadings[, ,1 ]
# summary(pca.gw$loadings[, ,1 ])
# function for calculation pproportion of variance
prop.var <- function(gwpca.obj, n.components) {
return((rowSums(gwpca.obj$var[, 1:n.components]) /rowSums(gwpca.obj$var)) * 100)
}
# First 5 PC in GWPCA
QoL$var.gwpca <- prop.var(pca.gw, 5)
summary(QoL$var.gwpca)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 82.06 82.41 82.97 82.84 83.17 83.60
Circulae back if we decide to use GWPCA. More stories explored here.
# Convert SpatialPolygonsDataFrame back to an sf object
spdf_sf <- st_as_sf(spdf) #st_as_sf(QoL)
# Iterate over the first 5 principal components
for (i in 1:5) {
# Extract the loadings for the i-th component
local.loadings <- pca.gw$loadings[, , i] # loadings are stored in 3D array format
lead.item <- colnames(local.loadings)[max.col(abs(local.loadings))]
# Add the lead item as a new column in the spdf_sf data
spdf_sf$lead.item <- lead.item
# Plotting the spatial distribution of the lead item for the i-th component
plot_title <- paste("Spatial Distribution of Lead Item Scores: PC", i)
p <- ggplot(data = spdf_sf) +
geom_sf(aes(fill = as.factor(lead.item)), show.legend = TRUE) + # Convert to factor for categorical coloring
scale_fill_viridis_d(option = "C", name = "Lead Item Score") + # Using discrete color scale
labs(title = plot_title,
caption = "Data source: Your Data") +
theme_void()
# Print the plot
print(p)
}
To map the first few principal component score from a GWPCA
# Sum the variance explained by the first five components for normalization
sum_five_variance <- sum(pca.gw$SDF@data[c("Comp.1_PV", "Comp.2_PV", "Comp.3_PV", "Comp.4_PV", "Comp.5_PV")])
# Weight the PCA scores by the explained variance of each component, normalized by the sum of the first five
QoL$gwpca_index <- (QoL$gwpca_pc1 * pca.gw$SDF@data$Comp.1_PV +
QoL$gwpca_pc2 * pca.gw$SDF@data$Comp.2_PV +
QoL$gwpca_pc3 * pca.gw$SDF@data$Comp.3_PV +
QoL$gwpca_pc4 * pca.gw$SDF@data$Comp.4_PV +
QoL$gwpca_pc5 * pca.gw$SDF@data$Comp.5_PV) / pca.gw$SDF@data$local_CP
# Ensure 'QoL$geometry' is recognized as an sf object for plotting
if (!inherits(QoL, "sf")) {
QoL <- sf::st_as_sf(QoL)
}
# Plotting the PCA index
pic_gwpca <- ggplot(data = QoL) +
geom_sf(aes(fill = gwpca_index)) +
scale_fill_gradient2(low = "darkgreen",
high ="darkred" ,
mid = "white",
midpoint = median(QoL$gwpca_index, na.rm = TRUE),
na.value = "grey",
name = "GWPCA Index") +
labs(title = "QoL-PCA Index Scores") +
theme_void()
# Display the map
print(pic_gwpca)
# Save the map to a file
ggsave("Output/Pics/QoL_Map_gwpca.png", plot = pic_gwpca, width = 10, height = 5, units = "in", dpi = 300)
col_func <- colorRampPalette(c( "#447733", "#77DD77", "#FFFFFF","#EE9988","#BB4444"))
QoL$gwpca_rank <- rank(-QoL$gwpca_index )
pic_gwpca_rank <- ggplot(data = QoL$geometry) +
geom_sf(aes(fill = QoL$gwpca_rank)) +
scale_fill_gradientn(colors = col_func(100), name = "GWPCA Rank",
limits = c(min(QoL$gwpca_rank, na.rm = TRUE), max(QoL$gwpca_rank, na.rm = TRUE)),
oob = scales::oob_squish) +
theme_void()
pic_gwpca_rank
ggsave("Output/Pics/QoL_Map_gwpca_rank.png", plot = pic_gwpca_rank , width = 10, height = 5, units = "in", dpi = 300)
PTV_GWPCA <- pca.gw$SDF@data$local_CP
# Weight the PCA.GW scores by the explained variance of each component
# Normalized by the sum of the variance explained by the first two components
# each component i has its own matrix of loadings
VS_GWPCA <- (pca.gw$loadings[,,1]* pca.gw$SDF@data$Comp.1_PV +
pca.gw$loadings[,,2]* pca.gw$SDF@data$Comp.2_PV +
pca.gw$loadings[,,3]* pca.gw$SDF@data$Comp.3_PV +
pca.gw$loadings[,,4]* pca.gw$SDF@data$Comp.4_PV +
pca.gw$loadings[,,5]* pca.gw$SDF@data$Comp.5_PV) / pca.gw$SDF@data$local_CP %>% round(2)
# VS_GWPCA is your matrix or data frame containing numerical data
VS_GWPCA <- colMeans(VS_GWPCA, na.rm = TRUE) # na.rm = TRUE to handle any NA values safely
VS_GWPCA
## In_migration Out_migration HouseTenure Business_Startup
## 0.14826131 0.09107879 0.20311630 0.04212529
## Commuting FamilyWithKids6_17 Homeownership BirthRate
## -0.07913866 0.10027053 -0.18181539 -0.01185513
Spatial matrix constructed but we utilized the connection matrix in the spatial PCA function. This can be used later for robustness check.
# Check if QoL is already an sf object
if (!inherits(QoL, "sf")) {
# 'geometry' is the column with spatial data
QoL <- st_as_sf(QoL, sf_column_name = "geometry")
}
# QoL is properly an sf object and ready to use
# Using poly2nb to create neighbor relationships based on shared boundaries
neighbors <- poly2nb(QoL ) #
weights <- nb2listw(neighbors, style = "B", zero.policy = TRUE)
# Convert listw to a square matrix of weights
weight_matrix <- listw2mat(weights)
# Check the dimensions to ensure it's square
dim(weight_matrix)
## [1] 3036 3036
We need to construct the coordinates to calculate the spatial correlation of counties. We utilizaed the geometry information to calculate the centriods.
# Convert geometries to centroids
QoL <- QoL %>%
mutate(centroid = st_centroid(geometry))
# Now extract coordinates from centroids
QoL <- QoL %>%
mutate(x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2])
# Convert QoL to a data frame if it's not already one
QoL <- as.data.frame(QoL)
# Ensure specified columns are numeric and remove rows with NAs
QoL <- QoL %>%
mutate(across(c("In_migration", "Out_migration", "HouseTenure", "Business_Startup",
"Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate"), as.numeric)) %>%
na.omit() # This will remove any rows that have NA values in any column
This is the code for Spatial PCA.
# centriod
xy_matrix <- cbind(QoL$x, QoL$y)
# spatial PCA
spca_results <- spca(QoL_RP[,] , xy = xy_matrix, center = FALSE,
scale = FALSE, scannf = FALSE,
nfposi = 3, nfnega = 1,
type = 3, ask = FALSE,
#k = 5, # the number of the neighbor
plot.nb = TRUE, edit.nb = FALSE,
truenames = TRUE,
d1 = NULL, d2 = NULL,
a = NULL, dmin = NULL,
axis = 1,
#matWeight = weight_matrix,
useLag = TRUE)
We can see the first postive egeinvalue is larger than the rest. There is no local structure in the data.
pc <- length(spca_results$eig)
# barplot(spca_results$eig,main="Eigenvalues of sPCA", col=rep(c("red","grey"),c(1,pc)))
# If there is any local spatial structure
barplot(spca_results$eig, main="A variant of the plot\n of sPCA eigenvalues",
col=spectral(length(spca_results$eig)))
legend("topright", fill=spectral(2), leg=c("Global structures", "Local structures"))
abline(h=0,col="grey")
Here we decompose th e object value var(x) I(x) into of the Spatial autocorrelation and the variance. We can see the first principle component shows enough representation of both sptial structure and the explaination of the variance.
screeplot(spca_results)
myLoadings <- spca_results$c1[,1]^2
names(myLoadings) <- rownames(spca_results$c1)
#to identify the 5% of alleles with the greatest contributions to the first global structure in mySpca, we need
loadingplot(myLoadings, threshold=quantile(myLoadings, 0.90), xlab="Variables", ylab="Weight of the variable", main="Contribution of variables \n to the first sPCA axis")
myLoadings <- spca_results$c1[,2]^2
names(myLoadings) <- rownames(spca_results$c1)
#to identify the 5% of alleles with the greatest contributions to the first global structure in mySpca, we need
loadingplot(myLoadings, threshold=quantile(myLoadings, 0.90), xlab="Variables", ylab="Weight of the variable", main="Contribution of variables \n to the second sPCA axis")
myLoadings <- spca_results$c1[,3]^2
names(myLoadings) <- rownames(spca_results$c1)
#to identify the 5% of alleles with the greatest contributions to the first global structure in mySpca, we need
loadingplot(myLoadings, threshold=quantile(myLoadings, 0.90), xlab="Variables", ylab="Weight of the variable", main="Contribution of variables \n to the third sPCA axis")
#colorplot(spca_results,cex=1,main="colorplot of mySpca, first global score")
# Ensure merge them with scores, here I just map the first PC
QoL <- QoL %>%
mutate(sPCA_PC1 = spca_results$ls[,1], sPCA_PC2 = spca_results$ls[,2] )
# # Perform the left join to merge data
# QoL <- QoL %>%
# left_join(ct %>% select(FIPS, geometry), by = "FIPS")
if (!inherits(QoL, "sf")) {
QoL <- st_as_sf(QoL) #, crs = 4326
}
# Create a map using ggplot2 with colors based on PC1
ggplot(QoL) +
geom_sf(aes(fill = sPCA_PC1), size = 0.5) + # Use fill for color based on PC1
scale_fill_viridis_c(option = "C", begin = 0, end = 1, name = "PC1 Score") +
labs(title = "sPCA- QoL 1st PC Scores") +
theme_void()
ggplot(QoL) +
geom_sf(aes(fill = sPCA_PC2), size = 0.5) + # Use fill for color based on PC2
scale_fill_viridis_c(option = "C", begin = 0, end = 1, name = "PC2 Score") +
labs(title = "sPCA- QoL 2nd PC Scores") +
theme_void()
### SPCA Maps- Composite Index
# the eigenvalues are stored correctly in spca_results$eig
var1 <- spca_results$eig[1] # Eigenvalue for the first principal component
var2 <- spca_results$eig[2] # Eigenvalue for the second principal component
# Calculate the sPCA index using each component's specific eigenvalue
QoL$sPCA_index <- spca_results$ls[,1] * var1 / (var1 + var2) +
spca_results$ls[,2] * var2 / (var1 + var2)
# if (!inherits(QoL, "sf")) {
# QoL <- st_as_sf(QoL, crs = 4326)
# }
# Plotting the sPCA index
pic_sPCA <- ggplot(QoL ) +
geom_sf(aes(fill = sPCA_index), size = 0.5) + # Use fill for color based on sPCA index
scale_fill_gradient2(low = "darkgreen",
high ="darkred" ,
mid = "white",
midpoint = median(QoL$sPCA_index, na.rm = TRUE),
na.value = "grey",
name = "sPCA Index") +
# labs(title = "sPCA - sPCA Index Scores") +
theme_void()
pic_sPCA
ggsave("Output/Pics/QoL_Map_spca.png", plot = pic_sPCA , width = 10, height = 5, units = "in", dpi = 300)
col_func <- colorRampPalette(c( "#447733", "#77DD77", "#FFFFFF","#EE9988","#BB4444"))
QoL$spca_rank <- rank(QoL$sPCA_index ) # here as the sign of the loadingis differetn from other method, i flips
pic_sPCA_rank <- ggplot(data = QoL$geometry) +
geom_sf(aes(fill = QoL$spca_rank)) +
scale_fill_gradientn(colors = col_func(100), name = "sPCA Rank",
limits = c(min(QoL$gwpca_rank, na.rm = TRUE), max(QoL$gwpca_rank, na.rm = TRUE)),
oob = scales::oob_squish) +
theme_void()
pic_sPCA_rank
ggsave("../03 Output/Pics/QoL_Map_spca_rank.png", plot =pic_sPCA_rank , width = 10, height = 5, units = "in", dpi = 300)
# Eigenvalues
var1 <- spca_results$eig[1] # Eigenvalue for the first principal component
var2 <- spca_results$eig[2] # Eigenvalue for the second principal component
# Sum of the eigenvalues for the first two and first six components
sum_2_sPC <- spca_results$eig[1] + spca_results$eig[2]
sum_sPC <- sum(spca_results$eig[1:6])
# Weighted loadings
VS_sPCA <- (spca_results$c1[,1]* var1 / sum_2_sPC + # Loading for the first component weighted
spca_results$c1[,2]* var2 / sum_2_sPC ) %>% round(2) # Loading for the second component weighted
names(VS_sPCA) <- c("In_migration", "Out_migration", "HouseTenure", "Business_Startup",
"Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate")
# Proportion of total variance explained by the first two components
PTV_sPCA <- sum_2_sPC / sum_sPC %>% round(2)
VS_sPCA
## In_migration Out_migration HouseTenure Business_Startup
## -0.32 -0.22 -0.38 -0.47
## Commuting FamilyWithKids6_17 Homeownership BirthRate
## 0.09 -0.06 0.13 0.01
PTV_sPCA
## [1] 0.603066
The loading for different components.
spca_results$c1
The PC values.
dim(spca_results$li)
## [1] 3036 4
head(spca_results$li)
The spatial lag term of the PC.
dim(spca_results$ls)
## [1] 3036 4
head(spca_results$ls)
Projection of the sPCA to the PCA
spca_results$as
rm(myGtest,neighbors ,pic_sPCA,pic_sPCA_rank,weight_matrix,weights,xy_matrix,myLoadings,pc,sum_2_sPC,sum_sPC, var1, var2)
We try the Bayesian Factor Anslsys.
# Model fit
QoL.model <- ' QoL =~ In_migration+ Out_migration + HouseTenure + Business_Startup + Commuting + FamilyWithKids6_17 + Homeownership + BirthRate
QoL~~1*QoL '
fit <- blavaan(QoL.model, data = QoL,
dp = dpriors(lambda = "normal(1,1)"), #prior for loading
auto.var = TRUE, auto.fix.first = FALSE,
auto.cov.lv.x = TRUE,save.lvs=TRUE,std.lv=TRUE)
cbind(1:length(coef(fit)), coef(fit))
# convergence diagnostics
blavInspect(fit, "rhat") ## want close to 1.000
blavInspect(fit, "neff") ## want large values
plot(fit) ## want a hairy caterpillar
plot(fit, 1:6) ## the numbers specify which parameters to plot, here I just
summary(fit)
coef(fit)
fitMeasures(fit)
coefficients <- coef(fit)
# Correctly define loadings and unique variances based on the given coefficients
loadings <- c(
In_migration = 0.464,
Out_migration = 0.221,
HouseTenure = 0.906,
Business_Startup = 0.091, # Updated from your input
Commuting = -0.333,
FamilyWithKids6_17 = 0.335,
Homeownership = -0.701, # Assuming negative based on your coefficients
BirthRate = -0.006
)
unique_variances <- c(
In_migration = 0.786,
Out_migration = 0.952,
HouseTenure = 0.181, # Updated based on your coefficients for variances
Business_Startup = 0.992, # Updated based on your coefficients for variances
Commuting = 0.890,
FamilyWithKids6_17 = 0.888,
Homeownership = 0.510, # Updated from your coefficients
BirthRate = 1.001
)
# Calculate R-squared for each variable
VS_Bayes<- sapply(names(loadings), function(var) {
lambda <- loadings[var]
psi <- unique_variances[var]
round( sqrt((lambda^2) / (lambda^2 + psi) ) ,2)
})
names(VS_Bayes) <- c("In_migration", "Out_migration", "HouseTenure", "Business_Startup",
"Commuting", "FamilyWithKids6_17", "Homeownership", "BirthRate")
VS_Bayes
## In_migration Out_migration HouseTenure Business_Startup
## 0.46 0.22 0.91 0.09
## Commuting FamilyWithKids6_17 Homeownership BirthRate
## 0.33 0.33 0.70 0.01
# latent variable samples
lvs <- blavInspect(fit, "lvs")
lvs <- do.call("rbind", lvs) ## convert to matrix
# Calculate the column means of lvs
QoL$BayeScores <- colMeans(lvs)
# The ranking of counties
QoL$BayeRank <- rank(-QoL$BayeScores, ties.method = "min")
write.csv(lvs, "Output/lvs.csv", row.names = TRUE)
# Ensure QoL is an sf object; if it's not, convert it
if (!inherits(QoL, "sf")) {
QoL <- st_as_sf(QoL)
}
# Plotting the map with Bayesian Scores
pic_baye <- ggplot(data = QoL ) +
geom_sf(aes(fill = BayeScores), size = 0.5) +
labs(title = "Bayesian Scores by County",
fill = "Factor Scores") +
theme_void() +
scale_fill_gradient2(low = "darkgreen",
high ="darkred" ,
mid = "white",
midpoint = median(QoL$BayeScores, na.rm = TRUE),
na.value = "grey",
name = "Baye Factor Score")
pic_baye
ggsave("Output/Pics/QoL_Map_baye.png", plot = pic_baye , width = 10, height = 5, units = "in", dpi = 300)
col_func <- colorRampPalette(c( "#447733", "#77DD77", "#FFFFFF","#EE9988","#BB4444"))
pic_baye_rank <- ggplot(data = QoL ) +
geom_sf(aes(fill = BayeRank)) +
scale_fill_gradientn(colors = col_func(100), name = "Baye FA Rank",
limits = c(min(QoL$gwpca_rank, na.rm = TRUE), max(QoL$gwpca_rank, na.rm = TRUE)),
oob = scales::oob_squish) +
theme_void()+ labs(title = "QoL - Bayesian Scores Ranking by County",
fill = "Ranking")
pic_baye_rank
ggsave("Output/Pics/QoL_Map_baye_rank.png", plot = pic_baye_rank , width = 10, height = 5, units = "in", dpi = 300)
–> –> –> –>
–>
–> –>
–> –>
–> –>
–>