Use of data from IPUMS PMA is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.
#read in spatial datalibrary(sf)
Linking to GEOS 3.13.1, GDAL 3.10.2, PROJ 9.5.1; sf_use_s2() is TRUE
UG_PMA_GPS<-read_sf("C:/Users/Rebecca/Downloads/PMA_UG_GPS_v2_06July2022/UGANDA/PMA_UG_GPS_v2_06July2022.csv")#read in boundary dataUG_boundary<-read_sf("C:/Users/Rebecca/Downloads/geouggen/geouggen.shp")UG_sub_boundary<-read_sf("C:/Users/Rebecca/Downloads/uganda-adm-boundaries/UGANDA BOUNDARIES SHAPEFILES AS OF 17 08 2018/COUNTIES_2018_UTM_36N.shp")
#do the same for medicial decision-makinglibrary(janitor)pmaug2022wgps<-pmaug2022wgps%>%mutate(HHDECMED=as.factor(HHDECMED))%>%mutate(decidemedical=recode(HHDECMED, '1'="respondent" ,'2'="husband/partner", '3'="someone else",.default =NA_character_))tabyl(pmaug2022wgps, decidemedical)
decidemeddichnum n percent
0 2297 0.7414461
1 801 0.2585539
pmaug2022groupeddecidemedweighted<-pmaug2022dropNA%>%group_by(EAID)%>%summarise(decidemedrateweighted=weighted.mean(decidemeddichnum, w=FQWEIGHT, strata=STRATA, na.rm=TRUE))#join with the othersspatialpmaugfull<-left_join(spatialpmaug, pmaug2022groupeddecidemedweighted)
Joining with `by = join_by(EAID)`
Now let’s check to see if these variables are correlated. There is a significant negative correlation between births per woman and a higher score of fertility autonomy.
Pearson's product-moment correlation
data: spatialpmaugfull$decidemedrateweighted and spatialpmaugfull$fertilityautonomymean
t = 1.263, df = 138, p-value = 0.2087
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.0600731 0.2680452
sample estimates:
cor
0.1068955
Now let’s map these distributions out.
library(sf)#merge GPS data and decision-making datamapdata<-merge(spatialpmaugfull, UG_PMA_GPS_2022, by=c("EAID"))#convert to a spatial framemapdata <-st_as_sf(mapdata, coords =c("GPSLONG", "GPSLAT"), crs=4326)
ggplot() +geom_sf(data = UG_sub_boundary, color ="darkgrey", fill =NA, linewidth =0.4) +geom_sf(data = UG_boundary, color ="black", fill =NA, linewidth =0.4) +geom_sf(data = mapdata, aes(color = meanbirthsperwoman), size =5, alpha = .8) +scale_color_viridis_c(option ="magma", direction =-1, name ="Number of Children Ever Born") +ggtitle("PMA Enumeration Area Average Number of Children Ever Born, Uganda 2022")
ggplot() +geom_sf(data = UG_sub_boundary, color ="darkgrey", fill =NA, linewidth =0.4) +geom_sf(data = UG_boundary, color ="black", fill =NA, linewidth =0.4)+geom_sf(data = mapdata, aes(color = decidemedrateweighted), size =5, alpha = .8) +scale_color_viridis_c(option ="magma", direction =-1, name ="Involved in medicial decisions") +ggtitle("PMA Enumeration Area Medical Decision-making Rate, Uganda 2022")
ggplot() +geom_sf(data = UG_sub_boundary, color ="darkgrey", fill =NA, linewidth =0.4) +geom_sf(data = UG_boundary, color ="black", fill =NA, linewidth =0.4)+geom_sf(data = mapdata, aes(color = fertilityautonomymean), size =5, alpha = .8) +scale_color_viridis_c(option ="magma", direction =-1, name ="Fertility autonomy scale") +ggtitle("PMA Enumeration Area Fertility Autonomy Rate, Uganda 2022")
The maps showcase: 1) overall low involvement in women’s medical decision-making; 2) varied total births per woman and varied levels of fertility autonomy.
library(dplyr)# Select relevant columns and remove missing valuescluster_data <- mapdata %>%select(decidemedrateweighted, meanbirthsperwoman, fertilityautonomymean, geometry) %>%na.omit() # Extract and z-score numeric columns for clusteringnumeric_data<- cluster_data%>%st_drop_geometry() %>%# Removes the geometry columnselect(decidemedrateweighted, meanbirthsperwoman, fertilityautonomymean) %>%na.omit() %>%scale() # Z-score standardization# Run k-meansset.seed(123)kmeans_result <-kmeans(numeric_data, centers =3)# Add cluster labels back to original sf objectcluster_data$cluster <-as.factor(kmeans_result$cluster)# Elbow method to find optimal number of clusterswcss <-vector()for (i in1:10) { kmeans_model <-kmeans(numeric_data, centers = i) wcss[i] <- kmeans_model$tot.withinss}plot(1:10, wcss, type ="b", pch =19, frame =FALSE,xlab ="Number of clusters K",ylab ="Total within-clusters sum of squares")
#recode cluster labels for mapcluster_data<- cluster_data %>%mutate(cluster_level =case_when( cluster ==1~'low fertility- high fertility autonomy- mid involvement in medical decision-making', cluster ==2~'high fertility- mid fertility autonomy-low involvement in medical decision-making', cluster ==3~'low fertility- mid fertility autonomy- high involvement in medical decision-making' ))ggplot(data = cluster_data) +geom_sf(data = UG_sub_boundary, color ="darkgrey", fill =NA, linewidth =0.4) +geom_sf(aes(color = cluster_level), size =5, alpha =0.5) +geom_sf(data = UG_boundary, color ="black", fill =NA, linewidth =0.4) +scale_color_brewer(palette ="Set1", name ="Cluster") +ggtitle("Clusters of Medical Decision-making, Fertility Autonomy, and Number of Births per Woman, Uganda 2022")
Next, I perform ANOVA/ MANOVA testing to see if the clusters are significantly different from each other. These tests are significant.
# Drop geometry for analysisdf<- cluster_data %>%st_drop_geometry()# ANOVA for medical decision-making rateanova_decision <-aov(decidemedrateweighted~ cluster, data = df)summary(anova_decision)
Df Sum Sq Mean Sq F value Pr(>F)
cluster 2 2.441 1.2204 78.12 <2e-16 ***
Residuals 137 2.140 0.0156
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for fertility rateanova_fertility <-aov(meanbirthsperwoman ~ cluster, data = df)summary(anova_fertility)
Df Sum Sq Mean Sq F value Pr(>F)
cluster 2 51.82 25.91 74.13 <2e-16 ***
Residuals 137 47.89 0.35
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Multivariate ANOVAmanova_result <-manova(cbind(decidemedrateweighted, meanbirthsperwoman) ~ cluster, data = df)summary(manova_result)
Now we will begin testing for univariate clustering.
First, we calculate Moran’s I. Medical decision-making and total births per woman show significant univariate spatial clustering.
#univariate for medical decision makinglibrary(spdep)
Warning: package 'spdep' was built under R version 4.5.1
Loading required package: spData
Warning: package 'spData' was built under R version 4.5.1
To access larger datasets in this package, install the spDataLarge
package with: `install.packages('spDataLarge',
repos='https://nowosad.github.io/drat/', type='source')`
library(sf)# Transform to UTM Zone 36Nmapdata_proj <-st_transform(mapdata, crs =32636)# Use centroids for coordinatescoords_proj <-st_coordinates(st_centroid(mapdata_proj))# Create neighbors within 50 kmnb <-dnearneigh(coords_proj, 0, 50000)
Warning in dnearneigh(coords_proj, 0, 50000): neighbour object has 13
sub-graphs
lw <-nb2listw(nb, style ="W", zero.policy =TRUE)# Check column existence and run Moran's I (univariate example)moran_result <-moran.mc(mapdata_proj$decidemedrateweighted, lw, nsim =499)print(moran_result)
Monte-Carlo simulation of Moran I
data: mapdata_proj$decidemedrateweighted
weights: lw
number of simulations + 1: 500
statistic = 0.11413, observed rank = 485, p-value = 0.03
alternative hypothesis: greater
#univariate for fertilitylibrary(spdep)library(sf)# Transform to UTM Zone 36Nmapdata_proj <-st_transform(mapdata, crs =32636)# Use centroids for coordinatescoords_proj <-st_coordinates(st_centroid(mapdata_proj))# Create neighbors within 50 kmnb <-dnearneigh(coords_proj, 0, 50000)
Warning in dnearneigh(coords_proj, 0, 50000): neighbour object has 13
sub-graphs
lw <-nb2listw(nb, style ="W", zero.policy =TRUE)# Check column existence and run Moran's I (univariate example)moran_result <-moran.mc(mapdata_proj$meanbirthsperwoman, lw, nsim =499)print(moran_result)
Monte-Carlo simulation of Moran I
data: mapdata_proj$meanbirthsperwoman
weights: lw
number of simulations + 1: 500
statistic = 0.22011, observed rank = 500, p-value = 0.002
alternative hypothesis: greater
#univariate for fertility autonomylibrary(spdep)library(sf)# Transform to UTM Zone 36Nmapdata_proj <-st_transform(mapdata, crs =32636)# Use centroids for coordinatescoords_proj <-st_coordinates(st_centroid(mapdata_proj))# Create neighbors within 100 kmnb <-dnearneigh(coords_proj, 0, 50000)
Warning in dnearneigh(coords_proj, 0, 50000): neighbour object has 13
sub-graphs
lw <-nb2listw(nb, style ="W", zero.policy =TRUE)# Check column existence and run Moran's I (univariate example)moran_result <-moran.mc(mapdata_proj$fertilityautonomymean, lw, nsim =499)print(moran_result)
Monte-Carlo simulation of Moran I
data: mapdata_proj$fertilityautonomymean
weights: lw
number of simulations + 1: 500
statistic = 0.064198, observed rank = 427, p-value = 0.146
alternative hypothesis: greater
# Run Getis-Ord Gi* for fertilityrategi_fertility <-localG(mapdata_proj$meanbirthsperwoman, lw)# Run Getis-Ord Gi* for decidemedrategi_decision <-localG(mapdata_proj$decidemedrateweighted, lw)# Run Getis-Ord Gi* for MCPuserategi_fertaut <-localG(mapdata_proj$fertilityautonomymean, lw)
# Add Gi* results as new columnsmapdata$gi_fertility <- gi_fertilitymapdata$gi_decision <- gi_decisionmapdata$gi_fertaut <- gi_fertautmapdata$gi_fertility <-as.numeric(gi_fertility)mapdata$gi_decision <-as.numeric(gi_decision)mapdata$gi_fertaut <-as.numeric(gi_fertaut)
We can visualize the hot and cold spots using this approach.
scale_color_viridis_c(option ="D", name ="Gi* Score")
<ScaleContinuous>
Range:
Limits: 0 -- 1
ggplot(data = mapdata) +geom_sf(data = UG_sub_boundary, color ="darkgrey", fill =NA, linewidth =0.4) +geom_sf(aes(color = gi_fertility), size =5, alpha =0.5) +geom_sf(data = UG_boundary, color ="black", fill =NA, linewidth =0.4) +scale_color_gradient2(low ="blue", mid ="white", high ="red", midpoint =0, name ="Gi* Score") +ggtitle("Cluster Analysis of Births per Woman, Uganda 2022")
ggplot(data = mapdata) +geom_sf(data = UG_sub_boundary, color ="darkgrey", fill =NA, linewidth =0.4) +geom_sf(aes(color = gi_fertaut), size =5, alpha =0.5) +geom_sf(data = UG_boundary, color ="black", fill =NA, linewidth =0.4) +scale_color_gradient2(low ="blue", mid ="white", high ="red", midpoint =0, name ="Gi* Score") +ggtitle("Cluster Analysis of Fertility Autonomy, Uganda 2022")
ggplot(data = mapdata) +geom_sf(data = UG_sub_boundary, color ="darkgrey", fill =NA, linewidth =0.4) +geom_sf(aes(color = gi_decision), size =5, alpha =0.5) +geom_sf(data = UG_boundary, color ="black", fill =NA, linewidth =0.4) +scale_color_gradient2(low ="blue", mid ="white", high ="red", midpoint =0, name ="Gi* Score") +ggtitle("Cluster Analysis of Medical Decision-making, Uganda 2022")
Now let’s consider if these gi scores cluster together. We can use the k-means procedure again.
#scale the variables# Normalize variables (e.g., z-scores)mapdata$decide_z <-scale(mapdata$gi_decision)mapdata$fertaut_z <-scale(mapdata$gi_fertaut)mapdata$fert_z <-scale(mapdata$gi_fertility)library(dplyr)library(sf) # since you're working with sf objects# Drop geometry and keep only the z-scored variablesgi_data_scaled <- mapdata %>%st_drop_geometry() %>%select(decide_z, fertaut_z, fert_z) %>%mutate(across(everything(), as.numeric))# Remove rows with NA, NaN, or Infgi_data_scaled <- gi_data_scaled %>%filter(if_all(everything(), ~!is.na(.) &!is.infinite(.) &!is.nan(.)))# Check againsummary(gi_data_scaled)
decide_z fertaut_z fert_z
Min. :-2.1511 Min. :-2.69672 Min. :-2.1503
1st Qu.:-0.6461 1st Qu.:-0.72928 1st Qu.:-0.0925
Median :-0.1344 Median : 0.04732 Median : 0.3615
Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
3rd Qu.: 0.5952 3rd Qu.: 0.67110 3rd Qu.: 0.6431
Max. : 2.7545 Max. : 1.73915 Max. : 1.3479
# Now run kmeansset.seed(123)kmeans_result <-kmeans(gi_data_scaled, centers =3, nstart =25)# Add cluster assignments back to your sf objectmapdata$cluster <-NAmapdata$cluster[as.numeric(rownames(gi_data_scaled))] <- kmeans_result$cluster# Print cluster means directly from kmeans resultkmeans_result$centers