In the following task I am going to use Principal Component Analysis (PCA) to perform a variable reduction in order to look at the health index at the county level. I am using Area Resource Files, in combination with Compressed Mortality Files to do the Principal Component Analysis. The variables used to develop the index are, Rural Population, Medicare Encrollment, Health Insurance (under 65 years old), unemployment rate, and food stamp or SNAP beneficiary.
library(readr)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(survival)
library(cmprsk)
library(questionr)
library(haven)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
library(pander)
library(knitr)
library(ggplot2)
library(tidycensus)
library(acs)
## Loading required package: stringr
## Loading required package: XML
##
## Attaching package: 'acs'
## The following object is masked from 'package:base':
##
## apply
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:acs':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load("C://Users/munta/Google Drive/ACS/cmf06_10_age_sex_std.Rdata")
load("C://Users/munta/Google Drive/ACS/arf1516.Rdata")
dim(cmf)
## [1] 3147 7
dim(arf15)
## [1] 3230 6921
## To install your API key for use in future sessions, run this function with `install = TRUE`.
#Gini coefficent for income inquality
gini<-get_acs(geography = "county", variables = "B19083_001", year = 2016 )
## Please note: `get_acs()` now defaults to a year or endyear of 2016.
mydata<-merge(x = arf15, y=cmf, by.x="f00004", by.y="County.Code")
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
##
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
##
## plot
## Warning: '*' is not a valid FIPS code or state name/abbreviation
usco<-counties(state = "*", year = 2010,cb = T )
## Warning: '*' is not a valid FIPS code or state name/abbreviation
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.2.0, proj.4 4.9.3
library(ggplot2)
usco<- st_as_sf(usco)
usco<-mutate(usco, geoid=paste(STATEFP, COUNTYFP, sep=""))
usco<-inner_join(usco, y=mydata, by=c("geoid"="f00004") )
usco<-inner_join(usco,y=gini, c("geoid"="GEOID") )
usco<-mutate(usco,pblack=100*( ( f1391005+ f1391105)/ f1198405),
phisp=100*( (f1392005+ f1392105)/ f1198405),
pwht=100*( ( f1390805+f1390905)/f1198405),
prural = 100*( f1367900/ f0453000 ),
unemp= f0679505,
pmedienroll= f1325405/f1198405,
psnap= f1408405/f1198405,
phltins= f1475010/ f0453010,
popdensity=f1198405/(as.numeric(CENSUSAREA)/1000000),
gini=estimate )
usco<- filter(usco, !STATEFP %in% c("02", "15", "72"), is.na(Age.Adjusted.Rate)==F&is.na(pblack)==F&is.na(prural)==F&is.na(pmedienroll)==F&is.na(phltins)==F&is.na(psnap)==F)
pchlthindex <- prcomp(~prural+unemp+pmedienroll+psnap+phltins, center=T, scale=T, data=usco, retx=T)
pchlthindex
## Standard deviations (1, .., p=5):
## [1] 1.3553676 1.1788503 0.9558361 0.7257425 0.5770318
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4 PC5
## prural -0.2814473 0.6320380 -0.2745066 0.6506140 0.1505422
## unemp -0.5842397 -0.2080674 0.4034358 0.2628991 -0.6192702
## pmedienroll -0.2170194 0.6868331 0.1713002 -0.6452456 -0.1883541
## psnap -0.6316040 -0.2040555 0.1748363 -0.1651046 0.7082447
## phltins -0.3652813 -0.2094222 -0.8378390 -0.2529507 -0.2382305
Interm of loadings for PC1, we see negative associations with everything, which can be interpreted as an index for health, since all the variables are loading in the same direction. For PC2, there’s a variation in the loadings, some positive and some negative. Rural Population and Medicare Enrollment return large coefficients.
summary(pchlthindex)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.3554 1.1789 0.9558 0.7257 0.57703
## Proportion of Variance 0.3674 0.2779 0.1827 0.1053 0.06659
## Cumulative Proportion 0.3674 0.6453 0.8281 0.9334 1.00000
PC1 is explaining around 37% variation, whereas PC2 is explaining 27% variation.
screeplot(pchlthindex, type = "l", main = "Scree Plot")
abline(h=1)
The first and second eigenvalues are more than, which suggests there are 2 real components among these 5 variables.
pchlthindex$rotation
## PC1 PC2 PC3 PC4 PC5
## prural -0.2814473 0.6320380 -0.2745066 0.6506140 0.1505422
## unemp -0.5842397 -0.2080674 0.4034358 0.2628991 -0.6192702
## pmedienroll -0.2170194 0.6868331 0.1713002 -0.6452456 -0.1883541
## psnap -0.6316040 -0.2040555 0.1748363 -0.1651046 0.7082447
## phltins -0.3652813 -0.2094222 -0.8378390 -0.2529507 -0.2382305
hist(pchlthindex$x[,1])
hist(pchlthindex$x[,2])
The two histograms show fairly symmetrical disbution around the mean.
cor(pchlthindex$x[,1:2])
## PC1 PC2
## PC1 1.000000e+00 -9.860855e-16
## PC2 -9.860855e-16 1.000000e+00
#scores<-data.frame(pchlthindex$x)
#scores$name<-rownames(pchlthindex$x)
#mydata$name<-rownames(mydata)
#mydata<-merge(, scores, by.x="name", by.y="name", all.x=F)
#tail(names(mydata), 20)
usco<-select(usco, "prural", "unemp", "pmedienroll", "psnap", "phltins")
usco$pc1<-pchlthindex$x[,1]
The correlation between the first two componenets do not show proimising outcome.
summary(usco$phltins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03037 0.11660 0.14765 0.15004 0.17895 0.34666
test<-as.data.frame(usco)
round(cor(test[, c("prural", "unemp", "pmedienroll", "psnap", "phltins")], method = "spearman"), 3)
## prural unemp pmedienroll psnap phltins
## prural 1.000 0.062 0.449 0.049 0.128
## unemp 0.062 1.000 0.078 0.651 0.165
## pmedienroll 0.449 0.078 1.000 0.126 -0.076
## psnap 0.049 0.651 0.126 1.000 0.376
## phltins 0.128 0.165 -0.076 0.376 1.000
The rural population show perfect correlation.
#round(cor(test[, c("prural", "unemp", "pmedienroll", "psnap", "phltins", "PC1", "PC2")], method = "spearman"), 3)