In the following task I am going to use Principal Component Analysis (PCA) to perform a variable reduction in order to look at the health index at the county level. I am using Area Resource Files, in combination with Compressed Mortality Files to do the Principal Component Analysis. The variables used to develop the index are, Rural Population, Medicare Encrollment, Health Insurance (under 65 years old), unemployment rate, and food stamp or SNAP beneficiary.

Loading Libraries

library(readr)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(survival)
library(cmprsk)
library(questionr)
library(haven)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
## 
##     expand
library(pander)
library(knitr)
library(ggplot2)
library(tidycensus)
library(acs)
## Loading required package: stringr
## Loading required package: XML
## 
## Attaching package: 'acs'
## The following object is masked from 'package:base':
## 
##     apply
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:acs':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load("C://Users/munta/Google Drive/ACS/cmf06_10_age_sex_std.Rdata")
load("C://Users/munta/Google Drive/ACS/arf1516.Rdata")

dim(cmf)
## [1] 3147    7
dim(arf15)
## [1] 3230 6921
## To install your API key for use in future sessions, run this function with `install = TRUE`.

Merging Data

#Gini coefficent for income inquality
gini<-get_acs(geography = "county", variables = "B19083_001", year = 2016 )
## Please note: `get_acs()` now defaults to a year or endyear of 2016.
mydata<-merge(x = arf15, y=cmf, by.x="f00004", by.y="County.Code")
library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
## 
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
## 
##     plot
## Warning: '*' is not a valid FIPS code or state name/abbreviation
usco<-counties(state = "*",  year = 2010,cb = T )
## Warning: '*' is not a valid FIPS code or state name/abbreviation
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.2.0, proj.4 4.9.3
library(ggplot2)

Selecting Variables

usco<- st_as_sf(usco)
usco<-mutate(usco, geoid=paste(STATEFP, COUNTYFP, sep=""))
usco<-inner_join(usco, y=mydata, by=c("geoid"="f00004") ) 
usco<-inner_join(usco,y=gini, c("geoid"="GEOID") )

usco<-mutate(usco,pblack=100*( (   f1391005+ f1391105)/  f1198405),
             phisp=100*( (f1392005+ f1392105)/ f1198405),
             pwht=100*( (  f1390805+f1390905)/f1198405),
             prural = 100*( f1367900/ f0453000 ),
             unemp= f0679505,
             pmedienroll= f1325405/f1198405,
             psnap=  f1408405/f1198405,
             phltins= f1475010/  f0453010,
             popdensity=f1198405/(as.numeric(CENSUSAREA)/1000000),
             gini=estimate )
usco<-  filter(usco, !STATEFP %in% c("02", "15", "72"), is.na(Age.Adjusted.Rate)==F&is.na(pblack)==F&is.na(prural)==F&is.na(pmedienroll)==F&is.na(phltins)==F&is.na(psnap)==F)

Creating the Index

pchlthindex <- prcomp(~prural+unemp+pmedienroll+psnap+phltins, center=T, scale=T, data=usco, retx=T)

pchlthindex
## Standard deviations (1, .., p=5):
## [1] 1.3553676 1.1788503 0.9558361 0.7257425 0.5770318
## 
## Rotation (n x k) = (5 x 5):
##                    PC1        PC2        PC3        PC4        PC5
## prural      -0.2814473  0.6320380 -0.2745066  0.6506140  0.1505422
## unemp       -0.5842397 -0.2080674  0.4034358  0.2628991 -0.6192702
## pmedienroll -0.2170194  0.6868331  0.1713002 -0.6452456 -0.1883541
## psnap       -0.6316040 -0.2040555  0.1748363 -0.1651046  0.7082447
## phltins     -0.3652813 -0.2094222 -0.8378390 -0.2529507 -0.2382305

Interm of loadings for PC1, we see negative associations with everything, which can be interpreted as an index for health, since all the variables are loading in the same direction. For PC2, there’s a variation in the loadings, some positive and some negative. Rural Population and Medicare Enrollment return large coefficients.

Summary Statistics

summary(pchlthindex)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.3554 1.1789 0.9558 0.7257 0.57703
## Proportion of Variance 0.3674 0.2779 0.1827 0.1053 0.06659
## Cumulative Proportion  0.3674 0.6453 0.8281 0.9334 1.00000

PC1 is explaining around 37% variation, whereas PC2 is explaining 27% variation.

Scree Plot

screeplot(pchlthindex, type = "l", main = "Scree Plot")
abline(h=1)

The first and second eigenvalues are more than, which suggests there are 2 real components among these 5 variables.

pchlthindex$rotation
##                    PC1        PC2        PC3        PC4        PC5
## prural      -0.2814473  0.6320380 -0.2745066  0.6506140  0.1505422
## unemp       -0.5842397 -0.2080674  0.4034358  0.2628991 -0.6192702
## pmedienroll -0.2170194  0.6868331  0.1713002 -0.6452456 -0.1883541
## psnap       -0.6316040 -0.2040555  0.1748363 -0.1651046  0.7082447
## phltins     -0.3652813 -0.2094222 -0.8378390 -0.2529507 -0.2382305

Histogram

hist(pchlthindex$x[,1])

hist(pchlthindex$x[,2])

The two histograms show fairly symmetrical disbution around the mean.

Correlation Matrix

cor(pchlthindex$x[,1:2])
##               PC1           PC2
## PC1  1.000000e+00 -9.860855e-16
## PC2 -9.860855e-16  1.000000e+00
#scores<-data.frame(pchlthindex$x)
#scores$name<-rownames(pchlthindex$x)
#mydata$name<-rownames(mydata)
#mydata<-merge(, scores, by.x="name", by.y="name", all.x=F)
#tail(names(mydata), 20)

usco<-select(usco, "prural", "unemp", "pmedienroll", "psnap", "phltins")

usco$pc1<-pchlthindex$x[,1]

The correlation between the first two componenets do not show proimising outcome.

summary(usco$phltins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03037 0.11660 0.14765 0.15004 0.17895 0.34666
test<-as.data.frame(usco)
round(cor(test[, c("prural", "unemp", "pmedienroll", "psnap", "phltins")], method = "spearman"), 3)
##             prural unemp pmedienroll psnap phltins
## prural       1.000 0.062       0.449 0.049   0.128
## unemp        0.062 1.000       0.078 0.651   0.165
## pmedienroll  0.449 0.078       1.000 0.126  -0.076
## psnap        0.049 0.651       0.126 1.000   0.376
## phltins      0.128 0.165      -0.076 0.376   1.000

The rural population show perfect correlation.

correlations among the original variables and the components

#round(cor(test[, c("prural", "unemp", "pmedienroll", "psnap", "phltins", "PC1", "PC2")], method = "spearman"), 3)