Principal Components Analysis

This example illustrates the use of the method of Principal Components to form an index of overall health using data from the 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link.

Principal Components is a mathematical technique (not a statistical one!) based off the eigenvalue decomposition/singular value decomposition of a data matrix (or correlation/covariance matrix) Link. This technique re-represents a complicated data set (set of variables) in a simpler form, where the information in the original variables is now represented by fewer components, which represent the majority (we hope!!) of the variance in the original data. This is known as a variable reduction strategy. It is also used commonly to form indices in the behavioral/social sciences. It is closely related to factor analysis Link, and indeed the initial solution of factor analysis is often the exact same as the PCA solution. PCA preserves the orthogonal nature of the components, while factor analysis can violate this assumption.

library(car)
library(foreign)
library(survey)

## Loading required package: grid
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart

library(ggplot2)
#load brfss
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)

brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-paste(brfss_11$statefip, brfss_11$cofip, sep="")

brfss_11$healthydays<-ifelse(brfss_11$physhlth %in%c(77,99), NA,ifelse(brfss_11$physhlth==88,0, brfss_11$physhlth))
brfss_11$healthymdays<-ifelse(brfss_11$menthlth %in%c(77,99), NA,ifelse(brfss_11$menthlth==88,0, brfss_11$menthlth))
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)

#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0")
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0")
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0")
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0")

#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA")

#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")

#education level
brfss_11$educ<-recode(brfss_11$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
brfss_11$educ<-relevel(brfss_11$educ, ref='2hsgrad')

#employment
brfss_11$employ2<-recode(brfss_11$employ , recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ2<-relevel(brfss_11$employ2, ref='Employed')

#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')

#income
brfss_11$inc<-ifelse(brfss_11$incomg==9, NA, brfss_11$incomg)

#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))

Analysis

Now we prepare to fit the survey corrected model. Here I just get the names of the variables that i will use, this keeps the size of things down, in terms of memory used

vars<-names(brfss_11)[c(6, 94, 189:214)]
brfss_11.sub<-subset(brfss_11[,vars], complete.cases(brfss_11[,vars]))

We use the prcomp function to do the pca, we specify our variables we want in our index using a formula, the name of the data, we also specify R to z-score the data (center=T removes the mean, scale=T divides by the standard deviation for each variable), and the retx=T will calculate the PC scores for each individual for each component.

brfss.pc<-prcomp(~healthydays+healthymdays+ins+smoke+lowact+hbp+hc+bmi+badhealth,data=brfss_11.sub, center=T, scale=T, retx=T)

#Screeplot
screeplot(brfss.pc, type = "l")
abline(h=1)

#Request some basic summaries of the PCA analysis option(digits=3)
summary(brfss.pc)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.4779 1.1414 1.0052 0.9780 0.9506 0.89007 0.86265
## Proportion of Variance 0.2427 0.1447 0.1123 0.1063 0.1004 0.08803 0.08269
## Cumulative Proportion  0.2427 0.3874 0.4997 0.6060 0.7064 0.79442 0.87710
##                            PC8     PC9
## Standard deviation     0.80991 0.67092
## Proportion of Variance 0.07288 0.05001
## Cumulative Proportion  0.94999 1.00000

brfss.pc$rotation

##                      PC1         PC2        PC3         PC4          PC5
## healthydays   0.49586186 -0.17302620  0.3367237  0.07567725 -0.145617801
## healthymdays  0.37076261 -0.33223889  0.1775998 -0.08138014 -0.123082107
## ins          -0.07400871  0.34806062  0.7016404  0.16323720  0.498553838
## smoke         0.16696851 -0.42293294 -0.2474312 -0.43075440  0.588282711
## lowact        0.26997643 -0.03628241 -0.2953424  0.59919239  0.534623098
## hbp           0.31606072  0.49207524 -0.1344724 -0.24521803 -0.005651674
## hc            0.25651099  0.44493736 -0.0659056 -0.49026896  0.168529640
## bmi           0.28828488  0.32989314 -0.3819836  0.33807082 -0.177164889
## badhealth     0.51148081 -0.09699275  0.2144507  0.04044195 -0.152873601
##                     PC6         PC7         PC8          PC9
## healthydays   0.1148932  0.20433897 -0.15789678  0.709917089
## healthymdays -0.2184482 -0.68992425  0.40164812 -0.117484004
## ins          -0.3169186 -0.04147157 -0.02766382 -0.054336260
## smoke        -0.3709821  0.21490265 -0.12400014  0.034846293
## lowact        0.3940754 -0.10773033  0.16355926 -0.007933627
## hbp          -0.0786751  0.33740216  0.67586145  0.053872627
## hc            0.3797617 -0.41304502 -0.38461366  0.019251283
## bmi          -0.6171397 -0.12327034 -0.34384770  0.046727419
## badhealth     0.1202057  0.35085505 -0.21887631 -0.687407649

#then I plot the first 2 components
hist(brfss.pc$x[,1])

hist(brfss.pc$x[,2])

The first two components account for 38.8% of the variation in the input variables, that isn’t bad. Also, we see 3 eigenvalues of at least 1, which suggests there are 3 real components among these 9 variables (remember, we are looking for eigenvalues > 1 when using z-scored data).

In terms of the loadings for PC1, we see positive associations with everything but insurace status. We may interpret this component as an index for overall health status, since all health variables are loading in the same direction.

For PC2 we see a mixture of loadings, some positive, some negative, and the only two variables that aren’t loading heavily are low physical activity (lowact) and fair/poor self rated health (badhealth). We may interpret this components as more of a metabolic physical health status, since the postive loadings are on the cardio-pulmonary and body mass variables (hpb, hc and bmi), with the exception of insurance coverage, while the more negative loadings are on more ot the health behaviors variables.

Of course, interpreting components is more art than science…

Next, I calculate the correlation between the first 2 components to show they are orthogonal, i.e. correlation == 0

cor(brfss.pc$x[,1:2])

##              PC1          PC2
## PC1 1.000000e+00 2.363684e-13
## PC2 2.363684e-13 1.000000e+00

brfss_11.sub$pc1<-brfss.pc$x[,1]
brfss_11.sub$pc2<-brfss.pc$x[,2]

Here we examine correlation among our health variables

round(cor(brfss_11.sub[,c(9:17)]), 3)

##              healthydays healthymdays    ins  smoke lowact    hbp    hc
## healthydays        1.000        0.343 -0.023  0.106  0.173  0.160 0.115
## healthymdays       0.343        1.000 -0.081  0.161  0.106  0.054 0.067
## ins               -0.023       -0.081  1.000 -0.109 -0.033  0.032 0.037
## smoke              0.106        0.161 -0.109  1.000  0.083 -0.001 0.010
## lowact             0.173        0.106 -0.033  0.083  1.000  0.081 0.061
## hbp                0.160        0.054  0.032 -0.001  0.081  1.000 0.288
## hc                 0.115        0.067  0.037  0.010  0.061  0.288 1.000
## bmi                0.131        0.093 -0.031 -0.024  0.157  0.240 0.132
## badhealth          0.539        0.286 -0.057  0.116  0.180  0.220 0.151
##                 bmi badhealth
## healthydays   0.131     0.539
## healthymdays  0.093     0.286
## ins          -0.031    -0.057
## smoke        -0.024     0.116
## lowact        0.157     0.180
## hbp           0.240     0.220
## hc            0.132     0.151
## bmi           1.000     0.179
## badhealth     0.179     1.000

Sometimes it’s easier to look at the correlations among the original variables and the components

round(cor(brfss_11.sub[,c(9:17, 29:30)]),3)

##              healthydays healthymdays    ins  smoke lowact    hbp    hc
## healthydays        1.000        0.343 -0.023  0.106  0.173  0.160 0.115
## healthymdays       0.343        1.000 -0.081  0.161  0.106  0.054 0.067
## ins               -0.023       -0.081  1.000 -0.109 -0.033  0.032 0.037
## smoke              0.106        0.161 -0.109  1.000  0.083 -0.001 0.010
## lowact             0.173        0.106 -0.033  0.083  1.000  0.081 0.061
## hbp                0.160        0.054  0.032 -0.001  0.081  1.000 0.288
## hc                 0.115        0.067  0.037  0.010  0.061  0.288 1.000
## bmi                0.131        0.093 -0.031 -0.024  0.157  0.240 0.132
## badhealth          0.539        0.286 -0.057  0.116  0.180  0.220 0.151
## pc1                0.733        0.548 -0.109  0.247  0.399  0.467 0.379
## pc2               -0.197       -0.379  0.397 -0.483 -0.041  0.562 0.508
##                 bmi badhealth    pc1    pc2
## healthydays   0.131     0.539  0.733 -0.197
## healthymdays  0.093     0.286  0.548 -0.379
## ins          -0.031    -0.057 -0.109  0.397
## smoke        -0.024     0.116  0.247 -0.483
## lowact        0.157     0.180  0.399 -0.041
## hbp           0.240     0.220  0.467  0.562
## hc            0.132     0.151  0.379  0.508
## bmi           1.000     0.179  0.426  0.377
## badhealth     0.179     1.000  0.756 -0.111
## pc1           0.426     0.756  1.000  0.000
## pc2           0.377    -0.111  0.000  1.000

#Make the survey design object
options(survey.lonely.psu = "adjust")
brfss_11.sub<-brfss_11.sub[complete.cases(brfss_11.sub),]
des<-svydesign(ids=~psu, strata=~ststr, weights=~cntywt, data=brfss_11.sub , nest=T)

#means of the variables in the index
svymean(~healthydays+healthymdays+ins+smoke+lowact+hbp+hc+bmi+badhealth, des, estimate.only=T )

##                  mean     SE
## healthydays   3.69151 0.0415
## healthymdays  3.60397 0.0445
## ins           0.88491 0.0021
## smoke         0.15836 0.0020
## lowact        0.43998 0.0027
## hbp           0.33094 0.0025
## hc            0.36971 0.0026
## bmi          27.63198 0.0329
## badhealth     0.15746 0.0021

The first analysis will look at variation in my health index across age, and work status:

boxplot(pc1~agec, data=brfss_11.sub)

boxplot(pc1~educ, data=brfss_11.sub)

The age plot is nice, and really shows that older ages have higher values for our health index variable, which confirms our interpretation that higher values of the index are “not good”

Now we do some hypothesis testing. This model will examine variation in my health index across age, education, race and two healthcare access variables:

fit.1<-svyglm(pc1~agec+educ+(black+hispanic+other)+medacc+inc, des, family=gaussian)
summary(fit.1)

## 
## Call:
## svyglm(formula = pc1 ~ agec + educ + (black + hispanic + other) + 
##     medacc + inc, des, family = gaussian)
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~ststr, weights = ~cntywt, data = brfss_11.sub, 
##     nest = T)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.128003   0.044378   2.884  0.00392 ** 
## agec(24,39]   0.466486   0.036767  12.687  < 2e-16 ***
## agec(39,59]   0.883167   0.035182  25.103  < 2e-16 ***
## agec(59,79]   1.103483   0.035645  30.957  < 2e-16 ***
## agec(79,99]   0.925618   0.044485  20.807  < 2e-16 ***
## educ0Prim     0.409117   0.069447   5.891 3.84e-09 ***
## educ1somehs   0.311931   0.044890   6.949 3.70e-12 ***
## educ3somecol -0.122772   0.020330  -6.039 1.56e-09 ***
## educ4colgrad -0.423639   0.018695 -22.661  < 2e-16 ***
## black         0.200932   0.024880   8.076 6.74e-16 ***
## hispanic     -0.047580   0.027949  -1.702  0.08868 .  
## other        -0.010314   0.028714  -0.359  0.71945    
## medacc        0.715557   0.028817  24.831  < 2e-16 ***
## inc          -0.236278   0.006974 -33.882  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.670394)
## 
## Number of Fisher Scoring iterations: 2

vif(fit.1)

##              GVIF Df GVIF^(1/(2*Df))
## agec     1.233225  4        1.026550
## educ     1.363604  4        1.039528
## black    1.047207  1        1.023331
## hispanic 1.148245  1        1.071562
## other    1.045773  1        1.022631
## medacc   1.102283  1        1.049897
## inc      1.490942  1        1.221041

#use survey procedure to do pca
svy.pc<-svyprcomp(formula=~healthydays+healthymdays+ins+smoke+lowact+hbp+hc+bmi+badhealth, center=T, scale.=T,scores=T, des)

brfss_11.sub$pcs1<-svy.pc$x[,1]
brfss_11.sub$pcs2<-svy.pc$x[,2]

#Just look at the non-survey pc1 vs the survey pc1
ggplot(brfss_11.sub, aes(x=pc1, y=pcs1))+geom_point(alpha=.05)+theme_gray()

ggplot(brfss_11.sub, aes(x=pc1, y=pcs1)) + stat_binhex()

Quick Factor analysis

In case you were wondering about how to do basic factor analysis, here you go:

brfss.fa<-svyfactanal(~healthydays+healthymdays+ins+smoke+lowact+hbp+hc+bmi+badhealth, factors=3 , design=des, n="effective", rotation="varimax")
brfss.fa

## 
## Call:
## svyfactanal(~healthydays + healthymdays + ins + smoke + lowact +     hbp + hc + bmi + badhealth, factors = 3, design = des, n = "effective",     rotation = "varimax")
## 
## Uniquenesses:
##  healthydays healthymdays          ins        smoke       lowact 
##        0.178        0.770        0.913        0.894        0.934 
##          hbp           hc          bmi    badhealth 
##        0.530        0.814        0.852        0.591 
## 
## Loadings:
##              Factor1 Factor2 Factor3
## healthydays   0.896   0.135         
## healthymdays  0.357           0.304 
## ins                          -0.293 
## smoke                         0.312 
## lowact        0.145   0.132   0.165 
## hbp           0.103   0.672         
## hc                    0.421         
## bmi                   0.362         
## badhealth     0.501   0.311   0.247 
## 
##                Factor1 Factor2 Factor3
## SS loadings      1.237   0.903   0.384
## Proportion Var   0.137   0.100   0.043
## Cumulative Var   0.137   0.238   0.280
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 236.9 on 12 degrees of freedom.
## The p-value is 7.32e-44

brfss.fa$loadings

## 
## Loadings:
##              Factor1 Factor2 Factor3
## healthydays   0.896   0.135         
## healthymdays  0.357           0.304 
## ins                          -0.293 
## smoke                         0.312 
## lowact        0.145   0.132   0.165 
## hbp           0.103   0.672         
## hc                    0.421         
## bmi                   0.362         
## badhealth     0.501   0.311   0.247 
## 
##                Factor1 Factor2 Factor3
## SS loadings      1.237   0.903   0.384
## Proportion Var   0.137   0.100   0.043
## Cumulative Var   0.137   0.238   0.280

x<-scale(cbind(brfss_11.sub[,c("healthydays", "healthymdays", "ins", "smoke", "lowact", "hbp", "hc", "bmi", "badhealth")]))

brfss.fact.scores <- x%*%solve(cor(x))%*%loadings(brfss.fa)


#then I plot the first 2 factors
hist(brfss.fact.scores[,1])

hist(brfss.fact.scores[,2])

brfss_11.sub$fa1<-brfss.fact.scores[,1]
brfss_11.sub$fa2<-brfss.fact.scores[,2]

ggplot(brfss_11.sub, aes(x=pcs1, y=fa1))+geom_point(alpha=.05)+theme_gray()

ggplot(brfss_11.sub, aes(x=pcs1, y=fa1)) + stat_binhex()

cor(brfss_11.sub[c("pcs1", "fa1")])

##           pcs1       fa1
## pcs1 1.0000000 0.2408964
## fa1  0.2408964 1.0000000

#ggplot(brfss_11.sub, aes(x=fa1, y=fa2))+geom_point(alpha=.05)+theme_gray()

So we get a slightly cleaner health index with the first factor than with the PCA, in the FA, we see that the first factor is more of a “general health index”, with the key variables being the #’s of days and the self rated health, while the second factor corresponds very well with what we saw from the PCA. Often times, factor analysis will often give cleaner looking results than PCA, because you can monkey with the initial solution.

Principal Components Analysis

coreysparks

March 16, 2015

Analysis

Quick Factor analysis