library(car)
library(stargazer)
library(survey)
library(ggplot2)
library(pander)
library(dplyr)
library(knitr)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

What is your latent construct?

In this assignment i will use Principal Component Analysis (PCA) to perform a variable reduction to look at chronic health condition index. The variables to develop the index are skin caneer, chronic obstructive pulmonary disease, arthritis, dispressive disorder, and kidney disease.

Recode variables

#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")

#Healthy mental health days
brfss_17$healthmdays<-Recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2=1; 3:4=0; else=NA")


brfss_17$skcancer<-Recode(brfss_17$chcscncr, recodes = "1= 1; 2= 0; else=NA")
brfss_17$kidney<-Recode(brfss_17$chckidny, recodes = "1= 1; 2= 0; else=NA")
brfss_17$chobpuld<-Recode(brfss_17$chccopd1, recodes = "1= 1; 2= 0; else=NA")
brfss_17$arthritis<-Recode(brfss_17$havarth3, recodes = "1= 1; 2= 0; else=NA")
brfss_17$depdisorder<-Recode(brfss_17$addepev2, recodes = "1= 1; 2:4=0; else=NA")

Analysis

wadt<-brfss_17%>%
  filter(complete.cases(mmsawt, ststr, agec, race_eth,educ,healthdays, healthmdays, smoke, bmi, badhealth,ins, skcancer, kidney, chobpuld, arthritis, depdisorder))%>%
  select(mmsawt, ststr, agec, race_eth,educ,healthdays, healthmdays, smoke, bmi, badhealth,ins, skcancer, kidney, chobpuld, arthritis, depdisorder)%>%
  mutate_at(vars(skcancer, kidney, chobpuld, smoke, bmi, badhealth,ins,arthritis, depdisorder), scale)
brfss.pc<-princomp(~skcancer + kidney + chobpuld + arthritis + depdisorder,
                   data=wadt,
                   scores=T)

#Screeplot
screeplot(brfss.pc,
          type = "l",
          main = "Scree Plot")
abline(h=1)

## Summary Statistics

#Request some basic summaries of the PCA analysis option(digits=3)
summary(brfss.pc)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.1942948 1.0050161 0.9622971 0.9253068 0.8839503
## Proportion of Variance 0.2852695 0.2020125 0.1852041 0.1712394 0.1562744
## Cumulative Proportion  0.2852695 0.4872820 0.6724861 0.8437256 1.0000000

The first two eigenvalues account for 48.7% of the variation in the input variables. That isn’t bad. Also, there are 3 eigenvalues of at least 1, which suggests there are 3 real components among these variables (remember, we are looking for eigenvalues > 1 when using z-scored data).

Examine eignevectors/loading vectors

loadings(brfss.pc )
## 
## Loadings:
##             Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## skcancer     0.269  0.827  0.304  0.167  0.351
## kidney       0.385  0.144 -0.901         0.102
## chobpuld     0.503 -0.192  0.140 -0.770  0.312
## arthritis    0.574         0.174        -0.794
## depdisorder  0.443 -0.503  0.214  0.605  0.373
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0

The loading for PC1 shows a positive association. This component may be interpreted as an index of the presence of chronic health condition, since all the health variables are loading in the same direction.

For PC2 we see a mixture of loading, some positive, some negative, and a missing. The only one variables that has a loading with large coefficients is skin cancer.

Components Plotting

#then I plot the first 2 components

scores<-data.frame(brfss.pc$scores)
hist(scores$Comp.1)

hist(scores$Comp.2)

There is no correlation between the first two components.

Correlation Matrix

Next, I calculate the correlation between the first 2 components to show they are orthogonal, i.e. correlation == 0

cor(scores[,1:2])
##               Comp.1        Comp.2
## Comp.1  1.000000e+00 -8.575574e-13
## Comp.2 -8.575574e-13  1.000000e+00
brfss_17c<-cbind(wadt, scores)
names(brfss_17c)
##  [1] "mmsawt"      "ststr"       "agec"        "race_eth"    "educ"       
##  [6] "healthdays"  "healthmdays" "smoke"       "bmi"         "badhealth"  
## [11] "ins"         "skcancer"    "kidney"      "chobpuld"    "arthritis"  
## [16] "depdisorder" "Comp.1"      "Comp.2"      "Comp.3"      "Comp.4"     
## [21] "Comp.5"

Here we examine correlation among chronic health condition variables

names(brfss_17c)
##  [1] "mmsawt"      "ststr"       "agec"        "race_eth"    "educ"       
##  [6] "healthdays"  "healthmdays" "smoke"       "bmi"         "badhealth"  
## [11] "ins"         "skcancer"    "kidney"      "chobpuld"    "arthritis"  
## [16] "depdisorder" "Comp.1"      "Comp.2"      "Comp.3"      "Comp.4"     
## [21] "Comp.5"
round(cor(brfss_17c[,c(-1:-5, -6:-11,  -17:-21)]), 3)
##             skcancer kidney chobpuld arthritis depdisorder
## skcancer       1.000  0.056    0.047     0.122      -0.001
## kidney         0.056  1.000    0.095     0.123       0.070
## chobpuld       0.047  0.095    1.000     0.183       0.135
## arthritis      0.122  0.123    0.183     1.000       0.164
## depdisorder   -0.001  0.070    0.135     0.164       1.000

Sometimes it’s easier to look at the correlations among the original variables and the first 2 components

round(cor(brfss_17c[,c(-1:-5, -6:-11, -18:-21)]),3)
##             skcancer kidney chobpuld arthritis depdisorder Comp.1
## skcancer       1.000  0.056    0.047     0.122      -0.001  0.321
## kidney         0.056  1.000    0.095     0.123       0.070  0.460
## chobpuld       0.047  0.095    1.000     0.183       0.135  0.600
## arthritis      0.122  0.123    0.183     1.000       0.164  0.686
## depdisorder   -0.001  0.070    0.135     0.164       1.000  0.529
## Comp.1         0.321  0.460    0.600     0.686       0.529  1.000

This allows us to see the correlations among the original chronic health condition variables and the new components. This is important for interpreting the direction of the component. In this case, the PC1 suggests that lower PC1 scores have better chronic health condition, because PC1 has positive correlations with all of the chronic health condition variables.

4. If deemed appropriate, conduct some testing of your index/components/latent variables.

Using the Principal Components

Next, I will use the PC’s we just generated in an analysis

#Make the survey design object
options(survey.lonely.psu = "adjust")
brfss_17c$pc1q<-cut(brfss_17c$Comp.1,
                    breaks = unique(quantile(brfss_17c$Comp.1,
                                      probs = c(0,.25,.5,.75,1) ,
                                      na.rm=T)), 
                    include.lowest=T)
des<-svydesign(ids=~1,
               strata=~ststr,
               weights=~mmsawt,
               data=brfss_17c)

The first analysis will look at variation in the chronic health condition index across age, and education level:

library(ggplot2)
ggplot(aes(x=agec, y=Comp.1),
       data=brfss_17c)+
  geom_boxplot()

brfss_17c$educ<-factor(brfss_17c$educ, levels(brfss_17c$educ)[c(1,3,2,4,5)])

ggplot(aes(x=educ, y=Comp.1),
       data=brfss_17c)+
  geom_boxplot()

The age plot is nice, and really shows that older ages have higher values for chronic health condition index variable, which confirms the interpretation that higher values of the index are “not good”

Hypothesis Testing

The model will examine variation in the chronic health condition index across age, education, and race variables:

fit.1<-svyglm(Comp.1~agec+educ+(race_eth),
              des,
              family=gaussian)
summary(fit.1)
## 
## Call:
## svyglm(formula = Comp.1 ~ agec + educ + (race_eth), design = des, 
##     family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17c)
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.488340   0.013450 -36.308  < 2e-16 ***
## agec(24,39]           0.132607   0.012590  10.533  < 2e-16 ***
## agec(39,59]           0.440154   0.014101  31.213  < 2e-16 ***
## agec(59,79]           0.912206   0.016713  54.582  < 2e-16 ***
## agec(79,99]           1.041658   0.028131  37.029  < 2e-16 ***
## educ0Prim             0.031548   0.033763   0.934  0.35010    
## educ1somehs           0.241942   0.027280   8.869  < 2e-16 ***
## educ3somecol          0.009935   0.014040   0.708  0.47916    
## educ4colgrad         -0.255626   0.012154 -21.032  < 2e-16 ***
## race_ethhispanic     -0.313233   0.014048 -22.297  < 2e-16 ***
## race_ethnh black     -0.127186   0.017490  -7.272 3.56e-13 ***
## race_ethnh multirace  0.107350   0.033242   3.229  0.00124 ** 
## race_ethnh other     -0.222793   0.022237 -10.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.046589)
## 
## Number of Fisher Scoring iterations: 2

So, overall, the index increases across age, and decreases with higher education. Hispanics, NH black, and other race/ethnic groups have lower indices on average than whites, while NH multiple race respondents have higher values.