library(car)
library(stargazer)
library(survey)
library(ggplot2)
library(pander)
library(dplyr)
library(knitr)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")
#Healthy mental health days
brfss_17$healthmdays<-Recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2=1; 3:4=0; else=NA")
brfss_17$skcancer<-Recode(brfss_17$chcscncr, recodes = "1= 1; 2= 0; else=NA")
brfss_17$kidney<-Recode(brfss_17$chckidny, recodes = "1= 1; 2= 0; else=NA")
brfss_17$chobpuld<-Recode(brfss_17$chccopd1, recodes = "1= 1; 2= 0; else=NA")
brfss_17$arthritis<-Recode(brfss_17$havarth3, recodes = "1= 1; 2= 0; else=NA")
brfss_17$depdisorder<-Recode(brfss_17$addepev2, recodes = "1= 1; 2:4=0; else=NA")
wadt<-brfss_17%>%
filter(complete.cases(mmsawt, ststr, agec, race_eth,educ,healthdays, healthmdays, smoke, bmi, badhealth,ins, skcancer, kidney, chobpuld, arthritis, depdisorder))%>%
select(mmsawt, ststr, agec, race_eth,educ,healthdays, healthmdays, smoke, bmi, badhealth,ins, skcancer, kidney, chobpuld, arthritis, depdisorder)%>%
mutate_at(vars(skcancer, kidney, chobpuld, smoke, bmi, badhealth,ins,arthritis, depdisorder), scale)
brfss.pc<-princomp(~skcancer + kidney + chobpuld + arthritis + depdisorder,
data=wadt,
scores=T)
#Screeplot
screeplot(brfss.pc,
type = "l",
main = "Scree Plot")
abline(h=1)
## Summary Statistics
#Request some basic summaries of the PCA analysis option(digits=3)
summary(brfss.pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.1942948 1.0050161 0.9622971 0.9253068 0.8839503
## Proportion of Variance 0.2852695 0.2020125 0.1852041 0.1712394 0.1562744
## Cumulative Proportion 0.2852695 0.4872820 0.6724861 0.8437256 1.0000000
loadings(brfss.pc )
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## skcancer 0.269 0.827 0.304 0.167 0.351
## kidney 0.385 0.144 -0.901 0.102
## chobpuld 0.503 -0.192 0.140 -0.770 0.312
## arthritis 0.574 0.174 -0.794
## depdisorder 0.443 -0.503 0.214 0.605 0.373
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.2 0.2 0.2 0.2 0.2
## Cumulative Var 0.2 0.4 0.6 0.8 1.0
For PC2 we see a mixture of loading, some positive, some negative, and a missing. The only one variables that has a loading with large coefficients is skin cancer.
#then I plot the first 2 components
scores<-data.frame(brfss.pc$scores)
hist(scores$Comp.1)
hist(scores$Comp.2)
Next, I calculate the correlation between the first 2 components to show they are orthogonal, i.e. correlation == 0
cor(scores[,1:2])
## Comp.1 Comp.2
## Comp.1 1.000000e+00 -8.575574e-13
## Comp.2 -8.575574e-13 1.000000e+00
brfss_17c<-cbind(wadt, scores)
names(brfss_17c)
## [1] "mmsawt" "ststr" "agec" "race_eth" "educ"
## [6] "healthdays" "healthmdays" "smoke" "bmi" "badhealth"
## [11] "ins" "skcancer" "kidney" "chobpuld" "arthritis"
## [16] "depdisorder" "Comp.1" "Comp.2" "Comp.3" "Comp.4"
## [21] "Comp.5"
Here we examine correlation among chronic health condition variables
names(brfss_17c)
## [1] "mmsawt" "ststr" "agec" "race_eth" "educ"
## [6] "healthdays" "healthmdays" "smoke" "bmi" "badhealth"
## [11] "ins" "skcancer" "kidney" "chobpuld" "arthritis"
## [16] "depdisorder" "Comp.1" "Comp.2" "Comp.3" "Comp.4"
## [21] "Comp.5"
round(cor(brfss_17c[,c(-1:-5, -6:-11, -17:-21)]), 3)
## skcancer kidney chobpuld arthritis depdisorder
## skcancer 1.000 0.056 0.047 0.122 -0.001
## kidney 0.056 1.000 0.095 0.123 0.070
## chobpuld 0.047 0.095 1.000 0.183 0.135
## arthritis 0.122 0.123 0.183 1.000 0.164
## depdisorder -0.001 0.070 0.135 0.164 1.000
Sometimes it’s easier to look at the correlations among the original variables and the first 2 components
round(cor(brfss_17c[,c(-1:-5, -6:-11, -18:-21)]),3)
## skcancer kidney chobpuld arthritis depdisorder Comp.1
## skcancer 1.000 0.056 0.047 0.122 -0.001 0.321
## kidney 0.056 1.000 0.095 0.123 0.070 0.460
## chobpuld 0.047 0.095 1.000 0.183 0.135 0.600
## arthritis 0.122 0.123 0.183 1.000 0.164 0.686
## depdisorder -0.001 0.070 0.135 0.164 1.000 0.529
## Comp.1 0.321 0.460 0.600 0.686 0.529 1.000
Next, I will use the PC’s we just generated in an analysis
#Make the survey design object
options(survey.lonely.psu = "adjust")
brfss_17c$pc1q<-cut(brfss_17c$Comp.1,
breaks = unique(quantile(brfss_17c$Comp.1,
probs = c(0,.25,.5,.75,1) ,
na.rm=T)),
include.lowest=T)
des<-svydesign(ids=~1,
strata=~ststr,
weights=~mmsawt,
data=brfss_17c)
The first analysis will look at variation in the chronic health condition index across age, and education level:
library(ggplot2)
ggplot(aes(x=agec, y=Comp.1),
data=brfss_17c)+
geom_boxplot()
brfss_17c$educ<-factor(brfss_17c$educ, levels(brfss_17c$educ)[c(1,3,2,4,5)])
ggplot(aes(x=educ, y=Comp.1),
data=brfss_17c)+
geom_boxplot()
The model will examine variation in the chronic health condition index across age, education, and race variables:
fit.1<-svyglm(Comp.1~agec+educ+(race_eth),
des,
family=gaussian)
summary(fit.1)
##
## Call:
## svyglm(formula = Comp.1 ~ agec + educ + (race_eth), design = des,
## family = gaussian)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17c)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.488340 0.013450 -36.308 < 2e-16 ***
## agec(24,39] 0.132607 0.012590 10.533 < 2e-16 ***
## agec(39,59] 0.440154 0.014101 31.213 < 2e-16 ***
## agec(59,79] 0.912206 0.016713 54.582 < 2e-16 ***
## agec(79,99] 1.041658 0.028131 37.029 < 2e-16 ***
## educ0Prim 0.031548 0.033763 0.934 0.35010
## educ1somehs 0.241942 0.027280 8.869 < 2e-16 ***
## educ3somecol 0.009935 0.014040 0.708 0.47916
## educ4colgrad -0.255626 0.012154 -21.032 < 2e-16 ***
## race_ethhispanic -0.313233 0.014048 -22.297 < 2e-16 ***
## race_ethnh black -0.127186 0.017490 -7.272 3.56e-13 ***
## race_ethnh multirace 0.107350 0.033242 3.229 0.00124 **
## race_ethnh other -0.222793 0.022237 -10.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.046589)
##
## Number of Fisher Scoring iterations: 2