#Load data for analysis#sub-setting and Re-codding variables for analysis purposes
brfss <- readRDS("brfss_177.rds")
# Cleaning the variable names for space, underscore & Uppercase Characters
renam<-names(brfss)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = renam))
names(brfss)<-newnames
homewk3 <- brfss %>%
dplyr::select(state,llcpwt,dispcode,seqno,psu,bmi5cat,racegr3,educa,ststr,smoke100,exerany2,sex,ageg,menthlth,smokday2,sleptim1,genhlth,physhlth ) %>%
mutate( smokers=ifelse(smoke100==1 & smokday2%in%c(1,2),3,
ifelse(smoke100==1 & smokday2==3, 2,
ifelse(smoke100==2, 1, NA)))) %>%
mutate(bmi = car::Recode(bmi5cat,recodes="1=2;2=1;3=3; 4=4;else=NA" ),
educa = car::Recode(educa,recodes="1:2='<HS';3:4='HS';5:6='>HS';else=NA", as.factor=T ),
racegr3 = car::Recode(racegr3,recodes="1='NH-White';2='NH-Black';3:4='Other';5='Hispanic';else=NA",as.factor=T ),
ageg = car::Recode(ageg,recodes="1='18-24';2='25-30';3='35-44';4='45-54';5='55-64';else='65+'", as.factor=T ),
Physicala = car::Recode(exerany2,recodes="1='Yes';2='No';else=NA",as.factor=T ), sex = car::Recode(sex,recodes="1='Male';2='Female';else=NA",as.factor=T ), physhlth = car::Recode(physhlth,recodes= "88=1; 77=NA; 99=NA" ),
genhlth = car::Recode(genhlth, recodes="4:5=2; 1:3=1; else=NA"),
sleepdurat=ifelse(sleptim1>=7 & !sleptim1%in%c(77,99), 1,
ifelse(sleptim1<7, 2 , NA ) ),
mentald=ifelse(menthlth<31, 2,
ifelse(menthlth==88, 1,NA)))
#Data containing only variables used in the analysis
Final_data<-homewk3%>%
filter(complete.cases(state,llcpwt,dispcode,seqno,psu,smokers,bmi,educa,ageg,exerany2, sleepdurat,sex,racegr3,mentald,ststr,genhlth,physhlth))%>%
select(state,llcpwt,dispcode,seqno,psu,smokers,bmi,educa,ageg,exerany2, sleepdurat,sex,racegr3,mentald,ststr,genhlth,physhlth)%>%
mutate_at(vars(smokers, exerany2, genhlth, physhlth, bmi,mentald), scale)
Sleep Adqeuacy Index
The latent construct for this analysis is that lower values of the sleep adequacy index will mean that such individuals enjoy adequate sleep(an average sleep of more than 7 hours daily)
brfss.pc<-princomp(~physhlth+genhlth+exerany2+bmi+genhlth+mentald+smokers,
data=Final_data,
scores=T)
#Screeplot
screeplot(brfss.pc,
type = "l",
main = "Scree Plot")
abline(h=1)
#Request some basic summaries of the PCA analysis option(digits=3)
summary(brfss.pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.3592092 1.0177354 0.9827464 0.9301822 0.9057536
## Proportion of Variance 0.3079153 0.1726348 0.1609688 0.1442098 0.1367347
## Cumulative Proportion 0.3079153 0.4805501 0.6415189 0.7857287 0.9224634
## Comp.6
## Standard deviation 0.6820623
## Proportion of Variance 0.0775366
## Cumulative Proportion 1.0000000
The first two components account for 48% of the variation in the input variable. Considering that the eigenvalues must be greater than 1 when using a z-scored data, Only first two components satified the condition.
loadings(brfss.pc )
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## physhlth 0.581 0.179 0.241 0.260 0.705
## genhlth 0.585 0.158 0.200 0.303 -0.707
## exerany2 0.323 -0.424 -0.430 0.383 -0.619
## bmi 0.173 -0.762 0.272 -0.554
## mentald 0.333 0.445 0.315 -0.446 -0.626
## smokers 0.273 0.178 -0.765 -0.499 0.242
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
## Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
The loading for the first principal Component shows a positive association. This component can be interepreted as index of sleep adequacy. All the variables loaded in the same direction. However,
the pricipal components 2 has a mixture of loadings, some positive, some negative, and missings. The only variable that has a loading with large coefficients is mental health condition.
#then I plot the first 2 components
scores<-data.frame(brfss.pc$scores)
hist(scores$Comp.1)
hist(scores$Comp.2)
Here, the correlation between the first 2 components should show they are orthogonal, i.e. correlation == 0
cor(scores[,1:2])
## Comp.1 Comp.2
## Comp.1 1.000000e+00 1.342456e-14
## Comp.2 1.342456e-14 1.000000e+00
brfss_17c<-cbind(Final_data, scores)
names(brfss_17c)
## [1] "state" "llcpwt" "dispcode" "seqno" "psu"
## [6] "smokers" "bmi" "educa" "ageg" "exerany2"
## [11] "sleepdurat" "sex" "racegr3" "mentald" "ststr"
## [16] "genhlth" "physhlth" "Comp.1" "Comp.2" "Comp.3"
## [21] "Comp.4" "Comp.5" "Comp.6"
Here, I examined correlation among our Sleep adequacy variables
names(brfss_17c)
## [1] "state" "llcpwt" "dispcode" "seqno" "psu"
## [6] "smokers" "bmi" "educa" "ageg" "exerany2"
## [11] "sleepdurat" "sex" "racegr3" "mentald" "ststr"
## [16] "genhlth" "physhlth" "Comp.1" "Comp.2" "Comp.3"
## [21] "Comp.4" "Comp.5" "Comp.6"
round(cor(brfss_17c[,c(-1:-5, -8:-9, -11:-13, -15, -18:-23)]), 3)
## smokers bmi exerany2 mentald genhlth physhlth
## smokers 1.000 0.001 0.114 0.086 0.150 0.135
## bmi 0.001 1.000 0.101 0.010 0.111 0.078
## exerany2 0.114 0.101 1.000 0.042 0.183 0.179
## mentald 0.086 0.010 0.042 1.000 0.200 0.218
## genhlth 0.150 0.111 0.183 0.200 1.000 0.533
## physhlth 0.135 0.078 0.179 0.218 0.533 1.000
Here, i looked at the correlations among the original variables and the first component
round(cor(brfss_17c[,c(-1:-5, -8:-9, -11:-13, -15, -19:-23)]), 3)
## smokers bmi exerany2 mentald genhlth physhlth Comp.1
## smokers 1.000 0.001 0.114 0.086 0.150 0.135 0.372
## bmi 0.001 1.000 0.101 0.010 0.111 0.078 0.235
## exerany2 0.114 0.101 1.000 0.042 0.183 0.179 0.439
## mentald 0.086 0.010 0.042 1.000 0.200 0.218 0.452
## genhlth 0.150 0.111 0.183 0.200 1.000 0.533 0.795
## physhlth 0.135 0.078 0.179 0.218 0.533 1.000 0.790
## Comp.1 0.372 0.235 0.439 0.452 0.795 0.790 1.000
This allows us to see the correlations among the original sleep adequacy variables and the new component. This is important for interpreting the direction of the component. The PC1 suggests that individuals with lower PC1 scores have adequate sleep hours daily, because PC1 has positive correlations with all of the sleep adequacy variables.
Next, I will use the PC’s we just generated in an analysis
#Make the survey design object
options(survey.lonely.psu = "adjust")
brfss_17c$pc1q<-cut(brfss_17c$Comp.1,
breaks = quantile(brfss_17c$Comp.1,
probs = c(0,.25,.5,.75,1) ,
na.rm=T),
include.lowest=T)
des<-svydesign(ids=~1,
strata=~ststr,
weights=~llcpwt,
data=brfss_17c)
The first analysis will look at variation in the sleep adequacy index across age, and education level:
library(ggplot2)
ggplot(aes(x=ageg, y=Comp.1),
data=brfss_17c)+
geom_boxplot()
brfss_17c$educa<-factor(brfss_17c$educa, levels(brfss_17c$educ)[c(1,3,2,4,5)])
ggplot(aes(x=educa, y=Comp.1),
data=brfss_17c)+
geom_boxplot()
The age plot shows that older ages have higher values for our Sleep adequacy index variable, which confirms the interpretation that the higher values of the index are “not good”
Now, i did some hypothesis testing. The model will examine variation in the sleep adequacy index across age, education, race:
fit.1<-svyglm(Comp.1~ageg+educa+(racegr3),
des,
family=gaussian)
summary(fit.1)
##
## Call:
## svyglm(formula = Comp.1 ~ ageg + educa + (racegr3), design = des,
## family = gaussian)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = brfss_17c)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.22263 0.08350 2.666 0.007675 **
## ageg25-30 0.25599 0.03871 6.613 3.80e-11 ***
## ageg35-44 0.31143 0.03777 8.245 < 2e-16 ***
## ageg45-54 0.51686 0.04047 12.772 < 2e-16 ***
## ageg55-64 0.63801 0.03888 16.409 < 2e-16 ***
## ageg65+ 0.49302 0.03496 14.103 < 2e-16 ***
## educa>HS -0.85743 0.08040 -10.664 < 2e-16 ***
## educaHS -0.34153 0.08161 -4.185 2.86e-05 ***
## racegr3NH-Black 0.17886 0.05324 3.359 0.000782 ***
## racegr3NH-White 0.06584 0.03336 1.974 0.048438 *
## racegr3Other 0.20025 0.05327 3.759 0.000171 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.773233)
##
## Number of Fisher Scoring iterations: 2
So, overall, the index increases across age except for age group 65+, and decreases with higher education. Blacks and White race/ethnic groups have lower indices on average than other, while NH blacks and Other race respondents have higher values than White.