DEM 7283 - Homewk 7 - Principal Component Analysis

#Load data for analysis#sub-setting and Re-codding variables for analysis purposes

brfss <- readRDS("brfss_177.rds")
# Cleaning the  variable names for space, underscore & Uppercase Characters
renam<-names(brfss)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  renam))
names(brfss)<-newnames


homewk3 <- brfss %>% 
  dplyr::select(state,llcpwt,dispcode,seqno,psu,bmi5cat,racegr3,educa,ststr,smoke100,exerany2,sex,ageg,menthlth,smokday2,sleptim1,genhlth,physhlth ) %>%  
  mutate( smokers=ifelse(smoke100==1 & smokday2%in%c(1,2),3,
                 ifelse(smoke100==1 & smokday2==3, 2, 
                ifelse(smoke100==2, 1, NA)))) %>% 
  
mutate(bmi = car::Recode(bmi5cat,recodes="1=2;2=1;3=3; 4=4;else=NA" ),
       educa = car::Recode(educa,recodes="1:2='<HS';3:4='HS';5:6='>HS';else=NA", as.factor=T ),
       racegr3 = car::Recode(racegr3,recodes="1='NH-White';2='NH-Black';3:4='Other';5='Hispanic';else=NA",as.factor=T ), 
       ageg = car::Recode(ageg,recodes="1='18-24';2='25-30';3='35-44';4='45-54';5='55-64';else='65+'", as.factor=T ), 
       Physicala = car::Recode(exerany2,recodes="1='Yes';2='No';else=NA",as.factor=T ), sex = car::Recode(sex,recodes="1='Male';2='Female';else=NA",as.factor=T ), physhlth = car::Recode(physhlth,recodes= "88=1; 77=NA; 99=NA" ),
       genhlth = car::Recode(genhlth, recodes="4:5=2; 1:3=1; else=NA"),
        sleepdurat=ifelse(sleptim1>=7 & !sleptim1%in%c(77,99), 1,
          ifelse(sleptim1<7, 2 , NA ) ),
        mentald=ifelse(menthlth<31, 2,
                       ifelse(menthlth==88, 1,NA))) 
 
#Data containing only variables used in the analysis
Final_data<-homewk3%>%
  filter(complete.cases(state,llcpwt,dispcode,seqno,psu,smokers,bmi,educa,ageg,exerany2, sleepdurat,sex,racegr3,mentald,ststr,genhlth,physhlth))%>%
  select(state,llcpwt,dispcode,seqno,psu,smokers,bmi,educa,ageg,exerany2, sleepdurat,sex,racegr3,mentald,ststr,genhlth,physhlth)%>%
  mutate_at(vars(smokers, exerany2, genhlth, physhlth, bmi,mentald), scale)

Latent Construct

Sleep Adqeuacy Index

The latent construct for this analysis is that lower values of the sleep adequacy index will mean that such individuals enjoy adequate sleep(an average sleep of more than 7 hours daily)

brfss.pc<-princomp(~physhlth+genhlth+exerany2+bmi+genhlth+mentald+smokers,
                   data=Final_data,
                   scores=T)

#Screeplot
screeplot(brfss.pc,
          type = "l",
          main = "Scree Plot")
abline(h=1)

Summary Statistics

#Request some basic summaries of the PCA analysis option(digits=3)
summary(brfss.pc)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.3592092 1.0177354 0.9827464 0.9301822 0.9057536
## Proportion of Variance 0.3079153 0.1726348 0.1609688 0.1442098 0.1367347
## Cumulative Proportion  0.3079153 0.4805501 0.6415189 0.7857287 0.9224634
##                           Comp.6
## Standard deviation     0.6820623
## Proportion of Variance 0.0775366
## Cumulative Proportion  1.0000000

The first two components account for 48% of the variation in the input variable. Considering that the eigenvalues must be greater than 1 when using a z-scored data, Only first two components satified the condition.

Load the Vectors

loadings(brfss.pc )

## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## physhlth  0.581         0.179  0.241  0.260  0.705
## genhlth   0.585         0.158  0.200  0.303 -0.707
## exerany2  0.323 -0.424 -0.430  0.383 -0.619       
## bmi       0.173 -0.762  0.272 -0.554              
## mentald   0.333  0.445  0.315 -0.446 -0.626       
## smokers   0.273  0.178 -0.765 -0.499  0.242       
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.167  0.167  0.167  0.167  0.167  0.167
## Cumulative Var  0.167  0.333  0.500  0.667  0.833  1.000

The loading for the first principal Component shows a positive association. This component can be interepreted as index of sleep adequacy. All the variables loaded in the same direction. However,

the pricipal components 2 has a mixture of loadings, some positive, some negative, and missings. The only variable that has a loading with large coefficients is mental health condition.

Components Plotting

#then I plot the first 2 components

scores<-data.frame(brfss.pc$scores)
hist(scores$Comp.1)

hist(scores$Comp.2)

Correlation Assumption

Here, the correlation between the first 2 components should show they are orthogonal, i.e. correlation == 0

cor(scores[,1:2])

##              Comp.1       Comp.2
## Comp.1 1.000000e+00 1.342456e-14
## Comp.2 1.342456e-14 1.000000e+00

brfss_17c<-cbind(Final_data, scores)
names(brfss_17c)

##  [1] "state"      "llcpwt"     "dispcode"   "seqno"      "psu"       
##  [6] "smokers"    "bmi"        "educa"      "ageg"       "exerany2"  
## [11] "sleepdurat" "sex"        "racegr3"    "mentald"    "ststr"     
## [16] "genhlth"    "physhlth"   "Comp.1"     "Comp.2"     "Comp.3"    
## [21] "Comp.4"     "Comp.5"     "Comp.6"

Here, I examined correlation among our Sleep adequacy variables

names(brfss_17c)

##  [1] "state"      "llcpwt"     "dispcode"   "seqno"      "psu"       
##  [6] "smokers"    "bmi"        "educa"      "ageg"       "exerany2"  
## [11] "sleepdurat" "sex"        "racegr3"    "mentald"    "ststr"     
## [16] "genhlth"    "physhlth"   "Comp.1"     "Comp.2"     "Comp.3"    
## [21] "Comp.4"     "Comp.5"     "Comp.6"

round(cor(brfss_17c[,c(-1:-5, -8:-9, -11:-13, -15,  -18:-23)]), 3)

##          smokers   bmi exerany2 mentald genhlth physhlth
## smokers    1.000 0.001    0.114   0.086   0.150    0.135
## bmi        0.001 1.000    0.101   0.010   0.111    0.078
## exerany2   0.114 0.101    1.000   0.042   0.183    0.179
## mentald    0.086 0.010    0.042   1.000   0.200    0.218
## genhlth    0.150 0.111    0.183   0.200   1.000    0.533
## physhlth   0.135 0.078    0.179   0.218   0.533    1.000

Here, i looked at the correlations among the original variables and the first component

round(cor(brfss_17c[,c(-1:-5, -8:-9, -11:-13, -15,  -19:-23)]), 3)

##          smokers   bmi exerany2 mentald genhlth physhlth Comp.1
## smokers    1.000 0.001    0.114   0.086   0.150    0.135  0.372
## bmi        0.001 1.000    0.101   0.010   0.111    0.078  0.235
## exerany2   0.114 0.101    1.000   0.042   0.183    0.179  0.439
## mentald    0.086 0.010    0.042   1.000   0.200    0.218  0.452
## genhlth    0.150 0.111    0.183   0.200   1.000    0.533  0.795
## physhlth   0.135 0.078    0.179   0.218   0.533    1.000  0.790
## Comp.1     0.372 0.235    0.439   0.452   0.795    0.790  1.000

This allows us to see the correlations among the original sleep adequacy variables and the new component. This is important for interpreting the direction of the component. The PC1 suggests that individuals with lower PC1 scores have adequate sleep hours daily, because PC1 has positive correlations with all of the sleep adequacy variables.

Using the Principal Components

Next, I will use the PC’s we just generated in an analysis

#Make the survey design object
options(survey.lonely.psu = "adjust")
brfss_17c$pc1q<-cut(brfss_17c$Comp.1,
                    breaks = quantile(brfss_17c$Comp.1,
                                      probs = c(0,.25,.5,.75,1) ,
                                      na.rm=T), 
                    include.lowest=T)
des<-svydesign(ids=~1,
               strata=~ststr,
               weights=~llcpwt,
               data=brfss_17c)

The first analysis will look at variation in the sleep adequacy index across age, and education level:

library(ggplot2)
ggplot(aes(x=ageg, y=Comp.1),
       data=brfss_17c)+
  geom_boxplot()

brfss_17c$educa<-factor(brfss_17c$educa, levels(brfss_17c$educ)[c(1,3,2,4,5)])

ggplot(aes(x=educa, y=Comp.1),
       data=brfss_17c)+
  geom_boxplot()

The age plot shows that older ages have higher values for our Sleep adequacy index variable, which confirms the interpretation that the higher values of the index are “not good”

Hypothesis Testing

Now, i did some hypothesis testing. The model will examine variation in the sleep adequacy index across age, education, race:

fit.1<-svyglm(Comp.1~ageg+educa+(racegr3),
              des,
              family=gaussian)
summary(fit.1)

## 
## Call:
## svyglm(formula = Comp.1 ~ ageg + educa + (racegr3), design = des, 
##     family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = brfss_17c)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.22263    0.08350   2.666 0.007675 ** 
## ageg25-30        0.25599    0.03871   6.613 3.80e-11 ***
## ageg35-44        0.31143    0.03777   8.245  < 2e-16 ***
## ageg45-54        0.51686    0.04047  12.772  < 2e-16 ***
## ageg55-64        0.63801    0.03888  16.409  < 2e-16 ***
## ageg65+          0.49302    0.03496  14.103  < 2e-16 ***
## educa>HS        -0.85743    0.08040 -10.664  < 2e-16 ***
## educaHS         -0.34153    0.08161  -4.185 2.86e-05 ***
## racegr3NH-Black  0.17886    0.05324   3.359 0.000782 ***
## racegr3NH-White  0.06584    0.03336   1.974 0.048438 *  
## racegr3Other     0.20025    0.05327   3.759 0.000171 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.773233)
## 
## Number of Fisher Scoring iterations: 2

So, overall, the index increases across age except for age group 65+, and decreases with higher education. Blacks and White race/ethnic groups have lower indices on average than other, while NH blacks and Other race respondents have higher values than White.