DEM 7283 Homework 5

DEM 7283

Professor C. Sparks

I am hypothesizing that there may be a latent construct in my dataset that is representative of sleep health (i.e. underlying sleep disturbances and/or disorders) manifested by short sleep duration, unintentional falling asleep during the day, poor concentration, high stress, and mental health or ‘unhealthy’/excessive behaviors such as binge drinking or smoking that may be associated with a sort of ‘self-medication’ in response to a level of emotional/mental instability.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

#library(stargazer)
#library(haven)
#library(pander)
mydata<-haven::read_xpt("~/Documents/1A_DEM Doctorate/DEM 7283/R/LLCP2017.xpt")
#names(mydata)

Clean up names

newnames<-tolower(gsub(pattern = "_", replacement = "", x= names(mydata))) 
names(mydata)<-newnames

Recode variables

#recode sex variable to male or female
mydata$male<-Recode(mydata$sex, recodes="1='male'; 2='0female'; else=NA")

#recode 5 level race/ethnicity 
mydata <- mydata %>%
  mutate(racethlvl = ifelse(racegr3 == 1, "0nh white", 
                                ifelse(racegr3== 2, "nh black",
                                ifelse(racegr3 == 3, "nh other/multi",
                                ifelse(racegr3 == 4, "nh other/multi",
                                ifelse(racegr3 == 5, "hispanic", NA))))))

#individual dummy variables
mydata$black<-Recode(mydata$racegr3, recodes="2='nh black';9=NA; else='0non black'")
mydata$white<-Recode(mydata$racegr3, recodes="1='nh white'; 9=NA; else='0non white'")
mydata$other<-Recode(mydata$racegr3, recodes="3:4='nh other/multi'; 9=NA; else='0non other/multi'")
mydata$hispanic<-Recode(mydata$racegr3, recodes="5='hispanic'; 9=NA; else='0non hispanic'")


#recode marital status
mydata <- mydata %>%
  mutate(maritallvl = ifelse(marital == 1, "Married", 
                                ifelse(marital == 2, "PrevMarried",
                                ifelse(marital == 3, "PrevMarried",
                                ifelse(marital == 4, "PrevMarried",
                                ifelse(marital == 5, "0NevMarried", NA))))))
#dummy variable married
mydata$married<-Recode(mydata$marital, recodes="1='married'; 2:5='0no'; else=NA")

#recode education
mydata <- mydata %>%
  mutate(educlvl = ifelse(educag == 1, "lessHS", 
                                ifelse(educag == 2, "HS",
                                ifelse(educag == 3, "Some Col",
                                ifelse(educag == 4, "Col Grad", NA)))))
#recode laborforce etc
mydata$laborforce<-Recode(mydata$employ1, recodes="1:4='labforce'; 5:8='0no'; else=NA")

#recode employment
mydata$employyes<-Recode(mydata$employ1, recodes="1:2='employed'; 3:4='0no';  else=NA")

#dummy variable education
mydata$col <- Recode(mydata$educag, recodes="1:2='0LessCol'; 3:4='Col'; else=NA")

#recode the number of hours of sleep to binary
mydata$shortsleep<-Recode(mydata$sleptim1, recodes ="1:6=1; 6:24=0; else=NA")

#recode unintentionally fell asleep during the day
mydata$unintentslep<-Recode(mydata$slepday1, recodes ="1:14=1; 88=0; else=NA")

#difficulty concentrating or remembering
mydata$brainf<-Recode(mydata$decide, recodes ="1=1; 2=0; else=NA")

#recode mental health
mydata$mentpoor<-Recode(mydata$menthlth, recodes ="1:30=1; 88=0; else=NA")

#recode poor health (either mental or physcial)
mydata$poorhlthnew<-Recode(mydata$poorhlth, recodes ="1:30=1; 88=0; else=NA")

#recode last checkup
mydata <- mydata %>%
  mutate(recchecklvl = ifelse(checkup1 == 1, "<1 yr", 
                                ifelse(checkup1 == 2, "<2 yr",
                                ifelse(checkup1 == 3, "<5 yr",
                                ifelse(checkup1 == 4, "5+", NA)))))

mydata$histress<-Recode(mydata$sdhstres, recodes="1:3=0; 4:5=1; else=NA")

#dummy variable recent checkup (within 1 year)
mydata$reccheck<-Recode(mydata$checkup1, recodes ="1='recent'; 2:4='0no'; else=NA")

#outlierReplace(my_data, "brnk3ge5", which(my_data$drnk3ge5 <70), NA)

#recode binge drinking
mydata$bingedrink<-Recode(mydata$drnk3ge5, recodes= "88=0; 77=NA; 99=NA")

#recode every smoked
mydata$smoke<-Recode(mydata$smoker3, recodes= "1:2=1; 3:4=0; else=NA")

#mydata$cancertypes<-Recode(mydata$cncrdiff, recodes="7=NA; 9=NA")

#recode select illness
#mydata$hrdx <- Recode(mydata$crgvprb2, recode="8=1; 1:7=0; 9:76=0; else=NA")
#mydata$subst <- Recode(mydata$crgvprb2, recode="12=1; 1:11=0; 13:76=0; else=NA")
#mydata$injur <- Recode(mydata$crgvprb2, recode="13=1; 1:12=0; 14:76=0; else=NA")
mydata$mentanx <- Recode(mydata$crgvprb2, recode="10=1; 1:9=0; 11:76=0; else=NA")

Below I am exploring various principal components comprised of different combinations of variables

Using five variables plus 2 behavior-related behaviors (binge drinking and current smoking status)

brfss.pctest7<-prcomp(~shortsleep+unintentslep+brainf+histress+mentpoor+bingedrink+smoke, data=mydata, center=T, scale=T, retx=T)

summary(brfss.pctest7)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6     PC7
## Standard deviation     1.3573 1.0465 0.9794 0.9578 0.9075 0.8466 0.80355
## Proportion of Variance 0.2632 0.1565 0.1370 0.1311 0.1177 0.1024 0.09224
## Cumulative Proportion  0.2632 0.4196 0.5566 0.6877 0.8054 0.9078 1.00000

Using five variables

brfss.pctest5<-prcomp(~shortsleep+unintentslep+brainf+histress+mentpoor, data=mydata, center=T, scale=T, retx=T)

summary(brfss.pctest5)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5
## Standard deviation     1.3177 0.9841 0.9711 0.8417 0.8022
## Proportion of Variance 0.3473 0.1937 0.1886 0.1417 0.1287
## Cumulative Proportion  0.3473 0.5410 0.7296 0.8713 1.0000

Using four variables plus 2 behavior-related behaviors (binge drinking and current smoking status)

brfss.pctest4b2<-prcomp(~shortsleep+brainf+histress+mentpoor+bingedrink+smoke, data=mydata, center=T, scale=T, retx=T)

summary(brfss.pctest4b2)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6
## Standard deviation     1.3426 1.0289 0.9650 0.9081 0.8493 0.8134
## Proportion of Variance 0.3004 0.1764 0.1552 0.1374 0.1202 0.1103
## Cumulative Proportion  0.3004 0.4769 0.6321 0.7695 0.8897 1.0000

Using four variables plus 1 behavior-related behavior (current smoking status)

brfss.pctest4b1<-prcomp(~shortsleep+brainf+histress+mentpoor+smoke, data=mydata, center=T, scale=T, retx=T)

summary(brfss.pctest4b1)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5
## Standard deviation     1.3350 0.9770 0.9517 0.8413 0.8060
## Proportion of Variance 0.3564 0.1909 0.1812 0.1415 0.1299
## Cumulative Proportion  0.3564 0.5474 0.7285 0.8701 1.0000

Using four variables (no unintentional day sleeping)

brfss.pctest4a<-prcomp(~shortsleep+brainf+histress+mentpoor, data=mydata, center=T, scale=T, retx=T)

summary(brfss.pctest4a)

## Importance of components:
##                           PC1    PC2    PC3    PC4
## Standard deviation     1.3004 0.9721 0.8440 0.8073
## Proportion of Variance 0.4227 0.2363 0.1781 0.1629
## Cumulative Proportion  0.4227 0.6590 0.8371 1.0000

Using four variables

brfss.pctest4b<-prcomp(~shortsleep+unintentslep+brainf+histress, data=mydata, center=T, scale=T, retx=T)

summary(brfss.pctest4b)

## Importance of components:
##                           PC1    PC2    PC3    PC4
## Standard deviation     1.2147 0.9776 0.9597 0.8049
## Proportion of Variance 0.3689 0.2389 0.2303 0.1620
## Cumulative Proportion  0.3689 0.6078 0.8380 1.0000

Using three variables

brfss.pctest3<-prcomp(~shortsleep+brainf+histress, data=mydata, center=T, scale=T, retx=T)

summary(brfss.pctest3)

## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.1903 0.9620 0.8110
## Proportion of Variance 0.4723 0.3085 0.2192
## Cumulative Proportion  0.4723 0.7808 1.0000

Moving forward with the four variables plus two behavior-related variables as it explains the most (47.7%) with 2 Principal Components (2 eigenvalues greater than 1). Plus, the assignment detailed to use at least 5 variables.

brfss.pc<-prcomp(~shortsleep+brainf+histress+mentpoor+bingedrink+smoke, data=mydata, center=T, scale=T, retx=T)

#Screeplot
screeplot(brfss.pc, type = "l", main = "Scree Plot")
abline(h=1)

#Request some basic summaries of the PCA analysis option(digits=3)
summary(brfss.pc)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6
## Standard deviation     1.3426 1.0289 0.9650 0.9081 0.8493 0.8134
## Proportion of Variance 0.3004 0.1764 0.1552 0.1374 0.1202 0.1103
## Cumulative Proportion  0.3004 0.4769 0.6321 0.7695 0.8897 1.0000

brfss.pc$rotation

##                   PC1         PC2          PC3         PC4         PC5
## shortsleep -0.2994983  0.01673559 -0.935319896  0.01123894  0.04506867
## brainf     -0.4927119  0.23218803  0.244705137  0.18599170 -0.55213883
## histress   -0.5229841  0.22902101  0.007635137 -0.10784354 -0.24494966
## mentpoor   -0.4712922  0.26367321  0.226890884 -0.25207168  0.74448930
## bingedrink -0.2479598 -0.71288596  0.069863076 -0.62441270 -0.16254684
## smoke      -0.3322995 -0.56180785  0.094220328  0.70726175  0.22896592
##                    PC6
## shortsleep  0.18176879
## brainf      0.55135548
## histress   -0.77611092
## mentpoor    0.19767698
## bingedrink  0.09547766
## smoke      -0.11147605

#then I plot the first 2 components
hist(brfss.pc$x[,1])

hist(brfss.pc$x[,2])

cor(brfss.pc$x[,1:2])

##              PC1          PC2
## PC1 1.000000e+00 5.247131e-15
## PC2 5.247131e-15 1.000000e+00

scores<-data.frame(brfss.pc$x)
scores$name<-rownames(brfss.pc$x)
mydata$name<-rownames(mydata)
mydata<-merge(mydata, scores, by.x="name", by.y="name", all.x=F)
tail(names(mydata), 20)

##  [1] "laborforce"   "employyes"    "col"          "shortsleep"  
##  [5] "unintentslep" "brainf"       "mentpoor"     "poorhlthnew" 
##  [9] "recchecklvl"  "histress"     "reccheck"     "bingedrink"  
## [13] "smoke"        "mentanx"      "PC1"          "PC2"         
## [17] "PC3"          "PC4"          "PC5"          "PC6"

This confirms that the PC’s are relatively orthogonal.

round(cor(mydata[,c("shortsleep","brainf","histress","mentpoor", "bingedrink", "smoke")], method = "spearman"), 3)

##            shortsleep brainf histress mentpoor bingedrink smoke
## shortsleep      1.000  0.107    0.177    0.107      0.073 0.088
## brainf          0.107  1.000    0.320    0.272      0.040 0.155
## histress        0.177  0.320    1.000    0.299      0.090 0.132
## mentpoor        0.107  0.272    0.299    1.000      0.118 0.107
## bingedrink      0.073  0.040    0.090    0.118      1.000 0.200
## smoke           0.088  0.155    0.132    0.107      0.200 1.000

round(cor(mydata[,c("shortsleep","brainf","histress","mentpoor", "bingedrink", "smoke", "PC1", "PC2")], method = "spearman"),3)

##            shortsleep brainf histress mentpoor bingedrink  smoke    PC1
## shortsleep      1.000  0.107    0.177    0.107      0.073  0.088 -0.453
## brainf          0.107  1.000    0.320    0.272      0.040  0.155 -0.422
## histress        0.177  0.320    1.000    0.299      0.090  0.132 -0.450
## mentpoor        0.107  0.272    0.299    1.000      0.118  0.107 -0.713
## bingedrink      0.073  0.040    0.090    0.118      1.000  0.200 -0.395
## smoke           0.088  0.155    0.132    0.107      0.200  1.000 -0.428
## PC1            -0.453 -0.422   -0.450   -0.713     -0.395 -0.428  1.000
## PC2             0.186  0.252    0.265    0.453     -0.486 -0.504 -0.202
##               PC2
## shortsleep  0.186
## brainf      0.252
## histress    0.265
## mentpoor    0.453
## bingedrink -0.486
## smoke      -0.504
## PC1        -0.202
## PC2         1.000

#Make the survey design object
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

options(survey.lonely.psu = "adjust")

#PC1
mydata$pc1q<-cut(mydata$PC1, breaks = quantile(mydata$PC1, probs = c (0, .10, .20, .30, .40, .50, 1), na.rm=T))
des<-svydesign(ids=~1, weights=~llcpwt, data=mydata)

#PC1: means of the variables in the index, by binning the index into 2 groups
svyby( ~shortsleep+brainf+histress+mentpoor+bingedrink+smoke,~pc1q, des,FUN=svymean , na.rm=T)

##                                pc1q shortsleep     brainf   histress
## (-7.96,-1.89]         (-7.96,-1.89] 0.53037668 0.56179668 0.63376753
## (-1.89,-0.732]       (-1.89,-0.732] 0.67383664 0.08474485 0.08352343
## (-0.732,-0.0579]   (-0.732,-0.0579] 0.08462273 0.00000000 0.00000000
## (-0.0579,-0.0299] (-0.0579,-0.0299] 0.00000000 0.00000000 0.00000000
## (-0.0299,0.298]     (-0.0299,0.298] 0.98418295 0.00000000 0.00000000
## (0.298,0.973]         (0.298,0.973] 0.00000000 0.00000000 0.00000000
##                    mentpoor bingedrink     smoke se.shortsleep  se.brainf
## (-7.96,-1.89]     0.9041255  3.1062942 0.4122630   0.021798311 0.02150043
## (-1.89,-0.732]    0.7136519  2.1965618 0.3341017   0.019506975 0.01009419
## (-0.732,-0.0579]  0.7859561  1.3049077 0.1783635   0.008948098 0.00000000
## (-0.0579,-0.0299] 0.0000000  0.0000000 1.0000000   0.000000000 0.00000000
## (-0.0299,0.298]   0.0000000  0.6592666 0.0000000   0.004642791 0.00000000
## (0.298,0.973]     0.0000000  0.5031933 0.0000000   0.000000000 0.00000000
##                   se.histress se.mentpoor se.bingedrink   se.smoke
## (-7.96,-1.89]      0.02081290  0.01336373    0.27261777 0.02192682
## (-1.89,-0.732]     0.01032225  0.01796117    0.18223965 0.02030242
## (-0.732,-0.0579]   0.00000000  0.01379546    0.07679730 0.01302402
## (-0.0579,-0.0299]  0.00000000  0.00000000    0.00000000 0.00000000
## (-0.0299,0.298]    0.00000000  0.00000000    0.05845531 0.00000000
## (0.298,0.973]      0.00000000  0.00000000    0.02464344 0.00000000

#PC2
mydata$pc2q<-cut(mydata$PC2, breaks = quantile(mydata$PC2,probs = c(0,.25, .5, .75, 1) , na.rm=T))
des<-svydesign(ids=~1, weights=~llcpwt, data=mydata)

#PC2: means of the variables in the index, by quantile of the index
svyby( ~shortsleep+brainf+histress+mentpoor+bingedrink+smoke,~pc2q, des,FUN=svymean , na.rm=T)

##                            pc2q shortsleep       brainf     histress
## (-8.08,-0.0646] (-8.08,-0.0646] 0.28181156 0.0602283453 0.0654784682
## (-0.0646,0.128] (-0.0646,0.128] 0.05976522 0.0072307062 0.0008170017
## (0.128,0.174]     (0.128,0.174] 0.99664276 0.0008147114 0.0063002758
## (0.174,2.54]       (0.174,2.54] 0.32990210 0.2006021125 0.2316515307
##                     mentpoor bingedrink        smoke se.shortsleep
## (-8.08,-0.0646] 0.2723361197 3.42347230 0.4537405102   0.012270431
## (-0.0646,0.128] 0.0238452878 0.10331001 0.0070042533   0.005731035
## (0.128,0.174]   0.0008147114 0.02438639 0.0008147114   0.001960078
## (0.174,2.54]    0.9254416977 0.42087969 0.0325121665   0.012633677
##                    se.brainf  se.histress  se.mentpoor se.bingedrink
## (-8.08,-0.0646] 0.0069197851 0.0074064965 0.0123062750    0.12528754
## (-0.0646,0.128] 0.0022744508 0.0004365355 0.0037531011    0.01058008
## (0.128,0.174]   0.0005835184 0.0025306256 0.0005835184    0.01000360
## (0.174,2.54]    0.0110435500 0.0115513908 0.0062889030    0.02566967
##                     se.smoke
## (-8.08,-0.0646] 0.0133956324
## (-0.0646,0.128] 0.0022635030
## (0.128,0.174]   0.0005835184
## (0.174,2.54]    0.0055568563

Interpret parcing for PC index bins above

library(ggplot2)
ggplot(aes(x=racethlvl, y=PC1), data=mydata)+geom_boxplot()

ggplot(aes(x=racethlvl, y=PC2), data=mydata)+geom_boxplot()

Hispanic and White populations are associated with a higher PC1 values and so with better sleep health. Black and Other/Multi populations are associated with lower PC1 values and so worse sleep health. The Hispanic population in this dataset seems to fit the Hispanic Paradox.