library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
#library(stargazer)
#library(haven)
#library(pander)
mydata<-haven::read_xpt("~/Documents/1A_DEM Doctorate/DEM 7283/R/LLCP2017.xpt")
#names(mydata)
newnames<-tolower(gsub(pattern = "_", replacement = "", x= names(mydata)))
names(mydata)<-newnames
#recode sex variable to male or female
mydata$male<-Recode(mydata$sex, recodes="1='male'; 2='0female'; else=NA")
#recode 5 level race/ethnicity
mydata <- mydata %>%
mutate(racethlvl = ifelse(racegr3 == 1, "0nh white",
ifelse(racegr3== 2, "nh black",
ifelse(racegr3 == 3, "nh other/multi",
ifelse(racegr3 == 4, "nh other/multi",
ifelse(racegr3 == 5, "hispanic", NA))))))
#individual dummy variables
mydata$black<-Recode(mydata$racegr3, recodes="2='nh black';9=NA; else='0non black'")
mydata$white<-Recode(mydata$racegr3, recodes="1='nh white'; 9=NA; else='0non white'")
mydata$other<-Recode(mydata$racegr3, recodes="3:4='nh other/multi'; 9=NA; else='0non other/multi'")
mydata$hispanic<-Recode(mydata$racegr3, recodes="5='hispanic'; 9=NA; else='0non hispanic'")
#recode marital status
mydata <- mydata %>%
mutate(maritallvl = ifelse(marital == 1, "Married",
ifelse(marital == 2, "PrevMarried",
ifelse(marital == 3, "PrevMarried",
ifelse(marital == 4, "PrevMarried",
ifelse(marital == 5, "0NevMarried", NA))))))
#dummy variable married
mydata$married<-Recode(mydata$marital, recodes="1='married'; 2:5='0no'; else=NA")
#recode education
mydata <- mydata %>%
mutate(educlvl = ifelse(educag == 1, "lessHS",
ifelse(educag == 2, "HS",
ifelse(educag == 3, "Some Col",
ifelse(educag == 4, "Col Grad", NA)))))
#recode laborforce etc
mydata$laborforce<-Recode(mydata$employ1, recodes="1:4='labforce'; 5:8='0no'; else=NA")
#recode employment
mydata$employyes<-Recode(mydata$employ1, recodes="1:2='employed'; 3:4='0no'; else=NA")
#dummy variable education
mydata$col <- Recode(mydata$educag, recodes="1:2='0LessCol'; 3:4='Col'; else=NA")
#recode the number of hours of sleep to binary
mydata$shortsleep<-Recode(mydata$sleptim1, recodes ="1:6=1; 6:24=0; else=NA")
#recode unintentionally fell asleep during the day
mydata$unintentslep<-Recode(mydata$slepday1, recodes ="1:14=1; 88=0; else=NA")
#difficulty concentrating or remembering
mydata$brainf<-Recode(mydata$decide, recodes ="1=1; 2=0; else=NA")
#recode mental health
mydata$mentpoor<-Recode(mydata$menthlth, recodes ="1:30=1; 88=0; else=NA")
#recode poor health (either mental or physcial)
mydata$poorhlthnew<-Recode(mydata$poorhlth, recodes ="1:30=1; 88=0; else=NA")
#recode last checkup
mydata <- mydata %>%
mutate(recchecklvl = ifelse(checkup1 == 1, "<1 yr",
ifelse(checkup1 == 2, "<2 yr",
ifelse(checkup1 == 3, "<5 yr",
ifelse(checkup1 == 4, "5+", NA)))))
mydata$histress<-Recode(mydata$sdhstres, recodes="1:3=0; 4:5=1; else=NA")
#dummy variable recent checkup (within 1 year)
mydata$reccheck<-Recode(mydata$checkup1, recodes ="1='recent'; 2:4='0no'; else=NA")
#outlierReplace(my_data, "brnk3ge5", which(my_data$drnk3ge5 <70), NA)
#recode binge drinking
mydata$bingedrink<-Recode(mydata$drnk3ge5, recodes= "88=0; 77=NA; 99=NA")
#recode every smoked
mydata$smoke<-Recode(mydata$smoker3, recodes= "1:2=1; 3:4=0; else=NA")
#mydata$cancertypes<-Recode(mydata$cncrdiff, recodes="7=NA; 9=NA")
#recode select illness
#mydata$hrdx <- Recode(mydata$crgvprb2, recode="8=1; 1:7=0; 9:76=0; else=NA")
#mydata$subst <- Recode(mydata$crgvprb2, recode="12=1; 1:11=0; 13:76=0; else=NA")
#mydata$injur <- Recode(mydata$crgvprb2, recode="13=1; 1:12=0; 14:76=0; else=NA")
mydata$mentanx <- Recode(mydata$crgvprb2, recode="10=1; 1:9=0; 11:76=0; else=NA")
Using five variables plus 2 behavior-related behaviors (binge drinking and current smoking status)
brfss.pctest7<-prcomp(~shortsleep+unintentslep+brainf+histress+mentpoor+bingedrink+smoke, data=mydata, center=T, scale=T, retx=T)
summary(brfss.pctest7)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.3573 1.0465 0.9794 0.9578 0.9075 0.8466 0.80355
## Proportion of Variance 0.2632 0.1565 0.1370 0.1311 0.1177 0.1024 0.09224
## Cumulative Proportion 0.2632 0.4196 0.5566 0.6877 0.8054 0.9078 1.00000
Using five variables
brfss.pctest5<-prcomp(~shortsleep+unintentslep+brainf+histress+mentpoor, data=mydata, center=T, scale=T, retx=T)
summary(brfss.pctest5)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.3177 0.9841 0.9711 0.8417 0.8022
## Proportion of Variance 0.3473 0.1937 0.1886 0.1417 0.1287
## Cumulative Proportion 0.3473 0.5410 0.7296 0.8713 1.0000
Using four variables plus 2 behavior-related behaviors (binge drinking and current smoking status)
brfss.pctest4b2<-prcomp(~shortsleep+brainf+histress+mentpoor+bingedrink+smoke, data=mydata, center=T, scale=T, retx=T)
summary(brfss.pctest4b2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.3426 1.0289 0.9650 0.9081 0.8493 0.8134
## Proportion of Variance 0.3004 0.1764 0.1552 0.1374 0.1202 0.1103
## Cumulative Proportion 0.3004 0.4769 0.6321 0.7695 0.8897 1.0000
Using four variables plus 1 behavior-related behavior (current smoking status)
brfss.pctest4b1<-prcomp(~shortsleep+brainf+histress+mentpoor+smoke, data=mydata, center=T, scale=T, retx=T)
summary(brfss.pctest4b1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.3350 0.9770 0.9517 0.8413 0.8060
## Proportion of Variance 0.3564 0.1909 0.1812 0.1415 0.1299
## Cumulative Proportion 0.3564 0.5474 0.7285 0.8701 1.0000
Using four variables (no unintentional day sleeping)
brfss.pctest4a<-prcomp(~shortsleep+brainf+histress+mentpoor, data=mydata, center=T, scale=T, retx=T)
summary(brfss.pctest4a)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.3004 0.9721 0.8440 0.8073
## Proportion of Variance 0.4227 0.2363 0.1781 0.1629
## Cumulative Proportion 0.4227 0.6590 0.8371 1.0000
Using four variables
brfss.pctest4b<-prcomp(~shortsleep+unintentslep+brainf+histress, data=mydata, center=T, scale=T, retx=T)
summary(brfss.pctest4b)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.2147 0.9776 0.9597 0.8049
## Proportion of Variance 0.3689 0.2389 0.2303 0.1620
## Cumulative Proportion 0.3689 0.6078 0.8380 1.0000
Using three variables
brfss.pctest3<-prcomp(~shortsleep+brainf+histress, data=mydata, center=T, scale=T, retx=T)
summary(brfss.pctest3)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.1903 0.9620 0.8110
## Proportion of Variance 0.4723 0.3085 0.2192
## Cumulative Proportion 0.4723 0.7808 1.0000
Moving forward with the four variables plus two behavior-related variables as it explains the most (47.7%) with 2 Principal Components (2 eigenvalues greater than 1). Plus, the assignment detailed to use at least 5 variables.
brfss.pc<-prcomp(~shortsleep+brainf+histress+mentpoor+bingedrink+smoke, data=mydata, center=T, scale=T, retx=T)
#Screeplot
screeplot(brfss.pc, type = "l", main = "Scree Plot")
abline(h=1)
#Request some basic summaries of the PCA analysis option(digits=3)
summary(brfss.pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.3426 1.0289 0.9650 0.9081 0.8493 0.8134
## Proportion of Variance 0.3004 0.1764 0.1552 0.1374 0.1202 0.1103
## Cumulative Proportion 0.3004 0.4769 0.6321 0.7695 0.8897 1.0000
brfss.pc$rotation
## PC1 PC2 PC3 PC4 PC5
## shortsleep -0.2994983 0.01673559 -0.935319896 0.01123894 0.04506867
## brainf -0.4927119 0.23218803 0.244705137 0.18599170 -0.55213883
## histress -0.5229841 0.22902101 0.007635137 -0.10784354 -0.24494966
## mentpoor -0.4712922 0.26367321 0.226890884 -0.25207168 0.74448930
## bingedrink -0.2479598 -0.71288596 0.069863076 -0.62441270 -0.16254684
## smoke -0.3322995 -0.56180785 0.094220328 0.70726175 0.22896592
## PC6
## shortsleep 0.18176879
## brainf 0.55135548
## histress -0.77611092
## mentpoor 0.19767698
## bingedrink 0.09547766
## smoke -0.11147605
#then I plot the first 2 components
hist(brfss.pc$x[,1])
hist(brfss.pc$x[,2])
cor(brfss.pc$x[,1:2])
## PC1 PC2
## PC1 1.000000e+00 5.247131e-15
## PC2 5.247131e-15 1.000000e+00
scores<-data.frame(brfss.pc$x)
scores$name<-rownames(brfss.pc$x)
mydata$name<-rownames(mydata)
mydata<-merge(mydata, scores, by.x="name", by.y="name", all.x=F)
tail(names(mydata), 20)
## [1] "laborforce" "employyes" "col" "shortsleep"
## [5] "unintentslep" "brainf" "mentpoor" "poorhlthnew"
## [9] "recchecklvl" "histress" "reccheck" "bingedrink"
## [13] "smoke" "mentanx" "PC1" "PC2"
## [17] "PC3" "PC4" "PC5" "PC6"
This confirms that the PC’s are relatively orthogonal.
round(cor(mydata[,c("shortsleep","brainf","histress","mentpoor", "bingedrink", "smoke")], method = "spearman"), 3)
## shortsleep brainf histress mentpoor bingedrink smoke
## shortsleep 1.000 0.107 0.177 0.107 0.073 0.088
## brainf 0.107 1.000 0.320 0.272 0.040 0.155
## histress 0.177 0.320 1.000 0.299 0.090 0.132
## mentpoor 0.107 0.272 0.299 1.000 0.118 0.107
## bingedrink 0.073 0.040 0.090 0.118 1.000 0.200
## smoke 0.088 0.155 0.132 0.107 0.200 1.000
round(cor(mydata[,c("shortsleep","brainf","histress","mentpoor", "bingedrink", "smoke", "PC1", "PC2")], method = "spearman"),3)
## shortsleep brainf histress mentpoor bingedrink smoke PC1
## shortsleep 1.000 0.107 0.177 0.107 0.073 0.088 -0.453
## brainf 0.107 1.000 0.320 0.272 0.040 0.155 -0.422
## histress 0.177 0.320 1.000 0.299 0.090 0.132 -0.450
## mentpoor 0.107 0.272 0.299 1.000 0.118 0.107 -0.713
## bingedrink 0.073 0.040 0.090 0.118 1.000 0.200 -0.395
## smoke 0.088 0.155 0.132 0.107 0.200 1.000 -0.428
## PC1 -0.453 -0.422 -0.450 -0.713 -0.395 -0.428 1.000
## PC2 0.186 0.252 0.265 0.453 -0.486 -0.504 -0.202
## PC2
## shortsleep 0.186
## brainf 0.252
## histress 0.265
## mentpoor 0.453
## bingedrink -0.486
## smoke -0.504
## PC1 -0.202
## PC2 1.000
#Make the survey design object
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
options(survey.lonely.psu = "adjust")
#PC1
mydata$pc1q<-cut(mydata$PC1, breaks = quantile(mydata$PC1, probs = c (0, .10, .20, .30, .40, .50, 1), na.rm=T))
des<-svydesign(ids=~1, weights=~llcpwt, data=mydata)
#PC1: means of the variables in the index, by binning the index into 2 groups
svyby( ~shortsleep+brainf+histress+mentpoor+bingedrink+smoke,~pc1q, des,FUN=svymean , na.rm=T)
## pc1q shortsleep brainf histress
## (-7.96,-1.89] (-7.96,-1.89] 0.53037668 0.56179668 0.63376753
## (-1.89,-0.732] (-1.89,-0.732] 0.67383664 0.08474485 0.08352343
## (-0.732,-0.0579] (-0.732,-0.0579] 0.08462273 0.00000000 0.00000000
## (-0.0579,-0.0299] (-0.0579,-0.0299] 0.00000000 0.00000000 0.00000000
## (-0.0299,0.298] (-0.0299,0.298] 0.98418295 0.00000000 0.00000000
## (0.298,0.973] (0.298,0.973] 0.00000000 0.00000000 0.00000000
## mentpoor bingedrink smoke se.shortsleep se.brainf
## (-7.96,-1.89] 0.9041255 3.1062942 0.4122630 0.021798311 0.02150043
## (-1.89,-0.732] 0.7136519 2.1965618 0.3341017 0.019506975 0.01009419
## (-0.732,-0.0579] 0.7859561 1.3049077 0.1783635 0.008948098 0.00000000
## (-0.0579,-0.0299] 0.0000000 0.0000000 1.0000000 0.000000000 0.00000000
## (-0.0299,0.298] 0.0000000 0.6592666 0.0000000 0.004642791 0.00000000
## (0.298,0.973] 0.0000000 0.5031933 0.0000000 0.000000000 0.00000000
## se.histress se.mentpoor se.bingedrink se.smoke
## (-7.96,-1.89] 0.02081290 0.01336373 0.27261777 0.02192682
## (-1.89,-0.732] 0.01032225 0.01796117 0.18223965 0.02030242
## (-0.732,-0.0579] 0.00000000 0.01379546 0.07679730 0.01302402
## (-0.0579,-0.0299] 0.00000000 0.00000000 0.00000000 0.00000000
## (-0.0299,0.298] 0.00000000 0.00000000 0.05845531 0.00000000
## (0.298,0.973] 0.00000000 0.00000000 0.02464344 0.00000000
#PC2
mydata$pc2q<-cut(mydata$PC2, breaks = quantile(mydata$PC2,probs = c(0,.25, .5, .75, 1) , na.rm=T))
des<-svydesign(ids=~1, weights=~llcpwt, data=mydata)
#PC2: means of the variables in the index, by quantile of the index
svyby( ~shortsleep+brainf+histress+mentpoor+bingedrink+smoke,~pc2q, des,FUN=svymean , na.rm=T)
## pc2q shortsleep brainf histress
## (-8.08,-0.0646] (-8.08,-0.0646] 0.28181156 0.0602283453 0.0654784682
## (-0.0646,0.128] (-0.0646,0.128] 0.05976522 0.0072307062 0.0008170017
## (0.128,0.174] (0.128,0.174] 0.99664276 0.0008147114 0.0063002758
## (0.174,2.54] (0.174,2.54] 0.32990210 0.2006021125 0.2316515307
## mentpoor bingedrink smoke se.shortsleep
## (-8.08,-0.0646] 0.2723361197 3.42347230 0.4537405102 0.012270431
## (-0.0646,0.128] 0.0238452878 0.10331001 0.0070042533 0.005731035
## (0.128,0.174] 0.0008147114 0.02438639 0.0008147114 0.001960078
## (0.174,2.54] 0.9254416977 0.42087969 0.0325121665 0.012633677
## se.brainf se.histress se.mentpoor se.bingedrink
## (-8.08,-0.0646] 0.0069197851 0.0074064965 0.0123062750 0.12528754
## (-0.0646,0.128] 0.0022744508 0.0004365355 0.0037531011 0.01058008
## (0.128,0.174] 0.0005835184 0.0025306256 0.0005835184 0.01000360
## (0.174,2.54] 0.0110435500 0.0115513908 0.0062889030 0.02566967
## se.smoke
## (-8.08,-0.0646] 0.0133956324
## (-0.0646,0.128] 0.0022635030
## (0.128,0.174] 0.0005835184
## (0.174,2.54] 0.0055568563
library(ggplot2)
ggplot(aes(x=racethlvl, y=PC1), data=mydata)+geom_boxplot()
ggplot(aes(x=racethlvl, y=PC2), data=mydata)+geom_boxplot()
Hispanic and White populations are associated with a higher PC1 values and so with better sleep health. Black and Other/Multi populations are associated with lower PC1 values and so worse sleep health. The Hispanic population in this dataset seems to fit the Hispanic Paradox.
Differences are limited with PC2.
ggplot(aes(x=educlvl, y=PC1), data=mydata)+geom_boxplot()
ggplot(aes(x=educlvl, y=PC2), data=mydata)+geom_boxplot()
There is an education gradient associated with PC1 where the higher the educational attainment the higher the PC1 values. The population with less than high school education is associated with lower PC1 values. Therefore better sleep health is associated with greater educational attainment.
Differences are limited with PC2.
ggplot(aes(x=maritallvl, y=PC1), data=mydata)+geom_boxplot()
ggplot(aes(x=maritallvl, y=PC2), data=mydata)+geom_boxplot()
There, too, appears to be a marital status gradient. Married populations are associated with the highest PC1 values followed by Previously Married. Populations reporting to Never Married have the lowest PC1 values. Therefore, Never Married populations report have worse sleep health.
Differences are limited with PC2.
ggplot(aes(x=recchecklvl, y=PC1), data=mydata)+geom_boxplot()
ggplot(aes(x=recchecklvl, y=PC2), data=mydata)+geom_boxplot()
There is a gradient associated with the length of time since last medical check up where those that had the a check-up in the last year have higher PC1 values and those that reported not having a check-up in more than 5 years have lower PC1 values. Therefore, worse sleep health is associated with longer the duration since last medical check-up.
Differences are limited with PC2.
ggplot(aes(x=laborforce, y=PC1), data=mydata)+geom_boxplot()
ggplot(aes(x=laborforce, y=PC2), data=mydata)+geom_boxplot()
Compared to those not in the Labor Force, those in the Labor Force have lower PC1 values and so worse sleep health.
Differences are limited with PC2.
ggplot(aes(x=employyes, y=PC1), data=mydata)+geom_boxplot()
ggplot(aes(x=employyes, y=PC2), data=mydata)+geom_boxplot()
Compared to the Unemployed population, the Employed population are associated with higher PC1 values and so better sleep health.
Differences are limited with PC2.
fit.1<-svyglm(PC1~racethlvl+educlvl+maritallvl+recchecklvl+employyes, design=des, family=gaussian)
summary(fit.1)
##
## Call:
## svyglm(formula = PC1 ~ racethlvl + educlvl + maritallvl + recchecklvl +
## employyes, design = des, family = gaussian)
##
## Survey design:
## svydesign(ids = ~1, weights = ~llcpwt, data = mydata)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.85008 0.16709 -5.088 3.75e-07 ***
## racethlvlhispanic 0.16496 0.17443 0.946 0.344332
## racethlvlnh black -0.37193 0.20256 -1.836 0.066382 .
## racethlvlnh other/multi -0.14889 0.15114 -0.985 0.324599
## educlvlHS -0.30279 0.06120 -4.947 7.74e-07 ***
## educlvllessHS -0.79027 0.26140 -3.023 0.002513 **
## educlvlSome Col -0.16468 0.04897 -3.363 0.000777 ***
## maritallvlMarried 0.67295 0.06987 9.631 < 2e-16 ***
## maritallvlPrevMarried 0.23308 0.09351 2.492 0.012715 *
## recchecklvl<2 yr -0.08383 0.06295 -1.332 0.182961
## recchecklvl<5 yr -0.25338 0.08666 -2.924 0.003471 **
## recchecklvl5+ -0.29206 0.10865 -2.688 0.007209 **
## employyesemployed 0.52692 0.15408 3.420 0.000631 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.859318)
##
## Number of Fisher Scoring iterations: 2
Regression analysis reiterates the patterns seen in bloxplots, however, race/ethnicity is not significantly associated with PC1. Compared to College Graduate, any educational attainment less than College degree is associaed with lower PC1 values. Less than High School education resulted in the greatest magnitude association. Compared to Never Married, both Married and Previously Married was associated with higher PC1 values. Married had the greatest magnitude. Longer period of time between a medical checkup (over 2 years) was associated with more negative PC1 values. Employed respondents were associated with higher PC1 values.
fit.2<-svyglm(PC2~racethlvl+educlvl+maritallvl+recchecklvl+employyes, design=des, family=gaussian)
summary(fit.2)
##
## Call:
## svyglm(formula = PC2 ~ racethlvl + educlvl + maritallvl + recchecklvl +
## employyes, design = des, family = gaussian)
##
## Survey design:
## svydesign(ids = ~1, weights = ~llcpwt, data = mydata)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.19596 0.11594 1.690 0.091053 .
## racethlvlhispanic 0.04313 0.11688 0.369 0.712113
## racethlvlnh black 0.06061 0.10988 0.552 0.581236
## racethlvlnh other/multi 0.16818 0.07662 2.195 0.028203 *
## educlvlHS -0.47030 0.05715 -8.229 2.34e-16 ***
## educlvllessHS -0.54810 0.14959 -3.664 0.000251 ***
## educlvlSome Col -0.18438 0.03425 -5.383 7.65e-08 ***
## maritallvlMarried 0.14359 0.04903 2.929 0.003416 **
## maritallvlPrevMarried -0.02553 0.06804 -0.375 0.707471
## recchecklvl<2 yr -0.09205 0.04658 -1.976 0.048158 *
## recchecklvl<5 yr -0.22368 0.08545 -2.618 0.008876 **
## recchecklvl5+ -0.40405 0.08736 -4.625 3.83e-06 ***
## employyesemployed -0.11241 0.10192 -1.103 0.270096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.133423)
##
## Number of Fisher Scoring iterations: 2
This regression analysis also reiterates the patterns seen in bloxplots, however, only other/multi race/ethnicity is significantly associated with PC2. Compared to College Graduate, any educational attainment less than College degree is associaed with lower PC2 values. Less than High School education resulted in the greatest magnitude association. Compared to Never Married, only Married was associated with higher PC2 values. Longer period of time between a medical checkup (over 1 years) was associated with more negative PC1 values. Employed status was not significantly associated with PC2