library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
library(knitr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v readr 2.1.1
## v tibble 3.1.6 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.1.3
brfss20sm <- readRDS("C:/Users/shahi/Dropbox/PC/Downloads/brfss20sm.rds")
names(brfss20sm) <- tolower(gsub(pattern = "_",replacement = "",x = names(brfss20sm)))
#For this homework, you are to use the technique of Principal Components Analysis (PCA) to perform a variable reduction of at least 5 variables.
#Age cut into intervals
brfss20sm$agec<-cut(brfss20sm$age80,
breaks=c(0,29,39,59,79,99))
brfss20sm$ageg<-factor(brfss20sm$ageg)
#depressive disorder
brfss20sm$depression<-Recode(brfss20sm$addepev3, recodes="1=1; 2=0; 7:9=NA")
#healthy mental health days
brfss20sm$healthmdays <- Recode (brfss20sm$menthlth,recodes = "88=0; 77=NA; 99=NA")
#smoking currently
brfss20sm$smoke <- Recode(brfss20sm$smoker3,
recodes= "1:2=1; 3:4=0; else=NA")
#employment
brfss20sm$employ<-Recode(brfss20sm$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20sm$employ<-relevel(brfss20sm$employ,
ref='Employed')
#marital status
brfss20sm$marst<-Recode(brfss20sm$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss20sm$marst<-relevel(brfss20sm$marst,
ref='married')
#insurance
brfss20sm$ins<-Recode(brfss20sm$hlthpln1,
recodes ="7:9=NA; 1=1;2=0")
brfss20sm$checkup <- Recode (brfss20sm$checkup1, recodes = "1:2 = 1; 3:4 =0; 8=0; else=NA")
brfss20sm2<-brfss20sm%>%
filter(complete.cases(mmsawt, ststr,ageg,employ,marst,smoke,healthmdays, depression,checkup, ins))%>%
select(mmsawt, ststr,ageg,employ,marst,smoke ,healthmdays, depression, checkup, ins)%>%
mutate_at(vars(smoke, healthmdays, depression,checkup,ins), scale)
brfss20sm.pc <- PCA(brfss20sm2[,c(6:10)],
scale.unit = T,
graph = F)
#Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible.
eigenvalues <- brfss20sm.pc$eig
head(eigenvalues[, 1:2])
## eigenvalue percentage of variance
## comp 1 1.4982341 29.96468
## comp 2 1.2466215 24.93243
## comp 3 0.8887812 17.77562
## comp 4 0.7759622 15.51924
## comp 5 0.5904011 11.80802
The first two components account for 55% of the variation in the input variables, that isn’t bad. Also, we see 2 eigenvalues of atleast 1, which suggests there are 2 real components among these variables (as we are looking for eigenvalues>1 when using z-scored data).
fviz_screeplot(brfss20sm.pc, ncp=10)
brfss20sm.pc$var
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## smoke 0.4874229 -0.2253642 0.8435676 0.004843098 0.0004417606
## healthmdays 0.7715495 0.2489127 -0.2177360 0.038911768 0.5420616073
## depression 0.7393719 0.3559771 -0.1850393 -0.025196534 -0.5401251135
## checkup -0.2342560 0.7147714 0.2204425 -0.617204736 0.0684791024
## ins -0.2526218 0.7044552 0.2166369 0.626775906 0.0120643284
##
## $cor
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## smoke 0.4874229 -0.2253642 0.8435676 0.004843098 0.0004417606
## healthmdays 0.7715495 0.2489127 -0.2177360 0.038911768 0.5420616073
## depression 0.7393719 0.3559771 -0.1850393 -0.025196534 -0.5401251135
## checkup -0.2342560 0.7147714 0.2204425 -0.617204736 0.0684791024
## ins -0.2526218 0.7044552 0.2166369 0.626775906 0.0120643284
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## smoke 0.23758105 0.05078904 0.71160625 0.0000234556 1.951524e-07
## healthmdays 0.59528859 0.06195754 0.04740896 0.0015141257 2.938308e-01
## depression 0.54667077 0.12671968 0.03423955 0.0006348653 2.917351e-01
## checkup 0.05487586 0.51089818 0.04859488 0.3809416867 4.689387e-03
## ins 0.06381779 0.49625707 0.04693156 0.3928480370 1.455480e-04
##
## $contrib
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## smoke 15.857406 4.074135 80.065404 0.003022776 3.305422e-05
## healthmdays 39.732683 4.970036 5.334154 0.195128802 4.976800e+01
## depression 36.487675 10.165048 3.852416 0.081816532 4.941304e+01
## checkup 3.662703 40.982622 5.467587 49.092816798 7.942715e-01
## ins 4.259534 39.808159 5.280440 50.627215092 2.465240e-02
fviz_pca_var(brfss20sm.pc,
col.var = "contrib")+
theme_minimal()
fviz_pca_ind(brfss20sm.pc,
label= "none",
col.ind= "cos2") +
scale_color_gradient2(low= "blue",
mid= "white",
high= "red",
midpoint = .5) +
theme_minimal()
desc<- dimdesc(brfss20sm.pc)
desc$Dim.1
## $quanti
## correlation p.value
## healthmdays 0.7715495 0
## depression 0.7393719 0
## smoke 0.4874229 0
## checkup -0.2342560 0
## ins -0.2526218 0
##
## attr(,"class")
## [1] "condes" "list"
desc$Dim.2
## $quanti
## correlation p.value
## checkup 0.7147714 0
## ins 0.7044552 0
## depression 0.3559771 0
## healthmdays 0.2489127 0
## smoke -0.2253642 0
##
## attr(,"class")
## [1] "condes" "list"
brfss20sm2$pc1 <- brfss20sm.pc$ind$coord[, 1]
options(survey.lonely.psu = "adjust")
des<- svydesign( ids= ~1,
strata = ~ststr,
weights= ~mmsawt,
data= brfss20sm2)
library(ggplot2)
ggplot(aes(x=ageg, y=pc1, group=ageg),
data=brfss20sm2) +
geom_boxplot()
brfss20sm2$employ<- forcats:: fct_relevel(brfss20sm2$employ, .x= c("Employed", "nilf", "retired", "unable"))
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
ggplot(aes(x=employ, y=pc1),
data=brfss20sm2) +
geom_boxplot()
#If deemed appropriate, conduct some testing of your index/components/latent variables:
Now we do some hypothesis testing. This model will examine variation in my mental health index across age, employment, marital status, and two healthcare access variables.
fit.1 <- svyglm(pc1~ factor(ageg) +employ + marst,
des,
family= gaussian)
summary(fit.1)
##
## Call:
## svyglm(formula = pc1 ~ factor(ageg) + employ + marst, design = des,
## family = gaussian)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss20sm2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.19663 0.03200 -6.145 8.01e-10 ***
## factor(ageg)2 0.16361 0.03098 5.281 1.29e-07 ***
## factor(ageg)3 0.11438 0.03297 3.469 0.000522 ***
## factor(ageg)4 -0.02885 0.03327 -0.867 0.385936
## factor(ageg)5 -0.10823 0.03378 -3.204 0.001354 **
## factor(ageg)6 -0.44118 0.03661 -12.051 < 2e-16 ***
## employnilf 0.24856 0.01983 12.534 < 2e-16 ***
## employretired 0.10873 0.02292 4.745 2.09e-06 ***
## employunable 1.02517 0.03789 27.055 < 2e-16 ***
## marstcohab 0.48376 0.03325 14.550 < 2e-16 ***
## marstdivorced 0.49582 0.02302 21.535 < 2e-16 ***
## marstnm 0.31648 0.02049 15.443 < 2e-16 ***
## marstseparated 0.59162 0.04496 13.159 < 2e-16 ***
## marstwidowed 0.27215 0.02634 10.333 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.380138)
##
## Number of Fisher Scoring iterations: 2
So, overall, the index decreases across age groups.