Homework PCA

#Load dataset and packages
library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(foreign)
library(ggplot2)
brfss2017=read.xport("~/Desktop/DataFolder/brfss.XPT ")

#Clean data and rename variables
brfss2017$marital<-Recode(brfss2017$MARITAL,recodes="1='b) married';5='a) single';else=NA",as.factor=T)
brfss2017$marital2<-Recode(brfss2017$MARITAL,recodes="1=1;else=NA",as.factor=T)
brfss2017$sxorient<-Recode(brfss2017$SXORIENT,recodes="1=0;2=1;else=NA",as.factor=T)
brfss2017$sxorient2<-Recode(brfss2017$SXORIENT,recodes="2='Gay';else=NA")
brfss2017$genh<-Recode(brfss2017$GENHLTH,recodes="1:3=0;4:5=1;else=NA")
brfss2017$genh1<-Recode(brfss2017$GENHLTH,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$choles<-Recode(brfss2017$TOLDHI2,recodes="1='Y';2='N';else=NA")
brfss2017$choles1<-Recode(brfss2017$TOLDHI2,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$blpressure<-Recode(brfss2017$BPMEDS,recodes="1=1;2=0;else=NA",as.factor=T)
brfss2017$blpressure1<-Recode(brfss2017$BPMEDS,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$heart<-Recode(brfss2017$CVDCRHD4,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$heart1<-Recode(brfss2017$CVDCRHD4,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$stroke<-Recode(brfss2017$CVDSTRK3,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$stroke1<-Recode(brfss2017$CVDSTRK3,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$chronic1<-Recode(brfss2017$CHCCOPD1,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$depressed<-Recode(brfss2017$ADDEPEV2,recodes="1=0;2=1;else=NA",as.factor=T)
brfss2017$hlthinsur<-Recode(brfss2017$HLTHPLN1,recodes="1='Y';2='N';else=NA",as.factor=T)
brfss2017$hlthinsur1<-Recode(brfss2017$HLTHPLN1,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$stress<-Recode(brfss2017$SDHSTRES,recodes="1='a-None';2='b-Little';3='c-Some';4='d-Most';5='e-Always';7=NA;9=NA;else=NA",as.numeric=T)
brfss2017$emosupport<-Recode(brfss2017$EMTSUPRT,recodes="1='a-Always';2='b-Usually';3='c-Sometimes';4='d-Rarely';5='e-Never';else=NA",as.factor=T)
brfss2017$emosupport1<-Recode(brfss2017$EMTSUPRT,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$satisfed<-Recode(brfss2017$LSATISFY,recodes="1:2='a-Satisfied';3:4='b-Dissatisfied';else=NA",as.factor=T)
brfss2017$satisfed1<-Recode(brfss2017$LSATISFY,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$poorh<-Recode(brfss2017$POORHLTH,recodes="88=0;77=NA;99=NA")
brfss2017$physh<-Recode(brfss2017$PHYSHLTH,recodes="88=0;77=NA;99=NA")
brfss2017$mental<-Recode(brfss2017$X_MENT14D,recodes="9=NA")
brfss2017$int2<-interaction(brfss2017$marital2,brfss2017$sxorient)
brfss2017$int<-interaction(brfss2017$marital,brfss2017$sxorient2)
brfss2017$ADSLEEP<-Recode(brfss2017$ADSLEEP,recodes="88=0;77=NA;99=NA")
brfss2017$stress1<-Recode(brfss2017$SDHSTRES,recodes="7=NA;9=NA",as.numeric=T)
brfss2017$depressed1<-Recode(brfss2017$ADDEPEV2,recodes="7=NA;9=NA",as.numeric=T)

#Prepare to fit the survey corrected model.
#The prcomp function is used to do a principal component analysis.
brfss.pc1<-prcomp(~emosupport1+satisfed1+stress1+depressed1+mental,data=brfss2017,center=T,scale=T,retx=T)
summary(brfss.pc1)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.5473 0.9643 0.7983 0.7529 0.68698
## Proportion of Variance 0.4788 0.1860 0.1275 0.1134 0.09439
## Cumulative Proportion  0.4788 0.6648 0.7922 0.9056 1.00000

#Plot the data
screeplot(brfss.pc1)

#Basic summary of the PCA analysis
summary(brfss.pc1)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.5473 0.9643 0.7983 0.7529 0.68698
## Proportion of Variance 0.4788 0.1860 0.1275 0.1134 0.09439
## Cumulative Proportion  0.4788 0.6648 0.7922 0.9056 1.00000

brfss.pc1$rotation

##                    PC1        PC2        PC3        PC4         PC5
## emosupport1 -0.3548964  0.7298637  0.3698690 -0.4518552 -0.01927042
## satisfed1   -0.4581031  0.3622700 -0.1909592  0.7855209  0.07344094
## stress1     -0.4903639 -0.1615365 -0.5293796 -0.2830594 -0.61080585
## depressed1   0.4198159  0.4700269 -0.7187757 -0.1691097  0.23998509
## mental      -0.4974817 -0.2983961 -0.1727722 -0.2646943  0.75070531

#Plot the first 2 components
hist(brfss.pc1$x[,1])

hist(brfss.pc1$x[,2])

#The first two components account for over 50% of the variation in the input variables.  There is one eigenvalue greater then 1 with another eigenvalue just under 1.  There is at least 1, possibly 2, real components among these 5 variables.

#Calculate the correlation between the first 2 components to show they are statistically independent. 
cor(brfss.pc1$x[,1:2])

##             PC1         PC2
## PC1 1.00000e+00 1.62524e-14
## PC2 1.62524e-14 1.00000e+00

#Merge data back to data frame
scores<-data.frame(brfss.pc1$x)
scores$name<-rownames(brfss.pc1$x)
brfss2017$name<-rownames(brfss2017)
brfss2017<-merge(brfss2017,scores,by.x="name",by.y = "name",all.x = F)
tail(names(brfss2017),20)

##  [1] "depressed"   "hlthinsur"   "hlthinsur1"  "stress"      "emosupport" 
##  [6] "emosupport1" "satisfed"    "satisfed1"   "poorh"       "physh"      
## [11] "mental"      "int2"        "int"         "stress1"     "depressed1" 
## [16] "PC1"         "PC2"         "PC3"         "PC4"         "PC5"

#Examine the correlation among variables
round(cor(brfss2017[,c("emosupport1","satisfed1","stress1","depressed1","mental")],method = "spearman"),3)

##             emosupport1 satisfed1 stress1 depressed1 mental
## emosupport1       1.000     0.424   0.298     -0.208  0.278
## satisfed1         0.424     1.000   0.352     -0.259  0.326
## stress1           0.298     0.352   1.000     -0.329  0.466
## depressed1       -0.208    -0.259  -0.329      1.000 -0.420
## mental            0.278     0.326   0.466     -0.420  1.000

#Correlation among the original variables and the components
round(cor(brfss2017[,c("emosupport1","satisfed1","stress1","depressed1","mental","PC1","PC2")],method = "spearman"),3)

##             emosupport1 satisfed1 stress1 depressed1 mental    PC1    PC2
## emosupport1       1.000     0.424   0.298     -0.208  0.278 -0.609  0.582
## satisfed1         0.424     1.000   0.352     -0.259  0.326 -0.695  0.393
## stress1           0.298     0.352   1.000     -0.329  0.466 -0.723 -0.184
## depressed1       -0.208    -0.259  -0.329      1.000 -0.420  0.566  0.452
## mental            0.278     0.326   0.466     -0.420  1.000 -0.692 -0.280
## PC1              -0.609    -0.695  -0.723      0.566 -0.692  1.000 -0.037
## PC2               0.582     0.393  -0.184      0.452 -0.280 -0.037  1.000

#Let's use the PCA...
# Make the survey design object
options(survey.lonely.psu = "adjust")
brfss2017$pc1q<-cut(brfss2017$PC1,breaks = quantile(brfss2017$PC1,probs = c(0,.25,.5,.75,1),na.rm = T))
des<-svydesign(ids=~1, strata=~X_STRWT, weights=~X_LLCPWT, data =brfss2017)

#means of the variables in the index, by quantile of the index
svyby(~emosupport1+satisfed1+stress1+depressed1+mental,~pc1q,des,FUN=svymean,na.rm=T)

##                          pc1q emosupport1 satisfed1  stress1 depressed1
## (-6.59,-0.762] (-6.59,-0.762]    2.424310  2.093545 3.038387   1.457749
## (-0.762,0.364] (-0.762,0.364]    1.886781  1.668608 2.018032   1.887669
## (0.364,1.12]     (0.364,1.12]    1.329728  1.406063 1.537065   1.966399
## (1.12,1.59]       (1.12,1.59]    1.126395  1.000000 1.000000   2.000000
##                  mental se.emosupport1 se.satisfed1 se.stress1
## (-6.59,-0.762] 2.175146    0.024546899   0.01356647 0.02453786
## (-0.762,0.364] 1.346791    0.024550264   0.01083412 0.01711585
## (0.364,1.12]   1.087028    0.013507714   0.01112799 0.01551463
## (1.12,1.59]    1.000000    0.007397394   0.00000000 0.00000000
##                se.depressed1   se.mental
## (-6.59,-0.762]   0.012233803 0.017808419
## (-0.762,0.364]   0.006505139 0.011887093
## (0.364,1.12]     0.003420035 0.006876961
## (1.12,1.59]      0.000000000 0.000000000

#Analysis of variation in the mental health index across marital status and sexual orientation.  
#In the first graph being married shows the likelihood of improved health over being single.
#The second graph sexual orientation shows the likelihood of improved health for straight people over gay people.
#The third graph is an interaction of marital and sxorient.  The graph shows the likelihood of improved health for married gay people over single gay people.
#The fourth graph is an interaction of marital and sxorient.  The graph shows the likelihood of improved health for married straight people over married gay people.

ggplot(aes(x=marital,y=PC1),data = brfss2017)+geom_boxplot()

ggplot(aes(x=sxorient,y=PC1),data = brfss2017)+geom_boxplot()

ggplot(aes(x=int,y=PC1),data = brfss2017)+geom_boxplot()

ggplot(aes(x=int2,y=PC1),data = brfss2017)+geom_boxplot()

#Hypothesis testing.  We will examine variation in the PCA index across marital status and sexual orientation.  We see statistical significance for improved health in married people over single people.  We see statistical significance for decreased health in gay people over straight people.  We see statistical significance for decreased health amond women over men.  
fit.1<-svyglm(PC1~marital+sxorient+SEX,des,family=gaussian)
summary(fit.1)

## 
## Call:
## svyglm(formula = PC1 ~ marital + sxorient + SEX, design = des, 
##     family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~X_STRWT, weights = ~X_LLCPWT, data = brfss2017)
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.09326    0.07960  -1.172   0.2414    
## maritalb) married  0.82240    0.05676  14.490  < 2e-16 ***
## sxorient1         -0.51335    0.21737  -2.362   0.0182 *  
## SEX               -0.32026    0.04222  -7.586 3.53e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2.251229)
## 
## Number of Fisher Scoring iterations: 2

Homework PCA

Steven Weeks

3/9/2019