PCA

#Load packages
library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(ggplot2)
library(pander)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(haven)

#Load BRFSS
BRFSS19<-read_xpt("C:/Users/adolp/Desktop/Statistics 2/BRFSS/LLCP2019.XPT")

#Fix BRFSS names
nams<-names(BRFSS19)
head(BRFSS19, n=10)

## # A tibble: 10 x 342
##    `_STATE` FMONTH IDATE   IMONTH IDAY  IYEAR DISPCODE SEQNO     `_PSU` CTELENM1
##       <dbl>  <dbl> <chr>   <chr>  <chr> <chr>    <dbl> <chr>      <dbl>    <dbl>
##  1        1      1 011820~ 01     18    2019      1100 201900~   2.02e9        1
##  2        1      1 011320~ 01     13    2019      1100 201900~   2.02e9        1
##  3        1      1 011820~ 01     18    2019      1100 201900~   2.02e9        1
##  4        1      1 011820~ 01     18    2019      1200 201900~   2.02e9        1
##  5        1      1 010420~ 01     04    2019      1100 201900~   2.02e9        1
##  6        1      1 011820~ 01     18    2019      1200 201900~   2.02e9        1
##  7        1      1 010420~ 01     04    2019      1100 201900~   2.02e9        1
##  8        1      1 012320~ 01     23    2019      1100 201900~   2.02e9        1
##  9        1      1 012420~ 01     24    2019      1100 201900~   2.02e9        1
## 10        1      1 011320~ 01     13    2019      1100 201900~   2.02e9        1
## # ... with 332 more variables: PVTRESD1 <dbl>, COLGHOUS <dbl>, STATERE1 <dbl>,
## #   CELPHONE <dbl>, LADULT1 <dbl>, COLGSEX <dbl>, NUMADULT <dbl>,
## #   LANDSEX <dbl>, NUMMEN <dbl>, NUMWOMEN <dbl>, RESPSLCT <dbl>,
## #   SAFETIME <dbl>, CTELNUM1 <dbl>, CELLFON5 <dbl>, CADULT1 <dbl>,
## #   CELLSEX <dbl>, PVTRESD3 <dbl>, CCLGHOUS <dbl>, CSTATE1 <dbl>,
## #   LANDLINE <dbl>, HHADULT <dbl>, SEXVAR <dbl>, GENHLTH <dbl>, PHYSHLTH <dbl>,
## #   MENTHLTH <dbl>, POORHLTH <dbl>, HLTHPLN1 <dbl>, PERSDOC2 <dbl>,
## #   MEDCOST <dbl>, CHECKUP1 <dbl>, BPHIGH4 <dbl>, BPMEDS <dbl>, CHOLCHK2 <dbl>,
## #   TOLDHI2 <dbl>, CHOLMED2 <dbl>, CVDINFR4 <dbl>, CVDCRHD4 <dbl>,
## #   CVDSTRK3 <dbl>, ASTHMA3 <dbl>, ASTHNOW <dbl>, CHCSCNCR <dbl>,
## #   CHCOCNCR <dbl>, CHCCOPD2 <dbl>, ADDEPEV3 <dbl>, CHCKDNY2 <dbl>,
## #   DIABETE4 <dbl>, DIABAGE3 <dbl>, HAVARTH4 <dbl>, ARTHEXER <dbl>,
## #   ARTHEDU <dbl>, LMTJOIN3 <dbl>, ARTHDIS2 <dbl>, JOINPAI2 <dbl>,
## #   MARITAL <dbl>, EDUCA <dbl>, RENTHOM1 <dbl>, NUMHHOL3 <dbl>, NUMPHON3 <dbl>,
## #   CPDEMO1B <dbl>, VETERAN3 <dbl>, EMPLOY1 <dbl>, CHILDREN <dbl>,
## #   INCOME2 <dbl>, WEIGHT2 <dbl>, HEIGHT3 <dbl>, PREGNANT <dbl>, DEAF <dbl>,
## #   BLIND <dbl>, DECIDE <dbl>, DIFFWALK <dbl>, DIFFDRES <dbl>, DIFFALON <dbl>,
## #   SMOKE100 <dbl>, SMOKDAY2 <dbl>, STOPSMK2 <dbl>, LASTSMK2 <dbl>,
## #   USENOW3 <dbl>, ALCDAY5 <dbl>, AVEDRNK3 <dbl>, DRNK3GE5 <dbl>,
## #   MAXDRNKS <dbl>, EXERANY2 <dbl>, EXRACT11 <dbl>, EXEROFT1 <dbl>,
## #   EXERHMM1 <dbl>, EXRACT21 <dbl>, EXEROFT2 <dbl>, EXERHMM2 <dbl>,
## #   STRENGTH <dbl>, FRUIT2 <dbl>, FRUITJU2 <dbl>, FVGREEN1 <dbl>,
## #   FRENCHF1 <dbl>, POTATOE1 <dbl>, VEGETAB2 <dbl>, FLUSHOT7 <dbl>,
## #   FLSHTMY3 <dbl>, TETANUS1 <dbl>, PNEUVAC4 <dbl>, HIVTST7 <dbl>, ...

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(BRFSS19)<-newnames

#Recode variables
BRFSS19$Hrt<-Recode(BRFSS19$cvdinfr4, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Strk<-Recode(BRFSS19$cvdstrk3, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Cncr<-Recode(BRFSS19$chcocncr, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Air<-Recode(BRFSS19$chccopd2, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Dep<-Recode(BRFSS19$addepev3, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Kdny<-Recode(BRFSS19$chckdny2, recodes ="7:9=NA; 1=1;2=0")
BRFSS19$Gen<-Recode(BRFSS19$birthsex, recodes = "7:9=NA; 1=1;2=0")
BRFSS19$Hispanic<-Recode(BRFSS19$racegr3, recodes="5=1; 9=NA; else=0")

#SUbset and standardize variables
BRFSS19b<-BRFSS19%>%
  filter(complete.cases(llcpwt, ststr, Gen, Hispanic,Hrt, Strk, Cncr, Air, Dep, Kdny))%>%
  select(llcpwt, ststr, Gen, Hispanic,Hrt, Strk, Cncr, Air, Dep, Kdny)%>%
  mutate_at(vars(Hrt, Strk, Cncr, Air, Dep, Kdny), scale)

#Summary statistics of scaled variables
summary(BRFSS19b)

##      llcpwt             ststr             Gen            Hispanic      
##  Min.   :    1.82   Min.   :151011   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  113.17   1st Qu.:272049   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  250.13   Median :361151   Median :0.0000   Median :0.00000  
##  Mean   :  544.26   Mean   :348506   Mean   :0.4694   Mean   :0.06503  
##  3rd Qu.:  586.04   3rd Qu.:491081   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :22131.79   Max.   :502109   Max.   :1.0000   Max.   :1.00000  
##        Hrt.V1              Strk.V1             Cncr.V1      
##  Min.   :-0.226653   Min.   :-0.195770   Min.   :-0.315724  
##  1st Qu.:-0.226653   1st Qu.:-0.195770   1st Qu.:-0.315724  
##  Median :-0.226653   Median :-0.195770   Median :-0.315724  
##  Mean   : 0.000000   Mean   : 0.000000   Mean   : 0.000000  
##  3rd Qu.:-0.226653   3rd Qu.:-0.195770   3rd Qu.:-0.315724  
##  Max.   : 4.411967   Max.   : 5.107943   Max.   : 3.167270  
##        Air.V1               Dep.V1              Kdny.V1      
##  Min.   :-0.267777   Min.   :-0.4860826   Min.   :-0.185114  
##  1st Qu.:-0.267777   1st Qu.:-0.4860826   1st Qu.:-0.185114  
##  Median :-0.267777   Median :-0.4860826   Median :-0.185114  
##  Mean   : 0.000000   Mean   : 0.0000000   Mean   : 0.000000  
##  3rd Qu.:-0.267777   3rd Qu.:-0.4860826   3rd Qu.:-0.185114  
##  Max.   : 3.734393   Max.   : 2.0572303   Max.   : 5.401997

#PCA
brfss.pc<-princomp(~Hrt+ Strk+ Cncr+ Air+ Dep+ Kdny,
                   data=BRFSS19b,
                   scores=T)

#Screeplot
screeplot(brfss.pc,
          type = "l",
          main = "Scree Plot")
abline(h=1)

Six chronic health variables were selected to create a chronic health index.

#Summary statistics of components
summary(brfss.pc)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.2045106 1.0114657 0.9871635 0.9535046 0.9225369
## Proportion of Variance 0.2418115 0.1705132 0.1624179 0.1515310 0.1418480
## Cumulative Proportion  0.2418115 0.4123247 0.5747427 0.7262736 0.8681216
##                           Comp.6
## Standard deviation     0.8895266
## Proportion of Variance 0.1318784
## Cumulative Proportion  1.0000000

The first component accounts for 24% of the total variation.

loadings(brfss.pc )

## 
## Loadings:
##      Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## Hrt   0.493  0.317  0.353         0.190  0.703
## Strk  0.469  0.253  0.419  0.141 -0.493 -0.527
## Cncr  0.317  0.180 -0.743  0.518 -0.193       
## Air   0.467 -0.372         0.178  0.694 -0.360
## Dep   0.267 -0.802               -0.449  0.283
## Kdny  0.384  0.146 -0.382 -0.822              
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.167  0.167  0.167  0.167  0.167  0.167
## Cumulative Var  0.167  0.333  0.500  0.667  0.833  1.000

#plot the first 2 components
scores<-data.frame(brfss.pc$scores)
hist(scores$Comp.1)

hist(scores$Comp.2)

cor(scores[,1:2])

##              Comp.1       Comp.2
## Comp.1 1.000000e+00 1.421011e-14
## Comp.2 1.421011e-14 1.000000e+00

BRFSS19c<-cbind(BRFSS19b, scores)
names(BRFSS19c)

##  [1] "llcpwt"   "ststr"    "Gen"      "Hispanic" "Hrt"      "Strk"    
##  [7] "Cncr"     "Air"      "Dep"      "Kdny"     "Comp.1"   "Comp.2"  
## [13] "Comp.3"   "Comp.4"   "Comp.5"   "Comp.6"

names(BRFSS19c)

##  [1] "llcpwt"   "ststr"    "Gen"      "Hispanic" "Hrt"      "Strk"    
##  [7] "Cncr"     "Air"      "Dep"      "Kdny"     "Comp.1"   "Comp.2"  
## [13] "Comp.3"   "Comp.4"   "Comp.5"   "Comp.6"

round(cor(BRFSS19c[,c(-1:-4, -11:-17)]), 3)

##        Hrt  Strk  Cncr   Air   Dep  Kdny
## Hrt  1.000 0.193 0.068 0.134 0.025 0.107
## Strk 0.193 1.000 0.066 0.106 0.050 0.085
## Cncr 0.068 0.066 1.000 0.084 0.019 0.088
## Air  0.134 0.106 0.084 1.000 0.132 0.087
## Dep  0.025 0.050 0.019 0.132 1.000 0.044
## Kdny 0.107 0.085 0.088 0.087 0.044 1.000

round(cor(BRFSS19c[,c(-1:-5, -13:-17)]),3)

##         Strk  Cncr    Air    Dep  Kdny Comp.1 Comp.2
## Strk   1.000 0.066  0.106  0.050 0.085  0.564  0.256
## Cncr   0.066 1.000  0.084  0.019 0.088  0.381  0.182
## Air    0.106 0.084  1.000  0.132 0.087  0.563 -0.376
## Dep    0.050 0.019  0.132  1.000 0.044  0.321 -0.811
## Kdny   0.085 0.088  0.087  0.044 1.000  0.463  0.148
## Comp.1 0.564 0.381  0.563  0.321 0.463  1.000  0.000
## Comp.2 0.256 0.182 -0.376 -0.811 0.148  0.000  1.000

#Survey design object
options(survey.lonely.psu = "adjust")
BRFSS19c$pc1q<-cut(BRFSS19c$Comp.1,
                    breaks = unique(quantile(BRFSS19c$Comp.1,
                                      probs = c(0,.25,.5,.75,1) ,
                                      na.rm=T), 
                    include.lowest=T))
des<-svydesign(ids=~1,
               strata=~ststr,
               weights=~llcpwt,
               data=BRFSS19c)

fit.1<-svyglm(Comp.1~Gen+Hispanic,
              des,
              family=gaussian)
summary(fit.1)

## 
## Call:
## svyglm(formula = Comp.1 ~ Gen + Hispanic, design = des, family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = BRFSS19c)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.03645    0.01064  -3.424 0.000617 ***
## Gen         -0.06012    0.01536  -3.915 9.04e-05 ***
## Hispanic    -0.15000    0.02435  -6.160 7.31e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.265712)
## 
## Number of Fisher Scoring iterations: 2

library(ggplot2)
ggplot(aes(as.factor(Hispanic), y=Comp.1),
       data=BRFSS19c)+
  geom_boxplot()

library(ggplot2)
ggplot(aes(as.factor(Gen), y=Comp.1),
       data=BRFSS19c)+
  geom_boxplot()

Six chronic health variables were selected (heart attack, stroke, cancer, COPD, depression, and Asthma) to create a chronic health index. I hypothesized that combined, these six variables had an underlying latent construct of chronic health disease or a chronic health index. The first component had positive loadings, accounting for 24% of the total variation with higher values on the component equating to chronic health issues. Component 1 was then analyzed across two factors: Gender (male = 1, women = 0) and Hispanic (Hispanic = 1, Non-Hispanic = 0). Results showed that the more respondents identified as a male, the less likely of chronic health diseases they had. Also, the more respondents identified as a Hispanics, the less likely of chronic health diseases they had.

PCA

Adolph Delgado

3/27/2021