Principle Component Analysis

library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(ggplot2)
library(pander)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(foreign)

Latent Construct

# High socioeconomic Status will be the latent construct used in this assignment. The variables used will be education(college), sex(male), race(white), marital status(married), and poverty status.
DEMO_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DEMO_J.XPT", 
    NULL)
ALQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/ALQ_J.XPT", 
    NULL)
BMX_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/BMX_J.XPT", 
    NULL)
DIQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DIQ_J.XPT", 
    NULL)
SMQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/SMQ_J.XPT", 
    NULL)
Master1 <- merge(DEMO_J, ALQ_J, all.x=TRUE)
Master2 <- merge(Master1, BMX_J, all.x=TRUE)
Master3 <- merge(Master2, DIQ_J, all.x=TRUE)
Master4 <- merge(Master3, SMQ_J, all.x=TRUE)

Recoding, filtering, and cleaning up the data

nams<-names(Master4)
head(nams, n=10)
##  [1] "SEQN"     "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
##  [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(Master4)<-newnames
#sex
Master4$sex<-as.factor(ifelse(Master4$riagendr==1,
                                "Male",
                                "Female"))
Master4$male<- Recode(Master4$riagendr, recodes= "1=1; else=0", as.numeric=T)
#race/ethnicity
Master4$hispanic <- Recode(Master4$ridreth3, recodes = "1=1; 2=1; else=0")
Master4$black<-Recode(Master4$ridreth3, recodes="4=1; else=0")
Master4$white<-Recode(Master4$ridreth3, recodes="3=1; else=0", as.numeric=T)
Master4$asian<-Recode(Master4$ridreth3, recodes="6=1; else=0")
Master4$other<-Recode(Master4$ridreth3, recodes="7=1; else=0")

Master4$race_eth<- Recode(Master4$ridreth3, recodes= "3= '1 white'; 1:2='2 hispanic'; 4= '3 black'; 6= '4 asian'; 7= '5 other' ", as.factor=T )

# at or below poverty threshold using income to poverty ratio
Master4$pov<-ifelse(Master4$indfmpir <= 1, "at or below poverty", 
                      ifelse(Master4$indfmpir > 1, "above poverty", NA))
Master4$pov2<- Recode(Master4$pov, recodes= "'at or below poverty'=1; 'above poverty'=0", as.numeric=T)

# education
Master4$educ <- Recode(Master4$dmdeduc2, recodes="1:2='1 lths'; 3='2 hs'; 4='3 some col'; 5='4 col grad'; 7:9=NA; else=NA", as.factor=T)
Master4$college <- Recode(Master4$dmdeduc2, recodes="1:4=0; 5=1; else=NA", as.numeric=T)

# marital status
Master4$marst <- Recode(Master4$dmdmartl, recodes="1='1 married'; 2='2 widowed'; 3='3 divorced'; 4='4 separated'; 5='5 nm'; 6='6 cohab'; else=NA", as.factor=T)
Master4$married <- Recode(Master4$dmdmartl, recodes="1=1; 2:6=0; else=NA", as.numeric=T)
# age
Master4$agec<-cut(Master4$ridageyr,
                   breaks=c(0,17,65,80))

Analysis

pca<-Master4%>%
  filter(complete.cases(wtint2yr, sdmvpsu, sdmvstra, ridageyr, agec, race_eth, educ, college, marst, pov, pov2, sex, hispanic, black, white, asian, other, male))%>%
  select(wtint2yr, sdmvpsu, sdmvstra, male, college, pov2, married, white)%>%
  mutate_at(vars(male, college, pov2, married, white), scale)

Eigenvalues

nhanes.pc<-princomp(~male+college+pov2+married+white,
                   data=pca,
                   scores=T)

#Screeplot
screeplot(nhanes.pc,
          type = "l",
          main = "Scree Plot")
abline(h=1)

#Request some basic summaries of the PCA analysis option(digits=3)
summary(nhanes.pc)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.1652188 1.0215507 0.9965926 0.9166050 0.8742372
## Proportion of Variance 0.2716038 0.2087569 0.1986810 0.1680681 0.1528901
## Cumulative Proportion  0.2716038 0.4803607 0.6790417 0.8471099 1.0000000

Examine eignevectors/loading vectors

loadings(nhanes.pc )
## 
## Loadings:
##         Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## male     0.197  0.594  0.667  0.400       
## college  0.507 -0.487         0.463  0.538
## pov2    -0.615         0.199 -0.160  0.746
## married  0.547         0.226 -0.772  0.224
## white    0.163  0.638 -0.680         0.317
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0
#then I plot the first 2 components

scores<-data.frame(nhanes.pc$scores)
hist(scores$Comp.1)

hist(scores$Comp.2)

cor(scores[,1:2])
##               Comp.1        Comp.2
## Comp.1  1.000000e+00 -1.150849e-14
## Comp.2 -1.150849e-14  1.000000e+00
cat<-cbind(pca, scores)
names(cat)
##  [1] "wtint2yr" "sdmvpsu"  "sdmvstra" "male"     "college"  "pov2"    
##  [7] "married"  "white"    "Comp.1"   "Comp.2"   "Comp.3"   "Comp.4"  
## [13] "Comp.5"
round(cor(cat[,c(-1:-5, -17:-27)]), 3)
##           pov2 married  white Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## pov2     1.000  -0.180 -0.089 -0.717  0.014  0.198 -0.147  0.652
## married -0.180   1.000  0.025  0.637  0.064  0.225 -0.708  0.196
## white   -0.089   0.025  1.000  0.190  0.652 -0.678  0.055  0.277
## Comp.1  -0.717   0.637  0.190  1.000  0.000  0.000  0.000  0.000
## Comp.2   0.014   0.064  0.652  0.000  1.000  0.000  0.000  0.000
## Comp.3   0.198   0.225 -0.678  0.000  0.000  1.000  0.000  0.000
## Comp.4  -0.147  -0.708  0.055  0.000  0.000  0.000  1.000  0.000
## Comp.5   0.652   0.196  0.277  0.000  0.000  0.000  0.000  1.000

Interpretation

# There are three eigenvalues of at least 1, suggesting that 3 real components exist among these variables. The first 3 components account for 68% of the variation in the input variables. All of the eigenvectors in comp1 are positive except for pov2. This may be because pov2 is coded as "1" being under the poverty line, the sign would be positive if coded differently. Since comp.1 is positively associated with being married and white, while negatively associated with being below the poverty rate, higher comp.1 scores have higher socioeconomic status.