library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(ggplot2)
library(pander)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(foreign)
Latent Construct
# High socioeconomic Status will be the latent construct used in this assignment. The variables used will be education(college), sex(male), race(white), marital status(married), and poverty status.
DEMO_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DEMO_J.XPT",
NULL)
ALQ_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/ALQ_J.XPT",
NULL)
BMX_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/BMX_J.XPT",
NULL)
DIQ_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DIQ_J.XPT",
NULL)
SMQ_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/SMQ_J.XPT",
NULL)
Master1 <- merge(DEMO_J, ALQ_J, all.x=TRUE)
Master2 <- merge(Master1, BMX_J, all.x=TRUE)
Master3 <- merge(Master2, DIQ_J, all.x=TRUE)
Master4 <- merge(Master3, SMQ_J, all.x=TRUE)
Recoding, filtering, and cleaning up the data
nams<-names(Master4)
head(nams, n=10)
## [1] "SEQN" "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
## [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(Master4)<-newnames
#sex
Master4$sex<-as.factor(ifelse(Master4$riagendr==1,
"Male",
"Female"))
Master4$male<- Recode(Master4$riagendr, recodes= "1=1; else=0", as.numeric=T)
#race/ethnicity
Master4$hispanic <- Recode(Master4$ridreth3, recodes = "1=1; 2=1; else=0")
Master4$black<-Recode(Master4$ridreth3, recodes="4=1; else=0")
Master4$white<-Recode(Master4$ridreth3, recodes="3=1; else=0", as.numeric=T)
Master4$asian<-Recode(Master4$ridreth3, recodes="6=1; else=0")
Master4$other<-Recode(Master4$ridreth3, recodes="7=1; else=0")
Master4$race_eth<- Recode(Master4$ridreth3, recodes= "3= '1 white'; 1:2='2 hispanic'; 4= '3 black'; 6= '4 asian'; 7= '5 other' ", as.factor=T )
# at or below poverty threshold using income to poverty ratio
Master4$pov<-ifelse(Master4$indfmpir <= 1, "at or below poverty",
ifelse(Master4$indfmpir > 1, "above poverty", NA))
Master4$pov2<- Recode(Master4$pov, recodes= "'at or below poverty'=1; 'above poverty'=0", as.numeric=T)
# education
Master4$educ <- Recode(Master4$dmdeduc2, recodes="1:2='1 lths'; 3='2 hs'; 4='3 some col'; 5='4 col grad'; 7:9=NA; else=NA", as.factor=T)
Master4$college <- Recode(Master4$dmdeduc2, recodes="1:4=0; 5=1; else=NA", as.numeric=T)
# marital status
Master4$marst <- Recode(Master4$dmdmartl, recodes="1='1 married'; 2='2 widowed'; 3='3 divorced'; 4='4 separated'; 5='5 nm'; 6='6 cohab'; else=NA", as.factor=T)
Master4$married <- Recode(Master4$dmdmartl, recodes="1=1; 2:6=0; else=NA", as.numeric=T)
# age
Master4$agec<-cut(Master4$ridageyr,
breaks=c(0,17,65,80))
Analysis
pca<-Master4%>%
filter(complete.cases(wtint2yr, sdmvpsu, sdmvstra, ridageyr, agec, race_eth, educ, college, marst, pov, pov2, sex, hispanic, black, white, asian, other, male))%>%
select(wtint2yr, sdmvpsu, sdmvstra, male, college, pov2, married, white)%>%
mutate_at(vars(male, college, pov2, married, white), scale)
Eigenvalues
nhanes.pc<-princomp(~male+college+pov2+married+white,
data=pca,
scores=T)
#Screeplot
screeplot(nhanes.pc,
type = "l",
main = "Scree Plot")
abline(h=1)
#Request some basic summaries of the PCA analysis option(digits=3)
summary(nhanes.pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.1652188 1.0215507 0.9965926 0.9166050 0.8742372
## Proportion of Variance 0.2716038 0.2087569 0.1986810 0.1680681 0.1528901
## Cumulative Proportion 0.2716038 0.4803607 0.6790417 0.8471099 1.0000000
Examine eignevectors/loading vectors
loadings(nhanes.pc )
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## male 0.197 0.594 0.667 0.400
## college 0.507 -0.487 0.463 0.538
## pov2 -0.615 0.199 -0.160 0.746
## married 0.547 0.226 -0.772 0.224
## white 0.163 0.638 -0.680 0.317
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.2 0.2 0.2 0.2 0.2
## Cumulative Var 0.2 0.4 0.6 0.8 1.0
#then I plot the first 2 components
scores<-data.frame(nhanes.pc$scores)
hist(scores$Comp.1)
hist(scores$Comp.2)
cor(scores[,1:2])
## Comp.1 Comp.2
## Comp.1 1.000000e+00 -1.150849e-14
## Comp.2 -1.150849e-14 1.000000e+00
cat<-cbind(pca, scores)
names(cat)
## [1] "wtint2yr" "sdmvpsu" "sdmvstra" "male" "college" "pov2"
## [7] "married" "white" "Comp.1" "Comp.2" "Comp.3" "Comp.4"
## [13] "Comp.5"
round(cor(cat[,c(-1:-5, -17:-27)]), 3)
## pov2 married white Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## pov2 1.000 -0.180 -0.089 -0.717 0.014 0.198 -0.147 0.652
## married -0.180 1.000 0.025 0.637 0.064 0.225 -0.708 0.196
## white -0.089 0.025 1.000 0.190 0.652 -0.678 0.055 0.277
## Comp.1 -0.717 0.637 0.190 1.000 0.000 0.000 0.000 0.000
## Comp.2 0.014 0.064 0.652 0.000 1.000 0.000 0.000 0.000
## Comp.3 0.198 0.225 -0.678 0.000 0.000 1.000 0.000 0.000
## Comp.4 -0.147 -0.708 0.055 0.000 0.000 0.000 1.000 0.000
## Comp.5 0.652 0.196 0.277 0.000 0.000 0.000 0.000 1.000
Interpretation
# There are three eigenvalues of at least 1, suggesting that 3 real components exist among these variables. The first 3 components account for 68% of the variation in the input variables. All of the eigenvectors in comp1 are positive except for pov2. This may be because pov2 is coded as "1" being under the poverty line, the sign would be positive if coded differently. Since comp.1 is positively associated with being married and white, while negatively associated with being below the poverty rate, higher comp.1 scores have higher socioeconomic status.