library(car)
library(stargazer)
library(survey)
library(ggplot2)
library(pander)
library(dplyr)
library(knitr)
library(readr)
acs <- read_csv("C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/HW/HW7/usa_00006.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## YEAR = col_double(),
## CLUSTER = col_double(),
## STRATA = col_double(),
## PERWT = col_double(),
## EDUC = col_double(),
## EDUCD = col_double(),
## EMPSTAT = col_double(),
## EMPSTATD = col_double(),
## INCWAGE = col_double(),
## HWSEI = col_double(),
## PRESGL = col_double(),
## NPBOSS90 = col_double(),
## VETSTAT = col_double(),
## VETSTATD = col_double()
## )
#The names in the data are very ugly, so I make them less ugly
nams<-names(acs)
head(nams, n=10)
## [1] "YEAR" "CLUSTER" "STRATA" "PERWT" "EDUC" "EDUCD"
## [7] "EMPSTAT" "EMPSTATD" "INCWAGE" "HWSEI"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(acs)<-newnames
summary(acs)
## year cluster strata perwt
## Min. :2019 Min. :2.019e+12 Min. : 10001 Min. : 1.0
## 1st Qu.:2019 1st Qu.:2.019e+12 1st Qu.: 90001 1st Qu.: 49.0
## Median :2019 Median :2.019e+12 Median : 220013 Median : 77.0
## Mean :2019 Mean :2.019e+12 Mean : 463023 Mean : 101.3
## 3rd Qu.:2019 3rd Qu.:2.019e+12 3rd Qu.: 410336 3rd Qu.: 121.0
## Max. :2019 Max. :2.019e+12 Max. :7030155 Max. :2585.0
## educ educd empstat empstatd
## Min. : 0.000 Min. : 1.00 Min. :0.000 Min. : 0.00
## 1st Qu.: 5.000 1st Qu.: 50.00 1st Qu.:1.000 1st Qu.:10.00
## Median : 6.000 Median : 64.00 Median :1.000 Median :10.00
## Mean : 6.267 Mean : 65.11 Mean :1.512 Mean :15.15
## 3rd Qu.:10.000 3rd Qu.:101.00 3rd Qu.:3.000 3rd Qu.:30.00
## Max. :11.000 Max. :116.00 Max. :3.000 Max. :30.00
## incwage hwsei presgl npboss90
## Min. : 0 Min. : 0.00 Min. : 0.00 Min. : 5.0
## 1st Qu.: 0 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 40.0
## Median : 24000 Median :21.90 Median :26.40 Median : 82.4
## Mean :200329 Mean :22.30 Mean :24.48 Mean :443.6
## 3rd Qu.: 90000 3rd Qu.:38.51 3rd Qu.:47.80 3rd Qu.:999.9
## Max. :999999 Max. :80.53 Max. :81.50 Max. :999.9
## vetstat vetstatd
## Min. :0.0000 Min. : 0.000
## 1st Qu.:1.0000 1st Qu.:11.000
## Median :1.0000 Median :11.000
## Mean :0.8774 Mean : 9.551
## 3rd Qu.:1.0000 3rd Qu.:11.000
## Max. :2.0000 Max. :20.000
library(dplyr)
ds2<-acs%>%
mutate(
edattain=Recode(educ,
recodes="0:2='1'; 3:6='2'; 7:10='3'; 11='4';else=NA"),
emp = Recode(empstat,
recodes="2 ='0'; 1 ='1'; else = NA"),
vet = Recode(vetstat,
recodes="2 ='1'; 1 ='0'; else = NA")
)
ds3<-ds2%>%
filter(complete.cases(edattain,emp, vet, incwage,hwsei,presgl, npboss90, cluster,strata, perwt))%>%
select(edattain,emp, vet, incwage,hwsei,presgl, npboss90, cluster,strata, perwt )%>%
mutate_at(vars(edattain,emp, vet, incwage,hwsei,presgl, npboss90), scale)
#First we tell R our survey design
#options(survey.lonely.psu = "adjust")
#des<-svydesign(ids=~1, weights=~wtfinl, data =sub )
Variables considered for this PCA are Educational attainment, employment status, veteran status, income, socioeconomic index, occupational prestige score, occupational status score. I believe this can be a latent construct of human capital/occupational prestige.
cps.pc<-princomp(~edattain+emp+vet+incwage+hwsei+presgl+npboss90,
data=ds3,
scores=T)
#Screeplot
screeplot(cps.pc,
type = "l",
main = "Scree Plot")
abline(h=1)
Summary of eignevalues and variance explained
#Request some basic summaries of the PCA analysis option(digits=3)
summary(cps.pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.5958420 1.1178453 1.0005826 0.9194472 0.83074870
## Proportion of Variance 0.3638162 0.1785113 0.1430238 0.1207691 0.09859198
## Cumulative Proportion 0.3638162 0.5423275 0.6853512 0.8061203 0.90471228
## Comp.6 Comp.7
## Standard deviation 0.7267983 0.37252905
## Proportion of Variance 0.0754623 0.01982543
## Cumulative Proportion 0.9801746 1.00000000
The first 3 components account for 67% of the variation in the input variables. Also, we see 3 eigenvalues of at least 1, which suggests there are 4 real components among these variables (remember, we are looking for eigenvalues > 1 when using z-scored data).
Examine eignevectors/loading vectors
loadings(cps.pc )
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## edattain 0.450 0.252 0.338 0.785
## emp 0.172 -0.608 -0.568 0.516 -0.110
## vet -0.995
## incwage 0.355 0.137 -0.644 -0.656
## hwsei 0.572 0.228 -0.301 -0.727
## presgl 0.556 0.288 -0.372 0.682
## npboss90 0.739 -0.348 0.431 -0.376
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143
## Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000
In terms of the loadings for PC1, we see positive associations with everything except veteran status and npboss90. We may interpret this component as an index for occupational prestige, since all occupation variables are loading in the same direction.
Of course, interpreting components is more art than science…
#then I plot the first 2 components
scores<-data.frame(cps.pc$scores)
hist(scores$Comp.1)
hist(scores$Comp.2)
Next, I calculate the correlation between the first 2 components to show they are orthogonal, i.e. correlation == 0
cor(scores[,1:2])
## Comp.1 Comp.2
## Comp.1 1.000000e+00 3.815582e-12
## Comp.2 3.815582e-12 1.000000e+00
ds3c<-cbind(ds3, scores)
names(ds3c)
## [1] "edattain" "emp" "vet" "incwage" "hwsei" "presgl"
## [7] "npboss90" "cluster" "strata" "perwt" "Comp.1" "Comp.2"
## [13] "Comp.3" "Comp.4" "Comp.5" "Comp.6" "Comp.7"
Here we examine correlation among our occupational prestige variables
names(ds3c)
## [1] "edattain" "emp" "vet" "incwage" "hwsei" "presgl"
## [7] "npboss90" "cluster" "strata" "perwt" "Comp.1" "Comp.2"
## [13] "Comp.3" "Comp.4" "Comp.5" "Comp.6" "Comp.7"
round(cor(ds3c[,c( -8:-17)]), 3)
## edattain emp vet incwage hwsei presgl npboss90
## edattain 1.000 0.070 0.011 0.306 0.545 0.485 0.090
## emp 0.070 1.000 0.006 0.125 0.147 0.159 -0.250
## vet 0.011 0.006 1.000 0.029 0.018 0.007 -0.012
## incwage 0.306 0.125 0.029 1.000 0.371 0.325 0.047
## hwsei 0.545 0.147 0.018 0.371 1.000 0.856 -0.080
## presgl 0.485 0.159 0.007 0.325 0.856 1.000 -0.141
## npboss90 0.090 -0.250 -0.012 0.047 -0.080 -0.141 1.000
Sometimes it’s easier to look at the correlations among the original variables and the first 3 components
round(cor(ds3c[,c(-8:-10, -14:-17)]),3)
## edattain emp vet incwage hwsei presgl npboss90 Comp.1 Comp.2
## edattain 1.000 0.070 0.011 0.306 0.545 0.485 0.090 0.718 0.282
## emp 0.070 1.000 0.006 0.125 0.147 0.159 -0.250 0.275 -0.679
## vet 0.011 0.006 1.000 0.029 0.018 0.007 -0.012 0.032 -0.024
## incwage 0.306 0.125 0.029 1.000 0.371 0.325 0.047 0.567 0.153
## hwsei 0.545 0.147 0.018 0.371 1.000 0.856 -0.080 0.912 0.032
## presgl 0.485 0.159 0.007 0.325 0.856 1.000 -0.141 0.888 -0.042
## npboss90 0.090 -0.250 -0.012 0.047 -0.080 -0.141 1.000 -0.114 0.826
## Comp.1 0.718 0.275 0.032 0.567 0.912 0.888 -0.114 1.000 0.000
## Comp.2 0.282 -0.679 -0.024 0.153 0.032 -0.042 0.826 0.000 1.000
## Comp.3 0.015 0.002 -0.996 -0.080 0.027 0.045 -0.016 0.000 0.000
## Comp.3
## edattain 0.015
## emp 0.002
## vet -0.996
## incwage -0.080
## hwsei 0.027
## presgl 0.045
## npboss90 -0.016
## Comp.1 0.000
## Comp.2 0.000
## Comp.3 1.000
This allows us to see the correlations among the original occupational prestige variables and the new components. This is important for interpreting the direction of the component. In this case, the PC1 suggests that higher PC1 scores have better occupational prestige, because PC1 has positive correlations with all of the occupational prestige variables except npboss90.