Principal Components For this homework, you are to use the technique of Principal Components Analysis (PCA) to perform a variable reduction of at least 5 variables. If you have an idea for latent construct, state what you believe this is. Report the summary statistics and correlation matrix for your data Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible If deemed appropriate, conduct some testing of your index/components/latent variables.
hardtoget<-haven::read_xpt("/Users/christacrumrine/Downloads/LLCP2020.XPT ")
names(hardtoget)<-tolower(gsub(pattern = "_", replacement = "",x=names(hardtoget)))
hardtogetb<-hardtoget%>%
filter(complete.cases(llcpwt, ststr, marst, stroke, hvdiab, drink, smoked))%>%
select(llcpwt, ststr, gender, race, marst, stroke, hvdiab, drink, IADLcare, ADLcare, smoked)%>%
mutate_at(vars( stroke, hvdiab, drink, IADLcare, ADLcare), scale)
hardtoget.pc <-PCA(hardtogetb[,6:11],
scale.unit = T,
graph = F)
## Warning in PCA(hardtogetb[, 6:11], scale.unit = T, graph = F): Missing values
## are imputed by the mean of the variable: you should use the imputePCA function
## of the missMDA package
#Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible
eigenvalues <- hardtoget.pc$eig
head(eigenvalues[, 1:2])
## eigenvalue percentage of variance
## comp 1 1.3020997 21.70166
## comp 2 1.1083792 18.47299
## comp 3 1.0000000 16.66667
## comp 4 0.9570000 15.95000
## comp 5 0.9341063 15.56844
## comp 6 0.6984148 11.64025
fviz_screeplot(hardtoget.pc, ncp=10)
hardtoget.pc$var
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## stroke -2.355482e-02 6.423210e-01 1.155517e-13 -1.919114e-01 7.416172e-01
## hvdiab 4.871519e-02 5.654530e-01 -1.706460e-13 7.954993e-01 -2.109791e-01
## drink 5.849169e-03 -6.129162e-01 -5.765369e-14 5.339944e-01 5.823545e-01
## IADLcare 8.055611e-01 -1.851281e-02 1.123509e-14 -4.526759e-02 1.346326e-02
## ADLcare 8.063553e-01 7.542398e-03 9.415966e-15 -1.231586e-02 1.673553e-02
## smoked -5.271278e-15 -1.292833e-14 1.000000e+00 1.893347e-13 -8.842888e-14
##
## $cor
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## stroke -2.355482e-02 6.423210e-01 1.155517e-13 -1.919114e-01 7.416172e-01
## hvdiab 4.871519e-02 5.654530e-01 -1.706460e-13 7.954993e-01 -2.109791e-01
## drink 5.849169e-03 -6.129162e-01 -5.765369e-14 5.339944e-01 5.823545e-01
## IADLcare 8.055611e-01 -1.851281e-02 1.123509e-14 -4.526759e-02 1.346326e-02
## ADLcare 8.063553e-01 7.542398e-03 9.415966e-15 -1.231586e-02 1.673553e-02
## smoked -5.271278e-15 -1.292833e-14 1.000000e+00 1.893347e-13 -8.842888e-14
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## stroke 5.548297e-04 4.125763e-01 1.335219e-26 3.683000e-02 5.499960e-01
## hvdiab 2.373170e-03 3.197371e-01 2.912005e-26 6.328191e-01 4.451217e-02
## drink 3.421278e-05 3.756663e-01 3.323949e-27 2.851501e-01 3.391368e-01
## IADLcare 6.489287e-01 3.427240e-04 1.262272e-28 2.049155e-03 1.812593e-04
## ADLcare 6.502088e-01 5.688776e-05 8.866042e-29 1.516803e-04 2.800779e-04
## smoked 2.778637e-29 1.671416e-28 1.000000e+00 3.584763e-26 7.819666e-27
##
## $contrib
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## stroke 4.261038e-02 3.722338e+01 1.335219e-24 3.848485e+00 5.887938e+01
## hvdiab 1.822572e-01 2.884726e+01 2.912005e-24 6.612530e+01 4.765214e+00
## drink 2.627509e-03 3.389330e+01 3.323949e-25 2.979624e+01 3.630602e+01
## IADLcare 4.983710e+01 3.092118e-02 1.262272e-26 2.141227e-01 1.940457e-02
## ADLcare 4.993541e+01 5.132518e-03 8.866042e-27 1.584956e-02 2.998351e-02
## smoked 2.133966e-27 1.507982e-26 1.000000e+02 3.745834e-24 8.371280e-25
According to the results I would remove smoked and stroke because of the negative results. This is telling me that there is no relationship between those and the other variables.
fviz_pca_var(hardtoget.pc,
col.var= "contrib")+
theme_minimal()
fviz_pca_ind(hardtoget.pc,
label= "none",
col.ind= "cos2") +
scale_color_gradient2(low= "blue",
mid= "white",
high= "red",
midpoint = .5) +
theme_minimal()
desc<- dimdesc(hardtoget.pc)
desc$Dim.1
## $quanti
## correlation p.value
## ADLcare 0.80635526 0.000000e+00
## IADLcare 0.80556110 0.000000e+00
## hvdiab 0.04871519 4.599201e-16
## stroke -0.02355482 8.665847e-05
##
## attr(,"class")
## [1] "condes" "list"
desc$Dim.2
## $quanti
## correlation p.value
## stroke 0.64232100 0.0000000
## hvdiab 0.56545299 0.0000000
## IADLcare -0.01851281 0.0020367
## drink -0.61291620 0.0000000
##
## attr(,"class")
## [1] "condes" "list"
hardtogetb$pc1 <- hardtoget.pc$ind$coord[, 1]
options(survey.lonely.psu = "adjust")
des<- svydesign( ids= ~1,
strata = ~ststr,
weights= ~llcpwt,
data= hardtogetb)
library(ggplot2)
ggplot(aes(x=race, y=pc1, group=race),
data=hardtogetb) +
geom_boxplot()
ggplot(aes(x=gender, y=pc1, group=gender),
data=hardtogetb) +
geom_boxplot()
ggplot(aes(x=marst, y=pc1),
data=hardtogetb) +
geom_boxplot()
#If deemed appropriate, conduct some testing of your index/components/latent variables.
fit.1 <- svyglm(pc1~ marst + race,
des,
family= gaussian)
summary(fit.1)
##
## Call:
## svyglm(formula = pc1 ~ marst + race, design = des, family = gaussian)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = hardtogetb)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.52637 0.23877 2.205 0.0275 *
## marstdivorced 0.02652 0.15055 0.176 0.8602
## marstmarried 0.01196 0.16326 0.073 0.9416
## marstnm -0.05977 0.17389 -0.344 0.7311
## marstseparated -0.08378 0.23136 -0.362 0.7173
## marstwidowed -1.21718 0.82101 -1.483 0.1383
## race77 -0.52421 0.20946 -2.503 0.0124 *
## race99 -0.24079 0.29157 -0.826 0.4089
## raceblack -0.25434 0.22989 -1.106 0.2686
## racenhwhite -0.41058 0.22278 -1.843 0.0654 .
## raceother -0.49809 0.31932 -1.560 0.1189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2.934168)
##
## Number of Fisher Scoring iterations: 2