library(ipumsr)
ddi <- read_ipums_ddi("nhis_00002.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS NHIS is subject to conditions including that users
## should cite the data appropriately. Use command `ipums_conditions()` for more
## details.
data<- haven::zap_labels(data)
names(data) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(data)))

For this homework, you are to use the technique of Principal Components Analysis (PCA) to perform a variable reduction of at least 5 variables.

To get Adverse health conditions will use: Bad health, Smoking, Anxiety disorder, Medication for worry, Medication for depression, Medical Care cost

data <- data%>%
filter(age >=16 & age<=24)

data2 <- data%>%
filter(complete.cases(sampweight, strata, badhealth, medical_care_cost, healthinsurace_coverage,smoke_100cig, anxiety_disoreder, medication_for_worry,medication_for_depression, educ,opportunity_youth_cat_num, opportunity_youth_cat, urban_rural, familyincome,poverty2, sex ))%>%
  
  select(sampweight, strata, badhealth, medical_care_cost,smoke_100cig, anxiety_disoreder, medication_for_worry,medication_for_depression, healthinsurace_coverage, opportunity_youth_cat_num, opportunity_youth_cat, urban_rural, familyincome,poverty2, sex, educ, race_eth) %>%
 mutate_at(vars(badhealth, smoke_100cig, anxiety_disoreder, medication_for_worry, medication_for_depression, medical_care_cost, healthinsurace_coverage), scale)
data.pc <- PCA(data2 [,3:8],
               scale.unit =  T,
               graph= F
               )

#Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible

The first eigen value is greater than 1 which shows that the combination is also and it accounts for about 38% of the variation

eigenvalues <- data.pc$eig
head(eigenvalues[, 1:2])
##        eigenvalue percentage of variance
## comp 1  2.2525429              37.542382
## comp 2  1.1684750              19.474583
## comp 3  0.9099391              15.165652
## comp 4  0.8655742              14.426236
## comp 5  0.5515383               9.192305
## comp 6  0.2519305               4.198842
fviz_screeplot(data.pc, ncp=10)

Medication for worry and medication for anxiety are more strongly correlated compared to bad health status

data.pc$var
## $coord
##                               Dim.1        Dim.2       Dim.3         Dim.4
## badhealth                 0.2724535  0.469561847  0.83719070  0.0028584296
## medical_care_cost         0.1769015  0.657485703 -0.32032631  0.6561675976
## smoke_100cig              0.1914909  0.651268732 -0.31620470 -0.6593454043
## anxiety_disoreder         0.7763685  0.003097719 -0.01981307 -0.0048559042
## medication_for_worry      0.8822963 -0.222845478 -0.05320747  0.0157907643
## medication_for_depression 0.8539053 -0.204642437 -0.05685879 -0.0008891807
##                                 Dim.5
## badhealth                  0.06575331
## medical_care_cost          0.05678533
## smoke_100cig               0.06498133
## anxiety_disoreder         -0.62533879
## medication_for_worry       0.16896911
## medication_for_depression  0.34665330
## 
## $cor
##                               Dim.1        Dim.2       Dim.3         Dim.4
## badhealth                 0.2724535  0.469561847  0.83719070  0.0028584296
## medical_care_cost         0.1769015  0.657485703 -0.32032631  0.6561675976
## smoke_100cig              0.1914909  0.651268732 -0.31620470 -0.6593454043
## anxiety_disoreder         0.7763685  0.003097719 -0.01981307 -0.0048559042
## medication_for_worry      0.8822963 -0.222845478 -0.05320747  0.0157907643
## medication_for_depression 0.8539053 -0.204642437 -0.05685879 -0.0008891807
##                                 Dim.5
## badhealth                  0.06575331
## medical_care_cost          0.05678533
## smoke_100cig               0.06498133
## anxiety_disoreder         -0.62533879
## medication_for_worry       0.16896911
## medication_for_depression  0.34665330
## 
## $cos2
##                                Dim.1        Dim.2        Dim.3        Dim.4
## badhealth                 0.07423092 2.204883e-01 0.7008882721 8.170620e-06
## medical_care_cost         0.03129413 4.322874e-01 0.1026089461 4.305559e-01
## smoke_100cig              0.03666878 4.241510e-01 0.0999854110 4.347364e-01
## anxiety_disoreder         0.60274811 9.595861e-06 0.0003925579 2.357981e-05
## medication_for_worry      0.77844672 4.966011e-02 0.0028310352 2.493482e-04
## medication_for_depression 0.72915424 4.187853e-02 0.0032329220 7.906423e-07
##                                 Dim.5
## badhealth                 0.004323498
## medical_care_cost         0.003224574
## smoke_100cig              0.004222574
## anxiety_disoreder         0.391048608
## medication_for_worry      0.028550559
## medication_for_depression 0.120168510
## 
## $contrib
##                               Dim.1        Dim.2       Dim.3        Dim.4
## badhealth                  3.295428 1.886975e+01 77.02584030 9.439538e-04
## medical_care_cost          1.389280 3.699587e+01 11.27646247 4.974223e+01
## smoke_100cig               1.627884 3.629953e+01 10.98814263 5.022520e+01
## anxiety_disoreder         26.758563 8.212295e-04  0.04314111 2.724181e-03
## medication_for_worry      34.558575 4.249993e+00  0.31112358 2.880726e-02
## medication_for_depression 32.370271 3.584033e+00  0.35528992 9.134310e-05
##                                Dim.5
## badhealth                  0.7838980
## medical_care_cost          0.5846509
## smoke_100cig               0.7655994
## anxiety_disoreder         70.9014391
## medication_for_worry       5.1765323
## medication_for_depression 21.7878803
fviz_pca_var(data.pc,
             col.var= "contrib")+
  theme_minimal()

fviz_pca_ind(data.pc,
             label= "none",
             col.ind= "cos2") +
  scale_color_gradient2(low= "blue",
                        mid= "white",
                        high= "red",
                        midpoint = .5) +
  theme_minimal()

Report the summary statistics and correlation matrix for your data

They are all significant because they are less than 0.05

desc<- dimdesc(data.pc)
desc$Dim.1
## $quanti
##                           correlation      p.value
## medication_for_worry        0.8822963 0.000000e+00
## medication_for_depression   0.8539053 0.000000e+00
## anxiety_disoreder           0.7763685 0.000000e+00
## badhealth                   0.2724535 2.876779e-64
## smoke_100cig                0.1914909 4.954563e-32
## medical_care_cost           0.1769015 1.648817e-27
## 
## attr(,"class")
## [1] "condes" "list"
desc$Dim.2
## $quanti
##                           correlation       p.value
## medical_care_cost           0.6574857  0.000000e+00
## smoke_100cig                0.6512687  0.000000e+00
## badhealth                   0.4695618 3.201458e-203
## medication_for_depression  -0.2046424  1.958925e-36
## medication_for_worry       -0.2228455  4.752133e-43
## 
## attr(,"class")
## [1] "condes" "list"
data2$pc1 <- data.pc$ind$coord[, 1]

options(survey.lonely.psu = "adjust")
des<- svydesign( ids= ~1,
                 strata = ~strata,
                 weights= ~sampweight,
                 data= data2)
library(ggplot2)
ggplot(aes(x=opportunity_youth_cat, y=pc1, group=opportunity_youth_cat),
       data=data2) +
  geom_boxplot()

data2$educ<- forcats:: fct_relevel(data2$educ, .x= c("1Less than HS", "2hsgrad", "3More than HS" ))
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
ggplot(aes(x=educ, y=pc1),
       data=data2) +
  geom_boxplot()

ggplot(aes(x=urban_rural, y=pc1),
       data=data2) +
  geom_boxplot()

ggplot(aes(x=sex, y=pc1),
       data=data2) +
  geom_boxplot()

data2$pc1 <- data.pc$ind$coord[, 1]

options(survey.lonely.psu = "adjust")
des<- svydesign( ids= ~1,
                 strata = ~strata,
                 weights= ~sampweight,
                 data= data2)

#If deemed appropriate, conduct some testing of your index/components/latent variables.

Opportunity youth status, gender and race ethnicity was significant while education was not

fit.1 <- svyglm(pc1~ factor(opportunity_youth_cat) +educ +  sex + (race_eth),
              des,
              family= gaussian)
summary(fit.1)
## 
## Call:
## svyglm(formula = pc1 ~ factor(opportunity_youth_cat) + educ + 
##     sex + (race_eth), design = des, family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~strata, weights = ~sampweight, 
##     data = data2)
## 
## Coefficients:
##                                                    Estimate Std. Error t value
## (Intercept)                                         0.66964    0.11447   5.850
## factor(opportunity_youth_cat)Not opportunity youth -0.24141    0.08739  -2.762
## educ2hsgrad                                        -0.08300    0.08616  -0.963
## educ3More than HS                                  -0.07134    0.08606  -0.829
## sexMale                                            -0.47140    0.05268  -8.948
## race_ethotherminority                              -0.45384    0.05168  -8.782
##                                                    Pr(>|t|)    
## (Intercept)                                        5.35e-09 ***
## factor(opportunity_youth_cat)Not opportunity youth  0.00577 ** 
## educ2hsgrad                                         0.33540    
## educ3More than HS                                   0.40717    
## sexMale                                             < 2e-16 ***
## race_ethotherminority                               < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.976604)
## 
## Number of Fisher Scoring iterations: 2