PCA, Multinomial, Poisson

Author

Cristina Martinez

options(Ncores = 12)
library(tidyverse, quietly = T)
library(haven, quietly = T)
require(foreign)
require(nnet)
require(ggplot2)
require(reshape2)
gss <- haven::read_dta(
  unz("2021_stata.zip",
      filename = "GSS2021.dta")
)

Recode

library(tidylog, quietly = T)

Attaching package: 'tidylog'
The following objects are masked from 'package:dplyr':

    add_count, add_tally, anti_join, count, distinct, distinct_all,
    distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
    full_join, group_by, group_by_all, group_by_at, group_by_if,
    inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
    relocate, rename, rename_all, rename_at, rename_if, rename_with,
    right_join, sample_frac, sample_n, select, select_all, select_at,
    select_if, semi_join, slice, slice_head, slice_max, slice_min,
    slice_sample, slice_tail, summarise, summarise_all, summarise_at,
    summarise_if, summarize, summarize_all, summarize_at, summarize_if,
    tally, top_frac, top_n, transmute, transmute_all, transmute_at,
    transmute_if, ungroup
The following objects are masked from 'package:tidyr':

    drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
    spread, uncount
The following object is masked from 'package:stats':

    filter
gss_sub0 <- gss %>%
  haven::zap_labels() %>%
  mutate(
    chldidel_fctr1 = factor(case_when(chldidel %in% c(0:1) ~ "1small (0-1)",
                               chldidel %in% c(2:3) ~ "2normative (2-3)",
                               chldidel %in% c(4:7) ~ "3large (4+)",
                               chldidel %in% c(8)   ~ "4As many")),
    chldidel_fctr1 = relevel(chldidel_fctr1, ref = "2normative (2-3)"),
    
    chldidel_fctr2 = factor(case_when(chldidel %in% c(0:1) ~ "1small (0-1)",
                               chldidel %in% c(2:3) ~ "2normative (2-3)",
                               chldidel %in% c(4:7) ~ "3large (4+)")),
    chldidel_fctr2 = relevel(chldidel_fctr2, ref = "2normative (2-3)"),
    chldidel_rc = if_else(chldidel == 8, NA_real_, chldidel),
    grngroup_rc = if_else(grngroup == 2, 0, grngroup),
    grnsign_rc = if_else(grnsign == 2, 0, grnsign),
    grnmoney_rc = if_else(grnmoney == 2, 0, grnmoney),
    grndemo_rc = if_else(grndemo == 2, 0, grndemo),
    # Reverse code
    grwthelp_rc = 6-grwthelp,
    grnexagg_rc = 6-grnexagg,
    grnprog_rc = 6-grnprog,
    naturdev_rc = 6-naturdev,
    impgrn_rc = 6-impgrn,
    grnecon_rc = 6-grnecon, 
    age_cat = case_when(
      age %in% c(18:29) ~ "18-29",
      age %in% c(30:39) ~ "30-39",
      age %in% c(40:49) ~ "40-49",
      age %in% c(50:64) ~ "50-64",
      age %in% c(65:89) ~ "65-89"
    ),
    race1 = if_else(is.na(racecen1) == F, 1, 0),
    race2 = if_else(is.na(racecen2) == F, 1, 0),
    race3 = if_else(is.na(racecen3) == F, 1, 0),
    race_sum = race1 + race2 + race3,
    race_two = if_else(race_sum > 1, 1, 0),
    raceth = case_when(
      race_two == 0 & racecen1 == 1 ~ "NH White",
      race_two == 0 & racecen1 == 2 ~ "NH Black",
      race_two == 0 & racecen1 == 3 ~ "AIAN",
      race_two == 0 & racecen1 %in% c(4:10) ~ "Asian",
      race_two == 0 & racecen1 %in% c(11:14) ~ "NHPI",
      race_two == 0 & racecen1 == 15 ~ "Other",
      race_two == 0 & racecen1 == 16 ~ "Hispanic",
      race_two ==1 ~ "Two or More"),
    race_d = if_else(raceth == 1, 1, 0),
    # grneffme_rc = case_when(grneffme %in% c(1:2) ~ "1Agree",
    #                         grneffme %in% c(3)   ~ "2Neutral",
    #                         grneffme %in% c(3:4) ~ "3Disagree"),
    educ = case_when(degree %in% 0 ~ "1Less than HS",
                     degree %in% 1 ~ "2High School",
                     degree %in% 2 ~ "3Associate's",
                     degree %in% 3 ~ "4Bachelor's",
                     degree %in% 4 ~ "5Graduate"),
    educ_d = if_else(educ > 2, 1, 0),
    sex_rc = if_else(sex == 1, "Male", "Female"),
    relitenv_rc = 5-relitenv
  ) %>%
  select(id, chldidel, chldidel_fctr1, chldidel_rc, age, age_cat, polviews, scigrn:recycle, impgrn:grnexagg, grncon, helpharm:nobuygrn,
         clmtcaus:watergen1, nukegen1, grngroup_rc:relitenv_rc)
mutate: converted 'educ' from double to character (43 fewer NA)
        new variable 'chldidel_fctr1' (factor) with 5 unique values and 33% NA
        new variable 'chldidel_fctr2' (factor) with 4 unique values and 54% NA
        new variable 'chldidel_rc' (double) with 9 unique values and 54% NA
        new variable 'grngroup_rc' (double) with 3 unique values and 55% NA
        new variable 'grnsign_rc' (double) with 3 unique values and 55% NA
        new variable 'grnmoney_rc' (double) with 3 unique values and 55% NA
        new variable 'grndemo_rc' (double) with 3 unique values and 55% NA
        new variable 'grwthelp_rc' (double) with 6 unique values and 56% NA
        new variable 'grnexagg_rc' (double) with 6 unique values and 56% NA
        new variable 'grnprog_rc' (double) with 6 unique values and 56% NA
        new variable 'naturdev_rc' (double) with 6 unique values and 56% NA
        new variable 'impgrn_rc' (double) with 6 unique values and 56% NA
        new variable 'grnecon_rc' (double) with 6 unique values and 55% NA
        new variable 'age_cat' (character) with 6 unique values and 8% NA
        new variable 'race1' (double) with 2 unique values and 0% NA
        new variable 'race2' (double) with 2 unique values and 0% NA
        new variable 'race3' (double) with 2 unique values and 0% NA
        new variable 'race_sum' (double) with 4 unique values and 0% NA
        new variable 'race_two' (double) with 2 unique values and 0% NA
        new variable 'raceth' (character) with 9 unique values and 1% NA
        new variable 'race_d' (double) with 2 unique values and 1% NA
        new variable 'educ_d' (double) with 3 unique values and 1% NA
        new variable 'sex_rc' (character) with 3 unique values and 2% NA
        new variable 'relitenv_rc' (double) with 5 unique values and 37% NA
select: dropped 701 variables (year, wrkslf, wrkgovt, occ10, prestg10, …)

Subset

# Filter data sets one that includes "as many children as you want and another that excludes that category
gss_sub1 <- gss_sub0 %>% 
  select(id, age, sex_rc, race_d, educ_d, chldidel_fctr1) %>%
  filter(complete.cases(.))
select: dropped 52 variables (chldidel, chldidel_rc, age_cat, polviews, scigrn, …)
filter: removed 1,614 rows (40%), 2,418 rows remaining
gss_sub2 <- gss_sub0 %>% 
  select(id, age, sex_rc, race_d, educ_d, chldidel_rc) %>%
  filter(complete.cases(.))
select: dropped 52 variables (chldidel, chldidel_fctr1, age_cat, polviews, scigrn, …)
filter: removed 2,338 rows (58%), 1,694 rows remaining
pca <- gss_sub0 %>%
  select(id, grnecon_rc, grnprog_rc, grwthelp_rc, ihlpgrn, impgrn_rc, grnexagg_rc, grneffme) %>%
  filter(complete.cases(.))
select: dropped 50 variables (chldidel, chldidel_fctr1, chldidel_rc, age, age_cat, …)
filter: removed 2,388 rows (59%), 1,644 rows remaining

Principal Component Analysis

Agree/Disagree response with loading values greater than .1

#pca1b <- pca1 %>% select(grnecon_rc, grnprog_rc, grwthelp_rc, ihlpgrn, impgrn_rc, grnexagg_rc, grneffme)

pc_test <- prcomp(formula = ~., data = pca[,-c(1)], center = T, scale. = T)
summary(pc_test)
Importance of components:
                          PC1    PC2    PC3    PC4     PC5     PC6     PC7
Standard deviation     1.7902 1.0262 0.8752 0.8362 0.78119 0.59518 0.55901
Proportion of Variance 0.4578 0.1504 0.1094 0.0999 0.08718 0.05061 0.04464
Cumulative Proportion  0.4578 0.6082 0.7177 0.8176 0.90475 0.95536 1.00000
biplot(pc_test, scale = 0)

pc_test_tibble <- pc_test$rotation %>%
  as_tibble(rownames = "predictor")

pc_test_tibble %>%
  select(predictor:PC3) %>%
  pivot_longer(PC1:PC3, names_to = "component", values_to = "value") %>%
  ggplot(aes(x = value, y = predictor)) +
  geom_col(fill = "darkgreen", color = "darkgreen", alpha = 0.5) +
  facet_wrap(~component) +
  labs(y = NULL, x = "Value") +
  theme_minimal()
select: dropped 4 variables (PC4, PC5, PC6, PC7)
pivot_longer: reorganized (PC1, PC2, PC3) into (component, value) [was 7x4, now 21x3]

scores<-data.frame(pc_test$x)
pca_join <- cbind(pca,scores) %>%
  select(id, PC1)
select: dropped 13 variables (grnecon_rc, grnprog_rc, grwthelp_rc, ihlpgrn, impgrn_rc, …)
# merge with analysis dataframe
gss_sub1 <- left_join(gss_sub1, pca_join, by = c("id" = "id"))
left_join: added one column (PC1)
           > rows only in x     891
           > rows only in y  (  117)
           > matched rows     1,527
           >                 =======
           > rows total       2,418
gss_sub2 <- left_join(gss_sub2, pca_join, by = c("id" = "id"))
left_join: added one column (PC1)
           > rows only in x     621
           > rows only in y  (  571)
           > matched rows     1,073
           >                 =======
           > rows total       1,694

Multinomial Regression with dummy variables

test <- multinom(chldidel_fctr1 ~ PC1 + sex_rc + race_d + educ_d + scale(age), data = gss_sub1)
# weights:  28 (18 variable)
initial  value 2116.871489 
iter  10 value 1488.702951
iter  20 value 1424.868755
final  value 1424.846097 
converged
summary(test)
Warning in sqrt(diag(vc)): NaNs produced
Call:
multinom(formula = chldidel_fctr1 ~ PC1 + sex_rc + race_d + educ_d + 
    scale(age), data = gss_sub1)

Coefficients:
             (Intercept)        PC1  sex_rcMale race_d     educ_d  scale(age)
1small (0-1)  -3.8687053  0.2874626 -0.25178468      0  0.9949492 -0.49610156
3large (4+)   -1.8184204 -0.2370428  0.05344611      0 -0.6511560 -0.17578455
4As many      -0.8515122 -0.0601730 -0.41973882      0  0.3432283 -0.08076576

Std. Errors:
             (Intercept)        PC1 sex_rcMale race_d    educ_d scale(age)
1small (0-1)   1.0198825 0.08631703  0.2794492    NaN 1.0268337 0.14541860
3large (4+)    0.3779090 0.06162215  0.2241608    NaN 0.3873315 0.11356737
4As many       0.2811257 0.03281668  0.1172944      0 0.2863948 0.05878833

Residual Deviance: 2849.692 
AIC: 2879.692 
exp(coef(test))
             (Intercept)       PC1 sex_rcMale race_d    educ_d scale(age)
1small (0-1)  0.02088539 1.3330407  0.7774121      1 2.7045870  0.6088998
3large (4+)   0.16228188 0.7889575  1.0549001      1 0.5214426  0.8387987
4As many      0.42676909 0.9416016  0.6572185      1 1.4094905  0.9224097
z <- summary(test)$coefficients/summary(test)$standard.errors
Warning in sqrt(diag(vc)): NaNs produced

Warning in sqrt(diag(vc)): NaNs produced
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
              (Intercept)          PC1   sex_rcMale race_d     educ_d
1small (0-1) 1.486671e-04 0.0008674900 0.3675864633    NaN 0.33257076
3large (4+)  1.495808e-06 0.0001197125 0.8115494938    NaN 0.09273692
4As many     2.454158e-03 0.0667119288 0.0003455644    NaN 0.23074407
              scale(age)
1small (0-1) 0.000645967
3large (4+)  0.121659907
4As many     0.169491340

Poisson Regression

fit1 <- glm(chldidel_rc~PC1 + sex_rc + race_d + educ_d + scale(age), data = gss_sub2, family = poisson)
summary(fit1)

Call:
glm(formula = chldidel_rc ~ PC1 + sex_rc + race_d + educ_d + 
    scale(age), family = poisson, data = gss_sub2)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3682  -0.2791  -0.1671   0.3467   2.0614  

Coefficients: (1 not defined because of singularities)
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.992372   0.082294  12.059  < 2e-16 ***
PC1         -0.036827   0.011313  -3.255  0.00113 ** 
sex_rcMale  -0.001873   0.039806  -0.047  0.96247    
race_d             NA         NA      NA       NA    
educ_d      -0.133038   0.083803  -1.588  0.11240    
scale(age)   0.007374   0.020235   0.364  0.71554    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 349.57  on 1072  degrees of freedom
Residual deviance: 335.79  on 1068  degrees of freedom
  (621 observations deleted due to missingness)
AIC: 3236.4

Number of Fisher Scoring iterations: 4