Final Project

Author

Jingyi Yang

Prepare

library(haven)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.2     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.2.1
✔ purrr     1.0.4     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(semPlot)
library(lavaan) 
This is lavaan 0.6-19
lavaan is FREE software! Please report any bugs.
library(psych)

Attaching package: 'psych'

The following object is masked from 'package:lavaan':

    cor2cov

The following objects are masked from 'package:ggplot2':

    %+%, alpha
library(skimr) 
library(corrplot)
corrplot 0.95 loaded
library(patchwork) #Merge GGPlots together 
library(ggplot2) #Graphing
library(jtools) #Tabular Regression Results
library(descr) #Easy Frequency Tables  
library(stats) #Imports survey data 
library(ggeffects) #Predicted Probabilities from Regressions
library(nnet) #For multinomial models
library(MASS) #For ordered models 

Attaching package: 'MASS'

The following object is masked from 'package:patchwork':

    area

The following object is masked from 'package:dplyr':

    select
library(brant) #Test parallel regression assumption 
library(boot) #Create CIs for Multinomial Modeling

Attaching package: 'boot'

The following object is masked from 'package:psych':

    logit
library(cem) #Coarsened Exact Matching 
Loading required package: tcltk
Loading required package: lattice

Attaching package: 'lattice'

The following object is masked from 'package:boot':

    melanoma


How to use CEM? Type vignette("cem")
library(MatchIt)  #Coarsened Exact Matching 
library(stargazer)  #Regression Output

Please cite as: 

 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(jtools)  #Regression Output
library(WeightIt) #Entropy Balancing
library(Hmisc) #General Modeling 

Attaching package: 'Hmisc'

The following object is masked from 'package:jtools':

    %nin%

The following object is masked from 'package:psych':

    describe

The following objects are masked from 'package:dplyr':

    src, summarize

The following objects are masked from 'package:base':

    format.pval, units
library(ebal) #Entropy Balancing
##
## ebal Package: Implements Entropy Balancing.

## See http://www.stanford.edu/~jhain/ for additional information.
library(survey) #Applying EB weights 
Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loading required package: survival

Attaching package: 'survival'

The following object is masked from 'package:boot':

    aml


Attaching package: 'survey'

The following object is masked from 'package:Hmisc':

    deff

The following object is masked from 'package:WeightIt':

    calibrate

The following object is masked from 'package:graphics':

    dotchart
library(gganimate)
Warning: package 'gganimate' was built under R version 4.5.2

Data Import

GSS2022 <- read_dta("~/研一下/DACSS 790Q/Final Project/2022_stata/2022/GSS2022.dta")
head(GSS2022)
# A tibble: 6 × 1,185
  year         id wrkstat hrs1        hrs2        evwork      wrkslf  occ10     
  <dbl+lbl> <dbl> <dbl+l> <dbl+lbl>   <dbl+lbl>   <dbl+lbl>   <dbl+l> <dbl+lbl> 
1 2022          1 1 [wor…    40       NA(i) [iap] NA(i) [iap] 2 [som…  430 [man…
2 2022          2 5 [ret… NA(i) [iap] NA(i) [iap]     1 [yes] 2 [som…   50 [mar…
3 2022          3 1 [wor…    52       NA(i) [iap] NA(i) [iap] 2 [som… 4610 [per…
4 2022          4 3 [wit… NA(i) [iap]    25       NA(i) [iap] 2 [som… 4120 [foo…
5 2022          5 8 [oth… NA(i) [iap] NA(i) [iap]     1 [yes] 2 [som… 7330 [ind…
6 2022          6 1 [wor…    50       NA(i) [iap] NA(i) [iap] 2 [som… 4610 [per…
# ℹ 1,177 more variables: prestg10 <dbl+lbl>, prestg105plus <dbl+lbl>,
#   indus10 <dbl+lbl>, marital <dbl+lbl>, martype <dbl+lbl>, divorce <dbl+lbl>,
#   widowed <dbl+lbl>, spwrksta <dbl+lbl>, sphrs1 <dbl+lbl>, sphrs2 <dbl+lbl>,
#   spevwork <dbl+lbl>, cowrksta <dbl+lbl>, coevwork <dbl+lbl>,
#   cohrs1 <dbl+lbl>, cohrs2 <dbl+lbl>, spwrkslf <dbl+lbl>, sppres80 <dbl+lbl>,
#   spocc10 <dbl+lbl>, sppres10 <dbl+lbl>, sppres105plus <dbl+lbl>,
#   spind10 <dbl+lbl>, coocc10 <dbl+lbl>, coind10 <dbl+lbl>, …

Setting data set

# Change age to age group! 
age_breaks <- c(18, 30, 50, 65, 100)

GSS2022$age_group <- cut(GSS2022$age, 
                      age_breaks, labels = c("18-29", "30-49", "50-64", "65+"), 
                      include.lowest = TRUE) 
                      
GSS2022<- GSS2022 %>% mutate(age_group= recode(age_group,"18-29"="1", "30-49"="2", "50-64"="3", "65+"="4"))

test_data <- GSS2022 %>% dplyr::select(c(wkvsfam, famvswk, physhlth, mntlhlth, marital,hompop_exp, age_group, degree, income, race, sex, wrkstat, wtssps))%>% mutate(wkvsfam= as.numeric(wkvsfam),famvswk= as.numeric(famvswk), physhlth= as.numeric(physhlth), mntlhlth= as.numeric(mntlhlth), marital= as.numeric(marital), hompop_exp= as.numeric(hompop_exp), age_group= as.numeric(age_group), degree= as.numeric(degree), income= as.numeric(income), race= as.numeric(race), sex= as.numeric(sex), wrkstat=as.numeric(wrkstat), wtssps=as.numeric(wtssps))

new_names <- c("job_interrupt_the_family", "family_interrupt_the_job", "physical_health_status", "mental_health_status", "marriage_status", "total_people_in_household", "age_group","education", "family_income", "race", "sex", "working_status", "weight" )

colnames(test_data) <- new_names #Apply new names to your data frame
skim(test_data)
Data summary
Name test_data
Number of rows 4149
Number of columns 13
_______________________
Column type frequency:
numeric 13
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
job_interrupt_the_family 1767 0.57 2.69 0.93 1.00 2.00 3.00 3.00 4.00 ▂▇▁▇▅
family_interrupt_the_job 1779 0.57 2.94 0.86 1.00 2.00 3.00 4.00 4.00 ▁▆▁▇▆
physical_health_status 1844 0.56 2.67 6.11 0.00 0.00 0.00 2.00 30.00 ▇▁▁▁▁
mental_health_status 1870 0.55 4.53 7.93 0.00 0.00 0.00 5.00 30.00 ▇▁▁▁▁
marriage_status 16 1.00 2.77 1.74 1.00 1.00 3.00 5.00 5.00 ▇▁▃▁▆
total_people_in_household 8 1.00 1.75 1.67 0.00 1.00 1.00 2.00 49.00 ▇▁▁▁▁
age_group 256 0.94 2.46 1.03 1.00 2.00 2.00 3.00 4.00 ▅▇▁▅▅
education 2 1.00 1.82 1.27 0.00 1.00 1.00 3.00 4.00 ▂▇▂▃▂
family_income 484 0.88 11.22 2.07 1.00 12.00 12.00 12.00 12.00 ▁▁▁▁▇
race 64 0.98 1.51 0.76 1.00 1.00 1.00 2.00 3.00 ▇▁▂▁▂
sex 23 0.99 1.54 0.50 1.00 1.00 2.00 2.00 2.00 ▇▁▁▁▇
working_status 9 1.00 3.07 2.34 1.00 1.00 2.00 5.00 8.00 ▇▁▃▁▂
weight 605 0.85 1.00 1.26 0.07 0.34 0.58 1.06 14.27 ▇▁▁▁▁
# Reverse Coding
final_data <- test_data %>%
  mutate(
    job_interrupt_the_family = case_when(
      job_interrupt_the_family == 1 ~ 4,
      job_interrupt_the_family == 2 ~ 3,
      job_interrupt_the_family == 3 ~ 2,
      job_interrupt_the_family == 4 ~ 1,
      job_interrupt_the_family == "NA"~ NA_real_),
    job_interrupt_the_family = labelled(job_interrupt_the_family, 
      c(`Never` = 1, `Rarely` = 2, `Sometimes` = 3, `Often` = 4))) %>% 
  mutate(family_interrupt_the_job = as.numeric(family_interrupt_the_job),
    family_interrupt_the_job = case_when(
      family_interrupt_the_job == 1 ~ 4,
      family_interrupt_the_job == 2 ~ 3,
      family_interrupt_the_job == 3 ~ 2,
      family_interrupt_the_job == 4 ~ 1,
      family_interrupt_the_job == "NA"~ NA_real_),
    family_interrupt_the_job = labelled(family_interrupt_the_job, 
      c(`Never` = 1, `Rarely` = 2, `Sometimes` = 3, `Often` = 4))) %>%
  mutate(marriage_status= case_when(
    marriage_status== 1 ~ 5, 
    marriage_status== 2 ~ 4,
    marriage_status== 3 ~ 3,
    marriage_status== 4 ~ 2,
    marriage_status== 5 ~ 1,
    marriage_status== "NA"~ NA_real_),
    marriage_status=labelled(marriage_status, c(`Never married`=1, `Separated`=2, `Divorced`=3, `Widowed`=4, `Married`=5))) %>%
  mutate(age_group= case_when(
    age_group== 1 ~ 1,
    age_group== 2 ~ 2, 
    age_group== 3 ~ 3,
    age_group== 4 ~ 4,
    age_group== "NA"~ NA_real_),
    age_group=labelled(age_group, c(`18-29 YEARS OLD`=1, `30-49 YEARS OLD`=2, `50-64 YEARS OLD`=3, `64 YEARS OLD OR OVER`=4)))%>% 
  mutate(education= case_when(
    education== 0 ~ 1,
    education== 1 ~ 2,
    education== 2 ~ 3, 
    education== 3 ~ 4, 
    education== 4 ~ 5,
    education== "NA"~ NA_real_),
    education= labelled(education, c(`Less THAN HIGH SCHOOL`= 1, `HIGH SCHOOL`= 2, `ASSOCIATE/JUNIOR COLLEGE`=3, `BACHELORS`=4, `GRADUATE`= 5)))%>%
  mutate(family_income= case_when(
    family_income== 1 ~ 1, 
    family_income== 2 ~ 2, 
    family_income== 3 ~ 3, 
    family_income== 4 ~ 4, 
    family_income== 5 ~ 5, 
    family_income== 6 ~ 6, 
    family_income== 7 ~ 7, 
    family_income== 8 ~ 8, 
    family_income== 9 ~ 9,
    family_income== 10 ~ 10, 
    family_income== 11 ~ 11, 
    family_income== 12 ~ 12,
    family_income== "NA"~ NA_real_),
    family_income= labelled(family_income, c(`UNDER$1,000`= 1,`$1,000 TO $2,999`= 2, `$3,000 TO $3,999`=3, `$4,000 TO $4,999`=4, `$5,000 TOlibra $5,999`= 5, `$6,000 TO $6,999`=6, `$7,000 TO $7,999`=7, `$8,000 TO $9,999`=8, `$10,000 TO $14,999`= 9, `$15,000 TO $19,999`= 10,`$20,000 TO $24,999`= 11,  `$25,000 OR MORE`= 12))) %>%
  mutate(race= case_when(
    race== 1 ~ 1, 
    race== 2 ~ 2,
    race== 3 ~ 3,
    race== "NA"~ NA_real_),
  race= labelled(race, c(`WHITE`= 1, `BLACK`= 2, `OTHER`=3))) %>%
  mutate(sex= case_when(
    sex== 1 ~ 1, 
    sex== 2 ~ 2,
    sex== "NA"~ NA_real_),
    sex= labelled(sex, c(`MALE`=1, `FEMALE`=2)))%>%
  mutate(total_people_in_household = case_when(
    total_people_in_household == 0 ~ 0,
    total_people_in_household == 1 ~ 1,
    total_people_in_household == 2 ~ 2,
    total_people_in_household == 3 ~ 3,
    total_people_in_household == 4 ~ 4,
    total_people_in_household == 5 ~ 5,
    total_people_in_household == 6 ~ 6,
    total_people_in_household == 7 ~ 7,
    total_people_in_household >= 8 ~ 8,
    total_people_in_household == "NA"~ NA_real_),
    total_people_in_household = labelled(total_people_in_household, 
      c(`NONE`=0, `1`=1, `2`=2, `3`=3, `4`=4, `5`=5, `6`=6, `7`=7, `8+`=8)))%>%
  mutate(working_status = case_when(
    working_status == 1 ~ 3,
    working_status == 2 ~ 2,
    working_status == 3 ~ 1,
    working_status == 4 ~ 1,
    working_status == 5 ~ 1,
    working_status == 6 ~ 1,
    working_status == 7 ~ 1,
    working_status == 8 ~ 1,
    working_status == "NA"~ NA_real_),
    working_status = labelled(working_status,
      c(`WORKING FULL TIME`=3, `WORKING PART TIME`=2,`OTHER`=1)))
head(final_data)
# A tibble: 6 × 13
  job_interrupt_the_family family_interrupt_the_job physical_health_status
  <dbl+lbl>                <dbl+lbl>                                 <dbl>
1  3 [Sometimes]            2 [Rarely]                                  30
2 NA                       NA                                           NA
3  2 [Rarely]               2 [Rarely]                                   0
4  4 [Often]                1 [Never]                                    4
5 NA                       NA                                           NA
6  4 [Often]                3 [Sometimes]                                0
# ℹ 10 more variables: mental_health_status <dbl>, marriage_status <dbl+lbl>,
#   total_people_in_household <dbl+lbl>, age_group <dbl+lbl>,
#   education <dbl+lbl>, family_income <dbl+lbl>, race <dbl+lbl>,
#   sex <dbl+lbl>, working_status <dbl+lbl>, weight <dbl>
str(final_data)
tibble [4,149 × 13] (S3: tbl_df/tbl/data.frame)
 $ job_interrupt_the_family : dbl+lbl [1:4149]  3, NA,  2,  4, NA,  4,  4,  2,  3, NA, NA,  1,  2,  ...
   ..@ labels: Named num [1:4] 1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "Never" "Rarely" "Sometimes" "Often"
 $ family_interrupt_the_job : dbl+lbl [1:4149]  2, NA,  2,  1, NA,  3,  2,  2,  4, NA, NA,  2,  2,  ...
   ..@ labels: Named num [1:4] 1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "Never" "Rarely" "Sometimes" "Often"
 $ physical_health_status   : num [1:4149] 30 NA 0 4 NA 0 2 0 5 NA ...
 $ mental_health_status     : num [1:4149] 15 NA 0 10 NA NA 14 0 5 NA ...
 $ marriage_status          : dbl+lbl [1:4149] 3, 5, 3, 1, 1, 1, 1, 5, 1, 1, 4, 1, 5, 1, 1, 3, 1, 5,...
   ..@ labels: Named num [1:5] 1 2 3 4 5
   .. ..- attr(*, "names")= chr [1:5] "Never married" "Separated" "Divorced" "Widowed" ...
 $ total_people_in_household: dbl+lbl [1:4149] 1, 1, 3, 1, 3, 1, 3, 2, 2, 1, 2, 1, 1, 1, 3, 1, 1, 3,...
   ..@ labels: Named num [1:9] 0 1 2 3 4 5 6 7 8
   .. ..- attr(*, "names")= chr [1:9] "NONE" "1" "2" "3" ...
 $ age_group                : dbl+lbl [1:4149]  4,  4,  3,  1,  3,  1,  1,  2,  2,  4,  3,  1,  2,  ...
   ..@ labels: Named num [1:4] 1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "18-29 YEARS OLD" "30-49 YEARS OLD" "50-64 YEARS OLD" "64 YEARS OLD OR OVER"
 $ education                : dbl+lbl [1:4149] 4, 5, 2, 4, 2, 2, 2, 5, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4,...
   ..@ labels: Named num [1:5] 1 2 3 4 5
   .. ..- attr(*, "names")= chr [1:5] "Less THAN HIGH SCHOOL" "HIGH SCHOOL" "ASSOCIATE/JUNIOR COLLEGE" "BACHELORS" ...
 $ family_income            : dbl+lbl [1:4149] 12, NA, 12, 12, 12, 12, 12, 12, 11,  9, 12, 12, 12,  ...
   ..@ labels: Named num [1:12] 1 2 3 4 5 6 7 8 9 10 ...
   .. ..- attr(*, "names")= chr [1:12] "UNDER$1,000" "$1,000 TO $2,999" "$3,000 TO $3,999" "$4,000 TO $4,999" ...
 $ race                     : dbl+lbl [1:4149]  1,  1,  1,  1,  1,  1,  3,  1,  1, NA,  1,  1,  1,  ...
   ..@ labels: Named num [1:3] 1 2 3
   .. ..- attr(*, "names")= chr [1:3] "WHITE" "BLACK" "OTHER"
 $ sex                      : dbl+lbl [1:4149]  2,  1,  2,  2,  1,  1,  2,  1,  2,  2,  2,  2,  2,  ...
   ..@ labels: Named num [1:2] 1 2
   .. ..- attr(*, "names")= chr [1:2] "MALE" "FEMALE"
 $ working_status           : dbl+lbl [1:4149] 3, 1, 3, 1, 1, 3, 2, 3, 3, 1, 1, 2, 3, 2, 1, 1, 1, 2,...
   ..@ labels: Named num [1:3] 3 2 1
   .. ..- attr(*, "names")= chr [1:3] "WORKING FULL TIME" "WORKING PART TIME" "OTHER"
 $ weight                   : num [1:4149] 0.231 0.577 1.014 0.902 1.084 ...
skim(final_data)
Data summary
Name final_data
Number of rows 4149
Number of columns 13
_______________________
Column type frequency:
numeric 13
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
job_interrupt_the_family 1767 0.57 2.31 0.93 1.00 2.00 2.00 3.00 4.00 ▅▇▁▇▂
family_interrupt_the_job 1779 0.57 2.06 0.86 1.00 1.00 2.00 3.00 4.00 ▆▇▁▆▁
physical_health_status 1844 0.56 2.67 6.11 0.00 0.00 0.00 2.00 30.00 ▇▁▁▁▁
mental_health_status 1870 0.55 4.53 7.93 0.00 0.00 0.00 5.00 30.00 ▇▁▁▁▁
marriage_status 16 1.00 3.23 1.74 1.00 1.00 3.00 5.00 5.00 ▆▁▃▁▇
total_people_in_household 8 1.00 1.72 1.33 0.00 1.00 1.00 2.00 8.00 ▇▂▁▁▁
age_group 256 0.94 2.46 1.03 1.00 2.00 2.00 3.00 4.00 ▅▇▁▅▅
education 2 1.00 2.82 1.27 1.00 2.00 2.00 4.00 5.00 ▂▇▂▃▂
family_income 484 0.88 11.22 2.07 1.00 12.00 12.00 12.00 12.00 ▁▁▁▁▇
race 64 0.98 1.51 0.76 1.00 1.00 1.00 2.00 3.00 ▇▁▂▁▂
sex 23 0.99 1.54 0.50 1.00 1.00 2.00 2.00 2.00 ▇▁▁▁▇
working_status 9 1.00 2.02 0.95 1.00 1.00 2.00 3.00 3.00 ▇▁▂▁▇
weight 605 0.85 1.00 1.26 0.07 0.34 0.58 1.06 14.27 ▇▁▁▁▁
summary_df <- summarytools::dfSummary(final_data,
                                     varnumbers=FALSE,
                                     plain.ascii=FALSE,
                                     style="grid",
                                     graph.col = TRUE,
                                     valid.col=FALSE)

# Print the summary table and suppress warnings
print(summary_df,
      method="render",
      table.classes="table-condensed")

Data Frame Summary

final_data

Dimensions: 4149 x 13
Duplicates: 132
Variable Stats / Values Freqs (% of Valid) Graph Missing
job_interrupt_the_family [haven_labelled, vctrs_vctr, double]
1. [1] Never
2. [2] Rarely
3. [3] Sometimes
4. [4] Often
525 ( 22.0% )
826 ( 34.7% )
789 ( 33.1% )
242 ( 10.2% )
1767 (42.6%)
family_interrupt_the_job [haven_labelled, vctrs_vctr, double]
1. [1] Never
2. [2] Rarely
3. [3] Sometimes
4. [4] Often
704 ( 29.7% )
926 ( 39.1% )
632 ( 26.7% )
108 ( 4.6% )
1779 (42.9%)
physical_health_status [numeric]
Mean (sd) : 2.7 (6.1)
min ≤ med ≤ max:
0 ≤ 0 ≤ 30
IQR (CV) : 2 (2.3)
24 distinct values 1844 (44.4%)
mental_health_status [numeric]
Mean (sd) : 4.5 (7.9)
min ≤ med ≤ max:
0 ≤ 0 ≤ 30
IQR (CV) : 5 (1.8)
29 distinct values 1870 (45.1%)
marriage_status [haven_labelled, vctrs_vctr, double]
1. [1] Never married
2. [2] Separated
3. [3] Divorced
4. [4] Widowed
5. [5] Married
1333 ( 32.3% )
121 ( 2.9% )
669 ( 16.2% )
273 ( 6.6% )
1737 ( 42.0% )
16 (0.4%)
total_people_in_household [haven_labelled, vctrs_vctr, double]
1. [0] NONE
2. [1] 1
3. [2] 2
4. [3] 3
5. [4] 4
6. [5] 5
7. [6] 6
8. [7] 7
9. [8] 8+
40 ( 1.0% )
2727 ( 65.9% )
580 ( 14.0% )
314 ( 7.6% )
270 ( 6.5% )
108 ( 2.6% )
52 ( 1.3% )
24 ( 0.6% )
26 ( 0.6% )
8 (0.2%)
age_group [haven_labelled, vctrs_vctr, double]
1. [1] 18-29 YEARS OLD
2. [2] 30-49 YEARS OLD
3. [3] 50-64 YEARS OLD
4. [4] 64 YEARS OLD OR OVER
750 ( 19.3% )
1408 ( 36.2% )
924 ( 23.7% )
811 ( 20.8% )
256 (6.2%)
education [haven_labelled, vctrs_vctr, double]
1. [1] Less THAN HIGH SCHOOL
2. [2] HIGH SCHOOL
3. [3] ASSOCIATE/JUNIOR COLL
4. [4] BACHELORS
5. [5] GRADUATE
417 ( 10.1% )
1919 ( 46.3% )
367 ( 8.8% )
866 ( 20.9% )
578 ( 13.9% )
2 (0.0%)
family_income [haven_labelled, vctrs_vctr, double]
1. [1] UNDER$1,000
2. [2] $1,000 TO $2,999
3. [3] $3,000 TO $3,999
4. [4] $4,000 TO $4,999
5. [5] $5,000 TOlibra $5,999
6. [6] $6,000 TO $6,999
7. [7] $7,000 TO $7,999
8. [8] $8,000 TO $9,999
9. [9] $10,000 TO $14,999
10. [10] $15,000 TO $19,999
[ 2 others ]
53 ( 1.4% )
37 ( 1.0% )
19 ( 0.5% )
14 ( 0.4% )
20 ( 0.5% )
15 ( 0.4% )
18 ( 0.5% )
46 ( 1.3% )
215 ( 5.9% )
126 ( 3.4% )
3102 ( 84.6% )
484 (11.7%)
race [haven_labelled, vctrs_vctr, double]
1. [1] WHITE
2. [2] BLACK
3. [3] OTHER
2651 ( 64.9% )
775 ( 19.0% )
659 ( 16.1% )
64 (1.5%)
sex [haven_labelled, vctrs_vctr, double]
1. [1] MALE
2. [2] FEMALE
1910 ( 46.3% )
2216 ( 53.7% )
23 (0.6%)
working_status [haven_labelled, vctrs_vctr, double]
1. [3] WORKING FULL TIME
2. [2] WORKING PART TIME
3. [1] OTHER
1904 ( 46.0% )
423 ( 10.2% )
1813 ( 43.8% )
9 (0.2%)
weight [numeric]
Mean (sd) : 1 (1.3)
min ≤ med ≤ max:
0.1 ≤ 0.6 ≤ 14.3
IQR (CV) : 0.7 (1.3)
2269 distinct values 605 (14.6%)

Generated by summarytools 1.1.4 (R version 4.5.1)
2025-11-20

Separate two scales

data_physical_health<-final_data%>% dplyr::select(c(job_interrupt_the_family, family_interrupt_the_job, physical_health_status, marriage_status, total_people_in_household,  age_group,education, family_income, race, sex, working_status, weight)) %>% mutate(marriage_status= as.factor(marriage_status))%>% mutate(race= as.factor(race))%>% mutate(working_status= as.factor(working_status))


head(data_physical_health)
# A tibble: 6 × 12
  job_interrupt_the_family family_interrupt_the_job physical_health_status
  <dbl+lbl>                <dbl+lbl>                                 <dbl>
1  3 [Sometimes]            2 [Rarely]                                  30
2 NA                       NA                                           NA
3  2 [Rarely]               2 [Rarely]                                   0
4  4 [Often]                1 [Never]                                    4
5 NA                       NA                                           NA
6  4 [Often]                3 [Sometimes]                                0
# ℹ 9 more variables: marriage_status <fct>,
#   total_people_in_household <dbl+lbl>, age_group <dbl+lbl>,
#   education <dbl+lbl>, family_income <dbl+lbl>, race <fct>, sex <dbl+lbl>,
#   working_status <fct>, weight <dbl>
data_mental_health <- final_data%>% dplyr::select(c(job_interrupt_the_family, family_interrupt_the_job, mental_health_status,marriage_status, total_people_in_household,  age_group,education, family_income, race, sex, working_status, weight))%>% mutate(marriage_status= as.factor(marriage_status)) %>% mutate(race= as.factor(race)) %>% mutate(working_status= as.factor(working_status))

head(data_mental_health)
# A tibble: 6 × 12
  job_interrupt_the_family family_interrupt_the_job mental_health_status
  <dbl+lbl>                <dbl+lbl>                               <dbl>
1  3 [Sometimes]            2 [Rarely]                                15
2 NA                       NA                                         NA
3  2 [Rarely]               2 [Rarely]                                 0
4  4 [Often]                1 [Never]                                 10
5 NA                       NA                                         NA
6  4 [Often]                3 [Sometimes]                             NA
# ℹ 9 more variables: marriage_status <fct>,
#   total_people_in_household <dbl+lbl>, age_group <dbl+lbl>,
#   education <dbl+lbl>, family_income <dbl+lbl>, race <fct>, sex <dbl+lbl>,
#   working_status <fct>, weight <dbl>

Count Dependent Variable Tutorials

Negative-Binomial Model

nb_physical_health<- glm.nb(physical_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+ factor(age_group)+education+family_income+race+sex+working_status, data=data_physical_health, weights = data_physical_health$weight) 

summary(nb_physical_health)

Call:
glm.nb(formula = physical_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    factor(age_group) + education + family_income + race + sex + 
    working_status, data = data_physical_health, weights = data_physical_health$weight, 
    init.theta = 0.2066739091, link = log)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.070116   0.611382   1.750 0.080063 .  
job_interrupt_the_family   0.046956   0.071397   0.658 0.510746    
family_interrupt_the_job   0.150010   0.076790   1.954 0.050758 .  
marriage_status2          -0.305971   0.466022  -0.657 0.511464    
marriage_status3          -0.114493   0.211822  -0.541 0.588841    
marriage_status4           0.419432   0.403927   1.038 0.299090    
marriage_status5          -0.532134   0.151580  -3.511 0.000447 ***
total_people_in_household  0.035484   0.037679   0.942 0.346327    
factor(age_group)2         0.420183   0.163890   2.564 0.010353 *  
factor(age_group)3         0.597448   0.188429   3.171 0.001521 ** 
factor(age_group)4         0.161875   0.293786   0.551 0.581636    
education                 -0.088090   0.048420  -1.819 0.068866 .  
family_income             -0.006117   0.041862  -0.146 0.883831    
race2                     -0.869784   0.182331  -4.770 1.84e-06 ***
race3                     -0.380102   0.184256  -2.063 0.039122 *  
sex                        0.480984   0.116805   4.118 3.82e-05 ***
working_status2           -0.704522   0.289847  -2.431 0.015071 *  
working_status3           -1.188900   0.263378  -4.514 6.36e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(0.2067) family taken to be 1)

    Null deviance: 1431.4  on 1663  degrees of freedom
Residual deviance: 1322.8  on 1646  degrees of freedom
  (2485 observations deleted due to missingness)
AIC: 5950.4

Number of Fisher Scoring iterations: 1

              Theta:  0.2067 
          Std. Err.:  0.0108 

 2 x log-likelihood:  -5912.4340 
nb_mental_health<- glm.nb(mental_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+ factor(age_group)+education+family_income+race+sex+working_status, data=data_mental_health, weights = data_mental_health$weight) 

summary(nb_mental_health)

Call:
glm.nb(formula = mental_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    factor(age_group) + education + family_income + race + sex + 
    working_status, data = data_mental_health, weights = data_mental_health$weight, 
    init.theta = 0.2967852665, link = log)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.86665    0.49971   3.735 0.000187 ***
job_interrupt_the_family   0.30549    0.06046   5.053 4.35e-07 ***
family_interrupt_the_job   0.14536    0.06508   2.234 0.025503 *  
marriage_status2          -0.67765    0.39241  -1.727 0.084186 .  
marriage_status3           0.02496    0.17657   0.141 0.887600    
marriage_status4          -0.01392    0.34390  -0.040 0.967709    
marriage_status5          -0.57128    0.12572  -4.544 5.52e-06 ***
total_people_in_household  0.04766    0.03118   1.528 0.126400    
factor(age_group)2        -0.25762    0.13466  -1.913 0.055720 .  
factor(age_group)3        -0.68757    0.15633  -4.398 1.09e-05 ***
factor(age_group)4        -1.42375    0.25834  -5.511 3.57e-08 ***
education                 -0.14462    0.04057  -3.565 0.000364 ***
family_income             -0.10208    0.03339  -3.057 0.002233 ** 
race2                     -0.78152    0.14905  -5.243 1.58e-07 ***
race3                     -0.60170    0.15473  -3.889 0.000101 ***
sex                        0.61570    0.09770   6.302 2.94e-10 ***
working_status2           -0.15969    0.24722  -0.646 0.518323    
working_status3           -0.12411    0.22416  -0.554 0.579828    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(0.2968) family taken to be 1)

    Null deviance: 1783.3  on 1655  degrees of freedom
Residual deviance: 1570.7  on 1638  degrees of freedom
  (2493 observations deleted due to missingness)
AIC: 7574.4

Number of Fisher Scoring iterations: 1

              Theta:  0.2968 
          Std. Err.:  0.0141 

 2 x log-likelihood:  -7536.4160 

Basic Poisson Model

pois_physical_health <- glm(physical_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status, data=data_physical_health, family = poisson, weights = data_physical_health$weight)
summary(pois_physical_health)

Call:
glm(formula = physical_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = poisson, data = data_physical_health, weights = data_physical_health$weight)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.287148   0.153912   8.363  < 2e-16 ***
job_interrupt_the_family   0.034496   0.019110   1.805  0.07105 .  
family_interrupt_the_job   0.136809   0.020105   6.805 1.01e-11 ***
marriage_status2          -0.429247   0.147328  -2.914  0.00357 ** 
marriage_status3          -0.027718   0.051398  -0.539  0.58969    
marriage_status4           0.631936   0.077712   8.132 4.23e-16 ***
marriage_status5          -0.345389   0.040021  -8.630  < 2e-16 ***
total_people_in_household  0.053797   0.009884   5.443 5.24e-08 ***
age_group                  0.100197   0.020701   4.840 1.30e-06 ***
education                 -0.112181   0.013355  -8.400  < 2e-16 ***
family_income             -0.006640   0.010666  -0.623  0.53359    
race2                     -0.785702   0.061160 -12.847  < 2e-16 ***
race3                     -0.380280   0.055435  -6.860 6.89e-12 ***
sex                        0.368993   0.031925  11.558  < 2e-16 ***
working_status2           -0.720800   0.054434 -13.242  < 2e-16 ***
working_status3           -1.068241   0.047625 -22.430  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 13049  on 1663  degrees of freedom
Residual deviance: 11681  on 1648  degrees of freedom
  (2485 observations deleted due to missingness)
AIC: 13900

Number of Fisher Scoring iterations: 6
pois_mental_health <- glm(mental_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status, data=data_mental_health, family = poisson, weights = data_mental_health$weight)
summary(pois_mental_health)

Call:
glm(formula = mental_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = poisson, data = data_mental_health, weights = data_mental_health$weight)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.86773    0.10756  17.364  < 2e-16 ***
job_interrupt_the_family   0.29156    0.01455  20.034  < 2e-16 ***
family_interrupt_the_job   0.10048    0.01545   6.503 7.87e-11 ***
marriage_status2          -0.52643    0.11076  -4.753 2.01e-06 ***
marriage_status3           0.08600    0.04142   2.076   0.0379 *  
marriage_status4           0.55237    0.07107   7.772 7.70e-15 ***
marriage_status5          -0.26321    0.02902  -9.071  < 2e-16 ***
total_people_in_household  0.01614    0.00731   2.208   0.0272 *  
age_group                 -0.36941    0.01738 -21.260  < 2e-16 ***
education                 -0.12536    0.01030 -12.168  < 2e-16 ***
family_income             -0.07043    0.00620 -11.360  < 2e-16 ***
race2                     -0.52015    0.03946 -13.183  < 2e-16 ***
race3                     -0.42281    0.04238  -9.978  < 2e-16 ***
sex                        0.56890    0.02454  23.186  < 2e-16 ***
working_status2           -0.03299    0.05354  -0.616   0.5377    
working_status3           -0.09345    0.04815  -1.941   0.0523 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 16817  on 1655  degrees of freedom
Residual deviance: 13943  on 1640  degrees of freedom
  (2493 observations deleted due to missingness)
AIC: 17023

Number of Fisher Scoring iterations: 6

Quasi-Poisson Model

quasi_physical_health <-  glm(physical_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status,family = quasipoisson, data=data_physical_health, weights = data_physical_health$weight)
summary(quasi_physical_health)

Call:
glm(formula = physical_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = quasipoisson, data = data_physical_health, weights = data_physical_health$weight)

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                1.28715    0.53099   2.424 0.015455 *  
job_interrupt_the_family   0.03450    0.06593   0.523 0.600875    
family_interrupt_the_job   0.13681    0.06936   1.972 0.048729 *  
marriage_status2          -0.42925    0.50828  -0.845 0.398505    
marriage_status3          -0.02772    0.17732  -0.156 0.875802    
marriage_status4           0.63194    0.26810   2.357 0.018537 *  
marriage_status5          -0.34539    0.13807  -2.502 0.012462 *  
total_people_in_household  0.05380    0.03410   1.578 0.114836    
age_group                  0.10020    0.07142   1.403 0.160808    
education                 -0.11218    0.04607  -2.435 0.015006 *  
family_income             -0.00664    0.03680  -0.180 0.856825    
race2                     -0.78570    0.21100  -3.724 0.000203 ***
race3                     -0.38028    0.19125  -1.988 0.046933 *  
sex                        0.36899    0.11014   3.350 0.000826 ***
working_status2           -0.72080    0.18780  -3.838 0.000129 ***
working_status3           -1.06824    0.16430  -6.502 1.05e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasipoisson family taken to be 11.90224)

    Null deviance: 13049  on 1663  degrees of freedom
Residual deviance: 11681  on 1648  degrees of freedom
  (2485 observations deleted due to missingness)
AIC: NA

Number of Fisher Scoring iterations: 6
quasi_mental_health <- glm(mental_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status, family = quasipoisson, data=data_mental_health, weights = data_mental_health$weight)
summary(quasi_mental_health)

Call:
glm(formula = mental_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = quasipoisson, data = data_mental_health, weights = data_mental_health$weight)

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                1.86773    0.36572   5.107 3.65e-07 ***
job_interrupt_the_family   0.29156    0.04948   5.892 4.61e-09 ***
family_interrupt_the_job   0.10048    0.05254   1.913 0.055968 .  
marriage_status2          -0.52643    0.37660  -1.398 0.162350    
marriage_status3           0.08600    0.14084   0.611 0.541569    
marriage_status4           0.55237    0.24163   2.286 0.022382 *  
marriage_status5          -0.26321    0.09866  -2.668 0.007709 ** 
total_people_in_household  0.01614    0.02485   0.649 0.516177    
age_group                 -0.36941    0.05908  -6.253 5.13e-10 ***
education                 -0.12536    0.03503  -3.579 0.000355 ***
family_income             -0.07043    0.02108  -3.341 0.000853 ***
race2                     -0.52015    0.13415  -3.877 0.000110 ***
race3                     -0.42281    0.14408  -2.935 0.003386 ** 
sex                        0.56889    0.08343   6.819 1.28e-11 ***
working_status2           -0.03299    0.18205  -0.181 0.856200    
working_status3           -0.09345    0.16372  -0.571 0.568228    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasipoisson family taken to be 11.56031)

    Null deviance: 16817  on 1655  degrees of freedom
Residual deviance: 13943  on 1640  degrees of freedom
  (2493 observations deleted due to missingness)
AIC: NA

Number of Fisher Scoring iterations: 6

Likelihood Ratio, AIC, BIC

#Create function to compare model fit stats
glm_fit <- function(model) {
  # Calculate Likelihood Ratio
  lr <- logLik(model)
  
  # Calculate AIC
  aic <- AIC(model)
  
  # Calculate BIC
  n <- nobs(model)
  p <- length(coef(model))
  bic <- -2 * logLik(model) + p * log(n)
  
  # Calculate Deviance
  deviance <- summary(model)$deviance
  
  # Return the metrics as a list
  metrics <- data.frame(Likelihood_Ratio = lr, AIC = aic, BIC = bic, Deviance = deviance, Coefficients = p)
  return(metrics)
}
glm_fit(pois_physical_health)
  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1        -6934.083 13900.17 13986.84 11680.73           16
glm_fit(pois_mental_health)
  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1         -8495.61 17023.22 17109.81 13943.12           16
glm_fit(quasi_physical_health)
  Likelihood_Ratio AIC BIC Deviance Coefficients
1               NA  NA  NA 11680.73           16
glm_fit(quasi_mental_health)
  Likelihood_Ratio AIC BIC Deviance Coefficients
1               NA  NA  NA 13943.12           16
glm_fit(nb_physical_health)
  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1        -2956.217 5950.434 6045.939 1322.828           18
glm_fit(nb_mental_health)
  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1        -3768.208 7574.416 7669.834  1570.67           18

Overall Table

stargazer(nb_physical_health, nb_mental_health, type="text",align=TRUE, dep.var.labels=c("Physical Health Status","Mental Health Status"), covariate.labels=c("Job interrupt the family", "Family interrupt the job", "Marriage status:Separated","Marriage status:Divorced", "Marriage status:Widowed","Marriage status:Married","Total people in household", "Age group:30-49 YEARS OLD","Age group:50-64 YEARS OLD", "Age group:64 YEARS OLD OR OVER", "Education", "Family income", "Race:BLACK", "Race:OTHER","Sex", "Working Status:WORKING PART TIME", "Working Status:WORKING FULL TIME" ))

============================================================================
                                             Dependent variable:            
                                 -------------------------------------------
                                 Physical Health Status Mental Health Status
                                          (1)                   (2)         
----------------------------------------------------------------------------
Job interrupt the family                 0.047                0.305***      
                                        (0.071)               (0.060)       
                                                                            
Family interrupt the job                 0.150*               0.145**       
                                        (0.077)               (0.065)       
                                                                            
Marriage status:Separated                -0.306               -0.678*       
                                        (0.466)               (0.392)       
                                                                            
Marriage status:Divorced                 -0.114                0.025        
                                        (0.212)               (0.177)       
                                                                            
Marriage status:Widowed                  0.419                 -0.014       
                                        (0.404)               (0.344)       
                                                                            
Marriage status:Married                -0.532***             -0.571***      
                                        (0.152)               (0.126)       
                                                                            
Total people in household                0.035                 0.048        
                                        (0.038)               (0.031)       
                                                                            
Age group:30-49 YEARS OLD               0.420**               -0.258*       
                                        (0.164)               (0.135)       
                                                                            
Age group:50-64 YEARS OLD               0.597***             -0.688***      
                                        (0.188)               (0.156)       
                                                                            
Age group:64 YEARS OLD OR OVER           0.162               -1.424***      
                                        (0.294)               (0.258)       
                                                                            
Education                               -0.088*              -0.145***      
                                        (0.048)               (0.041)       
                                                                            
Family income                            -0.006              -0.102***      
                                        (0.042)               (0.033)       
                                                                            
Race:BLACK                             -0.870***             -0.782***      
                                        (0.182)               (0.149)       
                                                                            
Race:OTHER                              -0.380**             -0.602***      
                                        (0.184)               (0.155)       
                                                                            
Sex                                     0.481***              0.616***      
                                        (0.117)               (0.098)       
                                                                            
Working Status:WORKING PART TIME        -0.705**               -0.160       
                                        (0.290)               (0.247)       
                                                                            
Working Status:WORKING FULL TIME       -1.189***               -0.124       
                                        (0.263)               (0.224)       
                                                                            
Constant                                 1.070*               1.867***      
                                        (0.611)               (0.500)       
                                                                            
----------------------------------------------------------------------------
Observations                             1,664                 1,656        
Log Likelihood                         -2,957.217            -3,769.208     
theta                               0.207*** (0.011)      0.297*** (0.014)  
Akaike Inf. Crit.                      5,950.434             7,574.416      
============================================================================
Note:                                            *p<0.1; **p<0.05; ***p<0.01

Prediction

physical_pred_j<-ggpredict(nb_physical_health, terms="job_interrupt_the_family")
print(physical_pred_j)
# Predicted counts of physical_health_status

job_interrupt_the_family | Predicted |      95% CI
--------------------------------------------------
                       1 |      6.63 | 3.66, 12.01
                       2 |      6.94 | 3.96, 12.16
                       3 |      7.28 | 4.16, 12.74
                       4 |      7.63 | 4.21, 13.81

Adjusted for:
*  family_interrupt_the_job =  2.11
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.65
*                      race =     1
*                       sex =  1.48
*            working_status =     1
physical_pred_f<-ggpredict(nb_physical_health, terms="family_interrupt_the_job")
print(physical_pred_f)
# Predicted counts of physical_health_status

family_interrupt_the_job | Predicted |      95% CI
--------------------------------------------------
                       1 |      5.98 | 3.37, 10.62
                       2 |      6.95 | 3.98, 12.11
                       3 |      8.07 | 4.53, 14.38
                       4 |      9.38 | 4.97, 17.69

Adjusted for:
*  job_interrupt_the_family =  2.36
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.65
*                      race =     1
*                       sex =  1.48
*            working_status =     1
mental_pred_j<-ggpredict(nb_mental_health, terms="job_interrupt_the_family")
print(mental_pred_j)
# Predicted counts of mental_health_status

job_interrupt_the_family | Predicted |      95% CI
--------------------------------------------------
                       1 |      6.53 | 3.95, 10.78
                       2 |      8.86 | 5.52, 14.22
                       3 |     12.02 | 7.48, 19.31
                       4 |     16.32 | 9.87, 26.98

Adjusted for:
*  family_interrupt_the_job =  2.10
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.66
*                      race =     1
*                       sex =  1.48
*            working_status =     1
mental_pred_f<-ggpredict(nb_mental_health, terms="family_interrupt_the_job")
print(mental_pred_f)
# Predicted counts of mental_health_status

family_interrupt_the_job | Predicted |      95% CI
--------------------------------------------------
                       1 |      8.39 | 5.16, 13.65
                       2 |      9.71 | 6.07, 15.53
                       3 |     11.23 | 6.89, 18.29
                       4 |     12.98 | 7.59, 22.21

Adjusted for:
*  job_interrupt_the_family =  2.35
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.66
*                      race =     1
*                       sex =  1.48
*            working_status =     1
custom_order <- c("Never", "Rarely"," Sometimes", "Often")

ggplot(physical_pred_j, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Job interrrupt family", y = "Predicted Count: Physical Health", 
       title = "Predicted Count for job interrupt family- Physical Health", 
       subtitle = "Results from Negative-Binomial Model")+
  scale_x_discrete(limits = custom_order)

ggplot(physical_pred_f, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Family interrrupt job", y = "Predicted Count: Physical Health", 
       title = "Predicted Count for family interrupt job- Physical Health", 
       subtitle = "Results from Negative-Binomial Model")+
  scale_x_discrete(limits = custom_order)

ggplot(mental_pred_j, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Work impact Family", y = "Predicted Count: Days of poor mental health", 
       title = "Predicted Days of Poor Mental Health by Work Impact Family", 
       subtitle = "Results from Negative-Binomial Model",
       fill = "Work impact Family")+
  scale_x_discrete(limits = custom_order)+
  theme(legend.position = "bottom" )

ggplot(mental_pred_f, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Family interrrupt job", y = "Predicted Count: Mental Health", 
       title = "Predicted Count for family interrupt job- Mental Health", 
       subtitle = "Results from Negative-Binomial Model")+
  scale_x_discrete(limits = custom_order)

data_physical_health <- data_physical_health %>% 
  mutate(job_interrupt_the_family = as.numeric(job_interrupt_the_family),
         family_interrupt_the_job = as.numeric(family_interrupt_the_job))

data_physical_health <- data_physical_health %>%
  mutate(job_interrupt_the_family = factor(job_interrupt_the_family,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) %>%
  mutate(family_interrupt_the_job = factor(family_interrupt_the_job,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) 

# Summarize the data (assuming CO2_sum is already created)
data_physical_health_sum <- data_physical_health |> group_by(job_interrupt_the_family) |>
   dplyr::summarise(mean = mean (physical_health_status, na.rm = TRUE),
            sd = sd(physical_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data <- data_physical_health_sum |>
  rowwise() |>
  do(data.frame(job_interrupt_the_family = .$job_interrupt_the_family,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data, aes(x = job_interrupt_the_family, y = outcome, fill = job_interrupt_the_family))+
  geom_col(data = data_physical_health_sum, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = job_interrupt_the_family), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Physical Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Job Interrupts Family",
        y = "Simulated Days of Poor Physical Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)

# Summarize the data (assuming CO2_sum is already created)
data_physical_health_sum_1 <- data_physical_health |> group_by(family_interrupt_the_job) |>
   dplyr::summarise(mean = mean (physical_health_status, na.rm = TRUE),
            sd = sd(physical_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data_1 <- data_physical_health_sum_1 |>
  rowwise() |>
  do(data.frame(family_interrupt_the_job = .$family_interrupt_the_job,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data_1, aes(x = family_interrupt_the_job, y = outcome, fill = family_interrupt_the_job))+
  geom_col(data = data_physical_health_sum_1, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = family_interrupt_the_job), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Physical Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Family Interrupts Job",
        y = "Simulated Days of Poor Physical Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)

data_mental_health <- data_mental_health %>% 
  mutate(job_interrupt_the_family = as.numeric(job_interrupt_the_family),
         family_interrupt_the_job = as.numeric(family_interrupt_the_job))

data_mental_health <- data_mental_health %>%
  mutate(job_interrupt_the_family = factor(job_interrupt_the_family,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) %>%
  mutate(family_interrupt_the_job = factor(family_interrupt_the_job,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) 

# Summarize the data (assuming CO2_sum is already created)
data_mental_health_sum <- data_mental_health |> group_by(job_interrupt_the_family) |>
   dplyr::summarise(mean = mean (mental_health_status, na.rm = TRUE),
            sd = sd(mental_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data_3 <- data_mental_health_sum |>
  rowwise() |>
  do(data.frame(job_interrupt_the_family = .$job_interrupt_the_family,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data_3, aes(x = job_interrupt_the_family, y = outcome, fill = job_interrupt_the_family))+
  geom_col(data = data_mental_health_sum, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = job_interrupt_the_family), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Mental Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Job Interrupts Family",
        y = "Simulated Days of Poor Mental Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)

# Summarize the data (assuming CO2_sum is already created)
data_mental_health_sum_1 <- data_mental_health |> group_by(family_interrupt_the_job) |>
   dplyr::summarise(mean = mean (mental_health_status, na.rm = TRUE),
            sd = sd(mental_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data_4 <- data_mental_health_sum_1 |>
  rowwise() |>
  do(data.frame(family_interrupt_the_job = .$family_interrupt_the_job,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data_4, aes(x = family_interrupt_the_job, y = outcome, fill = family_interrupt_the_job))+
  geom_col(data = data_mental_health_sum_1, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = family_interrupt_the_job), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Mental Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Family Interrupts Job",
        y = "Simulated Days of Poor Mental Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)