Final Project

Author

Jingyi Yang

Prepare

library(haven)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.2     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.2.1
✔ purrr     1.0.4     ✔ tidyr     1.3.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(semPlot)
library(lavaan)

This is lavaan 0.6-19
lavaan is FREE software! Please report any bugs.

library(psych)


Attaching package: 'psych'

The following object is masked from 'package:lavaan':

    cor2cov

The following objects are masked from 'package:ggplot2':

    %+%, alpha

library(skimr) 
library(corrplot)

corrplot 0.95 loaded

library(patchwork) #Merge GGPlots together 
library(ggplot2) #Graphing
library(jtools) #Tabular Regression Results
library(descr) #Easy Frequency Tables  
library(stats) #Imports survey data 
library(ggeffects) #Predicted Probabilities from Regressions
library(nnet) #For multinomial models
library(MASS) #For ordered models


Attaching package: 'MASS'

The following object is masked from 'package:patchwork':

    area

The following object is masked from 'package:dplyr':

    select

library(brant) #Test parallel regression assumption 
library(boot) #Create CIs for Multinomial Modeling


Attaching package: 'boot'

The following object is masked from 'package:psych':

    logit

library(cem) #Coarsened Exact Matching

Loading required package: tcltk
Loading required package: lattice

Attaching package: 'lattice'

The following object is masked from 'package:boot':

    melanoma


How to use CEM? Type vignette("cem")

library(MatchIt)  #Coarsened Exact Matching 
library(stargazer)  #Regression Output


Please cite as: 

 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(jtools)  #Regression Output
library(WeightIt) #Entropy Balancing
library(Hmisc) #General Modeling


Attaching package: 'Hmisc'

The following object is masked from 'package:jtools':

    %nin%

The following object is masked from 'package:psych':

    describe

The following objects are masked from 'package:dplyr':

    src, summarize

The following objects are masked from 'package:base':

    format.pval, units

library(ebal) #Entropy Balancing

##
## ebal Package: Implements Entropy Balancing.

## See http://www.stanford.edu/~jhain/ for additional information.

library(survey) #Applying EB weights

Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loading required package: survival

Attaching package: 'survival'

The following object is masked from 'package:boot':

    aml


Attaching package: 'survey'

The following object is masked from 'package:Hmisc':

    deff

The following object is masked from 'package:WeightIt':

    calibrate

The following object is masked from 'package:graphics':

    dotchart

library(gganimate)

Warning: package 'gganimate' was built under R version 4.5.2

Data Import

GSS2022 <- read_dta("~/研一下/DACSS 790Q/Final Project/2022_stata/2022/GSS2022.dta")
head(GSS2022)

# A tibble: 6 × 1,185
  year         id wrkstat hrs1        hrs2        evwork      wrkslf  occ10     
  <dbl+lbl> <dbl> <dbl+l> <dbl+lbl>   <dbl+lbl>   <dbl+lbl>   <dbl+l> <dbl+lbl> 
1 2022          1 1 [wor…    40       NA(i) [iap] NA(i) [iap] 2 [som…  430 [man…
2 2022          2 5 [ret… NA(i) [iap] NA(i) [iap]     1 [yes] 2 [som…   50 [mar…
3 2022          3 1 [wor…    52       NA(i) [iap] NA(i) [iap] 2 [som… 4610 [per…
4 2022          4 3 [wit… NA(i) [iap]    25       NA(i) [iap] 2 [som… 4120 [foo…
5 2022          5 8 [oth… NA(i) [iap] NA(i) [iap]     1 [yes] 2 [som… 7330 [ind…
6 2022          6 1 [wor…    50       NA(i) [iap] NA(i) [iap] 2 [som… 4610 [per…
# ℹ 1,177 more variables: prestg10 <dbl+lbl>, prestg105plus <dbl+lbl>,
#   indus10 <dbl+lbl>, marital <dbl+lbl>, martype <dbl+lbl>, divorce <dbl+lbl>,
#   widowed <dbl+lbl>, spwrksta <dbl+lbl>, sphrs1 <dbl+lbl>, sphrs2 <dbl+lbl>,
#   spevwork <dbl+lbl>, cowrksta <dbl+lbl>, coevwork <dbl+lbl>,
#   cohrs1 <dbl+lbl>, cohrs2 <dbl+lbl>, spwrkslf <dbl+lbl>, sppres80 <dbl+lbl>,
#   spocc10 <dbl+lbl>, sppres10 <dbl+lbl>, sppres105plus <dbl+lbl>,
#   spind10 <dbl+lbl>, coocc10 <dbl+lbl>, coind10 <dbl+lbl>, …

Setting data set

# Change age to age group! 
age_breaks <- c(18, 30, 50, 65, 100)

GSS2022$age_group <- cut(GSS2022$age, 
                      age_breaks, labels = c("18-29", "30-49", "50-64", "65+"), 
                      include.lowest = TRUE) 
                      
GSS2022<- GSS2022 %>% mutate(age_group= recode(age_group,"18-29"="1", "30-49"="2", "50-64"="3", "65+"="4"))

test_data <- GSS2022 %>% dplyr::select(c(wkvsfam, famvswk, physhlth, mntlhlth, marital,hompop_exp, age_group, degree, income, race, sex, wrkstat, wtssps))%>% mutate(wkvsfam= as.numeric(wkvsfam),famvswk= as.numeric(famvswk), physhlth= as.numeric(physhlth), mntlhlth= as.numeric(mntlhlth), marital= as.numeric(marital), hompop_exp= as.numeric(hompop_exp), age_group= as.numeric(age_group), degree= as.numeric(degree), income= as.numeric(income), race= as.numeric(race), sex= as.numeric(sex), wrkstat=as.numeric(wrkstat), wtssps=as.numeric(wtssps))

new_names <- c("job_interrupt_the_family", "family_interrupt_the_job", "physical_health_status", "mental_health_status", "marriage_status", "total_people_in_household", "age_group","education", "family_income", "race", "sex", "working_status", "weight" )

colnames(test_data) <- new_names #Apply new names to your data frame
skim(test_data)

Data summary
Name	test_data
Number of rows	4149
Number of columns	13
_______________________
Column type frequency:
numeric	13
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
job_interrupt_the_family	1767	0.57	2.69	0.93	1.00	2.00	3.00	3.00	4.00	▂▇▁▇▅
family_interrupt_the_job	1779	0.57	2.94	0.86	1.00	2.00	3.00	4.00	4.00	▁▆▁▇▆
physical_health_status	1844	0.56	2.67	6.11	0.00	0.00	0.00	2.00	30.00	▇▁▁▁▁
mental_health_status	1870	0.55	4.53	7.93	0.00	0.00	0.00	5.00	30.00	▇▁▁▁▁
marriage_status	16	1.00	2.77	1.74	1.00	1.00	3.00	5.00	5.00	▇▁▃▁▆
total_people_in_household	8	1.00	1.75	1.67	0.00	1.00	1.00	2.00	49.00	▇▁▁▁▁
age_group	256	0.94	2.46	1.03	1.00	2.00	2.00	3.00	4.00	▅▇▁▅▅
education	2	1.00	1.82	1.27	0.00	1.00	1.00	3.00	4.00	▂▇▂▃▂
family_income	484	0.88	11.22	2.07	1.00	12.00	12.00	12.00	12.00	▁▁▁▁▇
race	64	0.98	1.51	0.76	1.00	1.00	1.00	2.00	3.00	▇▁▂▁▂
sex	23	0.99	1.54	0.50	1.00	1.00	2.00	2.00	2.00	▇▁▁▁▇
working_status	9	1.00	3.07	2.34	1.00	1.00	2.00	5.00	8.00	▇▁▃▁▂
weight	605	0.85	1.00	1.26	0.07	0.34	0.58	1.06	14.27	▇▁▁▁▁

# Reverse Coding
final_data <- test_data %>%
  mutate(
    job_interrupt_the_family = case_when(
      job_interrupt_the_family == 1 ~ 4,
      job_interrupt_the_family == 2 ~ 3,
      job_interrupt_the_family == 3 ~ 2,
      job_interrupt_the_family == 4 ~ 1,
      job_interrupt_the_family == "NA"~ NA_real_),
    job_interrupt_the_family = labelled(job_interrupt_the_family, 
      c(`Never` = 1, `Rarely` = 2, `Sometimes` = 3, `Often` = 4))) %>% 
  mutate(family_interrupt_the_job = as.numeric(family_interrupt_the_job),
    family_interrupt_the_job = case_when(
      family_interrupt_the_job == 1 ~ 4,
      family_interrupt_the_job == 2 ~ 3,
      family_interrupt_the_job == 3 ~ 2,
      family_interrupt_the_job == 4 ~ 1,
      family_interrupt_the_job == "NA"~ NA_real_),
    family_interrupt_the_job = labelled(family_interrupt_the_job, 
      c(`Never` = 1, `Rarely` = 2, `Sometimes` = 3, `Often` = 4))) %>%
  mutate(marriage_status= case_when(
    marriage_status== 1 ~ 5, 
    marriage_status== 2 ~ 4,
    marriage_status== 3 ~ 3,
    marriage_status== 4 ~ 2,
    marriage_status== 5 ~ 1,
    marriage_status== "NA"~ NA_real_),
    marriage_status=labelled(marriage_status, c(`Never married`=1, `Separated`=2, `Divorced`=3, `Widowed`=4, `Married`=5))) %>%
  mutate(age_group= case_when(
    age_group== 1 ~ 1,
    age_group== 2 ~ 2, 
    age_group== 3 ~ 3,
    age_group== 4 ~ 4,
    age_group== "NA"~ NA_real_),
    age_group=labelled(age_group, c(`18-29 YEARS OLD`=1, `30-49 YEARS OLD`=2, `50-64 YEARS OLD`=3, `64 YEARS OLD OR OVER`=4)))%>% 
  mutate(education= case_when(
    education== 0 ~ 1,
    education== 1 ~ 2,
    education== 2 ~ 3, 
    education== 3 ~ 4, 
    education== 4 ~ 5,
    education== "NA"~ NA_real_),
    education= labelled(education, c(`Less THAN HIGH SCHOOL`= 1, `HIGH SCHOOL`= 2, `ASSOCIATE/JUNIOR COLLEGE`=3, `BACHELORS`=4, `GRADUATE`= 5)))%>%
  mutate(family_income= case_when(
    family_income== 1 ~ 1, 
    family_income== 2 ~ 2, 
    family_income== 3 ~ 3, 
    family_income== 4 ~ 4, 
    family_income== 5 ~ 5, 
    family_income== 6 ~ 6, 
    family_income== 7 ~ 7, 
    family_income== 8 ~ 8, 
    family_income== 9 ~ 9,
    family_income== 10 ~ 10, 
    family_income== 11 ~ 11, 
    family_income== 12 ~ 12,
    family_income== "NA"~ NA_real_),
    family_income= labelled(family_income, c(`UNDER$1,000`= 1,`$1,000 TO $2,999`= 2, `$3,000 TO $3,999`=3, `$4,000 TO $4,999`=4, `$5,000 TOlibra $5,999`= 5, `$6,000 TO $6,999`=6, `$7,000 TO $7,999`=7, `$8,000 TO $9,999`=8, `$10,000 TO $14,999`= 9, `$15,000 TO $19,999`= 10,`$20,000 TO $24,999`= 11,  `$25,000 OR MORE`= 12))) %>%
  mutate(race= case_when(
    race== 1 ~ 1, 
    race== 2 ~ 2,
    race== 3 ~ 3,
    race== "NA"~ NA_real_),
  race= labelled(race, c(`WHITE`= 1, `BLACK`= 2, `OTHER`=3))) %>%
  mutate(sex= case_when(
    sex== 1 ~ 1, 
    sex== 2 ~ 2,
    sex== "NA"~ NA_real_),
    sex= labelled(sex, c(`MALE`=1, `FEMALE`=2)))%>%
  mutate(total_people_in_household = case_when(
    total_people_in_household == 0 ~ 0,
    total_people_in_household == 1 ~ 1,
    total_people_in_household == 2 ~ 2,
    total_people_in_household == 3 ~ 3,
    total_people_in_household == 4 ~ 4,
    total_people_in_household == 5 ~ 5,
    total_people_in_household == 6 ~ 6,
    total_people_in_household == 7 ~ 7,
    total_people_in_household >= 8 ~ 8,
    total_people_in_household == "NA"~ NA_real_),
    total_people_in_household = labelled(total_people_in_household, 
      c(`NONE`=0, `1`=1, `2`=2, `3`=3, `4`=4, `5`=5, `6`=6, `7`=7, `8+`=8)))%>%
  mutate(working_status = case_when(
    working_status == 1 ~ 3,
    working_status == 2 ~ 2,
    working_status == 3 ~ 1,
    working_status == 4 ~ 1,
    working_status == 5 ~ 1,
    working_status == 6 ~ 1,
    working_status == 7 ~ 1,
    working_status == 8 ~ 1,
    working_status == "NA"~ NA_real_),
    working_status = labelled(working_status,
      c(`WORKING FULL TIME`=3, `WORKING PART TIME`=2,`OTHER`=1)))
head(final_data)

# A tibble: 6 × 13
  job_interrupt_the_family family_interrupt_the_job physical_health_status
  <dbl+lbl>                <dbl+lbl>                                 <dbl>
1  3 [Sometimes]            2 [Rarely]                                  30
2 NA                       NA                                           NA
3  2 [Rarely]               2 [Rarely]                                   0
4  4 [Often]                1 [Never]                                    4
5 NA                       NA                                           NA
6  4 [Often]                3 [Sometimes]                                0
# ℹ 10 more variables: mental_health_status <dbl>, marriage_status <dbl+lbl>,
#   total_people_in_household <dbl+lbl>, age_group <dbl+lbl>,
#   education <dbl+lbl>, family_income <dbl+lbl>, race <dbl+lbl>,
#   sex <dbl+lbl>, working_status <dbl+lbl>, weight <dbl>

str(final_data)

tibble [4,149 × 13] (S3: tbl_df/tbl/data.frame)
 $ job_interrupt_the_family : dbl+lbl [1:4149]  3, NA,  2,  4, NA,  4,  4,  2,  3, NA, NA,  1,  2,  ...
   ..@ labels: Named num [1:4] 1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "Never" "Rarely" "Sometimes" "Often"
 $ family_interrupt_the_job : dbl+lbl [1:4149]  2, NA,  2,  1, NA,  3,  2,  2,  4, NA, NA,  2,  2,  ...
   ..@ labels: Named num [1:4] 1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "Never" "Rarely" "Sometimes" "Often"
 $ physical_health_status   : num [1:4149] 30 NA 0 4 NA 0 2 0 5 NA ...
 $ mental_health_status     : num [1:4149] 15 NA 0 10 NA NA 14 0 5 NA ...
 $ marriage_status          : dbl+lbl [1:4149] 3, 5, 3, 1, 1, 1, 1, 5, 1, 1, 4, 1, 5, 1, 1, 3, 1, 5,...
   ..@ labels: Named num [1:5] 1 2 3 4 5
   .. ..- attr(*, "names")= chr [1:5] "Never married" "Separated" "Divorced" "Widowed" ...
 $ total_people_in_household: dbl+lbl [1:4149] 1, 1, 3, 1, 3, 1, 3, 2, 2, 1, 2, 1, 1, 1, 3, 1, 1, 3,...
   ..@ labels: Named num [1:9] 0 1 2 3 4 5 6 7 8
   .. ..- attr(*, "names")= chr [1:9] "NONE" "1" "2" "3" ...
 $ age_group                : dbl+lbl [1:4149]  4,  4,  3,  1,  3,  1,  1,  2,  2,  4,  3,  1,  2,  ...
   ..@ labels: Named num [1:4] 1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "18-29 YEARS OLD" "30-49 YEARS OLD" "50-64 YEARS OLD" "64 YEARS OLD OR OVER"
 $ education                : dbl+lbl [1:4149] 4, 5, 2, 4, 2, 2, 2, 5, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4,...
   ..@ labels: Named num [1:5] 1 2 3 4 5
   .. ..- attr(*, "names")= chr [1:5] "Less THAN HIGH SCHOOL" "HIGH SCHOOL" "ASSOCIATE/JUNIOR COLLEGE" "BACHELORS" ...
 $ family_income            : dbl+lbl [1:4149] 12, NA, 12, 12, 12, 12, 12, 12, 11,  9, 12, 12, 12,  ...
   ..@ labels: Named num [1:12] 1 2 3 4 5 6 7 8 9 10 ...
   .. ..- attr(*, "names")= chr [1:12] "UNDER$1,000" "$1,000 TO $2,999" "$3,000 TO $3,999" "$4,000 TO $4,999" ...
 $ race                     : dbl+lbl [1:4149]  1,  1,  1,  1,  1,  1,  3,  1,  1, NA,  1,  1,  1,  ...
   ..@ labels: Named num [1:3] 1 2 3
   .. ..- attr(*, "names")= chr [1:3] "WHITE" "BLACK" "OTHER"
 $ sex                      : dbl+lbl [1:4149]  2,  1,  2,  2,  1,  1,  2,  1,  2,  2,  2,  2,  2,  ...
   ..@ labels: Named num [1:2] 1 2
   .. ..- attr(*, "names")= chr [1:2] "MALE" "FEMALE"
 $ working_status           : dbl+lbl [1:4149] 3, 1, 3, 1, 1, 3, 2, 3, 3, 1, 1, 2, 3, 2, 1, 1, 1, 2,...
   ..@ labels: Named num [1:3] 3 2 1
   .. ..- attr(*, "names")= chr [1:3] "WORKING FULL TIME" "WORKING PART TIME" "OTHER"
 $ weight                   : num [1:4149] 0.231 0.577 1.014 0.902 1.084 ...

skim(final_data)

Data summary
Name	final_data
Number of rows	4149
Number of columns	13
_______________________
Column type frequency:
numeric	13
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
job_interrupt_the_family	1767	0.57	2.31	0.93	1.00	2.00	2.00	3.00	4.00	▅▇▁▇▂
family_interrupt_the_job	1779	0.57	2.06	0.86	1.00	1.00	2.00	3.00	4.00	▆▇▁▆▁
physical_health_status	1844	0.56	2.67	6.11	0.00	0.00	0.00	2.00	30.00	▇▁▁▁▁
mental_health_status	1870	0.55	4.53	7.93	0.00	0.00	0.00	5.00	30.00	▇▁▁▁▁
marriage_status	16	1.00	3.23	1.74	1.00	1.00	3.00	5.00	5.00	▆▁▃▁▇
total_people_in_household	8	1.00	1.72	1.33	0.00	1.00	1.00	2.00	8.00	▇▂▁▁▁
age_group	256	0.94	2.46	1.03	1.00	2.00	2.00	3.00	4.00	▅▇▁▅▅
education	2	1.00	2.82	1.27	1.00	2.00	2.00	4.00	5.00	▂▇▂▃▂
family_income	484	0.88	11.22	2.07	1.00	12.00	12.00	12.00	12.00	▁▁▁▁▇
race	64	0.98	1.51	0.76	1.00	1.00	1.00	2.00	3.00	▇▁▂▁▂
sex	23	0.99	1.54	0.50	1.00	1.00	2.00	2.00	2.00	▇▁▁▁▇
working_status	9	1.00	2.02	0.95	1.00	1.00	2.00	3.00	3.00	▇▁▂▁▇
weight	605	0.85	1.00	1.26	0.07	0.34	0.58	1.06	14.27	▇▁▁▁▁

summary_df <- summarytools::dfSummary(final_data,
                                     varnumbers=FALSE,
                                     plain.ascii=FALSE,
                                     style="grid",
                                     graph.col = TRUE,
                                     valid.col=FALSE)

# Print the summary table and suppress warnings
print(summary_df,
      method="render",
      table.classes="table-condensed")

Data Frame Summary

final_data

Dimensions: 4149 x 13
Duplicates: 132

job_interrupt_the_family [haven_labelled, vctrs_vctr, double]

1. [1] Never
2. [2] Rarely
3. [3] Sometimes
4. [4] Often

525	(	22.0%	)
826	(	34.7%	)
789	(	33.1%	)
242	(	10.2%	)

1767 (42.6%)

family_interrupt_the_job [haven_labelled, vctrs_vctr, double]

1. [1] Never
2. [2] Rarely
3. [3] Sometimes
4. [4] Often

704	(	29.7%	)
926	(	39.1%	)
632	(	26.7%	)
108	(	4.6%	)

1779 (42.9%)

physical_health_status [numeric]

Mean (sd) : 2.7 (6.1)
min ≤ med ≤ max:
0 ≤ 0 ≤ 30
IQR (CV) : 2 (2.3)

24 distinct values

1844 (44.4%)

mental_health_status [numeric]

Mean (sd) : 4.5 (7.9)
min ≤ med ≤ max:
0 ≤ 0 ≤ 30
IQR (CV) : 5 (1.8)

29 distinct values

1870 (45.1%)

marriage_status [haven_labelled, vctrs_vctr, double]

1. [1] Never married
2. [2] Separated
3. [3] Divorced
4. [4] Widowed
5. [5] Married

1333	(	32.3%	)
121	(	2.9%	)
669	(	16.2%	)
273	(	6.6%	)
1737	(	42.0%	)

16 (0.4%)

total_people_in_household [haven_labelled, vctrs_vctr, double]

1. [0] NONE
2. [1] 1
3. [2] 2
4. [3] 3
5. [4] 4
6. [5] 5
7. [6] 6
8. [7] 7
9. [8] 8+

40	(	1.0%	)
2727	(	65.9%	)
580	(	14.0%	)
314	(	7.6%	)
270	(	6.5%	)
108	(	2.6%	)
52	(	1.3%	)
24	(	0.6%	)
26	(	0.6%	)

8 (0.2%)

age_group [haven_labelled, vctrs_vctr, double]

1. [1] 18-29 YEARS OLD
2. [2] 30-49 YEARS OLD
3. [3] 50-64 YEARS OLD
4. [4] 64 YEARS OLD OR OVER

750	(	19.3%	)
1408	(	36.2%	)
924	(	23.7%	)
811	(	20.8%	)

256 (6.2%)

education [haven_labelled, vctrs_vctr, double]

1. [1] Less THAN HIGH SCHOOL
2. [2] HIGH SCHOOL
3. [3] ASSOCIATE/JUNIOR COLL
4. [4] BACHELORS
5. [5] GRADUATE

417	(	10.1%	)
1919	(	46.3%	)
367	(	8.8%	)
866	(	20.9%	)
578	(	13.9%	)

2 (0.0%)

family_income [haven_labelled, vctrs_vctr, double]

1. [1] UNDER$1,000
2. [2] $1,000 TO $2,999
3. [3] $3,000 TO $3,999
4. [4] $4,000 TO $4,999
5. [5] $5,000 TOlibra $5,999
6. [6] $6,000 TO $6,999
7. [7] $7,000 TO $7,999
8. [8] $8,000 TO $9,999
9. [9] $10,000 TO $14,999
10. [10] $15,000 TO $19,999
[ 2 others ]

53	(	1.4%	)
37	(	1.0%	)
19	(	0.5%	)
14	(	0.4%	)
20	(	0.5%	)
15	(	0.4%	)
18	(	0.5%	)
46	(	1.3%	)
215	(	5.9%	)
126	(	3.4%	)
3102	(	84.6%	)

484 (11.7%)

race [haven_labelled, vctrs_vctr, double]

1. [1] WHITE
2. [2] BLACK
3. [3] OTHER

2651	(	64.9%	)
775	(	19.0%	)
659	(	16.1%	)

64 (1.5%)

sex [haven_labelled, vctrs_vctr, double]

1. [1] MALE
2. [2] FEMALE

1910	(	46.3%	)
2216	(	53.7%	)

23 (0.6%)

working_status [haven_labelled, vctrs_vctr, double]

1. [3] WORKING FULL TIME
2. [2] WORKING PART TIME
3. [1] OTHER

1904	(	46.0%	)
423	(	10.2%	)
1813	(	43.8%	)

9 (0.2%)

weight [numeric]

Mean (sd) : 1 (1.3)
min ≤ med ≤ max:
0.1 ≤ 0.6 ≤ 14.3
IQR (CV) : 0.7 (1.3)

2269 distinct values

605 (14.6%)

Generated by summarytools 1.1.4 (R version 4.5.1)
2025-11-20

Separate two scales

data_physical_health<-final_data%>% dplyr::select(c(job_interrupt_the_family, family_interrupt_the_job, physical_health_status, marriage_status, total_people_in_household,  age_group,education, family_income, race, sex, working_status, weight)) %>% mutate(marriage_status= as.factor(marriage_status))%>% mutate(race= as.factor(race))%>% mutate(working_status= as.factor(working_status))


head(data_physical_health)

# A tibble: 6 × 12
  job_interrupt_the_family family_interrupt_the_job physical_health_status
  <dbl+lbl>                <dbl+lbl>                                 <dbl>
1  3 [Sometimes]            2 [Rarely]                                  30
2 NA                       NA                                           NA
3  2 [Rarely]               2 [Rarely]                                   0
4  4 [Often]                1 [Never]                                    4
5 NA                       NA                                           NA
6  4 [Often]                3 [Sometimes]                                0
# ℹ 9 more variables: marriage_status <fct>,
#   total_people_in_household <dbl+lbl>, age_group <dbl+lbl>,
#   education <dbl+lbl>, family_income <dbl+lbl>, race <fct>, sex <dbl+lbl>,
#   working_status <fct>, weight <dbl>

data_mental_health <- final_data%>% dplyr::select(c(job_interrupt_the_family, family_interrupt_the_job, mental_health_status,marriage_status, total_people_in_household,  age_group,education, family_income, race, sex, working_status, weight))%>% mutate(marriage_status= as.factor(marriage_status)) %>% mutate(race= as.factor(race)) %>% mutate(working_status= as.factor(working_status))

head(data_mental_health)

# A tibble: 6 × 12
  job_interrupt_the_family family_interrupt_the_job mental_health_status
  <dbl+lbl>                <dbl+lbl>                               <dbl>
1  3 [Sometimes]            2 [Rarely]                                15
2 NA                       NA                                         NA
3  2 [Rarely]               2 [Rarely]                                 0
4  4 [Often]                1 [Never]                                 10
5 NA                       NA                                         NA
6  4 [Often]                3 [Sometimes]                             NA
# ℹ 9 more variables: marriage_status <fct>,
#   total_people_in_household <dbl+lbl>, age_group <dbl+lbl>,
#   education <dbl+lbl>, family_income <dbl+lbl>, race <fct>, sex <dbl+lbl>,
#   working_status <fct>, weight <dbl>

Count Dependent Variable Tutorials

Negative-Binomial Model

nb_physical_health<- glm.nb(physical_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+ factor(age_group)+education+family_income+race+sex+working_status, data=data_physical_health, weights = data_physical_health$weight) 

summary(nb_physical_health)


Call:
glm.nb(formula = physical_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    factor(age_group) + education + family_income + race + sex + 
    working_status, data = data_physical_health, weights = data_physical_health$weight, 
    init.theta = 0.2066739091, link = log)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.070116   0.611382   1.750 0.080063 .  
job_interrupt_the_family   0.046956   0.071397   0.658 0.510746    
family_interrupt_the_job   0.150010   0.076790   1.954 0.050758 .  
marriage_status2          -0.305971   0.466022  -0.657 0.511464    
marriage_status3          -0.114493   0.211822  -0.541 0.588841    
marriage_status4           0.419432   0.403927   1.038 0.299090    
marriage_status5          -0.532134   0.151580  -3.511 0.000447 ***
total_people_in_household  0.035484   0.037679   0.942 0.346327    
factor(age_group)2         0.420183   0.163890   2.564 0.010353 *  
factor(age_group)3         0.597448   0.188429   3.171 0.001521 ** 
factor(age_group)4         0.161875   0.293786   0.551 0.581636    
education                 -0.088090   0.048420  -1.819 0.068866 .  
family_income             -0.006117   0.041862  -0.146 0.883831    
race2                     -0.869784   0.182331  -4.770 1.84e-06 ***
race3                     -0.380102   0.184256  -2.063 0.039122 *  
sex                        0.480984   0.116805   4.118 3.82e-05 ***
working_status2           -0.704522   0.289847  -2.431 0.015071 *  
working_status3           -1.188900   0.263378  -4.514 6.36e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(0.2067) family taken to be 1)

    Null deviance: 1431.4  on 1663  degrees of freedom
Residual deviance: 1322.8  on 1646  degrees of freedom
  (2485 observations deleted due to missingness)
AIC: 5950.4

Number of Fisher Scoring iterations: 1

              Theta:  0.2067 
          Std. Err.:  0.0108 

 2 x log-likelihood:  -5912.4340

nb_mental_health<- glm.nb(mental_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+ factor(age_group)+education+family_income+race+sex+working_status, data=data_mental_health, weights = data_mental_health$weight) 

summary(nb_mental_health)


Call:
glm.nb(formula = mental_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    factor(age_group) + education + family_income + race + sex + 
    working_status, data = data_mental_health, weights = data_mental_health$weight, 
    init.theta = 0.2967852665, link = log)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.86665    0.49971   3.735 0.000187 ***
job_interrupt_the_family   0.30549    0.06046   5.053 4.35e-07 ***
family_interrupt_the_job   0.14536    0.06508   2.234 0.025503 *  
marriage_status2          -0.67765    0.39241  -1.727 0.084186 .  
marriage_status3           0.02496    0.17657   0.141 0.887600    
marriage_status4          -0.01392    0.34390  -0.040 0.967709    
marriage_status5          -0.57128    0.12572  -4.544 5.52e-06 ***
total_people_in_household  0.04766    0.03118   1.528 0.126400    
factor(age_group)2        -0.25762    0.13466  -1.913 0.055720 .  
factor(age_group)3        -0.68757    0.15633  -4.398 1.09e-05 ***
factor(age_group)4        -1.42375    0.25834  -5.511 3.57e-08 ***
education                 -0.14462    0.04057  -3.565 0.000364 ***
family_income             -0.10208    0.03339  -3.057 0.002233 ** 
race2                     -0.78152    0.14905  -5.243 1.58e-07 ***
race3                     -0.60170    0.15473  -3.889 0.000101 ***
sex                        0.61570    0.09770   6.302 2.94e-10 ***
working_status2           -0.15969    0.24722  -0.646 0.518323    
working_status3           -0.12411    0.22416  -0.554 0.579828    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(0.2968) family taken to be 1)

    Null deviance: 1783.3  on 1655  degrees of freedom
Residual deviance: 1570.7  on 1638  degrees of freedom
  (2493 observations deleted due to missingness)
AIC: 7574.4

Number of Fisher Scoring iterations: 1

              Theta:  0.2968 
          Std. Err.:  0.0141 

 2 x log-likelihood:  -7536.4160

Basic Poisson Model

pois_physical_health <- glm(physical_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status, data=data_physical_health, family = poisson, weights = data_physical_health$weight)
summary(pois_physical_health)


Call:
glm(formula = physical_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = poisson, data = data_physical_health, weights = data_physical_health$weight)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.287148   0.153912   8.363  < 2e-16 ***
job_interrupt_the_family   0.034496   0.019110   1.805  0.07105 .  
family_interrupt_the_job   0.136809   0.020105   6.805 1.01e-11 ***
marriage_status2          -0.429247   0.147328  -2.914  0.00357 ** 
marriage_status3          -0.027718   0.051398  -0.539  0.58969    
marriage_status4           0.631936   0.077712   8.132 4.23e-16 ***
marriage_status5          -0.345389   0.040021  -8.630  < 2e-16 ***
total_people_in_household  0.053797   0.009884   5.443 5.24e-08 ***
age_group                  0.100197   0.020701   4.840 1.30e-06 ***
education                 -0.112181   0.013355  -8.400  < 2e-16 ***
family_income             -0.006640   0.010666  -0.623  0.53359    
race2                     -0.785702   0.061160 -12.847  < 2e-16 ***
race3                     -0.380280   0.055435  -6.860 6.89e-12 ***
sex                        0.368993   0.031925  11.558  < 2e-16 ***
working_status2           -0.720800   0.054434 -13.242  < 2e-16 ***
working_status3           -1.068241   0.047625 -22.430  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 13049  on 1663  degrees of freedom
Residual deviance: 11681  on 1648  degrees of freedom
  (2485 observations deleted due to missingness)
AIC: 13900

Number of Fisher Scoring iterations: 6

pois_mental_health <- glm(mental_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status, data=data_mental_health, family = poisson, weights = data_mental_health$weight)
summary(pois_mental_health)


Call:
glm(formula = mental_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = poisson, data = data_mental_health, weights = data_mental_health$weight)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)                1.86773    0.10756  17.364  < 2e-16 ***
job_interrupt_the_family   0.29156    0.01455  20.034  < 2e-16 ***
family_interrupt_the_job   0.10048    0.01545   6.503 7.87e-11 ***
marriage_status2          -0.52643    0.11076  -4.753 2.01e-06 ***
marriage_status3           0.08600    0.04142   2.076   0.0379 *  
marriage_status4           0.55237    0.07107   7.772 7.70e-15 ***
marriage_status5          -0.26321    0.02902  -9.071  < 2e-16 ***
total_people_in_household  0.01614    0.00731   2.208   0.0272 *  
age_group                 -0.36941    0.01738 -21.260  < 2e-16 ***
education                 -0.12536    0.01030 -12.168  < 2e-16 ***
family_income             -0.07043    0.00620 -11.360  < 2e-16 ***
race2                     -0.52015    0.03946 -13.183  < 2e-16 ***
race3                     -0.42281    0.04238  -9.978  < 2e-16 ***
sex                        0.56890    0.02454  23.186  < 2e-16 ***
working_status2           -0.03299    0.05354  -0.616   0.5377    
working_status3           -0.09345    0.04815  -1.941   0.0523 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 16817  on 1655  degrees of freedom
Residual deviance: 13943  on 1640  degrees of freedom
  (2493 observations deleted due to missingness)
AIC: 17023

Number of Fisher Scoring iterations: 6

Quasi-Poisson Model

quasi_physical_health <-  glm(physical_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status,family = quasipoisson, data=data_physical_health, weights = data_physical_health$weight)
summary(quasi_physical_health)


Call:
glm(formula = physical_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = quasipoisson, data = data_physical_health, weights = data_physical_health$weight)

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                1.28715    0.53099   2.424 0.015455 *  
job_interrupt_the_family   0.03450    0.06593   0.523 0.600875    
family_interrupt_the_job   0.13681    0.06936   1.972 0.048729 *  
marriage_status2          -0.42925    0.50828  -0.845 0.398505    
marriage_status3          -0.02772    0.17732  -0.156 0.875802    
marriage_status4           0.63194    0.26810   2.357 0.018537 *  
marriage_status5          -0.34539    0.13807  -2.502 0.012462 *  
total_people_in_household  0.05380    0.03410   1.578 0.114836    
age_group                  0.10020    0.07142   1.403 0.160808    
education                 -0.11218    0.04607  -2.435 0.015006 *  
family_income             -0.00664    0.03680  -0.180 0.856825    
race2                     -0.78570    0.21100  -3.724 0.000203 ***
race3                     -0.38028    0.19125  -1.988 0.046933 *  
sex                        0.36899    0.11014   3.350 0.000826 ***
working_status2           -0.72080    0.18780  -3.838 0.000129 ***
working_status3           -1.06824    0.16430  -6.502 1.05e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasipoisson family taken to be 11.90224)

    Null deviance: 13049  on 1663  degrees of freedom
Residual deviance: 11681  on 1648  degrees of freedom
  (2485 observations deleted due to missingness)
AIC: NA

Number of Fisher Scoring iterations: 6

quasi_mental_health <- glm(mental_health_status~job_interrupt_the_family+family_interrupt_the_job+ marriage_status+ total_people_in_household+age_group+education+family_income+race+sex+working_status, family = quasipoisson, data=data_mental_health, weights = data_mental_health$weight)
summary(quasi_mental_health)


Call:
glm(formula = mental_health_status ~ job_interrupt_the_family + 
    family_interrupt_the_job + marriage_status + total_people_in_household + 
    age_group + education + family_income + race + sex + working_status, 
    family = quasipoisson, data = data_mental_health, weights = data_mental_health$weight)

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                1.86773    0.36572   5.107 3.65e-07 ***
job_interrupt_the_family   0.29156    0.04948   5.892 4.61e-09 ***
family_interrupt_the_job   0.10048    0.05254   1.913 0.055968 .  
marriage_status2          -0.52643    0.37660  -1.398 0.162350    
marriage_status3           0.08600    0.14084   0.611 0.541569    
marriage_status4           0.55237    0.24163   2.286 0.022382 *  
marriage_status5          -0.26321    0.09866  -2.668 0.007709 ** 
total_people_in_household  0.01614    0.02485   0.649 0.516177    
age_group                 -0.36941    0.05908  -6.253 5.13e-10 ***
education                 -0.12536    0.03503  -3.579 0.000355 ***
family_income             -0.07043    0.02108  -3.341 0.000853 ***
race2                     -0.52015    0.13415  -3.877 0.000110 ***
race3                     -0.42281    0.14408  -2.935 0.003386 ** 
sex                        0.56889    0.08343   6.819 1.28e-11 ***
working_status2           -0.03299    0.18205  -0.181 0.856200    
working_status3           -0.09345    0.16372  -0.571 0.568228    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasipoisson family taken to be 11.56031)

    Null deviance: 16817  on 1655  degrees of freedom
Residual deviance: 13943  on 1640  degrees of freedom
  (2493 observations deleted due to missingness)
AIC: NA

Number of Fisher Scoring iterations: 6

Likelihood Ratio, AIC, BIC

#Create function to compare model fit stats
glm_fit <- function(model) {
  # Calculate Likelihood Ratio
  lr <- logLik(model)
  
  # Calculate AIC
  aic <- AIC(model)
  
  # Calculate BIC
  n <- nobs(model)
  p <- length(coef(model))
  bic <- -2 * logLik(model) + p * log(n)
  
  # Calculate Deviance
  deviance <- summary(model)$deviance
  
  # Return the metrics as a list
  metrics <- data.frame(Likelihood_Ratio = lr, AIC = aic, BIC = bic, Deviance = deviance, Coefficients = p)
  return(metrics)
}

glm_fit(pois_physical_health)

  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1        -6934.083 13900.17 13986.84 11680.73           16

glm_fit(pois_mental_health)

  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1         -8495.61 17023.22 17109.81 13943.12           16

glm_fit(quasi_physical_health)

  Likelihood_Ratio AIC BIC Deviance Coefficients
1               NA  NA  NA 11680.73           16

glm_fit(quasi_mental_health)

  Likelihood_Ratio AIC BIC Deviance Coefficients
1               NA  NA  NA 13943.12           16

glm_fit(nb_physical_health)

  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1        -2956.217 5950.434 6045.939 1322.828           18

glm_fit(nb_mental_health)

  Likelihood_Ratio      AIC      BIC Deviance Coefficients
1        -3768.208 7574.416 7669.834  1570.67           18

Overall Table

stargazer(nb_physical_health, nb_mental_health, type="text",align=TRUE, dep.var.labels=c("Physical Health Status","Mental Health Status"), covariate.labels=c("Job interrupt the family", "Family interrupt the job", "Marriage status:Separated","Marriage status:Divorced", "Marriage status:Widowed","Marriage status:Married","Total people in household", "Age group：30-49 YEARS OLD","Age group：50-64 YEARS OLD", "Age group：64 YEARS OLD OR OVER", "Education", "Family income", "Race:BLACK", "Race:OTHER","Sex", "Working Status:WORKING PART TIME", "Working Status:WORKING FULL TIME" ))


============================================================================
                                             Dependent variable:            
                                 -------------------------------------------
                                 Physical Health Status Mental Health Status
                                          (1)                   (2)         
----------------------------------------------------------------------------
Job interrupt the family                 0.047                0.305***      
                                        (0.071)               (0.060)       
                                                                            
Family interrupt the job                 0.150*               0.145**       
                                        (0.077)               (0.065)       
                                                                            
Marriage status:Separated                -0.306               -0.678*       
                                        (0.466)               (0.392)       
                                                                            
Marriage status:Divorced                 -0.114                0.025        
                                        (0.212)               (0.177)       
                                                                            
Marriage status:Widowed                  0.419                 -0.014       
                                        (0.404)               (0.344)       
                                                                            
Marriage status:Married                -0.532***             -0.571***      
                                        (0.152)               (0.126)       
                                                                            
Total people in household                0.035                 0.048        
                                        (0.038)               (0.031)       
                                                                            
Age group：30-49 YEARS OLD               0.420**               -0.258*       
                                        (0.164)               (0.135)       
                                                                            
Age group：50-64 YEARS OLD               0.597***             -0.688***      
                                        (0.188)               (0.156)       
                                                                            
Age group：64 YEARS OLD OR OVER           0.162               -1.424***      
                                        (0.294)               (0.258)       
                                                                            
Education                               -0.088*              -0.145***      
                                        (0.048)               (0.041)       
                                                                            
Family income                            -0.006              -0.102***      
                                        (0.042)               (0.033)       
                                                                            
Race:BLACK                             -0.870***             -0.782***      
                                        (0.182)               (0.149)       
                                                                            
Race:OTHER                              -0.380**             -0.602***      
                                        (0.184)               (0.155)       
                                                                            
Sex                                     0.481***              0.616***      
                                        (0.117)               (0.098)       
                                                                            
Working Status:WORKING PART TIME        -0.705**               -0.160       
                                        (0.290)               (0.247)       
                                                                            
Working Status:WORKING FULL TIME       -1.189***               -0.124       
                                        (0.263)               (0.224)       
                                                                            
Constant                                 1.070*               1.867***      
                                        (0.611)               (0.500)       
                                                                            
----------------------------------------------------------------------------
Observations                             1,664                 1,656        
Log Likelihood                         -2,957.217            -3,769.208     
theta                               0.207*** (0.011)      0.297*** (0.014)  
Akaike Inf. Crit.                      5,950.434             7,574.416      
============================================================================
Note:                                            *p<0.1; **p<0.05; ***p<0.01

Prediction

physical_pred_j<-ggpredict(nb_physical_health, terms="job_interrupt_the_family")
print(physical_pred_j)

# Predicted counts of physical_health_status

job_interrupt_the_family | Predicted |      95% CI
--------------------------------------------------
                       1 |      6.63 | 3.66, 12.01
                       2 |      6.94 | 3.96, 12.16
                       3 |      7.28 | 4.16, 12.74
                       4 |      7.63 | 4.21, 13.81

Adjusted for:
*  family_interrupt_the_job =  2.11
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.65
*                      race =     1
*                       sex =  1.48
*            working_status =     1

physical_pred_f<-ggpredict(nb_physical_health, terms="family_interrupt_the_job")
print(physical_pred_f)

# Predicted counts of physical_health_status

family_interrupt_the_job | Predicted |      95% CI
--------------------------------------------------
                       1 |      5.98 | 3.37, 10.62
                       2 |      6.95 | 3.98, 12.11
                       3 |      8.07 | 4.53, 14.38
                       4 |      9.38 | 4.97, 17.69

Adjusted for:
*  job_interrupt_the_family =  2.36
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.65
*                      race =     1
*                       sex =  1.48
*            working_status =     1

mental_pred_j<-ggpredict(nb_mental_health, terms="job_interrupt_the_family")
print(mental_pred_j)

# Predicted counts of mental_health_status

job_interrupt_the_family | Predicted |      95% CI
--------------------------------------------------
                       1 |      6.53 | 3.95, 10.78
                       2 |      8.86 | 5.52, 14.22
                       3 |     12.02 | 7.48, 19.31
                       4 |     16.32 | 9.87, 26.98

Adjusted for:
*  family_interrupt_the_job =  2.10
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.66
*                      race =     1
*                       sex =  1.48
*            working_status =     1

mental_pred_f<-ggpredict(nb_mental_health, terms="family_interrupt_the_job")
print(mental_pred_f)

# Predicted counts of mental_health_status

family_interrupt_the_job | Predicted |      95% CI
--------------------------------------------------
                       1 |      8.39 | 5.16, 13.65
                       2 |      9.71 | 6.07, 15.53
                       3 |     11.23 | 6.89, 18.29
                       4 |     12.98 | 7.59, 22.21

Adjusted for:
*  job_interrupt_the_family =  2.35
*           marriage_status =     1
* total_people_in_household =  2.03
*                 age_group =     1
*                 education =  2.89
*             family_income = 11.66
*                      race =     1
*                       sex =  1.48
*            working_status =     1

custom_order <- c("Never", "Rarely"," Sometimes", "Often")

ggplot(physical_pred_j, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Job interrrupt family", y = "Predicted Count: Physical Health", 
       title = "Predicted Count for job interrupt family- Physical Health", 
       subtitle = "Results from Negative-Binomial Model")+
  scale_x_discrete(limits = custom_order)

ggplot(physical_pred_f, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Family interrrupt job", y = "Predicted Count: Physical Health", 
       title = "Predicted Count for family interrupt job- Physical Health", 
       subtitle = "Results from Negative-Binomial Model")+
  scale_x_discrete(limits = custom_order)

ggplot(mental_pred_j, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Work impact Family", y = "Predicted Count: Days of poor mental health", 
       title = "Predicted Days of Poor Mental Health by Work Impact Family", 
       subtitle = "Results from Negative-Binomial Model",
       fill = "Work impact Family")+
  scale_x_discrete(limits = custom_order)+
  theme(legend.position = "bottom" )

ggplot(mental_pred_f, aes(x = x, y = predicted, fill = as_factor(x))) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.4, position = position_dodge(width = 0.7)) +
  theme_minimal(base_size = 13) +
  labs(x = "Family interrrupt job", y = "Predicted Count: Mental Health", 
       title = "Predicted Count for family interrupt job- Mental Health", 
       subtitle = "Results from Negative-Binomial Model")+
  scale_x_discrete(limits = custom_order)

data_physical_health <- data_physical_health %>% 
  mutate(job_interrupt_the_family = as.numeric(job_interrupt_the_family),
         family_interrupt_the_job = as.numeric(family_interrupt_the_job))

data_physical_health <- data_physical_health %>%
  mutate(job_interrupt_the_family = factor(job_interrupt_the_family,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) %>%
  mutate(family_interrupt_the_job = factor(family_interrupt_the_job,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) 

# Summarize the data (assuming CO2_sum is already created)
data_physical_health_sum <- data_physical_health |> group_by(job_interrupt_the_family) |>
   dplyr::summarise(mean = mean (physical_health_status, na.rm = TRUE),
            sd = sd(physical_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data <- data_physical_health_sum |>
  rowwise() |>
  do(data.frame(job_interrupt_the_family = .$job_interrupt_the_family,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data, aes(x = job_interrupt_the_family, y = outcome, fill = job_interrupt_the_family))+
  geom_col(data = data_physical_health_sum, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = job_interrupt_the_family), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Physical Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Job Interrupts Family",
        y = "Simulated Days of Poor Physical Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)

# Summarize the data (assuming CO2_sum is already created)
data_physical_health_sum_1 <- data_physical_health |> group_by(family_interrupt_the_job) |>
   dplyr::summarise(mean = mean (physical_health_status, na.rm = TRUE),
            sd = sd(physical_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data_1 <- data_physical_health_sum_1 |>
  rowwise() |>
  do(data.frame(family_interrupt_the_job = .$family_interrupt_the_job,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data_1, aes(x = family_interrupt_the_job, y = outcome, fill = family_interrupt_the_job))+
  geom_col(data = data_physical_health_sum_1, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = family_interrupt_the_job), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Physical Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Family Interrupts Job",
        y = "Simulated Days of Poor Physical Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)

data_mental_health <- data_mental_health %>% 
  mutate(job_interrupt_the_family = as.numeric(job_interrupt_the_family),
         family_interrupt_the_job = as.numeric(family_interrupt_the_job))

data_mental_health <- data_mental_health %>%
  mutate(job_interrupt_the_family = factor(job_interrupt_the_family,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) %>%
  mutate(family_interrupt_the_job = factor(family_interrupt_the_job,
                                           levels = c(1,2,3,4),
                                           labels = c("Never", "Rarely", "Sometimes", "Often"))) 

# Summarize the data (assuming CO2_sum is already created)
data_mental_health_sum <- data_mental_health |> group_by(job_interrupt_the_family) |>
   dplyr::summarise(mean = mean (mental_health_status, na.rm = TRUE),
            sd = sd(mental_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data_3 <- data_mental_health_sum |>
  rowwise() |>
  do(data.frame(job_interrupt_the_family = .$job_interrupt_the_family,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data_3, aes(x = job_interrupt_the_family, y = outcome, fill = job_interrupt_the_family))+
  geom_col(data = data_mental_health_sum, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = job_interrupt_the_family), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Mental Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Job Interrupts Family",
        y = "Simulated Days of Poor Mental Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)

# Summarize the data (assuming CO2_sum is already created)
data_mental_health_sum_1 <- data_mental_health |> group_by(family_interrupt_the_job) |>
   dplyr::summarise(mean = mean (mental_health_status, na.rm = TRUE),
            sd = sd(mental_health_status, na.rm = TRUE))
# Simulate hypothetical outcomes
set.seed(01002)  # For reproducibility
n_simulations <- 100 # Number of hypothetical outcomes

# Create a data frame with simulated outcomes for each Type
simulated_data_4 <- data_mental_health_sum_1 |>
  rowwise() |>
  do(data.frame(family_interrupt_the_job = .$family_interrupt_the_job,
                outcome = rnbinom(n_simulations, size = (.$mean^2 / .$sd^2), mu = .$mean),
                sim_id = 1:n_simulations))

# Create an animated plot using ggplot and gganimmated
g <- ggplot (simulated_data_4, aes(x = family_interrupt_the_job, y = outcome, fill = family_interrupt_the_job))+
  geom_col(data = data_mental_health_sum_1, aes(y = mean), alpha = 0.4, color = "black") +
  geom_point(aes(color = family_interrupt_the_job), size = 2, position = position_jitter(width = 0.1)) +
  labs (title = "Hypothetical Outcome Plot: Days of Poor Mental Health",
        subtitle = "Simulation {frame} of {nframes}",
        x = "Family Interrupts Job",
        y = "Simulated Days of Poor Mental Health") + 
  theme_minimal()+
  theme(legend.position = "none") +
  transition_manual(sim_id)

animate(g, nframes = n_simulations, fps =1)