1. Load Data and clean up – CHFS 2019.
## load data
library(haven)
chfs2019_ind_202112 <- read_dta("chfs2019_ind_202112.dta")
chfs2019_hh_202112 <- read_dta("chfs2019_hh_202112.dta")
chfs2019_master_202112 <- read_dta("chfs2019_master_202112.dta")
## Individual data pick variables _ data_ind
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data_ind <- chfs2019_ind_202112 %>% select(hhid, pline, a2001, a2003, a2005, a2011b, a2012, a2015, a2022, a2024, a2025b, a3132b, a3132d, f2001a, f6001a) ## Select relationship, gender, birth year,ethnicity, education level, party membership,Hukou, marital status, health, work, type of work, medical insurance, business_health_insurance
data_ind <- data_ind %>%
rename(relationship = a2001,gender = a2003, birth_year = a2005, ethnicity = a2011b, education = a2012, Party = a2015, Hukou = a2022, marital = a2024, health = a2025b, work = a3132b, work_type = a3132d, medical_insurance=f2001a, business_health_insurance=f6001a)## rename variables
summary(data_ind)
## hhid pline relationship gender
## Length:107008 Min. : 1.000 Min. : 1.00 Min. :1.000
## Class :character 1st Qu.: 1.000 1st Qu.: 1.00 1st Qu.:1.000
## Mode :character Median : 2.000 Median : 2.00 Median :1.000
## Mean : 4.199 Mean : 35.45 Mean :1.494
## 3rd Qu.: 4.000 3rd Qu.: 6.00 3rd Qu.:2.000
## Max. :39.000 Max. :7777.00 Max. :2.000
## NA's :7
## birth_year ethnicity education Party
## Min. :1902 Min. : 1.00 Min. :1.00 Min. :1.000
## 1st Qu.:1957 1st Qu.: 1.00 1st Qu.:2.00 1st Qu.:2.000
## Median :1972 Median : 1.00 Median :3.00 Median :2.000
## Mean :1975 Mean : 1.39 Mean :3.43 Mean :1.867
## 3rd Qu.:1993 3rd Qu.: 1.00 3rd Qu.:4.00 3rd Qu.:2.000
## Max. :2019 Max. :10.00 Max. :9.00 Max. :2.000
## NA's :51 NA's :44829 NA's :15989 NA's :29678
## Hukou marital health work
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 1.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000
## Median : 1.00 Median :2.000 Median :2.000 Median :1.000
## Mean : 10.35 Mean :2.188 Mean :2.541 Mean :1.415
## 3rd Qu.: 2.00 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :7777.00 Max. :6.000 Max. :5.000 Max. :2.000
## NA's :198 NA's :18010 NA's :57 NA's :17067
## work_type medical_insurance business_health_insurance
## Min. :1.00 Min. : 1.0 Length:107008
## 1st Qu.:1.00 1st Qu.: 2.0 Class :character
## Median :2.00 Median : 3.0 Mode :character
## Mean :3.75 Mean : 703.9
## 3rd Qu.:7.00 3rd Qu.: 3.0
## Max. :7.00 Max. :7788.0
## NA's :54523 NA's :248
## Master data pick variables _ data_master
data_master <- chfs2019_master_202112 %>% select(hhid, pline, rural, weight_hh, weight_ind, region, city_level, total_income, total_consump, total_asset, total_debt)
summary(data_master)
## hhid pline rural weight_hh
## Length:107008 Min. : 1.000 Min. :0.0000 Min. : 140
## Class :character 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.: 4637
## Mode :character Median : 2.000 Median :0.0000 Median : 8840
## Mean : 4.199 Mean :0.3797 Mean : 12477
## 3rd Qu.: 4.000 3rd Qu.:1.0000 3rd Qu.: 15983
## Max. :39.000 Max. :1.0000 Max. :122576
##
## weight_ind region city_level total_income
## Min. : 616.6 Min. :1.000 Min. :1.000 Min. :-5493190
## 1st Qu.: 4002.2 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 23616
## Median : 8664.7 Median :2.000 Median :3.000 Median : 58190
## Mean :12639.3 Mean :2.143 Mean :2.364 Mean : 92183
## 3rd Qu.:16792.4 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 109454
## Max. :61153.5 Max. :4.000 Max. :3.000 Max. :12122418
## NA's :254
## total_consump total_asset total_debt
## Min. : 1188 Min. :0.000e+00 Min. : 0
## 1st Qu.: 35380 1st Qu.:1.577e+05 1st Qu.: 0
## Median : 61120 Median :4.350e+05 Median : 0
## Mean : 101760 Mean :1.187e+06 Mean : 84081
## 3rd Qu.: 102695 3rd Qu.:1.125e+06 3rd Qu.: 50000
## Max. :169546352 Max. :2.100e+09 Max. :40658344
## NA's :16794
## Household data pick variables _ data_hh
data_hh <- chfs2019_hh_202112 %>% select(hhid, c1001, e4001)
data_hh <- data_hh %>%
rename( house_nature = c1001, medical_liabilities = e4001 )
summary(data_hh)
## hhid house_nature medical_liabilities
## Length:34643 Min. :1.000 Min. :1.000
## Class :character 1st Qu.:1.000 1st Qu.:2.000
## Mode :character Median :1.000 Median :2.000
## Mean :1.214 Mean :1.954
## 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :3.000 Max. :2.000
## NA's :25 NA's :29
## merge data
data_master$hhid <- as.character(data_master$hhid)
data_master$pline <- as.character(data_master$pline)
data_master$id = paste(data_master$hhid, data_master$pline, sep = "_")## generate a new unique variable to as identification
data_ind$hhid <- as.character(data_ind$hhid)
data_ind$pline <- as.character(data_ind$pline)
data_ind$id = paste(data_ind$hhid, data_ind$pline, sep = "_") ## generate a new unique variable to as identification
nrow(chfs2019_master_202112$pline)
## NULL
nrow(chfs2019_ind_202112$pline)
## NULL
nrow(chfs2019_hh_202112$hhid)
## NULL
#merging individual to master, M1
data_master_ind <- merge(data_ind, data_master, by="id", all =T) ## first merge, master and individual
data_master_ind <- data_master_ind %>%
rename(hhid = hhid.x, pline = pline.x ) ## rename variable
data_master_ind_hh <- merge(data_master_ind, data_hh, by="hhid", all=T) ## second merge, master, individual and household
## generate a new variable: Number of kids
data_master_ind_hh$number_kids <- data_master_ind_hh$relationship == 6 & data_master_ind_hh$birth_year < 2001
library(dplyr)
family_kids_number <- data_master_ind_hh %>%
group_by(hhid) %>%
summarise(number_kids_under18 = sum(number_kids))
data_master_ind_hh <- merge(data_master_ind_hh, family_kids_number, by="hhid")
data_master_ind_hh$number_kids_under18.x<- NULL
data_master_ind_hh$number_kids_under18.y <- NULL
summary(data_master_ind_hh$number_kids_under18)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.5161 1.0000 4.0000 45
## Final data after filters
Data_total <- data_master_ind_hh %>%
filter(relationship==1 | relationship==2) %>%
filter(birth_year < 1999) %>%
filter(marital==2 | marital==3)
summary(Data_total)
## hhid id pline relationship
## Length:56116 Length:56116 Length:56116 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:1.000
## Mode :character Mode :character Mode :character Median :1.000
## Mean :1.491
## 3rd Qu.:2.000
## Max. :2.000
##
## gender birth_year ethnicity education Party
## Min. :1.0 Min. :1903 Min. : 1.000 Min. :1.00 Min. :1.000
## 1st Qu.:1.0 1st Qu.:1954 1st Qu.: 1.000 1st Qu.:2.00 1st Qu.:2.000
## Median :2.0 Median :1964 Median : 1.000 Median :3.00 Median :2.000
## Mean :1.5 Mean :1964 Mean : 1.384 Mean :3.25 Mean :1.853
## 3rd Qu.:2.0 3rd Qu.:1973 3rd Qu.: 1.000 3rd Qu.:4.00 3rd Qu.:2.000
## Max. :2.0 Max. :1998 Max. :10.000 Max. :9.00 Max. :2.000
## NA's :98 NA's :41 NA's :2102
## Hukou marital health work
## Min. : 1.000 Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.: 1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000
## Median : 1.000 Median :2.000 Median :3.000 Median :1.000
## Mean : 8.786 Mean :2.004 Mean :2.768 Mean :1.371
## 3rd Qu.: 2.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :7777.000 Max. :3.000 Max. :5.000 Max. :2.000
## NA's :60 NA's :10 NA's :14
## work_type medical_insurance business_health_insurance hhid.y
## Min. :1.000 Min. : 1.0 Length:56116 Length:56116
## 1st Qu.:1.000 1st Qu.: 1.0 Class :character Class :character
## Median :4.000 Median : 3.0 Mode :character Mode :character
## Mean :4.131 Mean : 459.9
## 3rd Qu.:7.000 3rd Qu.: 3.0
## Max. :7.000 Max. :7788.0
## NA's :20878 NA's :79
## pline.y rural weight_hh weight_ind
## Length:56116 Min. :0.0000 Min. : 140 Min. : 616.6
## Class :character 1st Qu.:0.0000 1st Qu.: 4847 1st Qu.: 3694.0
## Mode :character Median :0.0000 Median : 9161 Median : 7722.4
## Mean :0.3666 Mean : 12851 Mean :11172.0
## 3rd Qu.:1.0000 3rd Qu.: 16394 3rd Qu.:14703.1
## Max. :1.0000 Max. :122576 Max. :61153.5
##
## region city_level total_income total_consump
## Min. :1.00 Min. :1.000 Min. :-5493190 Min. : 1968
## 1st Qu.:1.00 1st Qu.:1.000 1st Qu.: 20590 1st Qu.: 31622
## Median :2.00 Median :3.000 Median : 54918 Median : 56040
## Mean :2.13 Mean :2.308 Mean : 86683 Mean : 86710
## 3rd Qu.:3.00 3rd Qu.:3.000 3rd Qu.: 103007 3rd Qu.: 95209
## Max. :4.00 Max. :3.000 Max. :12122418 Max. :169546352
## NA's :147
## total_asset total_debt house_nature medical_liabilities
## Min. :0.000e+00 Min. : 0 Min. :1.000 Min. :1.000
## 1st Qu.:1.486e+05 1st Qu.: 0 1st Qu.:1.000 1st Qu.:2.000
## Median :4.341e+05 Median : 0 Median :1.000 Median :2.000
## Mean :1.205e+06 Mean : 77493 Mean :1.183 Mean :1.955
## 3rd Qu.:1.141e+06 3rd Qu.: 35000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :2.100e+09 Max. :40658344 Max. :3.000 Max. :2.000
## NA's :9847 NA's :28 NA's :43
## number_kids number_kids_under18
## Mode :logical Min. :0.0000
## FALSE:56116 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3856
## 3rd Qu.:1.0000
## Max. :4.0000
## NA's :2
## Filters: Family relationship is respondent or spouse, respondent is 1, spouse is 2, Age greater than 20 years, i.e. year of birth before 1999, Marital status married or cohabiting, married 2, cohabiting 3
Data_total$pline.y <- NULL
Data_total$hhid.y <- NULL
Data_total$business_health_insurance <- as.numeric(Data_total$business_health_insurance)
## Warning: NAs introduced by coercion
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
Data_total$Hukou <- car::recode(Data_total$Hukou, recodes = "7777=NA", as.numeric = T)
Data_total$medical_insurance <- car::recode(Data_total$medical_insurance, recodes = "7788=0", as.numeric = T)
Data_total$business_health_insurance <- car::recode(Data_total$business_health_insurance, recodes = "1=0; 2=1; 3=0; 7788=0", as.numeric = T)
Data_total$medical_liabilities <- car::recode(Data_total$medical_liabilities, recodes = "1=1; 2=0", as.numeric = T) ## recode variables
Data_total <- Data_total %>%
filter(!is.na(ethnicity)) %>%
filter(!is.na(education)) %>%
filter(!is.na(Party)) %>%
filter(!is.na(health)) %>%
filter(!is.na(Hukou)) %>%
filter(!is.na(work)) %>%
filter(!is.na(medical_insurance)) %>%
filter(!is.na(business_health_insurance)) %>%
filter(!is.na(total_income)) %>%
filter(!is.na(total_debt)) %>%
filter(!is.na(house_nature)) %>%
filter(!is.na(medical_liabilities)) %>%
filter(!is.na(number_kids_under18)) ## delete missing value
2. Descriptive Statistics – Educational Assortative Mating and
Health Outcomes
##Dependent variable: Self-assessed health
summary(Data_total$health)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.781 3.000 5.000
table(Data_total$health)
##
## 1 2 3 4 5
## 4686 11751 17641 7275 2167
Data_total$dv_health <- car::recode(Data_total$health, "1:2=1 ; 3:5=0",as.numeric = T)
table(Data_total$dv_health)
##
## 0 1
## 27083 16437
Data_total$dv_health <- car::recode(Data_total$dv_health,recodes = "1='Healthy';0='Unhealthy'",as.factor=T)
## Independent variable: Education Pairing
summary(Data_total$relationship)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.491 2.000 2.000
summary(Data_total$gender)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.499 2.000 2.000
summary(Data_total$education)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.128 4.000 9.000
table(Data_total$relationship)
##
## 1 2
## 22162 21358
table(Data_total$gender)
##
## 1 2
## 21801 21719
table(Data_total$education)
##
## 1 2 3 4 5 6 7 8 9
## 4365 12126 14833 5595 1623 2579 2156 212 31
Data_total$education_level <- car::recode(Data_total$education, "1:2=1; 3=2; 4:5=3; 6=4; 7:9=5",as.numeric = T)
table(Data_total$education_level)
##
## 1 2 3 4 5
## 16491 14833 7218 2579 2399
data_res_heterogomy <- filter(Data_total, relationship==1) ## Create new Data "data_res_heterogomy", Select family member relationship is 1, 1 means respondents.
data_spo_heterogomy <- filter(Data_total, relationship==2) ## Create new Data "data_spo_heterogomy", Select family member relationship is 2, 2 means spouse.
merge_data <- merge(data_res_heterogomy,data_spo_heterogomy, by="hhid") ## merge the first data to get independent variable
summary(merge_data)
## hhid id.x pline.x relationship.x
## Length:20985 Length:20985 Length:20985 Min. :1
## Class :character Class :character Class :character 1st Qu.:1
## Mode :character Mode :character Mode :character Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
##
## gender.x birth_year.x ethnicity.x education.x
## Min. :1.000 Min. :1927 Min. : 1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1955 1st Qu.: 1.000 1st Qu.:2.000
## Median :1.000 Median :1964 Median : 1.000 Median :3.000
## Mean :1.439 Mean :1965 Mean : 1.397 Mean :3.205
## 3rd Qu.:2.000 3rd Qu.:1973 3rd Qu.: 1.000 3rd Qu.:4.000
## Max. :2.000 Max. :1998 Max. :10.000 Max. :9.000
##
## Party.x Hukou.x marital.x health.x
## Min. :1.000 Min. :1.000 Min. :2.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :2.000 Median :3.000
## Mean :1.845 Mean :1.475 Mean :2.003 Mean :2.756
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :2.000 Max. :4.000 Max. :3.000 Max. :5.000
##
## work.x work_type.x medical_insurance.x
## Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :1.000 Median :6.000 Median :3.000
## Mean :1.299 Mean :4.503 Mean :2.387
## 3rd Qu.:2.000 3rd Qu.:7.000 3rd Qu.:3.000
## Max. :2.000 Max. :7.000 Max. :5.000
## NA's :6318
## business_health_insurance.x rural.x weight_hh.x
## Min. :0.00000 Min. :0.0000 Min. : 140
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 5048
## Median :0.00000 Median :0.0000 Median : 9466
## Mean :0.03212 Mean :0.4245 Mean : 13511
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.: 17560
## Max. :1.00000 Max. :1.0000 Max. :122576
##
## weight_ind.x region.x city_level.x total_income.x
## Min. : 616.6 Min. :1.000 Min. :1.000 Min. :-5493190
## 1st Qu.: 3726.5 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 18256
## Median : 7855.4 Median :2.000 Median :3.000 Median : 50195
## Mean :11408.1 Mean :2.087 Mean :2.385 Mean : 80946
## 3rd Qu.:15164.3 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 98342
## Max. :61153.5 Max. :4.000 Max. :3.000 Max. :12122418
##
## total_consump.x total_asset.x total_debt.x house_nature.x
## Min. : 2026 Min. :0.000e+00 Min. : 0 Min. :1.000
## 1st Qu.: 29878 1st Qu.:1.382e+05 1st Qu.: 0 1st Qu.:1.000
## Median : 53600 Median :4.020e+05 Median : 0 Median :1.000
## Mean : 85748 Mean :1.136e+06 Mean : 71868 Mean :1.158
## 3rd Qu.: 91980 3rd Qu.:1.056e+06 3rd Qu.: 32000 3rd Qu.:1.000
## Max. :169546352 Max. :2.100e+09 Max. :14045000 Max. :3.000
##
## medical_liabilities.x number_kids.x number_kids_under18.x dv_health.x
## Min. :0.00000 Mode :logical Min. :0.0000 Healthy : 7957
## 1st Qu.:0.00000 FALSE:20985 1st Qu.:0.0000 Unhealthy:13028
## Median :0.00000 Median :0.0000
## Mean :0.05418 Mean :0.4167
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :4.0000
##
## education_level.x id.y pline.y relationship.y
## Min. :1.000 Length:20985 Length:20985 Min. :2
## 1st Qu.:1.000 Class :character Class :character 1st Qu.:2
## Median :2.000 Mode :character Mode :character Median :2
## Mean :2.121 Mean :2
## 3rd Qu.:3.000 3rd Qu.:2
## Max. :5.000 Max. :2
##
## gender.y birth_year.y ethnicity.y education.y Party.y
## Min. :1.000 Min. :1923 Min. : 1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1956 1st Qu.: 1.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1965 Median : 1.0 Median :3.000 Median :2.000
## Mean :1.561 Mean :1965 Mean : 1.4 Mean :3.032 Mean :1.903
## 3rd Qu.:2.000 3rd Qu.:1974 3rd Qu.: 1.0 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :2.000 Max. :1998 Max. :10.0 Max. :9.000 Max. :2.000
##
## Hukou.y marital.y health.y work.y
## Min. :1.000 Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000
## Median :1.000 Median :2.000 Median :3.000 Median :1.000
## Mean :1.468 Mean :2.003 Mean :2.812 Mean :1.312
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :4.000 Max. :3.000 Max. :5.000 Max. :2.000
##
## work_type.y medical_insurance.y business_health_insurance.y
## Min. :1.000 Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :4.000 Median :3.000 Median :0.0000
## Mean :4.246 Mean :2.381 Mean :0.0284
## 3rd Qu.:7.000 3rd Qu.:3.000 3rd Qu.:0.0000
## Max. :7.000 Max. :5.000 Max. :1.0000
## NA's :6583
## rural.y weight_hh.y weight_ind.y region.y
## Min. :0.0000 Min. : 140 Min. : 616.6 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 5048 1st Qu.: 3791.8 1st Qu.:1.000
## Median :0.0000 Median : 9466 Median : 7948.4 Median :2.000
## Mean :0.4245 Mean : 13511 Mean :11455.3 Mean :2.087
## 3rd Qu.:1.0000 3rd Qu.: 17560 3rd Qu.:15105.2 3rd Qu.:3.000
## Max. :1.0000 Max. :122576 Max. :61153.5 Max. :4.000
##
## city_level.y total_income.y total_consump.y total_asset.y
## Min. :1.000 Min. :-5493190 Min. : 2026 Min. :0.000e+00
## 1st Qu.:2.000 1st Qu.: 18256 1st Qu.: 29878 1st Qu.:1.382e+05
## Median :3.000 Median : 50195 Median : 53600 Median :4.020e+05
## Mean :2.385 Mean : 80946 Mean : 85748 Mean :1.136e+06
## 3rd Qu.:3.000 3rd Qu.: 98342 3rd Qu.: 91980 3rd Qu.:1.056e+06
## Max. :3.000 Max. :12122418 Max. :169546352 Max. :2.100e+09
##
## total_debt.y house_nature.y medical_liabilities.y number_kids.y
## Min. : 0 Min. :1.000 Min. :0.00000 Mode :logical
## 1st Qu.: 0 1st Qu.:1.000 1st Qu.:0.00000 FALSE:20985
## Median : 0 Median :1.000 Median :0.00000
## Mean : 71868 Mean :1.158 Mean :0.05418
## 3rd Qu.: 32000 3rd Qu.:1.000 3rd Qu.:0.00000
## Max. :14045000 Max. :3.000 Max. :1.00000
##
## number_kids_under18.y dv_health.y education_level.y
## Min. :0.0000 Healthy : 7838 Min. :1.000
## 1st Qu.:0.0000 Unhealthy:13147 1st Qu.:1.000
## Median :0.0000 Median :2.000
## Mean :0.4167 Mean :2.007
## 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :4.0000 Max. :5.000
##
## Educational Assortative Mating - A Series of Dummy Variables: Homogamy, Hypergamy, Hypogamy.
## 1. Educational Homogamy
library(dplyr)
library(tidyr)
merge_data <- merge_data %>%
mutate(edu_homogamy = case_when( education_level.x == education_level.y ~ 1,
TRUE ~ 0
))
## 2. Educational Hypergamy
merge_data <- merge_data %>%
mutate(edu_hypergamy = case_when( (education_level.x > education_level.y) & gender.x == 1 ~ 1,
TRUE ~ 0
))
## 3. Educational Hypogamy
merge_data <- merge_data %>%
mutate(edu_hypogamy = case_when( (education_level.x > education_level.y) & gender.x == 2 ~ 1,
TRUE ~ 0
))
merge <- merge_data %>% select(hhid, edu_homogamy, edu_hypergamy, edu_hypogamy)
Data_total <- merge(Data_total, merge, by="hhid")
#Control variable:
#Gender
table(Data_total$gender)
##
## 1 2
## 20985 20985
Data_total$gender <- car::recode(Data_total$gender,recodes = "1='Male';2='Female'",as.factor=T)
class(Data_total$gender)
## [1] "factor"
Data_total$gender <- as.factor(Data_total$gender)
# Age
Data_total$age <- 2019- Data_total$birth_year
summary(Data_total$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 46.00 54.00 54.06 64.00 96.00
## Birth Cohort
Data_total <- Data_total %>%
mutate(birth_cohort = cut(birth_year, breaks = c(1922, 1949, 1978, 1989, 1998), labels = c("Before 1949", "1950-1978", "1979-1989", "1990-1998")))
# Education Level
class(Data_total$education_level)
## [1] "numeric"
Data_total$education_level <- car::recode(Data_total$education_level,recodes = "1='Primary school or below';2='Junior high school'; 3='High school or technical school'; 4=' Junior college or vocational college'; 5='College or above'",as.factor=T)
Data_total$education_level <- as.factor(Data_total$education_level)
# Work
table(Data_total$work)
##
## 1 2
## 29130 12840
Data_total$work <- car::recode(Data_total$work, recodes = "1='Work' ; 2='No work'", as.factor=F)
Data_total$work <- as.factor(Data_total$work)
table(Data_total$work)
##
## No work Work
## 12840 29130
# Number of kids
table(Data_total$number_kids_under18)
##
## 0 1 2 3 4
## 27148 12456 2100 230 36
# Rural
table(Data_total$rural)
##
## 0 1
## 24154 17816
Data_total$rural <- car::recode(Data_total$rural,recodes = "0='Urban';1='Rural'",as.factor=T)
Data_total$rural <- as.factor(Data_total$rural)
# export the table of descriptive statistics
library(dplyr)
library(gtsummary)
library(haven)
Data_total <- Data_total %>%
mutate(
region = as_factor(region),
city_level = as_factor(city_level)
)
tb <- Data_total %>%
mutate(
dv_gender = interaction(dv_health, gender, sep = "_"),
number_kids_under18 = as.numeric(as.character(number_kids_under18))
) %>%
select(dv_gender, edu_homogamy,edu_hypergamy,edu_hypogamy, age, birth_cohort, education_level, work, number_kids_under18, rural) %>%
tbl_summary(
by = dv_gender,
type = list(
number_kids_under18 ~ "continuous"
),
statistic = list(all_continuous() ~"{mean} ({sd})",
all_categorical() ~ "{n} ({p})"),
digits = all_continuous() ~ 2,
label = list(
age ~ "Age",
birth_cohort ~ "Birth Cohort",
education_level ~ "Education",
work ~ "Work",
number_kids_under18 ~ "Number of kids under 18",
rural ~ "Rural"
)
)
tb
| Characteristic |
Healthy_Female, N = 7,472 |
Unhealthy_Female, N = 13,513 |
Healthy_Male, N = 8,323 |
Unhealthy_Male, N = 12,662 |
| edu_homogamy |
4,091 (55) |
7,659 (57) |
4,564 (55) |
7,186 (57) |
| edu_hypergamy |
1,410 (19) |
2,710 (20) |
1,615 (19) |
2,505 (20) |
| edu_hypogamy |
581 (7.8) |
844 (6.2) |
644 (7.7) |
781 (6.2) |
| Age |
48.98 (12.75) |
55.29 (11.32) |
51.66 (12.98) |
57.31 (11.43) |
| Birth Cohort |
|
|
|
|
| Â Â Â Â Before 1949 |
426 (5.7) |
1,352 (10) |
802 (9.6) |
1,891 (15) |
| Â Â Â Â 1950-1978 |
4,906 (66) |
10,708 (79) |
5,679 (68) |
9,762 (77) |
| Â Â Â Â 1979-1989 |
1,721 (23) |
1,273 (9.4) |
1,570 (19) |
921 (7.3) |
| Â Â Â Â 1990-1998 |
419 (5.6) |
180 (1.3) |
272 (3.3) |
88 (0.7) |
| Education |
|
|
|
|
| Â Â Â Â Junior college or vocational college |
690 (9.2) |
436 (3.2) |
754 (9.1) |
566 (4.5) |
| Â Â Â Â College or above |
638 (8.5) |
363 (2.7) |
812 (9.8) |
453 (3.6) |
| Â Â Â Â High school or technical school |
1,375 (18) |
1,573 (12) |
1,844 (22) |
2,147 (17) |
| Â Â Â Â Junior high school |
2,466 (33) |
3,933 (29) |
3,066 (37) |
4,901 (39) |
| Â Â Â Â Primary school or below |
2,303 (31) |
7,208 (53) |
1,847 (22) |
4,595 (36) |
| Work |
|
|
|
|
| Â Â Â Â No work |
2,496 (33) |
5,510 (41) |
1,389 (17) |
3,445 (27) |
| Â Â Â Â Work |
4,976 (67) |
8,003 (59) |
6,934 (83) |
9,217 (73) |
| Number of kids under 18 |
0.36 (0.57) |
0.45 (0.64) |
0.37 (0.58) |
0.45 (0.65) |
| Rural |
|
|
|
|
| Â Â Â Â Rural |
2,495 (33) |
6,413 (47) |
2,941 (35) |
5,967 (47) |
| Â Â Â Â Urban |
4,977 (67) |
7,100 (53) |
5,382 (65) |
6,695 (53) |
library(gt)
gt_tbl <- as_gt(tb)
gtsave(gt_tbl, "table.html")
3. Results of Logistics Models
## 1. Install The library packages
library(dplyr)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:car':
##
## some
library(broom)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(gtsummary)
## 2. Recode The Rural Variables
table(Data_total$rural)
##
## Rural Urban
## 17816 24154
Data_total$Rural <- car::recode(Data_total$rural,recodes = "0='Urban';1='Rural'",as.factor=T)
Data_total$education_level <- as.factor(Data_total$education_level)
Data_total$work <- as.factor(Data_total$work) # Change the variables from continuous to factor.
## 3. Log weight individual level (No used)
summary(Data_total$weight_ind)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 616.6 3749.1 7904.3 11431.8 15129.0 61153.5
Data_total$log_weight_ind <- log(Data_total$weight_ind + 1)
summary(Data_total$log_weight_ind)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.426 8.230 8.975 8.897 9.624 11.021
## 4. Separate the gender sample
data_male <- Data_total %>%
filter(gender=='Male')
data_female <- Data_total %>%
filter(gender=='Female')
## 5. Logistics Model 1a, Model 2a, Model 1b, Model 2b.
## For Male Sample
## Model_1_A
summary(Data_total$dv_health)
## Healthy Unhealthy
## 15795 26175
m1a <- glm(dv_health ~ edu_homogamy + age + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_male)
summary((m1a))
##
## Call:
## glm(formula = dv_health ~ edu_homogamy + age + education_level +
## work + number_kids_under18 + Rural, family = binomial, data = data_male)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.869695 0.115730 -7.515
## edu_homogamy -0.093812 0.032083 -2.924
## age 0.023606 0.001442 16.372
## education_levelCollege or above -0.200442 0.083176 -2.410
## education_levelHigh school or technical school 0.191265 0.066480 2.877
## education_levelJunior high school 0.504081 0.063581 7.928
## education_levelPrimary school or below 0.792811 0.069988 11.328
## workWork -0.389226 0.041527 -9.373
## number_kids_under18 0.112648 0.024044 4.685
## RuralUrban -0.237055 0.033264 -7.126
## Pr(>|z|)
## (Intercept) 5.70e-14 ***
## edu_homogamy 0.00345 **
## age < 2e-16 ***
## education_levelCollege or above 0.01596 *
## education_levelHigh school or technical school 0.00401 **
## education_levelJunior high school 2.22e-15 ***
## education_levelPrimary school or below < 2e-16 ***
## workWork < 2e-16 ***
## number_kids_under18 2.80e-06 ***
## RuralUrban 1.03e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28188 on 20984 degrees of freedom
## Residual deviance: 26566 on 20975 degrees of freedom
## AIC: 26586
##
## Number of Fisher Scoring iterations: 4
conf_int <- confint(m1a) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m1a <- tidy(m1a, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m1a$signif <- symnum(tidy_m1a$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
## Model_2_A
m2a <- glm(dv_health ~ edu_homogamy + edu_hypogamy + age + education_level + work + number_kids_under18 + Rural, family = binomial, data = data_male)
summary((m2a))
##
## Call:
## glm(formula = dv_health ~ edu_homogamy + edu_hypogamy + age +
## education_level + work + number_kids_under18 + Rural, family = binomial,
## data = data_male)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.795678 0.117114 -6.794
## edu_homogamy -0.148739 0.034580 -4.301
## edu_hypogamy -0.269996 0.063047 -4.282
## age 0.022624 0.001461 15.490
## education_levelCollege or above -0.214763 0.083282 -2.579
## education_levelHigh school or technical school 0.196397 0.066577 2.950
## education_levelJunior high school 0.523464 0.063833 8.201
## education_levelPrimary school or below 0.844458 0.071110 11.875
## workWork -0.394719 0.041577 -9.494
## number_kids_under18 0.109129 0.024071 4.534
## RuralUrban -0.216853 0.033619 -6.450
## Pr(>|z|)
## (Intercept) 1.09e-11 ***
## edu_homogamy 1.70e-05 ***
## edu_hypogamy 1.85e-05 ***
## age < 2e-16 ***
## education_levelCollege or above 0.00992 **
## education_levelHigh school or technical school 0.00318 **
## education_levelJunior high school 2.39e-16 ***
## education_levelPrimary school or below < 2e-16 ***
## workWork < 2e-16 ***
## number_kids_under18 5.80e-06 ***
## RuralUrban 1.12e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28188 on 20984 degrees of freedom
## Residual deviance: 26548 on 20974 degrees of freedom
## AIC: 26570
##
## Number of Fisher Scoring iterations: 4
conf_int <- confint(m2a) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m2a <- tidy(m2a, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m2a$signif <- symnum(tidy_m2a$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
## For Female Sample
## Model_1_B
m1b <- glm(dv_health ~ edu_homogamy + age + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_female)
summary((m1b))
##
## Call:
## glm(formula = dv_health ~ edu_homogamy + age + education_level +
## work + number_kids_under18 + Rural, family = binomial, data = data_female)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.385272 0.104317 -13.279
## edu_homogamy 0.061167 0.030754 1.989
## age 0.028645 0.001437 19.936
## education_levelCollege or above -0.004673 0.091979 -0.051
## education_levelHigh school or technical school 0.281330 0.073878 3.808
## education_levelJunior high school 0.570971 0.069638 8.199
## education_levelPrimary school or below 1.017151 0.072216 14.085
## workWork -0.193138 0.034060 -5.671
## number_kids_under18 0.100248 0.025062 4.000
## RuralUrban -0.209937 0.035052 -5.989
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## edu_homogamy 0.04671 *
## age < 2e-16 ***
## education_levelCollege or above 0.95948
## education_levelHigh school or technical school 0.00014 ***
## education_levelJunior high school 2.42e-16 ***
## education_levelPrimary school or below < 2e-16 ***
## workWork 1.42e-08 ***
## number_kids_under18 6.33e-05 ***
## RuralUrban 2.11e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27327 on 20984 degrees of freedom
## Residual deviance: 25356 on 20975 degrees of freedom
## AIC: 25376
##
## Number of Fisher Scoring iterations: 4
conf_int <- confint(m1b) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m1b <- tidy(m1b, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m1b$signif <- symnum(tidy_m1b$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
# Model_2_B
m2b <- glm(dv_health ~ edu_homogamy + edu_hypogamy + age + education_level + work + number_kids_under18 + Rural, family = binomial, data = data_female)
conf_int <- confint(m2b) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m2b <- tidy(m2b, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m2b$signif <- symnum(tidy_m2b$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
## 6. Logistics Model 1 to Model 4
## get the table of outcome
library(officer)
library(flextable)
##
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
##
## compose
## The following objects are masked from 'package:gtsummary':
##
## as_flextable, continuous_summary
library(stargazer)
stargazer(m1a, m2a, m1b, m2b, type = "text",
model.names = TRUE, out = "models_table.tex",
star.cutoffs = c(0.05, 0.01, 0.001))
##
## ==============================================================================================
## Dependent variable:
## -----------------------------------------------
## dv_health
## logistic
## (1) (2) (3) (4)
## ----------------------------------------------------------------------------------------------
## edu_homogamy -0.094** -0.149*** 0.061* 0.135***
## (0.032) (0.035) (0.031) (0.032)
##
## edu_hypogamy -0.270*** 0.453***
## (0.063) (0.064)
##
## age 0.024*** 0.023*** 0.029*** 0.028***
## (0.001) (0.001) (0.001) (0.001)
##
## education_levelCollege or above -0.200* -0.215** -0.005 -0.008
## (0.083) (0.083) (0.092) (0.092)
##
## education_levelHigh school or technical school 0.191** 0.196** 0.281*** 0.304***
## (0.066) (0.067) (0.074) (0.074)
##
## education_levelJunior high school 0.504*** 0.523*** 0.571*** 0.639***
## (0.064) (0.064) (0.070) (0.071)
##
## education_levelPrimary school or below 0.793*** 0.844*** 1.017*** 1.125***
## (0.070) (0.071) (0.072) (0.074)
##
## workWork -0.389*** -0.395*** -0.193*** -0.199***
## (0.042) (0.042) (0.034) (0.034)
##
## number_kids_under18 0.113*** 0.109*** 0.100*** 0.098***
## (0.024) (0.024) (0.025) (0.025)
##
## RuralUrban -0.237*** -0.217*** -0.210*** -0.199***
## (0.033) (0.034) (0.035) (0.035)
##
## Constant -0.870*** -0.796*** -1.385*** -1.512***
## (0.116) (0.117) (0.104) (0.106)
##
## ----------------------------------------------------------------------------------------------
## Observations 20,985 20,985 20,985 20,985
## Log Likelihood -13,282.930 -13,273.810 -12,677.990 -12,652.900
## Akaike Inf. Crit. 26,585.870 26,569.620 25,375.980 25,327.810
## ==============================================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
library(broom)
library(dplyr)
library(ggplot2)
# get the coefficients
tidy_m1a <- tidy(m1a, conf.int = TRUE) %>% mutate(model = "Model_1A")
tidy_m2a <- tidy(m2a, conf.int = TRUE) %>% mutate(model = "Model_2A")
tidy_m1b <- tidy(m1b, conf.int = TRUE) %>% mutate(model = "Model_1B")
tidy_m2b <- tidy(m2b, conf.int = TRUE) %>% mutate(model = "Model_2B")
# merge
coefficients_combined <- bind_rows(tidy_m1a, tidy_m2a, tidy_m1b, tidy_m2b)
# create the plot
ggplot(coefficients_combined, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high, color = model)) +
geom_pointrange() +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_wrap(~model, scales = "free_x") +
coord_flip() +
labs(x = "Term", y = "Coefficient Estimate", title = "The results for Model 1 and 2, by gender") +
theme_minimal() +
theme(legend.position = "bottom")

4. Birth Cohort
## 1. Install The library packages
library(dplyr)
library(purrr)
library(broom)
library(stargazer)
library(gtsummary)
## 2. Birth Cohort
Data_total <- Data_total %>%
mutate(birth_cohort = cut(birth_year, breaks = c(1922, 1949, 1978, 1989, 1998), labels = c("Before 1949", "1950-1978", "1979-1989", "1990-1998")))
summary(Data_total$birth_cohort)
## Before 1949 1950-1978 1979-1989 1990-1998
## 4471 31055 5485 959
## 3. Separate the gender sample
data_male <- Data_total %>%
filter(gender=='Male')
data_female <- Data_total %>%
filter(gender=='Female')
## 4. Logistics Model 1c, Model 2c, Model 1d, Model 2d.
## For Male Sample
## Model_1_c
m1c <- glm(dv_health ~ edu_homogamy + birth_cohort + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_male)
summary((m1c))
##
## Call:
## glm(formula = dv_health ~ edu_homogamy + birth_cohort + education_level +
## work + number_kids_under18 + Rural, family = binomial, data = data_male)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.703310 0.081791 8.599
## edu_homogamy -0.108479 0.032145 -3.375
## birth_cohort1950-1978 0.007245 0.049339 0.147
## birth_cohort1979-1989 -0.692936 0.066233 -10.462
## birth_cohort1990-1998 -1.215081 0.134876 -9.009
## education_levelCollege or above -0.172979 0.083759 -2.065
## education_levelHigh school or technical school 0.194396 0.066775 2.911
## education_levelJunior high school 0.498913 0.063933 7.804
## education_levelPrimary school or below 0.865930 0.069796 12.407
## workWork -0.573542 0.039693 -14.450
## number_kids_under18 0.024689 0.024802 0.995
## RuralUrban -0.268865 0.033095 -8.124
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## edu_homogamy 0.000739 ***
## birth_cohort1950-1978 0.883264
## birth_cohort1979-1989 < 2e-16 ***
## birth_cohort1990-1998 < 2e-16 ***
## education_levelCollege or above 0.038904 *
## education_levelHigh school or technical school 0.003600 **
## education_levelJunior high school 6.01e-15 ***
## education_levelPrimary school or below < 2e-16 ***
## workWork < 2e-16 ***
## number_kids_under18 0.319518
## RuralUrban 4.50e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28188 on 20984 degrees of freedom
## Residual deviance: 26555 on 20973 degrees of freedom
## AIC: 26579
##
## Number of Fisher Scoring iterations: 4
conf_int <- confint(m1c) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m1c <- tidy(m1c, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m1c$signif <- symnum(tidy_m1c$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
## Model_2_c
m2c <- glm(dv_health ~ edu_homogamy + edu_hypogamy + birth_cohort + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_male)
summary((m2c))
##
## Call:
## glm(formula = dv_health ~ edu_homogamy + edu_hypogamy + birth_cohort +
## education_level + work + number_kids_under18 + Rural, family = binomial,
## data = data_male)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.70365 0.08188 8.594
## edu_homogamy -0.17167 0.03452 -4.974
## edu_hypogamy -0.32012 0.06319 -5.066
## birth_cohort1950-1978 0.02431 0.04948 0.491
## birth_cohort1979-1989 -0.65365 0.06672 -9.797
## birth_cohort1990-1998 -1.16365 0.13538 -8.596
## education_levelCollege or above -0.18945 0.08387 -2.259
## education_levelHigh school or technical school 0.19812 0.06689 2.962
## education_levelJunior high school 0.52012 0.06418 8.104
## education_levelPrimary school or below 0.92242 0.07079 13.031
## workWork -0.57332 0.03972 -14.434
## number_kids_under18 0.02227 0.02483 0.897
## RuralUrban -0.24338 0.03350 -7.265
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## edu_homogamy 6.57e-07 ***
## edu_hypogamy 4.06e-07 ***
## birth_cohort1950-1978 0.62319
## birth_cohort1979-1989 < 2e-16 ***
## birth_cohort1990-1998 < 2e-16 ***
## education_levelCollege or above 0.02390 *
## education_levelHigh school or technical school 0.00306 **
## education_levelJunior high school 5.33e-16 ***
## education_levelPrimary school or below < 2e-16 ***
## workWork < 2e-16 ***
## number_kids_under18 0.36968
## RuralUrban 3.73e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28188 on 20984 degrees of freedom
## Residual deviance: 26529 on 20972 degrees of freedom
## AIC: 26555
##
## Number of Fisher Scoring iterations: 4
conf_int <- confint(m2c) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m2c <- tidy(m2c, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m2c$signif <- symnum(tidy_m2c$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
## For Female Sample
## Model_1_d
m1d <- glm(dv_health ~ edu_homogamy + birth_cohort + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_female)
summary((m1d))
##
## Call:
## glm(formula = dv_health ~ edu_homogamy + birth_cohort + education_level +
## work + number_kids_under18 + Rural, family = binomial, data = data_female)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.451034 0.095844 4.706
## edu_homogamy 0.072787 0.030769 2.366
## birth_cohort1950-1978 -0.117934 0.060746 -1.941
## birth_cohort1979-1989 -0.829674 0.071727 -11.567
## birth_cohort1990-1998 -1.296671 0.109752 -11.815
## education_levelCollege or above 0.040308 0.092834 0.434
## education_levelHigh school or technical school 0.291481 0.074251 3.926
## education_levelJunior high school 0.567126 0.070090 8.091
## education_levelPrimary school or below 1.099658 0.071823 15.311
## workWork -0.316270 0.033337 -9.487
## number_kids_under18 -0.001476 0.025923 -0.057
## RuralUrban -0.229226 0.035024 -6.545
## Pr(>|z|)
## (Intercept) 2.53e-06 ***
## edu_homogamy 0.0180 *
## birth_cohort1950-1978 0.0522 .
## birth_cohort1979-1989 < 2e-16 ***
## birth_cohort1990-1998 < 2e-16 ***
## education_levelCollege or above 0.6642
## education_levelHigh school or technical school 8.65e-05 ***
## education_levelJunior high school 5.90e-16 ***
## education_levelPrimary school or below < 2e-16 ***
## workWork < 2e-16 ***
## number_kids_under18 0.9546
## RuralUrban 5.95e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27327 on 20984 degrees of freedom
## Residual deviance: 25393 on 20973 degrees of freedom
## AIC: 25417
##
## Number of Fisher Scoring iterations: 4
conf_int <- confint(m1d) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m1d <- tidy(m1d, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m1d$signif <- symnum(tidy_m1d$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
# Model_2_d
m2d <- glm(dv_health ~ edu_homogamy + edu_hypogamy + birth_cohort + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_female)
conf_int <- confint(m2d) # Calculate the confidence intervals
## Waiting for profiling to be done...
tidy_m2d <- tidy(m2d, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package
tidy_m2d$signif <- symnum(tidy_m2d$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels
## 5. Logistics Model 1 to Model 4
## get the table of outcome
library(officer)
library(flextable)
library(stargazer)
stargazer(m1c, m2c, m1d, m2d, type = "text",
model.names = TRUE, out = "models_table.tex",
star.cutoffs = c(0.05, 0.01, 0.001))
##
## ==============================================================================================
## Dependent variable:
## -----------------------------------------------
## dv_health
## logistic
## (1) (2) (3) (4)
## ----------------------------------------------------------------------------------------------
## edu_homogamy -0.108*** -0.172*** 0.073* 0.148***
## (0.032) (0.035) (0.031) (0.032)
##
## edu_hypogamy -0.320*** 0.465***
## (0.063) (0.065)
##
## birth_cohort1950-1978 0.007 0.024 -0.118 -0.115
## (0.049) (0.049) (0.061) (0.061)
##
## birth_cohort1979-1989 -0.693*** -0.654*** -0.830*** -0.817***
## (0.066) (0.067) (0.072) (0.072)
##
## birth_cohort1990-1998 -1.215*** -1.164*** -1.297*** -1.291***
## (0.135) (0.135) (0.110) (0.110)
##
## education_levelCollege or above -0.173* -0.189* 0.040 0.035
## (0.084) (0.084) (0.093) (0.093)
##
## education_levelHigh school or technical school 0.194** 0.198** 0.291*** 0.314***
## (0.067) (0.067) (0.074) (0.075)
##
## education_levelJunior high school 0.499*** 0.520*** 0.567*** 0.636***
## (0.064) (0.064) (0.070) (0.071)
##
## education_levelPrimary school or below 0.866*** 0.922*** 1.100*** 1.208***
## (0.070) (0.071) (0.072) (0.074)
##
## workWork -0.574*** -0.573*** -0.316*** -0.322***
## (0.040) (0.040) (0.033) (0.033)
##
## number_kids_under18 0.025 0.022 -0.001 -0.003
## (0.025) (0.025) (0.026) (0.026)
##
## RuralUrban -0.269*** -0.243*** -0.229*** -0.218***
## (0.033) (0.034) (0.035) (0.035)
##
## Constant 0.703*** 0.704*** 0.451*** 0.298**
## (0.082) (0.082) (0.096) (0.098)
##
## ----------------------------------------------------------------------------------------------
## Observations 20,985 20,985 20,985 20,985
## Log Likelihood -13,277.300 -13,264.550 -12,696.590 -12,670.250
## Akaike Inf. Crit. 26,578.590 26,555.100 25,417.180 25,366.500
## ==============================================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
library(broom)
library(dplyr)
library(ggplot2)
# get the coefficients
tidy_m1c <- tidy(m1c, conf.int = TRUE) %>% mutate(model = "Model_1C")
tidy_m2c <- tidy(m2c, conf.int = TRUE) %>% mutate(model = "Model_2C")
tidy_m1d <- tidy(m1d, conf.int = TRUE) %>% mutate(model = "Model_1D")
tidy_m2d <- tidy(m2d, conf.int = TRUE) %>% mutate(model = "Model_2D")
# merge
coefficients_combined2<- bind_rows(tidy_m1c, tidy_m2c, tidy_m1d, tidy_m2d)
coefficients_combined2
# generate plot
ggplot(coefficients_combined2, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high, color = model)) +
geom_pointrange() +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_wrap(~model, scales = "free_x") +
coord_flip() +
labs(x = "Term", y = "Coefficient Estimate", title = "The results for Model 1 and 2, by gender and cohort") +
theme_minimal() +
theme(legend.position = "bottom")

Calculate the odds of results
probability_change_ma1 <- (exp(0.08) - 1) * 100
probability_change_ma2_homogamy <- (exp(0.15) - 1) * 100
probability_change_ma2_hypogamy <- (exp(0.27) - 1) * 100
probability_change_mb1 <- (1- exp(-0.06)) * 100
probability_change_mb2_homogamy <- (1 - exp(-0.11)) * 100
probability_change_mb2_hypogamy <- (1 - exp(-0.45) ) * 100