1. Load Data and clean up – CHFS 2019.

## load data

library(haven)
chfs2019_ind_202112 <- read_dta("chfs2019_ind_202112.dta") 
chfs2019_hh_202112 <- read_dta("chfs2019_hh_202112.dta") 
chfs2019_master_202112 <- read_dta("chfs2019_master_202112.dta") 

## Individual data pick variables _ data_ind
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data_ind <- chfs2019_ind_202112 %>% select(hhid, pline, a2001, a2003, a2005, a2011b, a2012, a2015, a2022, a2024, a2025b, a3132b, a3132d, f2001a, f6001a) ## Select relationship, gender, birth year,ethnicity, education level, party membership,Hukou, marital status, health, work, type of work, medical insurance, business_health_insurance

data_ind <- data_ind %>%
  rename(relationship = a2001,gender = a2003, birth_year = a2005, ethnicity = a2011b, education = a2012, Party = a2015, Hukou = a2022, marital = a2024, health = a2025b, work = a3132b, work_type = a3132d, medical_insurance=f2001a, business_health_insurance=f6001a)## rename variables

summary(data_ind)

##      hhid               pline         relationship         gender     
##  Length:107008      Min.   : 1.000   Min.   :   1.00   Min.   :1.000  
##  Class :character   1st Qu.: 1.000   1st Qu.:   1.00   1st Qu.:1.000  
##  Mode  :character   Median : 2.000   Median :   2.00   Median :1.000  
##                     Mean   : 4.199   Mean   :  35.45   Mean   :1.494  
##                     3rd Qu.: 4.000   3rd Qu.:   6.00   3rd Qu.:2.000  
##                     Max.   :39.000   Max.   :7777.00   Max.   :2.000  
##                                                        NA's   :7      
##    birth_year     ethnicity       education         Party      
##  Min.   :1902   Min.   : 1.00   Min.   :1.00    Min.   :1.000  
##  1st Qu.:1957   1st Qu.: 1.00   1st Qu.:2.00    1st Qu.:2.000  
##  Median :1972   Median : 1.00   Median :3.00    Median :2.000  
##  Mean   :1975   Mean   : 1.39   Mean   :3.43    Mean   :1.867  
##  3rd Qu.:1993   3rd Qu.: 1.00   3rd Qu.:4.00    3rd Qu.:2.000  
##  Max.   :2019   Max.   :10.00   Max.   :9.00    Max.   :2.000  
##  NA's   :51     NA's   :44829   NA's   :15989   NA's   :29678  
##      Hukou            marital          health           work      
##  Min.   :   1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:   1.00   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :   1.00   Median :2.000   Median :2.000   Median :1.000  
##  Mean   :  10.35   Mean   :2.188   Mean   :2.541   Mean   :1.415  
##  3rd Qu.:   2.00   3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :7777.00   Max.   :6.000   Max.   :5.000   Max.   :2.000  
##  NA's   :198       NA's   :18010   NA's   :57      NA's   :17067  
##    work_type     medical_insurance business_health_insurance
##  Min.   :1.00    Min.   :   1.0    Length:107008            
##  1st Qu.:1.00    1st Qu.:   2.0    Class :character         
##  Median :2.00    Median :   3.0    Mode  :character         
##  Mean   :3.75    Mean   : 703.9                             
##  3rd Qu.:7.00    3rd Qu.:   3.0                             
##  Max.   :7.00    Max.   :7788.0                             
##  NA's   :54523   NA's   :248

## Master data pick variables _ data_master

data_master <- chfs2019_master_202112 %>% select(hhid, pline, rural, weight_hh, weight_ind, region, city_level, total_income, total_consump, total_asset, total_debt) 

summary(data_master)

##      hhid               pline            rural          weight_hh     
##  Length:107008      Min.   : 1.000   Min.   :0.0000   Min.   :   140  
##  Class :character   1st Qu.: 1.000   1st Qu.:0.0000   1st Qu.:  4637  
##  Mode  :character   Median : 2.000   Median :0.0000   Median :  8840  
##                     Mean   : 4.199   Mean   :0.3797   Mean   : 12477  
##                     3rd Qu.: 4.000   3rd Qu.:1.0000   3rd Qu.: 15983  
##                     Max.   :39.000   Max.   :1.0000   Max.   :122576  
##                                                                       
##    weight_ind          region        city_level     total_income     
##  Min.   :  616.6   Min.   :1.000   Min.   :1.000   Min.   :-5493190  
##  1st Qu.: 4002.2   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:   23616  
##  Median : 8664.7   Median :2.000   Median :3.000   Median :   58190  
##  Mean   :12639.3   Mean   :2.143   Mean   :2.364   Mean   :   92183  
##  3rd Qu.:16792.4   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:  109454  
##  Max.   :61153.5   Max.   :4.000   Max.   :3.000   Max.   :12122418  
##                                                    NA's   :254       
##  total_consump        total_asset          total_debt      
##  Min.   :     1188   Min.   :0.000e+00   Min.   :       0  
##  1st Qu.:    35380   1st Qu.:1.577e+05   1st Qu.:       0  
##  Median :    61120   Median :4.350e+05   Median :       0  
##  Mean   :   101760   Mean   :1.187e+06   Mean   :   84081  
##  3rd Qu.:   102695   3rd Qu.:1.125e+06   3rd Qu.:   50000  
##  Max.   :169546352   Max.   :2.100e+09   Max.   :40658344  
##                                          NA's   :16794

## Household data pick variables _ data_hh

data_hh <- chfs2019_hh_202112 %>% select(hhid, c1001, e4001) 

data_hh <- data_hh %>%
  rename( house_nature = c1001, medical_liabilities = e4001 )

summary(data_hh)

##      hhid            house_nature   medical_liabilities
##  Length:34643       Min.   :1.000   Min.   :1.000      
##  Class :character   1st Qu.:1.000   1st Qu.:2.000      
##  Mode  :character   Median :1.000   Median :2.000      
##                     Mean   :1.214   Mean   :1.954      
##                     3rd Qu.:1.000   3rd Qu.:2.000      
##                     Max.   :3.000   Max.   :2.000      
##                     NA's   :25      NA's   :29

## merge data


data_master$hhid <- as.character(data_master$hhid)
data_master$pline <- as.character(data_master$pline)

data_master$id = paste(data_master$hhid, data_master$pline, sep = "_")## generate a new unique variable to as identification


data_ind$hhid <- as.character(data_ind$hhid)
data_ind$pline <- as.character(data_ind$pline)

data_ind$id = paste(data_ind$hhid, data_ind$pline, sep = "_") ## generate a new unique variable to as identification

nrow(chfs2019_master_202112$pline)

## NULL

nrow(chfs2019_ind_202112$pline)

## NULL

nrow(chfs2019_hh_202112$hhid)

## NULL

#merging individual to master, M1
data_master_ind <- merge(data_ind, data_master, by="id", all =T) ## first merge, master and individual


data_master_ind <- data_master_ind %>%
  rename(hhid = hhid.x, pline = pline.x ) ## rename variable


data_master_ind_hh <- merge(data_master_ind, data_hh, by="hhid", all=T) ## second merge, master, individual and household

## generate a new variable: Number of kids 

data_master_ind_hh$number_kids <- data_master_ind_hh$relationship == 6 & data_master_ind_hh$birth_year < 2001

library(dplyr)
family_kids_number <- data_master_ind_hh %>% 
  group_by(hhid) %>% 
  summarise(number_kids_under18 = sum(number_kids))

data_master_ind_hh <- merge(data_master_ind_hh, family_kids_number, by="hhid")

data_master_ind_hh$number_kids_under18.x<- NULL
data_master_ind_hh$number_kids_under18.y <- NULL

summary(data_master_ind_hh$number_kids_under18)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.5161  1.0000  4.0000      45

## Final data after filters


Data_total <- data_master_ind_hh %>% 
  filter(relationship==1 | relationship==2) %>% 
  filter(birth_year < 1999) %>% 
  filter(marital==2 | marital==3) 

summary(Data_total)

##      hhid                id               pline            relationship  
##  Length:56116       Length:56116       Length:56116       Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :1.000  
##                                                           Mean   :1.491  
##                                                           3rd Qu.:2.000  
##                                                           Max.   :2.000  
##                                                                          
##      gender      birth_year     ethnicity        education        Party      
##  Min.   :1.0   Min.   :1903   Min.   : 1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:1.0   1st Qu.:1954   1st Qu.: 1.000   1st Qu.:2.00   1st Qu.:2.000  
##  Median :2.0   Median :1964   Median : 1.000   Median :3.00   Median :2.000  
##  Mean   :1.5   Mean   :1964   Mean   : 1.384   Mean   :3.25   Mean   :1.853  
##  3rd Qu.:2.0   3rd Qu.:1973   3rd Qu.: 1.000   3rd Qu.:4.00   3rd Qu.:2.000  
##  Max.   :2.0   Max.   :1998   Max.   :10.000   Max.   :9.00   Max.   :2.000  
##                               NA's   :98       NA's   :41     NA's   :2102   
##      Hukou             marital          health           work      
##  Min.   :   1.000   Min.   :2.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:   1.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :   1.000   Median :2.000   Median :3.000   Median :1.000  
##  Mean   :   8.786   Mean   :2.004   Mean   :2.768   Mean   :1.371  
##  3rd Qu.:   2.000   3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :7777.000   Max.   :3.000   Max.   :5.000   Max.   :2.000  
##  NA's   :60                         NA's   :10      NA's   :14     
##    work_type     medical_insurance business_health_insurance    hhid.y         
##  Min.   :1.000   Min.   :   1.0    Length:56116              Length:56116      
##  1st Qu.:1.000   1st Qu.:   1.0    Class :character          Class :character  
##  Median :4.000   Median :   3.0    Mode  :character          Mode  :character  
##  Mean   :4.131   Mean   : 459.9                                                
##  3rd Qu.:7.000   3rd Qu.:   3.0                                                
##  Max.   :7.000   Max.   :7788.0                                                
##  NA's   :20878   NA's   :79                                                    
##    pline.y              rural          weight_hh        weight_ind     
##  Length:56116       Min.   :0.0000   Min.   :   140   Min.   :  616.6  
##  Class :character   1st Qu.:0.0000   1st Qu.:  4847   1st Qu.: 3694.0  
##  Mode  :character   Median :0.0000   Median :  9161   Median : 7722.4  
##                     Mean   :0.3666   Mean   : 12851   Mean   :11172.0  
##                     3rd Qu.:1.0000   3rd Qu.: 16394   3rd Qu.:14703.1  
##                     Max.   :1.0000   Max.   :122576   Max.   :61153.5  
##                                                                        
##      region       city_level     total_income      total_consump      
##  Min.   :1.00   Min.   :1.000   Min.   :-5493190   Min.   :     1968  
##  1st Qu.:1.00   1st Qu.:1.000   1st Qu.:   20590   1st Qu.:    31622  
##  Median :2.00   Median :3.000   Median :   54918   Median :    56040  
##  Mean   :2.13   Mean   :2.308   Mean   :   86683   Mean   :    86710  
##  3rd Qu.:3.00   3rd Qu.:3.000   3rd Qu.:  103007   3rd Qu.:    95209  
##  Max.   :4.00   Max.   :3.000   Max.   :12122418   Max.   :169546352  
##                                 NA's   :147                           
##   total_asset          total_debt        house_nature   medical_liabilities
##  Min.   :0.000e+00   Min.   :       0   Min.   :1.000   Min.   :1.000      
##  1st Qu.:1.486e+05   1st Qu.:       0   1st Qu.:1.000   1st Qu.:2.000      
##  Median :4.341e+05   Median :       0   Median :1.000   Median :2.000      
##  Mean   :1.205e+06   Mean   :   77493   Mean   :1.183   Mean   :1.955      
##  3rd Qu.:1.141e+06   3rd Qu.:   35000   3rd Qu.:1.000   3rd Qu.:2.000      
##  Max.   :2.100e+09   Max.   :40658344   Max.   :3.000   Max.   :2.000      
##                      NA's   :9847       NA's   :28      NA's   :43         
##  number_kids     number_kids_under18
##  Mode :logical   Min.   :0.0000     
##  FALSE:56116     1st Qu.:0.0000     
##                  Median :0.0000     
##                  Mean   :0.3856     
##                  3rd Qu.:1.0000     
##                  Max.   :4.0000     
##                  NA's   :2

  ## Filters: Family relationship is respondent or spouse, respondent is 1, spouse is 2, Age greater than 20 years, i.e. year of birth before 1999, Marital status married or cohabiting, married 2, cohabiting 3

Data_total$pline.y <- NULL
Data_total$hhid.y <- NULL

Data_total$business_health_insurance <- as.numeric(Data_total$business_health_insurance)

## Warning: NAs introduced by coercion

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

Data_total$Hukou <- car::recode(Data_total$Hukou, recodes = "7777=NA", as.numeric = T)
Data_total$medical_insurance <- car::recode(Data_total$medical_insurance, recodes = "7788=0", as.numeric = T)
Data_total$business_health_insurance <- car::recode(Data_total$business_health_insurance, recodes = "1=0; 2=1; 3=0; 7788=0", as.numeric = T)
Data_total$medical_liabilities <- car::recode(Data_total$medical_liabilities, recodes = "1=1; 2=0", as.numeric = T) ## recode variables

Data_total <- Data_total %>% 
   filter(!is.na(ethnicity)) %>% 
   filter(!is.na(education)) %>% 
   filter(!is.na(Party)) %>% 
   filter(!is.na(health)) %>% 
   filter(!is.na(Hukou)) %>% 
   filter(!is.na(work)) %>% 
   filter(!is.na(medical_insurance)) %>% 
   filter(!is.na(business_health_insurance)) %>% 
   filter(!is.na(total_income)) %>% 
   filter(!is.na(total_debt)) %>% 
   filter(!is.na(house_nature)) %>% 
   filter(!is.na(medical_liabilities)) %>% 
   filter(!is.na(number_kids_under18))  ## delete missing value

2. Descriptive Statistics – Educational Assortative Mating and Health Outcomes

##Dependent variable: Self-assessed health

summary(Data_total$health)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.781   3.000   5.000

table(Data_total$health)

## 
##     1     2     3     4     5 
##  4686 11751 17641  7275  2167

Data_total$dv_health <- car::recode(Data_total$health, "1:2=1 ; 3:5=0",as.numeric = T)

table(Data_total$dv_health)

## 
##     0     1 
## 27083 16437

Data_total$dv_health <- car::recode(Data_total$dv_health,recodes = "1='Healthy';0='Unhealthy'",as.factor=T)



## Independent variable: Education Pairing

summary(Data_total$relationship)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.491   2.000   2.000

summary(Data_total$gender)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.499   2.000   2.000

summary(Data_total$education)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.128   4.000   9.000

table(Data_total$relationship)

## 
##     1     2 
## 22162 21358

table(Data_total$gender)

## 
##     1     2 
## 21801 21719

table(Data_total$education)

## 
##     1     2     3     4     5     6     7     8     9 
##  4365 12126 14833  5595  1623  2579  2156   212    31

Data_total$education_level <- car::recode(Data_total$education, "1:2=1; 3=2; 4:5=3; 6=4; 7:9=5",as.numeric = T)
table(Data_total$education_level)

## 
##     1     2     3     4     5 
## 16491 14833  7218  2579  2399

data_res_heterogomy <- filter(Data_total, relationship==1) ## Create new Data "data_res_heterogomy", Select family member relationship is 1,  1 means respondents.
data_spo_heterogomy <- filter(Data_total, relationship==2) ## Create new Data "data_spo_heterogomy", Select family member relationship is 2,  2 means spouse.

merge_data <- merge(data_res_heterogomy,data_spo_heterogomy, by="hhid") ## merge the first data to get independent variable
summary(merge_data)

##      hhid               id.x             pline.x          relationship.x
##  Length:20985       Length:20985       Length:20985       Min.   :1     
##  Class :character   Class :character   Class :character   1st Qu.:1     
##  Mode  :character   Mode  :character   Mode  :character   Median :1     
##                                                           Mean   :1     
##                                                           3rd Qu.:1     
##                                                           Max.   :1     
##                                                                         
##     gender.x      birth_year.x   ethnicity.x      education.x   
##  Min.   :1.000   Min.   :1927   Min.   : 1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1955   1st Qu.: 1.000   1st Qu.:2.000  
##  Median :1.000   Median :1964   Median : 1.000   Median :3.000  
##  Mean   :1.439   Mean   :1965   Mean   : 1.397   Mean   :3.205  
##  3rd Qu.:2.000   3rd Qu.:1973   3rd Qu.: 1.000   3rd Qu.:4.000  
##  Max.   :2.000   Max.   :1998   Max.   :10.000   Max.   :9.000  
##                                                                 
##     Party.x         Hukou.x        marital.x        health.x    
##  Min.   :1.000   Min.   :1.000   Min.   :2.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :1.000   Median :2.000   Median :3.000  
##  Mean   :1.845   Mean   :1.475   Mean   :2.003   Mean   :2.756  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :2.000   Max.   :4.000   Max.   :3.000   Max.   :5.000  
##                                                                 
##      work.x       work_type.x    medical_insurance.x
##  Min.   :1.000   Min.   :1.000   Min.   :0.000      
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000      
##  Median :1.000   Median :6.000   Median :3.000      
##  Mean   :1.299   Mean   :4.503   Mean   :2.387      
##  3rd Qu.:2.000   3rd Qu.:7.000   3rd Qu.:3.000      
##  Max.   :2.000   Max.   :7.000   Max.   :5.000      
##                  NA's   :6318                       
##  business_health_insurance.x    rural.x        weight_hh.x    
##  Min.   :0.00000             Min.   :0.0000   Min.   :   140  
##  1st Qu.:0.00000             1st Qu.:0.0000   1st Qu.:  5048  
##  Median :0.00000             Median :0.0000   Median :  9466  
##  Mean   :0.03212             Mean   :0.4245   Mean   : 13511  
##  3rd Qu.:0.00000             3rd Qu.:1.0000   3rd Qu.: 17560  
##  Max.   :1.00000             Max.   :1.0000   Max.   :122576  
##                                                               
##   weight_ind.x        region.x      city_level.x   total_income.x    
##  Min.   :  616.6   Min.   :1.000   Min.   :1.000   Min.   :-5493190  
##  1st Qu.: 3726.5   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:   18256  
##  Median : 7855.4   Median :2.000   Median :3.000   Median :   50195  
##  Mean   :11408.1   Mean   :2.087   Mean   :2.385   Mean   :   80946  
##  3rd Qu.:15164.3   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:   98342  
##  Max.   :61153.5   Max.   :4.000   Max.   :3.000   Max.   :12122418  
##                                                                      
##  total_consump.x     total_asset.x        total_debt.x      house_nature.x 
##  Min.   :     2026   Min.   :0.000e+00   Min.   :       0   Min.   :1.000  
##  1st Qu.:    29878   1st Qu.:1.382e+05   1st Qu.:       0   1st Qu.:1.000  
##  Median :    53600   Median :4.020e+05   Median :       0   Median :1.000  
##  Mean   :    85748   Mean   :1.136e+06   Mean   :   71868   Mean   :1.158  
##  3rd Qu.:    91980   3rd Qu.:1.056e+06   3rd Qu.:   32000   3rd Qu.:1.000  
##  Max.   :169546352   Max.   :2.100e+09   Max.   :14045000   Max.   :3.000  
##                                                                            
##  medical_liabilities.x number_kids.x   number_kids_under18.x    dv_health.x   
##  Min.   :0.00000       Mode :logical   Min.   :0.0000        Healthy  : 7957  
##  1st Qu.:0.00000       FALSE:20985     1st Qu.:0.0000        Unhealthy:13028  
##  Median :0.00000                       Median :0.0000                         
##  Mean   :0.05418                       Mean   :0.4167                         
##  3rd Qu.:0.00000                       3rd Qu.:1.0000                         
##  Max.   :1.00000                       Max.   :4.0000                         
##                                                                               
##  education_level.x     id.y             pline.y          relationship.y
##  Min.   :1.000     Length:20985       Length:20985       Min.   :2     
##  1st Qu.:1.000     Class :character   Class :character   1st Qu.:2     
##  Median :2.000     Mode  :character   Mode  :character   Median :2     
##  Mean   :2.121                                           Mean   :2     
##  3rd Qu.:3.000                                           3rd Qu.:2     
##  Max.   :5.000                                           Max.   :2     
##                                                                        
##     gender.y      birth_year.y   ethnicity.y    education.y       Party.y     
##  Min.   :1.000   Min.   :1923   Min.   : 1.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1956   1st Qu.: 1.0   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :1965   Median : 1.0   Median :3.000   Median :2.000  
##  Mean   :1.561   Mean   :1965   Mean   : 1.4   Mean   :3.032   Mean   :1.903  
##  3rd Qu.:2.000   3rd Qu.:1974   3rd Qu.: 1.0   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :2.000   Max.   :1998   Max.   :10.0   Max.   :9.000   Max.   :2.000  
##                                                                               
##     Hukou.y        marital.y        health.y         work.y     
##  Min.   :1.000   Min.   :2.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :1.000   Median :2.000   Median :3.000   Median :1.000  
##  Mean   :1.468   Mean   :2.003   Mean   :2.812   Mean   :1.312  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :4.000   Max.   :3.000   Max.   :5.000   Max.   :2.000  
##                                                                 
##   work_type.y    medical_insurance.y business_health_insurance.y
##  Min.   :1.000   Min.   :0.000       Min.   :0.0000             
##  1st Qu.:2.000   1st Qu.:1.000       1st Qu.:0.0000             
##  Median :4.000   Median :3.000       Median :0.0000             
##  Mean   :4.246   Mean   :2.381       Mean   :0.0284             
##  3rd Qu.:7.000   3rd Qu.:3.000       3rd Qu.:0.0000             
##  Max.   :7.000   Max.   :5.000       Max.   :1.0000             
##  NA's   :6583                                                   
##     rural.y        weight_hh.y      weight_ind.y        region.y    
##  Min.   :0.0000   Min.   :   140   Min.   :  616.6   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:  5048   1st Qu.: 3791.8   1st Qu.:1.000  
##  Median :0.0000   Median :  9466   Median : 7948.4   Median :2.000  
##  Mean   :0.4245   Mean   : 13511   Mean   :11455.3   Mean   :2.087  
##  3rd Qu.:1.0000   3rd Qu.: 17560   3rd Qu.:15105.2   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :122576   Max.   :61153.5   Max.   :4.000  
##                                                                     
##   city_level.y   total_income.y     total_consump.y     total_asset.y      
##  Min.   :1.000   Min.   :-5493190   Min.   :     2026   Min.   :0.000e+00  
##  1st Qu.:2.000   1st Qu.:   18256   1st Qu.:    29878   1st Qu.:1.382e+05  
##  Median :3.000   Median :   50195   Median :    53600   Median :4.020e+05  
##  Mean   :2.385   Mean   :   80946   Mean   :    85748   Mean   :1.136e+06  
##  3rd Qu.:3.000   3rd Qu.:   98342   3rd Qu.:    91980   3rd Qu.:1.056e+06  
##  Max.   :3.000   Max.   :12122418   Max.   :169546352   Max.   :2.100e+09  
##                                                                            
##   total_debt.y      house_nature.y  medical_liabilities.y number_kids.y  
##  Min.   :       0   Min.   :1.000   Min.   :0.00000       Mode :logical  
##  1st Qu.:       0   1st Qu.:1.000   1st Qu.:0.00000       FALSE:20985    
##  Median :       0   Median :1.000   Median :0.00000                      
##  Mean   :   71868   Mean   :1.158   Mean   :0.05418                      
##  3rd Qu.:   32000   3rd Qu.:1.000   3rd Qu.:0.00000                      
##  Max.   :14045000   Max.   :3.000   Max.   :1.00000                      
##                                                                          
##  number_kids_under18.y    dv_health.y    education_level.y
##  Min.   :0.0000        Healthy  : 7838   Min.   :1.000    
##  1st Qu.:0.0000        Unhealthy:13147   1st Qu.:1.000    
##  Median :0.0000                          Median :2.000    
##  Mean   :0.4167                          Mean   :2.007    
##  3rd Qu.:1.0000                          3rd Qu.:3.000    
##  Max.   :4.0000                          Max.   :5.000    
##

## Educational Assortative Mating - A Series of Dummy Variables: Homogamy, Hypergamy, Hypogamy.

## 1. Educational Homogamy

library(dplyr)
library(tidyr)

merge_data <- merge_data %>%
  mutate(edu_homogamy = case_when( education_level.x == education_level.y ~ 1,
                                   TRUE ~ 0
                                   ))

## 2. Educational Hypergamy
merge_data <- merge_data %>%
  mutate(edu_hypergamy = case_when( (education_level.x > education_level.y) & gender.x == 1 ~ 1,
                                   TRUE ~ 0
                                   ))

## 3. Educational Hypogamy
merge_data <- merge_data %>%
  mutate(edu_hypogamy = case_when( (education_level.x > education_level.y) & gender.x == 2 ~ 1,
                                   TRUE ~ 0
                                   ))

merge <- merge_data %>% select(hhid, edu_homogamy, edu_hypergamy, edu_hypogamy) 

Data_total <- merge(Data_total, merge, by="hhid")


#Control variable:

#Gender

table(Data_total$gender)

## 
##     1     2 
## 20985 20985

Data_total$gender <- car::recode(Data_total$gender,recodes = "1='Male';2='Female'",as.factor=T)

class(Data_total$gender)

## [1] "factor"

Data_total$gender <- as.factor(Data_total$gender)

# Age

Data_total$age <- 2019- Data_total$birth_year
summary(Data_total$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   46.00   54.00   54.06   64.00   96.00

## Birth Cohort

Data_total <- Data_total %>%
  mutate(birth_cohort = cut(birth_year, breaks = c(1922, 1949, 1978, 1989, 1998), labels = c("Before 1949", "1950-1978", "1979-1989", "1990-1998")))

# Education Level

class(Data_total$education_level)

## [1] "numeric"

Data_total$education_level <- car::recode(Data_total$education_level,recodes = "1='Primary school or below';2='Junior high school'; 3='High school or technical school'; 4=' Junior college or vocational college'; 5='College or above'",as.factor=T)

Data_total$education_level <- as.factor(Data_total$education_level)

# Work

table(Data_total$work)

## 
##     1     2 
## 29130 12840

Data_total$work <- car::recode(Data_total$work, recodes = "1='Work' ; 2='No work'", as.factor=F)
Data_total$work <- as.factor(Data_total$work)
table(Data_total$work)

## 
## No work    Work 
##   12840   29130

# Number of kids 

table(Data_total$number_kids_under18)

## 
##     0     1     2     3     4 
## 27148 12456  2100   230    36

# Rural

table(Data_total$rural)

## 
##     0     1 
## 24154 17816

Data_total$rural <- car::recode(Data_total$rural,recodes = "0='Urban';1='Rural'",as.factor=T)
Data_total$rural <- as.factor(Data_total$rural)

# export the table of descriptive statistics

library(dplyr)
library(gtsummary)
library(haven)


Data_total <- Data_total %>%
  mutate(
    region = as_factor(region),
    city_level = as_factor(city_level)
  )

tb <- Data_total %>% 
  mutate(
    dv_gender = interaction(dv_health, gender, sep = "_"),
    number_kids_under18 = as.numeric(as.character(number_kids_under18))
  ) %>%
  select(dv_gender, edu_homogamy,edu_hypergamy,edu_hypogamy, age, birth_cohort, education_level, work, number_kids_under18, rural) %>% 
  tbl_summary(
    by = dv_gender, 
    type = list(
      number_kids_under18 ~ "continuous"
    ),
    statistic = list(all_continuous() ~"{mean} ({sd})",
                     all_categorical() ~ "{n} ({p})"),
    digits = all_continuous() ~ 2,
    label = list(
      age ~ "Age",
      birth_cohort ~ "Birth Cohort",
      education_level ~ "Education",
      work ~ "Work",
      number_kids_under18 ~ "Number of kids under 18",
      rural ~ "Rural"
    ) 
  ) 


tb

Characteristic	Healthy_Female, N = 7,472¹	Unhealthy_Female, N = 13,513¹	Healthy_Male, N = 8,323¹	Unhealthy_Male, N = 12,662¹
edu_homogamy	4,091 (55)	7,659 (57)	4,564 (55)	7,186 (57)
edu_hypergamy	1,410 (19)	2,710 (20)	1,615 (19)	2,505 (20)
edu_hypogamy	581 (7.8)	844 (6.2)	644 (7.7)	781 (6.2)
Age	48.98 (12.75)	55.29 (11.32)	51.66 (12.98)	57.31 (11.43)
Birth Cohort
Before 1949	426 (5.7)	1,352 (10)	802 (9.6)	1,891 (15)
1950-1978	4,906 (66)	10,708 (79)	5,679 (68)	9,762 (77)
1979-1989	1,721 (23)	1,273 (9.4)	1,570 (19)	921 (7.3)
1990-1998	419 (5.6)	180 (1.3)	272 (3.3)	88 (0.7)
Education
Junior college or vocational college	690 (9.2)	436 (3.2)	754 (9.1)	566 (4.5)
College or above	638 (8.5)	363 (2.7)	812 (9.8)	453 (3.6)
High school or technical school	1,375 (18)	1,573 (12)	1,844 (22)	2,147 (17)
Junior high school	2,466 (33)	3,933 (29)	3,066 (37)	4,901 (39)
Primary school or below	2,303 (31)	7,208 (53)	1,847 (22)	4,595 (36)
Work
No work	2,496 (33)	5,510 (41)	1,389 (17)	3,445 (27)
Work	4,976 (67)	8,003 (59)	6,934 (83)	9,217 (73)
Number of kids under 18	0.36 (0.57)	0.45 (0.64)	0.37 (0.58)	0.45 (0.65)
Rural
Rural	2,495 (33)	6,413 (47)	2,941 (35)	5,967 (47)
Urban	4,977 (67)	7,100 (53)	5,382 (65)	6,695 (53)
¹ n (%); Mean (SD)

library(gt)
gt_tbl <- as_gt(tb)
gtsave(gt_tbl, "table.html")

3. Results of Logistics Models

## 1. Install The library packages

library(dplyr)
library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:car':
## 
##     some

library(broom)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(gtsummary)

## 2. Recode The Rural Variables

table(Data_total$rural)

## 
## Rural Urban 
## 17816 24154

Data_total$Rural <- car::recode(Data_total$rural,recodes = "0='Urban';1='Rural'",as.factor=T)

Data_total$education_level <- as.factor(Data_total$education_level)
Data_total$work <- as.factor(Data_total$work) # Change the variables from continuous to factor.

## 3. Log weight individual level (No used) 

summary(Data_total$weight_ind)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   616.6  3749.1  7904.3 11431.8 15129.0 61153.5

Data_total$log_weight_ind <- log(Data_total$weight_ind + 1)

summary(Data_total$log_weight_ind)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.426   8.230   8.975   8.897   9.624  11.021

## 4. Separate the gender sample

data_male <- Data_total %>%
  filter(gender=='Male')
data_female <- Data_total %>%
  filter(gender=='Female')


## 5. Logistics Model 1a, Model 2a, Model 1b, Model 2b.


## For Male Sample
## Model_1_A


summary(Data_total$dv_health)

##   Healthy Unhealthy 
##     15795     26175

m1a <- glm(dv_health ~ edu_homogamy + age + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_male)
summary((m1a))

## 
## Call:
## glm(formula = dv_health ~ edu_homogamy + age + education_level + 
##     work + number_kids_under18 + Rural, family = binomial, data = data_male)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                    -0.869695   0.115730  -7.515
## edu_homogamy                                   -0.093812   0.032083  -2.924
## age                                             0.023606   0.001442  16.372
## education_levelCollege or above                -0.200442   0.083176  -2.410
## education_levelHigh school or technical school  0.191265   0.066480   2.877
## education_levelJunior high school               0.504081   0.063581   7.928
## education_levelPrimary school or below          0.792811   0.069988  11.328
## workWork                                       -0.389226   0.041527  -9.373
## number_kids_under18                             0.112648   0.024044   4.685
## RuralUrban                                     -0.237055   0.033264  -7.126
##                                                Pr(>|z|)    
## (Intercept)                                    5.70e-14 ***
## edu_homogamy                                    0.00345 ** 
## age                                             < 2e-16 ***
## education_levelCollege or above                 0.01596 *  
## education_levelHigh school or technical school  0.00401 ** 
## education_levelJunior high school              2.22e-15 ***
## education_levelPrimary school or below          < 2e-16 ***
## workWork                                        < 2e-16 ***
## number_kids_under18                            2.80e-06 ***
## RuralUrban                                     1.03e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28188  on 20984  degrees of freedom
## Residual deviance: 26566  on 20975  degrees of freedom
## AIC: 26586
## 
## Number of Fisher Scoring iterations: 4

conf_int <- confint(m1a) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m1a <- tidy(m1a, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m1a$signif <- symnum(tidy_m1a$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels


## Model_2_A

m2a <- glm(dv_health ~ edu_homogamy + edu_hypogamy + age + education_level + work + number_kids_under18 + Rural, family = binomial, data = data_male)

summary((m2a))

## 
## Call:
## glm(formula = dv_health ~ edu_homogamy + edu_hypogamy + age + 
##     education_level + work + number_kids_under18 + Rural, family = binomial, 
##     data = data_male)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                    -0.795678   0.117114  -6.794
## edu_homogamy                                   -0.148739   0.034580  -4.301
## edu_hypogamy                                   -0.269996   0.063047  -4.282
## age                                             0.022624   0.001461  15.490
## education_levelCollege or above                -0.214763   0.083282  -2.579
## education_levelHigh school or technical school  0.196397   0.066577   2.950
## education_levelJunior high school               0.523464   0.063833   8.201
## education_levelPrimary school or below          0.844458   0.071110  11.875
## workWork                                       -0.394719   0.041577  -9.494
## number_kids_under18                             0.109129   0.024071   4.534
## RuralUrban                                     -0.216853   0.033619  -6.450
##                                                Pr(>|z|)    
## (Intercept)                                    1.09e-11 ***
## edu_homogamy                                   1.70e-05 ***
## edu_hypogamy                                   1.85e-05 ***
## age                                             < 2e-16 ***
## education_levelCollege or above                 0.00992 ** 
## education_levelHigh school or technical school  0.00318 ** 
## education_levelJunior high school              2.39e-16 ***
## education_levelPrimary school or below          < 2e-16 ***
## workWork                                        < 2e-16 ***
## number_kids_under18                            5.80e-06 ***
## RuralUrban                                     1.12e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28188  on 20984  degrees of freedom
## Residual deviance: 26548  on 20974  degrees of freedom
## AIC: 26570
## 
## Number of Fisher Scoring iterations: 4

conf_int <- confint(m2a) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m2a <- tidy(m2a, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m2a$signif <- symnum(tidy_m2a$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels






## For Female Sample
## Model_1_B


m1b <- glm(dv_health ~ edu_homogamy + age + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_female)
summary((m1b))

## 
## Call:
## glm(formula = dv_health ~ edu_homogamy + age + education_level + 
##     work + number_kids_under18 + Rural, family = binomial, data = data_female)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                    -1.385272   0.104317 -13.279
## edu_homogamy                                    0.061167   0.030754   1.989
## age                                             0.028645   0.001437  19.936
## education_levelCollege or above                -0.004673   0.091979  -0.051
## education_levelHigh school or technical school  0.281330   0.073878   3.808
## education_levelJunior high school               0.570971   0.069638   8.199
## education_levelPrimary school or below          1.017151   0.072216  14.085
## workWork                                       -0.193138   0.034060  -5.671
## number_kids_under18                             0.100248   0.025062   4.000
## RuralUrban                                     -0.209937   0.035052  -5.989
##                                                Pr(>|z|)    
## (Intercept)                                     < 2e-16 ***
## edu_homogamy                                    0.04671 *  
## age                                             < 2e-16 ***
## education_levelCollege or above                 0.95948    
## education_levelHigh school or technical school  0.00014 ***
## education_levelJunior high school              2.42e-16 ***
## education_levelPrimary school or below          < 2e-16 ***
## workWork                                       1.42e-08 ***
## number_kids_under18                            6.33e-05 ***
## RuralUrban                                     2.11e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27327  on 20984  degrees of freedom
## Residual deviance: 25356  on 20975  degrees of freedom
## AIC: 25376
## 
## Number of Fisher Scoring iterations: 4

conf_int <- confint(m1b) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m1b <- tidy(m1b, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m1b$signif <- symnum(tidy_m1b$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels


# Model_2_B


m2b <- glm(dv_health ~ edu_homogamy + edu_hypogamy + age + education_level + work + number_kids_under18 + Rural, family = binomial, data = data_female)

conf_int <- confint(m2b) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m2b <- tidy(m2b, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m2b$signif <- symnum(tidy_m2b$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels



## 6. Logistics Model 1 to Model 4 

## get the table of outcome

library(officer)
library(flextable)

## 
## Attaching package: 'flextable'

## The following object is masked from 'package:purrr':
## 
##     compose

## The following objects are masked from 'package:gtsummary':
## 
##     as_flextable, continuous_summary

library(stargazer)


stargazer(m1a, m2a, m1b, m2b, type = "text",
          model.names = TRUE, out = "models_table.tex",
          star.cutoffs = c(0.05, 0.01, 0.001))

## 
## ==============================================================================================
##                                                              Dependent variable:              
##                                                -----------------------------------------------
##                                                                   dv_health                   
##                                                                   logistic                    
##                                                    (1)         (2)         (3)         (4)    
## ----------------------------------------------------------------------------------------------
## edu_homogamy                                    -0.094**    -0.149***    0.061*     0.135***  
##                                                  (0.032)     (0.035)     (0.031)     (0.032)  
##                                                                                               
## edu_hypogamy                                                -0.270***               0.453***  
##                                                              (0.063)                 (0.064)  
##                                                                                               
## age                                             0.024***    0.023***    0.029***    0.028***  
##                                                  (0.001)     (0.001)     (0.001)     (0.001)  
##                                                                                               
## education_levelCollege or above                  -0.200*    -0.215**     -0.005      -0.008   
##                                                  (0.083)     (0.083)     (0.092)     (0.092)  
##                                                                                               
## education_levelHigh school or technical school   0.191**     0.196**    0.281***    0.304***  
##                                                  (0.066)     (0.067)     (0.074)     (0.074)  
##                                                                                               
## education_levelJunior high school               0.504***    0.523***    0.571***    0.639***  
##                                                  (0.064)     (0.064)     (0.070)     (0.071)  
##                                                                                               
## education_levelPrimary school or below          0.793***    0.844***    1.017***    1.125***  
##                                                  (0.070)     (0.071)     (0.072)     (0.074)  
##                                                                                               
## workWork                                        -0.389***   -0.395***   -0.193***   -0.199*** 
##                                                  (0.042)     (0.042)     (0.034)     (0.034)  
##                                                                                               
## number_kids_under18                             0.113***    0.109***    0.100***    0.098***  
##                                                  (0.024)     (0.024)     (0.025)     (0.025)  
##                                                                                               
## RuralUrban                                      -0.237***   -0.217***   -0.210***   -0.199*** 
##                                                  (0.033)     (0.034)     (0.035)     (0.035)  
##                                                                                               
## Constant                                        -0.870***   -0.796***   -1.385***   -1.512*** 
##                                                  (0.116)     (0.117)     (0.104)     (0.106)  
##                                                                                               
## ----------------------------------------------------------------------------------------------
## Observations                                     20,985      20,985      20,985      20,985   
## Log Likelihood                                 -13,282.930 -13,273.810 -12,677.990 -12,652.900
## Akaike Inf. Crit.                              26,585.870  26,569.620  25,375.980  25,327.810 
## ==============================================================================================
## Note:                                                            *p<0.05; **p<0.01; ***p<0.001

library(broom)
library(dplyr)
library(ggplot2)

# get the coefficients
tidy_m1a <- tidy(m1a, conf.int = TRUE) %>% mutate(model = "Model_1A")
tidy_m2a <- tidy(m2a, conf.int = TRUE) %>% mutate(model = "Model_2A")
tidy_m1b <- tidy(m1b, conf.int = TRUE) %>% mutate(model = "Model_1B")
tidy_m2b <- tidy(m2b, conf.int = TRUE) %>% mutate(model = "Model_2B")

# merge
coefficients_combined <- bind_rows(tidy_m1a, tidy_m2a, tidy_m1b, tidy_m2b)


# create the plot
ggplot(coefficients_combined, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high, color = model)) +
  geom_pointrange() +  
  geom_hline(yintercept = 0, linetype = "dashed") +  
  facet_wrap(~model, scales = "free_x") +  
  coord_flip() + 
  labs(x = "Term", y = "Coefficient Estimate", title = "The results for Model 1 and 2, by gender") +
  theme_minimal() + 
  theme(legend.position = "bottom")

4. Birth Cohort

## 1. Install The library packages

library(dplyr)
library(purrr)
library(broom)
library(stargazer)
library(gtsummary)

## 2. Birth Cohort

Data_total <- Data_total %>%
  mutate(birth_cohort = cut(birth_year, breaks = c(1922, 1949, 1978, 1989, 1998), labels = c("Before 1949", "1950-1978", "1979-1989", "1990-1998")))

summary(Data_total$birth_cohort)

## Before 1949   1950-1978   1979-1989   1990-1998 
##        4471       31055        5485         959

## 3. Separate the gender sample

data_male <- Data_total %>%
  filter(gender=='Male')
data_female <- Data_total %>%
  filter(gender=='Female')


## 4. Logistics Model 1c, Model 2c, Model 1d, Model 2d.


## For Male Sample
## Model_1_c

m1c <- glm(dv_health ~ edu_homogamy + birth_cohort + education_level + work + number_kids_under18 + Rural  , family = binomial, data = data_male)
summary((m1c))

## 
## Call:
## glm(formula = dv_health ~ edu_homogamy + birth_cohort + education_level + 
##     work + number_kids_under18 + Rural, family = binomial, data = data_male)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                     0.703310   0.081791   8.599
## edu_homogamy                                   -0.108479   0.032145  -3.375
## birth_cohort1950-1978                           0.007245   0.049339   0.147
## birth_cohort1979-1989                          -0.692936   0.066233 -10.462
## birth_cohort1990-1998                          -1.215081   0.134876  -9.009
## education_levelCollege or above                -0.172979   0.083759  -2.065
## education_levelHigh school or technical school  0.194396   0.066775   2.911
## education_levelJunior high school               0.498913   0.063933   7.804
## education_levelPrimary school or below          0.865930   0.069796  12.407
## workWork                                       -0.573542   0.039693 -14.450
## number_kids_under18                             0.024689   0.024802   0.995
## RuralUrban                                     -0.268865   0.033095  -8.124
##                                                Pr(>|z|)    
## (Intercept)                                     < 2e-16 ***
## edu_homogamy                                   0.000739 ***
## birth_cohort1950-1978                          0.883264    
## birth_cohort1979-1989                           < 2e-16 ***
## birth_cohort1990-1998                           < 2e-16 ***
## education_levelCollege or above                0.038904 *  
## education_levelHigh school or technical school 0.003600 ** 
## education_levelJunior high school              6.01e-15 ***
## education_levelPrimary school or below          < 2e-16 ***
## workWork                                        < 2e-16 ***
## number_kids_under18                            0.319518    
## RuralUrban                                     4.50e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28188  on 20984  degrees of freedom
## Residual deviance: 26555  on 20973  degrees of freedom
## AIC: 26579
## 
## Number of Fisher Scoring iterations: 4

conf_int <- confint(m1c) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m1c <- tidy(m1c, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m1c$signif <- symnum(tidy_m1c$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels


## Model_2_c

m2c <- glm(dv_health ~ edu_homogamy + edu_hypogamy + birth_cohort + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_male)

summary((m2c))

## 
## Call:
## glm(formula = dv_health ~ edu_homogamy + edu_hypogamy + birth_cohort + 
##     education_level + work + number_kids_under18 + Rural, family = binomial, 
##     data = data_male)
## 
## Coefficients:
##                                                Estimate Std. Error z value
## (Intercept)                                     0.70365    0.08188   8.594
## edu_homogamy                                   -0.17167    0.03452  -4.974
## edu_hypogamy                                   -0.32012    0.06319  -5.066
## birth_cohort1950-1978                           0.02431    0.04948   0.491
## birth_cohort1979-1989                          -0.65365    0.06672  -9.797
## birth_cohort1990-1998                          -1.16365    0.13538  -8.596
## education_levelCollege or above                -0.18945    0.08387  -2.259
## education_levelHigh school or technical school  0.19812    0.06689   2.962
## education_levelJunior high school               0.52012    0.06418   8.104
## education_levelPrimary school or below          0.92242    0.07079  13.031
## workWork                                       -0.57332    0.03972 -14.434
## number_kids_under18                             0.02227    0.02483   0.897
## RuralUrban                                     -0.24338    0.03350  -7.265
##                                                Pr(>|z|)    
## (Intercept)                                     < 2e-16 ***
## edu_homogamy                                   6.57e-07 ***
## edu_hypogamy                                   4.06e-07 ***
## birth_cohort1950-1978                           0.62319    
## birth_cohort1979-1989                           < 2e-16 ***
## birth_cohort1990-1998                           < 2e-16 ***
## education_levelCollege or above                 0.02390 *  
## education_levelHigh school or technical school  0.00306 ** 
## education_levelJunior high school              5.33e-16 ***
## education_levelPrimary school or below          < 2e-16 ***
## workWork                                        < 2e-16 ***
## number_kids_under18                             0.36968    
## RuralUrban                                     3.73e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28188  on 20984  degrees of freedom
## Residual deviance: 26529  on 20972  degrees of freedom
## AIC: 26555
## 
## Number of Fisher Scoring iterations: 4

conf_int <- confint(m2c) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m2c <- tidy(m2c, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m2c$signif <- symnum(tidy_m2c$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels






## For Female Sample
## Model_1_d


m1d <- glm(dv_health ~ edu_homogamy + birth_cohort + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_female)
summary((m1d))

## 
## Call:
## glm(formula = dv_health ~ edu_homogamy + birth_cohort + education_level + 
##     work + number_kids_under18 + Rural, family = binomial, data = data_female)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                     0.451034   0.095844   4.706
## edu_homogamy                                    0.072787   0.030769   2.366
## birth_cohort1950-1978                          -0.117934   0.060746  -1.941
## birth_cohort1979-1989                          -0.829674   0.071727 -11.567
## birth_cohort1990-1998                          -1.296671   0.109752 -11.815
## education_levelCollege or above                 0.040308   0.092834   0.434
## education_levelHigh school or technical school  0.291481   0.074251   3.926
## education_levelJunior high school               0.567126   0.070090   8.091
## education_levelPrimary school or below          1.099658   0.071823  15.311
## workWork                                       -0.316270   0.033337  -9.487
## number_kids_under18                            -0.001476   0.025923  -0.057
## RuralUrban                                     -0.229226   0.035024  -6.545
##                                                Pr(>|z|)    
## (Intercept)                                    2.53e-06 ***
## edu_homogamy                                     0.0180 *  
## birth_cohort1950-1978                            0.0522 .  
## birth_cohort1979-1989                           < 2e-16 ***
## birth_cohort1990-1998                           < 2e-16 ***
## education_levelCollege or above                  0.6642    
## education_levelHigh school or technical school 8.65e-05 ***
## education_levelJunior high school              5.90e-16 ***
## education_levelPrimary school or below          < 2e-16 ***
## workWork                                        < 2e-16 ***
## number_kids_under18                              0.9546    
## RuralUrban                                     5.95e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27327  on 20984  degrees of freedom
## Residual deviance: 25393  on 20973  degrees of freedom
## AIC: 25417
## 
## Number of Fisher Scoring iterations: 4

conf_int <- confint(m1d) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m1d <- tidy(m1d, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m1d$signif <- symnum(tidy_m1d$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels


# Model_2_d


m2d <- glm(dv_health ~ edu_homogamy + edu_hypogamy + birth_cohort + education_level + work + number_kids_under18 + Rural , family = binomial, data = data_female)



conf_int <- confint(m2d) # Calculate the confidence intervals

## Waiting for profiling to be done...

tidy_m2d <- tidy(m2d, conf.int = TRUE) # Convert to data frame, note that this requires using the 'broom' package

tidy_m2d$signif <- symnum(tidy_m2d$p.value, corr = FALSE, na = FALSE,
                             cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                             symbols = c("***", "**", "*", ".", " ")) # Create a new column for significance levels



## 5. Logistics Model 1 to Model 4 

## get the table of outcome

library(officer)
library(flextable)
library(stargazer)


stargazer(m1c, m2c, m1d, m2d, type = "text",
          model.names = TRUE, out = "models_table.tex",
          star.cutoffs = c(0.05, 0.01, 0.001))

## 
## ==============================================================================================
##                                                              Dependent variable:              
##                                                -----------------------------------------------
##                                                                   dv_health                   
##                                                                   logistic                    
##                                                    (1)         (2)         (3)         (4)    
## ----------------------------------------------------------------------------------------------
## edu_homogamy                                    -0.108***   -0.172***    0.073*     0.148***  
##                                                  (0.032)     (0.035)     (0.031)     (0.032)  
##                                                                                               
## edu_hypogamy                                                -0.320***               0.465***  
##                                                              (0.063)                 (0.065)  
##                                                                                               
## birth_cohort1950-1978                             0.007       0.024      -0.118      -0.115   
##                                                  (0.049)     (0.049)     (0.061)     (0.061)  
##                                                                                               
## birth_cohort1979-1989                           -0.693***   -0.654***   -0.830***   -0.817*** 
##                                                  (0.066)     (0.067)     (0.072)     (0.072)  
##                                                                                               
## birth_cohort1990-1998                           -1.215***   -1.164***   -1.297***   -1.291*** 
##                                                  (0.135)     (0.135)     (0.110)     (0.110)  
##                                                                                               
## education_levelCollege or above                  -0.173*     -0.189*      0.040       0.035   
##                                                  (0.084)     (0.084)     (0.093)     (0.093)  
##                                                                                               
## education_levelHigh school or technical school   0.194**     0.198**    0.291***    0.314***  
##                                                  (0.067)     (0.067)     (0.074)     (0.075)  
##                                                                                               
## education_levelJunior high school               0.499***    0.520***    0.567***    0.636***  
##                                                  (0.064)     (0.064)     (0.070)     (0.071)  
##                                                                                               
## education_levelPrimary school or below          0.866***    0.922***    1.100***    1.208***  
##                                                  (0.070)     (0.071)     (0.072)     (0.074)  
##                                                                                               
## workWork                                        -0.574***   -0.573***   -0.316***   -0.322*** 
##                                                  (0.040)     (0.040)     (0.033)     (0.033)  
##                                                                                               
## number_kids_under18                               0.025       0.022      -0.001      -0.003   
##                                                  (0.025)     (0.025)     (0.026)     (0.026)  
##                                                                                               
## RuralUrban                                      -0.269***   -0.243***   -0.229***   -0.218*** 
##                                                  (0.033)     (0.034)     (0.035)     (0.035)  
##                                                                                               
## Constant                                        0.703***    0.704***    0.451***     0.298**  
##                                                  (0.082)     (0.082)     (0.096)     (0.098)  
##                                                                                               
## ----------------------------------------------------------------------------------------------
## Observations                                     20,985      20,985      20,985      20,985   
## Log Likelihood                                 -13,277.300 -13,264.550 -12,696.590 -12,670.250
## Akaike Inf. Crit.                              26,578.590  26,555.100  25,417.180  25,366.500 
## ==============================================================================================
## Note:                                                            *p<0.05; **p<0.01; ***p<0.001

library(broom)
library(dplyr)
library(ggplot2)

# get the coefficients
tidy_m1c <- tidy(m1c, conf.int = TRUE) %>% mutate(model = "Model_1C")
tidy_m2c <- tidy(m2c, conf.int = TRUE) %>% mutate(model = "Model_2C")
tidy_m1d <- tidy(m1d, conf.int = TRUE) %>% mutate(model = "Model_1D")
tidy_m2d <- tidy(m2d, conf.int = TRUE) %>% mutate(model = "Model_2D")

# merge
coefficients_combined2<- bind_rows(tidy_m1c, tidy_m2c, tidy_m1d, tidy_m2d)

coefficients_combined2

# generate plot
ggplot(coefficients_combined2, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high, color = model)) +
  geom_pointrange() +  
  geom_hline(yintercept = 0, linetype = "dashed") +  
  facet_wrap(~model, scales = "free_x") +  
  coord_flip() +  
  labs(x = "Term", y = "Coefficient Estimate", title = "The results for Model 1 and 2, by gender and cohort") +
  theme_minimal() + 
  theme(legend.position = "bottom")

Calculate the odds of results

probability_change_ma1 <- (exp(0.08) - 1) * 100

probability_change_ma2_homogamy <- (exp(0.15) - 1) * 100

probability_change_ma2_hypogamy <- (exp(0.27) - 1) * 100



probability_change_mb1 <- (1- exp(-0.06)) * 100

probability_change_mb2_homogamy <- (1 - exp(-0.11)) * 100

probability_change_mb2_hypogamy <- (1 - exp(-0.45) ) * 100

Term paper of Stats

Jianfeng He

2024-05-10

1. Load Data and clean up – CHFS 2019.

2. Descriptive Statistics – Educational Assortative Mating and Health Outcomes

3. Results of Logistics Models

4. Birth Cohort

Calculate the odds of results