The document performs Propensity Score Matching, in order to find balance between treatment and control groups within a sample of insurance data.
# install.packages("MatchIt")
# install.packages("Matching")
library(MatchIt)
library(Matching)
library(data.table)
require(MatchIt)
require(Matching)
dt <- read.csv("Aim1.csv")
dt[1:5,]
## ID birth_year chronic_meds poverty_ratio sample_type sample_weight_1
## 1 11 1982 0 198 1 218371
## 2 16 1982 0 120 1 99849
## 3 18 1982 0 0 1 96678
## 4 21 1982 0 0 1 99849
## 5 28 1983 0 82 1 110561
## mother_education daily_cigs daily_alcohol weekly_exercise
## 1 14 20 3 7
## 2 11 0 0 3
## 3 15 0 0 3
## 4 13 30 2 2
## 5 12 0 0 0
## sample_weight_15 insurance_11 routine_checkup years_insured insurance
## 1 354954 1 0 7 1
## 2 175920 0 1 7 1
## 3 0 0 0 5 0
## 4 175920 1 0 5 0
## 5 157334 1 1 8 1
## female disability NE_So_region great_health mother_college HS_grad
## 1 1 0 1 1 1 1
## 2 0 0 1 0 0 1
## 3 0 0 1 0 1 0
## 4 0 0 1 0 1 0
## 5 1 0 1 1 0 1
## low_income age_97 age_11 race_white daily_fruit daily_veg high_exercise
## 1 1 15 29 0 0 0 1
## 2 1 15 29 0 0 0 1
## 3 1 15 29 0 0 1 1
## 4 1 15 29 0 0 0 0
## 5 1 14 28 0 0 0 0
## smoker low_risk_alcohol BMI_97 BMI_11 overweight_97 obese_97
## 1 1 0 20.59570 30.20703 0 0
## 2 0 1 20.08571 21.83876 0 0
## 3 0 1 19.73223 20.52444 0 0
## 4 1 1 20.35865 25.10216 0 0
## 5 0 1 20.78205 29.53161 0 0
## overweight_11 obese_11
## 1 0 1
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
names(dt)
## [1] "ID" "birth_year" "chronic_meds"
## [4] "poverty_ratio" "sample_type" "sample_weight_1"
## [7] "mother_education" "daily_cigs" "daily_alcohol"
## [10] "weekly_exercise" "sample_weight_15" "insurance_11"
## [13] "routine_checkup" "years_insured" "insurance"
## [16] "female" "disability" "NE_So_region"
## [19] "great_health" "mother_college" "HS_grad"
## [22] "low_income" "age_97" "age_11"
## [25] "race_white" "daily_fruit" "daily_veg"
## [28] "high_exercise" "smoker" "low_risk_alcohol"
## [31] "BMI_97" "BMI_11" "overweight_97"
## [34] "obese_97" "overweight_11" "obese_11"
attach(dt)
dt2 <- cbind(dt[15:22], dt[25:30])
m.out <- matchit(insurance ~ great_health + mother_college + HS_grad + low_income + disability + female + NE_So_region + high_exercise + smoker + race_white + daily_veg + daily_fruit + low_risk_alcohol, data=dt2, method="subclass")
# , ratio=1
summary(m.out)
##
## Call:
## matchit(formula = insurance ~ great_health + mother_college +
## HS_grad + low_income + disability + female + NE_So_region +
## high_exercise + smoker + race_white + daily_veg + daily_fruit +
## low_risk_alcohol, data = dt2, method = "subclass", sub.by = "treat")
## Summary of balance for all data:
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.5779 0.4370 0.1409 0.1578 0.1407
## great_health 0.7912 0.7333 0.0579 0.0000 0.0573
## mother_college 0.5474 0.3718 0.1757 0.0000 0.1753
## HS_grad 0.8347 0.6323 0.2023 0.0000 0.2019
## low_income 0.3715 0.6330 -0.2615 0.0000 0.2619
## disability 0.1719 0.1869 -0.0150 0.0000 0.0150
## female 0.5876 0.4604 0.1272 0.0000 0.1269
## NE_So_region 0.4848 0.5484 -0.0636 0.0000 0.0641
## high_exercise 0.4789 0.4379 0.0410 0.0000 0.0409
## smoker 0.2958 0.4393 -0.1435 0.0000 0.1439
## race_white 0.6765 0.5164 0.1602 0.0000 0.1596
## daily_veg 0.3630 0.3322 0.0308 0.0000 0.0307
## daily_fruit 0.2424 0.2087 0.0337 0.0000 0.0334
## low_risk_alcohol 0.5105 0.5791 -0.0686 0.0000 0.0689
## eQQ Max
## distance 0.1796
## great_health 1.0000
## mother_college 1.0000
## HS_grad 1.0000
## low_income 1.0000
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 1.0000
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 1.0000
## low_risk_alcohol 1.0000
##
##
## Summary of balance by subclasses:
## , , Subclass 1
##
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.3009 0.2629 0.0380 0.0408 0.0384
## great_health 0.6640 0.6869 -0.0229 0.0000 0.0198
## mother_college 0.1858 0.1981 -0.0123 0.0000 0.0119
## HS_grad 0.4308 0.2875 0.1433 0.0000 0.1423
## low_income 0.9170 0.8962 0.0208 0.0000 0.0237
## disability 0.2411 0.1901 0.0510 0.0000 0.0514
## female 0.4229 0.2827 0.1402 0.0000 0.1423
## NE_So_region 0.5889 0.6166 -0.0277 0.0000 0.0277
## high_exercise 0.4071 0.4169 -0.0098 0.0000 0.0079
## smoker 0.5257 0.6278 -0.1021 0.0000 0.1028
## race_white 0.3478 0.3610 -0.0132 0.0000 0.0119
## daily_veg 0.3281 0.2891 0.0389 0.0000 0.0395
## daily_fruit 0.1858 0.1789 0.0069 0.0000 0.0079
## low_risk_alcohol 0.6403 0.6406 -0.0003 0.0000 0.0000
## eQQ Max
## distance 0.0663
## great_health 1.0000
## mother_college 1.0000
## HS_grad 1.0000
## low_income 1.0000
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 1.0000
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 1.0000
## low_risk_alcohol 0.0000
##
## , , Subclass 2
##
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.4592 0.4510 0.0082 0.0085 0.0084
## great_health 0.7154 0.6959 0.0195 0.0000 0.0198
## mother_college 0.2411 0.2978 -0.0567 0.0000 0.0553
## HS_grad 0.8261 0.8150 0.0110 0.0000 0.0119
## low_income 0.7233 0.7398 -0.0165 0.0000 0.0158
## disability 0.1462 0.2069 -0.0607 0.0000 0.0593
## female 0.5810 0.5204 0.0607 0.0000 0.0632
## NE_So_region 0.5375 0.5611 -0.0236 0.0000 0.0237
## high_exercise 0.3597 0.4389 -0.0792 0.0000 0.0791
## smoker 0.3360 0.3417 -0.0057 0.0000 0.0040
## race_white 0.4704 0.4796 -0.0093 0.0000 0.0079
## daily_veg 0.3439 0.3354 0.0085 0.0000 0.0079
## daily_fruit 0.2134 0.2100 0.0034 0.0000 0.0040
## low_risk_alcohol 0.6126 0.6019 0.0108 0.0000 0.0119
## eQQ Max
## distance 0.0147
## great_health 1.0000
## mother_college 1.0000
## HS_grad 1.0000
## low_income 1.0000
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 1.0000
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 1.0000
## low_risk_alcohol 1.0000
##
## , , Subclass 3
##
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.5537 0.5496 0.0042 0.0039 0.0045
## great_health 0.7945 0.7938 0.0007 0.0000 0.0000
## mother_college 0.6838 0.6134 0.0704 0.0000 0.0670
## HS_grad 0.8577 0.8660 -0.0083 0.0000 0.0103
## low_income 0.3913 0.4639 -0.0726 0.0000 0.0722
## disability 0.1739 0.1134 0.0605 0.0000 0.0567
## female 0.4308 0.5103 -0.0795 0.0000 0.0825
## NE_So_region 0.4783 0.4639 0.0143 0.0000 0.0103
## high_exercise 0.4506 0.4639 -0.0133 0.0000 0.0155
## smoker 0.4071 0.3608 0.0463 0.0000 0.0464
## race_white 0.6680 0.6031 0.0649 0.0000 0.0619
## daily_veg 0.3162 0.3608 -0.0446 0.0000 0.0464
## daily_fruit 0.2095 0.1907 0.0188 0.0000 0.0155
## low_risk_alcohol 0.5099 0.5361 -0.0262 0.0000 0.0258
## eQQ Max
## distance 0.0125
## great_health 0.0000
## mother_college 1.0000
## HS_grad 1.0000
## low_income 1.0000
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 1.0000
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 1.0000
## low_risk_alcohol 1.0000
##
## , , Subclass 4
##
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.6435 0.6400 0.0036 0.0023 0.0035
## great_health 0.7960 0.7791 0.0169 0.0000 0.0123
## mother_college 0.5240 0.5153 0.0087 0.0000 0.0061
## HS_grad 0.9200 0.9509 -0.0309 0.0000 0.0368
## low_income 0.1600 0.2086 -0.0486 0.0000 0.0491
## disability 0.2200 0.2331 -0.0131 0.0000 0.0123
## female 0.6000 0.6810 -0.0810 0.0000 0.0859
## NE_So_region 0.5160 0.4969 0.0191 0.0000 0.0184
## high_exercise 0.5000 0.4110 0.0890 0.0000 0.0859
## smoker 0.2640 0.3067 -0.0427 0.0000 0.0429
## race_white 0.7520 0.7178 0.0342 0.0000 0.0307
## daily_veg 0.3680 0.3804 -0.0124 0.0000 0.0123
## daily_fruit 0.2280 0.2577 -0.0297 0.0000 0.0307
## low_risk_alcohol 0.5480 0.4908 0.0572 0.0000 0.0552
## eQQ Max
## distance 0.0112
## great_health 1.0000
## mother_college 1.0000
## HS_grad 1.0000
## low_income 1.0000
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 1.0000
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 1.0000
## low_risk_alcohol 1.0000
##
## , , Subclass 5
##
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.7106 0.7087 0.0019 0.0014 0.0020
## great_health 0.8789 0.8627 0.0162 0.0000 0.0098
## mother_college 0.8281 0.7451 0.0830 0.0000 0.0784
## HS_grad 0.9727 1.0000 -0.0273 0.0000 0.0294
## low_income 0.0391 0.0686 -0.0296 0.0000 0.0294
## disability 0.1484 0.1961 -0.0476 0.0000 0.0490
## female 0.5781 0.6275 -0.0493 0.0000 0.0490
## NE_So_region 0.4375 0.4804 -0.0429 0.0000 0.0490
## high_exercise 0.5547 0.4902 0.0645 0.0000 0.0588
## smoker 0.2422 0.2157 0.0265 0.0000 0.0294
## race_white 0.8516 0.8529 -0.0014 0.0000 0.0098
## daily_veg 0.3750 0.3922 -0.0172 0.0000 0.0196
## daily_fruit 0.2734 0.2647 0.0087 0.0000 0.0098
## low_risk_alcohol 0.4297 0.4706 -0.0409 0.0000 0.0392
## eQQ Max
## distance 0.0093
## great_health 1.0000
## mother_college 1.0000
## HS_grad 1.0000
## low_income 1.0000
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 1.0000
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 1.0000
## low_risk_alcohol 1.0000
##
## , , Subclass 6
##
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.7989 0.7915 0.0074 0.0075 0.0075
## great_health 0.8972 0.8710 0.0263 0.0000 0.0161
## mother_college 0.8182 0.7581 0.0601 0.0000 0.0484
## HS_grad 1.0000 1.0000 0.0000 0.0000 0.0000
## low_income 0.0000 0.0000 0.0000 0.0000 0.0000
## disability 0.1028 0.1452 -0.0424 0.0000 0.0323
## female 0.9130 0.9355 -0.0224 0.0000 0.0323
## NE_So_region 0.3518 0.3065 0.0453 0.0000 0.0484
## high_exercise 0.6008 0.5484 0.0524 0.0000 0.0484
## smoker 0.0000 0.0000 0.0000 0.0000 0.0000
## race_white 0.9684 0.9194 0.0490 0.0000 0.0484
## daily_veg 0.4466 0.4355 0.0112 0.0000 0.0161
## daily_fruit 0.3439 0.3387 0.0052 0.0000 0.0000
## low_risk_alcohol 0.3241 0.3871 -0.0630 0.0000 0.0645
## eQQ Max
## distance 0.0172
## great_health 1.0000
## mother_college 1.0000
## HS_grad 0.0000
## low_income 0.0000
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 0.0000
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 0.0000
## low_risk_alcohol 1.0000
##
##
## Sample sizes by subclasses:
## Subclass 1 Subclass 2 Subclass 3 Subclass 4 Subclass 5 Subclass 6
## Treated 253 253 253 250 256 253
## Control 626 319 194 163 102 62
## Total 879 572 447 413 358 315
##
## Summary of balance across subclasses
## Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance 0.5779 0.5674 0.0067 0.0108 0.0107
## great_health 0.7912 0.7817 0.0077 0.0000 0.0129
## mother_college 0.5474 0.5218 0.0230 0.0000 0.0447
## HS_grad 0.8347 0.8200 0.0250 0.0000 0.0384
## low_income 0.3715 0.3959 0.0160 0.0000 0.0317
## disability 0.1719 0.1807 0.0199 0.0000 0.0436
## female 0.5876 0.5928 0.0329 0.0000 0.0758
## NE_So_region 0.4848 0.4875 0.0127 0.0000 0.0296
## high_exercise 0.4789 0.4617 0.0243 0.0000 0.0492
## smoker 0.2958 0.3086 0.0205 0.0000 0.0376
## race_white 0.6765 0.6559 0.0149 0.0000 0.0284
## daily_veg 0.3630 0.3656 0.0107 0.0000 0.0237
## daily_fruit 0.2424 0.2401 0.0062 0.0000 0.0113
## low_risk_alcohol 0.5105 0.5211 0.0164 0.0000 0.0327
## eQQ Max
## distance 0.0219
## great_health 0.8333
## mother_college 1.0000
## HS_grad 0.8333
## low_income 0.8333
## disability 1.0000
## female 1.0000
## NE_So_region 1.0000
## high_exercise 1.0000
## smoker 0.8333
## race_white 1.0000
## daily_veg 1.0000
## daily_fruit 0.8333
## low_risk_alcohol 0.8333
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance 92.5223 93.1869 92.4010 87.8202
## great_health 83.7176 0.0000 77.3994 16.6667
## mother_college 85.3827 0.0000 74.5195 0.0000
## HS_grad 92.7641 0.0000 80.9657 16.6667
## low_income 90.6808 0.0000 87.9138 16.6667
## disability 41.3094 0.0000 -190.2639 0.0000
## female 95.9345 0.0000 40.2647 0.0000
## NE_So_region 95.7670 0.0000 53.7671 0.0000
## high_exercise 58.0292 0.0000 -20.2119 0.0000
## smoker 91.0615 0.0000 73.9099 16.6667
## race_white 87.1122 0.0000 82.2246 0.0000
## daily_veg 91.5163 0.0000 22.9446 0.0000
## daily_fruit 93.2177 0.0000 66.3155 16.6667
## low_risk_alcohol 84.5596 0.0000 52.4910 16.6667
matchedIns <- match.data(m.out)
write.csv(matchedIns, file="MatchedData.csv")
plot(m.out, type="jitter")
## [1] "To identify the units, use first mouse button; to stop, use second."
## integer(0)
plot(m.out, type="hist")
##
## Welch Two Sample t-test
##
## data: insurance by great_health
## t = -3.7301, df = 1183.2, p-value = 0.0002005
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1219875 -0.0378926
## sample estimates:
## mean in group 0 mean in group 1
## 0.4477401 0.5276801
##
## Welch Two Sample t-test
##
## data: insurance by mother_college
## t = -9.7821, df = 2920.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2121013 -0.1412699
## sample estimates:
## mean in group 0 mean in group 1
## 0.4272388 0.6039244
##
## Welch Two Sample t-test
##
## data: insurance by HS_grad
## t = -13.221, df = 1470.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2983018 -0.2212233
## sample estimates:
## mean in group 0 mean in group 1
## 0.3177215 0.5774840
##
## Welch Two Sample t-test
##
## data: insurance by low_income
## t = 14.791, df = 2981.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2267418 0.2960464
## sample estimates:
## mean in group 0 mean in group 1
## 0.6394102 0.3780161