Synopsis

The document performs Propensity Score Matching, in order to find balance between treatment and control groups within a sample of insurance data.

Required R language packages for Matched Sampling:

# install.packages("MatchIt")
# install.packages("Matching")
library(MatchIt)
library(Matching)
library(data.table)
require(MatchIt)
require(Matching)

Import of Insurance Data, and data summary:

dt <- read.csv("Aim1.csv")
dt[1:5,]
##   ID birth_year chronic_meds poverty_ratio sample_type sample_weight_1
## 1 11       1982            0           198           1          218371
## 2 16       1982            0           120           1           99849
## 3 18       1982            0             0           1           96678
## 4 21       1982            0             0           1           99849
## 5 28       1983            0            82           1          110561
##   mother_education daily_cigs daily_alcohol weekly_exercise
## 1               14         20             3               7
## 2               11          0             0               3
## 3               15          0             0               3
## 4               13         30             2               2
## 5               12          0             0               0
##   sample_weight_15 insurance_11 routine_checkup years_insured insurance
## 1           354954            1               0             7         1
## 2           175920            0               1             7         1
## 3                0            0               0             5         0
## 4           175920            1               0             5         0
## 5           157334            1               1             8         1
##   female disability NE_So_region great_health mother_college HS_grad
## 1      1          0            1            1              1       1
## 2      0          0            1            0              0       1
## 3      0          0            1            0              1       0
## 4      0          0            1            0              1       0
## 5      1          0            1            1              0       1
##   low_income age_97 age_11 race_white daily_fruit daily_veg high_exercise
## 1          1     15     29          0           0         0             1
## 2          1     15     29          0           0         0             1
## 3          1     15     29          0           0         1             1
## 4          1     15     29          0           0         0             0
## 5          1     14     28          0           0         0             0
##   smoker low_risk_alcohol   BMI_97   BMI_11 overweight_97 obese_97
## 1      1                0 20.59570 30.20703             0        0
## 2      0                1 20.08571 21.83876             0        0
## 3      0                1 19.73223 20.52444             0        0
## 4      1                1 20.35865 25.10216             0        0
## 5      0                1 20.78205 29.53161             0        0
##   overweight_11 obese_11
## 1             0        1
## 2             0        0
## 3             0        0
## 4             1        0
## 5             1        0

Insurance Data variables:

names(dt)
##  [1] "ID"               "birth_year"       "chronic_meds"    
##  [4] "poverty_ratio"    "sample_type"      "sample_weight_1" 
##  [7] "mother_education" "daily_cigs"       "daily_alcohol"   
## [10] "weekly_exercise"  "sample_weight_15" "insurance_11"    
## [13] "routine_checkup"  "years_insured"    "insurance"       
## [16] "female"           "disability"       "NE_So_region"    
## [19] "great_health"     "mother_college"   "HS_grad"         
## [22] "low_income"       "age_97"           "age_11"          
## [25] "race_white"       "daily_fruit"      "daily_veg"       
## [28] "high_exercise"    "smoker"           "low_risk_alcohol"
## [31] "BMI_97"           "BMI_11"           "overweight_97"   
## [34] "obese_97"         "overweight_11"    "obese_11"
attach(dt)

Propensity Score Matching of Insurance Data, (Treatment & Control Groups):

dt2 <- cbind(dt[15:22], dt[25:30])
m.out <- matchit(insurance ~ great_health + mother_college + HS_grad + low_income + disability + female + NE_So_region + high_exercise + smoker + race_white + daily_veg + daily_fruit + low_risk_alcohol, data=dt2, method="subclass")
# , ratio=1
summary(m.out)
## 
## Call:
## matchit(formula = insurance ~ great_health + mother_college + 
##     HS_grad + low_income + disability + female + NE_So_region + 
##     high_exercise + smoker + race_white + daily_veg + daily_fruit + 
##     low_risk_alcohol, data = dt2, method = "subclass", sub.by = "treat")
## Summary of balance for all data:
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.5779        0.4370    0.1409  0.1578   0.1407
## great_health            0.7912        0.7333    0.0579  0.0000   0.0573
## mother_college          0.5474        0.3718    0.1757  0.0000   0.1753
## HS_grad                 0.8347        0.6323    0.2023  0.0000   0.2019
## low_income              0.3715        0.6330   -0.2615  0.0000   0.2619
## disability              0.1719        0.1869   -0.0150  0.0000   0.0150
## female                  0.5876        0.4604    0.1272  0.0000   0.1269
## NE_So_region            0.4848        0.5484   -0.0636  0.0000   0.0641
## high_exercise           0.4789        0.4379    0.0410  0.0000   0.0409
## smoker                  0.2958        0.4393   -0.1435  0.0000   0.1439
## race_white              0.6765        0.5164    0.1602  0.0000   0.1596
## daily_veg               0.3630        0.3322    0.0308  0.0000   0.0307
## daily_fruit             0.2424        0.2087    0.0337  0.0000   0.0334
## low_risk_alcohol        0.5105        0.5791   -0.0686  0.0000   0.0689
##                  eQQ Max
## distance          0.1796
## great_health      1.0000
## mother_college    1.0000
## HS_grad           1.0000
## low_income        1.0000
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            1.0000
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       1.0000
## low_risk_alcohol  1.0000
## 
## 
## Summary of balance by subclasses:
## , , Subclass 1
## 
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.3009        0.2629    0.0380  0.0408   0.0384
## great_health            0.6640        0.6869   -0.0229  0.0000   0.0198
## mother_college          0.1858        0.1981   -0.0123  0.0000   0.0119
## HS_grad                 0.4308        0.2875    0.1433  0.0000   0.1423
## low_income              0.9170        0.8962    0.0208  0.0000   0.0237
## disability              0.2411        0.1901    0.0510  0.0000   0.0514
## female                  0.4229        0.2827    0.1402  0.0000   0.1423
## NE_So_region            0.5889        0.6166   -0.0277  0.0000   0.0277
## high_exercise           0.4071        0.4169   -0.0098  0.0000   0.0079
## smoker                  0.5257        0.6278   -0.1021  0.0000   0.1028
## race_white              0.3478        0.3610   -0.0132  0.0000   0.0119
## daily_veg               0.3281        0.2891    0.0389  0.0000   0.0395
## daily_fruit             0.1858        0.1789    0.0069  0.0000   0.0079
## low_risk_alcohol        0.6403        0.6406   -0.0003  0.0000   0.0000
##                  eQQ Max
## distance          0.0663
## great_health      1.0000
## mother_college    1.0000
## HS_grad           1.0000
## low_income        1.0000
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            1.0000
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       1.0000
## low_risk_alcohol  0.0000
## 
## , , Subclass 2
## 
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.4592        0.4510    0.0082  0.0085   0.0084
## great_health            0.7154        0.6959    0.0195  0.0000   0.0198
## mother_college          0.2411        0.2978   -0.0567  0.0000   0.0553
## HS_grad                 0.8261        0.8150    0.0110  0.0000   0.0119
## low_income              0.7233        0.7398   -0.0165  0.0000   0.0158
## disability              0.1462        0.2069   -0.0607  0.0000   0.0593
## female                  0.5810        0.5204    0.0607  0.0000   0.0632
## NE_So_region            0.5375        0.5611   -0.0236  0.0000   0.0237
## high_exercise           0.3597        0.4389   -0.0792  0.0000   0.0791
## smoker                  0.3360        0.3417   -0.0057  0.0000   0.0040
## race_white              0.4704        0.4796   -0.0093  0.0000   0.0079
## daily_veg               0.3439        0.3354    0.0085  0.0000   0.0079
## daily_fruit             0.2134        0.2100    0.0034  0.0000   0.0040
## low_risk_alcohol        0.6126        0.6019    0.0108  0.0000   0.0119
##                  eQQ Max
## distance          0.0147
## great_health      1.0000
## mother_college    1.0000
## HS_grad           1.0000
## low_income        1.0000
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            1.0000
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       1.0000
## low_risk_alcohol  1.0000
## 
## , , Subclass 3
## 
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.5537        0.5496    0.0042  0.0039   0.0045
## great_health            0.7945        0.7938    0.0007  0.0000   0.0000
## mother_college          0.6838        0.6134    0.0704  0.0000   0.0670
## HS_grad                 0.8577        0.8660   -0.0083  0.0000   0.0103
## low_income              0.3913        0.4639   -0.0726  0.0000   0.0722
## disability              0.1739        0.1134    0.0605  0.0000   0.0567
## female                  0.4308        0.5103   -0.0795  0.0000   0.0825
## NE_So_region            0.4783        0.4639    0.0143  0.0000   0.0103
## high_exercise           0.4506        0.4639   -0.0133  0.0000   0.0155
## smoker                  0.4071        0.3608    0.0463  0.0000   0.0464
## race_white              0.6680        0.6031    0.0649  0.0000   0.0619
## daily_veg               0.3162        0.3608   -0.0446  0.0000   0.0464
## daily_fruit             0.2095        0.1907    0.0188  0.0000   0.0155
## low_risk_alcohol        0.5099        0.5361   -0.0262  0.0000   0.0258
##                  eQQ Max
## distance          0.0125
## great_health      0.0000
## mother_college    1.0000
## HS_grad           1.0000
## low_income        1.0000
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            1.0000
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       1.0000
## low_risk_alcohol  1.0000
## 
## , , Subclass 4
## 
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.6435        0.6400    0.0036  0.0023   0.0035
## great_health            0.7960        0.7791    0.0169  0.0000   0.0123
## mother_college          0.5240        0.5153    0.0087  0.0000   0.0061
## HS_grad                 0.9200        0.9509   -0.0309  0.0000   0.0368
## low_income              0.1600        0.2086   -0.0486  0.0000   0.0491
## disability              0.2200        0.2331   -0.0131  0.0000   0.0123
## female                  0.6000        0.6810   -0.0810  0.0000   0.0859
## NE_So_region            0.5160        0.4969    0.0191  0.0000   0.0184
## high_exercise           0.5000        0.4110    0.0890  0.0000   0.0859
## smoker                  0.2640        0.3067   -0.0427  0.0000   0.0429
## race_white              0.7520        0.7178    0.0342  0.0000   0.0307
## daily_veg               0.3680        0.3804   -0.0124  0.0000   0.0123
## daily_fruit             0.2280        0.2577   -0.0297  0.0000   0.0307
## low_risk_alcohol        0.5480        0.4908    0.0572  0.0000   0.0552
##                  eQQ Max
## distance          0.0112
## great_health      1.0000
## mother_college    1.0000
## HS_grad           1.0000
## low_income        1.0000
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            1.0000
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       1.0000
## low_risk_alcohol  1.0000
## 
## , , Subclass 5
## 
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.7106        0.7087    0.0019  0.0014   0.0020
## great_health            0.8789        0.8627    0.0162  0.0000   0.0098
## mother_college          0.8281        0.7451    0.0830  0.0000   0.0784
## HS_grad                 0.9727        1.0000   -0.0273  0.0000   0.0294
## low_income              0.0391        0.0686   -0.0296  0.0000   0.0294
## disability              0.1484        0.1961   -0.0476  0.0000   0.0490
## female                  0.5781        0.6275   -0.0493  0.0000   0.0490
## NE_So_region            0.4375        0.4804   -0.0429  0.0000   0.0490
## high_exercise           0.5547        0.4902    0.0645  0.0000   0.0588
## smoker                  0.2422        0.2157    0.0265  0.0000   0.0294
## race_white              0.8516        0.8529   -0.0014  0.0000   0.0098
## daily_veg               0.3750        0.3922   -0.0172  0.0000   0.0196
## daily_fruit             0.2734        0.2647    0.0087  0.0000   0.0098
## low_risk_alcohol        0.4297        0.4706   -0.0409  0.0000   0.0392
##                  eQQ Max
## distance          0.0093
## great_health      1.0000
## mother_college    1.0000
## HS_grad           1.0000
## low_income        1.0000
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            1.0000
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       1.0000
## low_risk_alcohol  1.0000
## 
## , , Subclass 6
## 
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.7989        0.7915    0.0074  0.0075   0.0075
## great_health            0.8972        0.8710    0.0263  0.0000   0.0161
## mother_college          0.8182        0.7581    0.0601  0.0000   0.0484
## HS_grad                 1.0000        1.0000    0.0000  0.0000   0.0000
## low_income              0.0000        0.0000    0.0000  0.0000   0.0000
## disability              0.1028        0.1452   -0.0424  0.0000   0.0323
## female                  0.9130        0.9355   -0.0224  0.0000   0.0323
## NE_So_region            0.3518        0.3065    0.0453  0.0000   0.0484
## high_exercise           0.6008        0.5484    0.0524  0.0000   0.0484
## smoker                  0.0000        0.0000    0.0000  0.0000   0.0000
## race_white              0.9684        0.9194    0.0490  0.0000   0.0484
## daily_veg               0.4466        0.4355    0.0112  0.0000   0.0161
## daily_fruit             0.3439        0.3387    0.0052  0.0000   0.0000
## low_risk_alcohol        0.3241        0.3871   -0.0630  0.0000   0.0645
##                  eQQ Max
## distance          0.0172
## great_health      1.0000
## mother_college    1.0000
## HS_grad           0.0000
## low_income        0.0000
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            0.0000
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       0.0000
## low_risk_alcohol  1.0000
## 
## 
## Sample sizes by subclasses:
##         Subclass 1 Subclass 2 Subclass 3 Subclass 4 Subclass 5 Subclass 6
## Treated        253        253        253        250        256        253
## Control        626        319        194        163        102         62
## Total          879        572        447        413        358        315
## 
## Summary of balance across subclasses
##                  Means Treated Means Control Mean Diff eQQ Med eQQ Mean
## distance                0.5779        0.5674    0.0067  0.0108   0.0107
## great_health            0.7912        0.7817    0.0077  0.0000   0.0129
## mother_college          0.5474        0.5218    0.0230  0.0000   0.0447
## HS_grad                 0.8347        0.8200    0.0250  0.0000   0.0384
## low_income              0.3715        0.3959    0.0160  0.0000   0.0317
## disability              0.1719        0.1807    0.0199  0.0000   0.0436
## female                  0.5876        0.5928    0.0329  0.0000   0.0758
## NE_So_region            0.4848        0.4875    0.0127  0.0000   0.0296
## high_exercise           0.4789        0.4617    0.0243  0.0000   0.0492
## smoker                  0.2958        0.3086    0.0205  0.0000   0.0376
## race_white              0.6765        0.6559    0.0149  0.0000   0.0284
## daily_veg               0.3630        0.3656    0.0107  0.0000   0.0237
## daily_fruit             0.2424        0.2401    0.0062  0.0000   0.0113
## low_risk_alcohol        0.5105        0.5211    0.0164  0.0000   0.0327
##                  eQQ Max
## distance          0.0219
## great_health      0.8333
## mother_college    1.0000
## HS_grad           0.8333
## low_income        0.8333
## disability        1.0000
## female            1.0000
## NE_So_region      1.0000
## high_exercise     1.0000
## smoker            0.8333
## race_white        1.0000
## daily_veg         1.0000
## daily_fruit       0.8333
## low_risk_alcohol  0.8333
## 
## Percent Balance Improvement:
##                  Mean Diff. eQQ Med  eQQ Mean eQQ Max
## distance            92.5223 93.1869   92.4010 87.8202
## great_health        83.7176  0.0000   77.3994 16.6667
## mother_college      85.3827  0.0000   74.5195  0.0000
## HS_grad             92.7641  0.0000   80.9657 16.6667
## low_income          90.6808  0.0000   87.9138 16.6667
## disability          41.3094  0.0000 -190.2639  0.0000
## female              95.9345  0.0000   40.2647  0.0000
## NE_So_region        95.7670  0.0000   53.7671  0.0000
## high_exercise       58.0292  0.0000  -20.2119  0.0000
## smoker              91.0615  0.0000   73.9099 16.6667
## race_white          87.1122  0.0000   82.2246  0.0000
## daily_veg           91.5163  0.0000   22.9446  0.0000
## daily_fruit         93.2177  0.0000   66.3155 16.6667
## low_risk_alcohol    84.5596  0.0000   52.4910 16.6667

Create output data table:

matchedIns <- match.data(m.out)
write.csv(matchedIns, file="MatchedData.csv")

Jitter Plot of Propensity Score Matching:

plot(m.out, type="jitter")

## [1] "To identify the units, use first mouse button; to stop, use second."
## integer(0)

Histogram of Propensity Score Matching:

plot(m.out, type="hist")

T-tests of Predictor Group Balance

## 
##  Welch Two Sample t-test
## 
## data:  insurance by great_health
## t = -3.7301, df = 1183.2, p-value = 0.0002005
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1219875 -0.0378926
## sample estimates:
## mean in group 0 mean in group 1 
##       0.4477401       0.5276801
## 
##  Welch Two Sample t-test
## 
## data:  insurance by mother_college
## t = -9.7821, df = 2920.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2121013 -0.1412699
## sample estimates:
## mean in group 0 mean in group 1 
##       0.4272388       0.6039244
## 
##  Welch Two Sample t-test
## 
## data:  insurance by HS_grad
## t = -13.221, df = 1470.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2983018 -0.2212233
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3177215       0.5774840
## 
##  Welch Two Sample t-test
## 
## data:  insurance by low_income
## t = 14.791, df = 2981.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2267418 0.2960464
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6394102       0.3780161