Predicting Medical Expenses

Exploring and preparing the data

insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
insurance

Summarize the charges variable

summary(insurance$expenses)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1122    4740    9382   13270   16640   63770 

histogram of insurace charges

hist(insurance$expenses)

Table of region

table(insurance$region)

northeast northwest southeast southwest 
      324       325       364       325 

exploring relationships among features: correlation matrix

cor(insurance[c("age", "bmi", "children", "expenses")])
               age        bmi   children   expenses
age      1.0000000 0.10934101 0.04246900 0.29900819
bmi      0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000

visualizing the relationships

pairs(insurance[c("age", "bmi", "children", "expenses")])

Step 3: Training a model on the data

ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region, 
                data = insurance)
ins_model 

Call:
lm(formula = expenses ~ age + children + bmi + sex + smoker + 
    region, data = insurance)

Coefficients:
    (Intercept)              age         children              bmi  
       -11941.6            256.8            475.7            339.3  
        sexmale        smokeryes  regionnorthwest  regionsoutheast  
         -131.4          23847.5           -352.8          -1035.6  
regionsouthwest  
         -959.3  

Step 4:Evaluating model performance

summary(ins_model)

Call:
lm(formula = expenses ~ age + children + bmi + sex + smoker + 
    region, data = insurance)

Residuals:
     Min       1Q   Median       3Q      Max 
-11302.7  -2850.9   -979.6   1383.9  29981.7 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -11941.6      987.8 -12.089  < 2e-16 ***
age                256.8       11.9  21.586  < 2e-16 ***
children           475.7      137.8   3.452 0.000574 ***
bmi                339.3       28.6  11.864  < 2e-16 ***
sexmale           -131.3      332.9  -0.395 0.693255    
smokeryes        23847.5      413.1  57.723  < 2e-16 ***
regionnorthwest   -352.8      476.3  -0.741 0.458976    
regionsoutheast  -1035.6      478.7  -2.163 0.030685 *  
regionsouthwest   -959.3      477.9  -2.007 0.044921 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6062 on 1329 degrees of freedom
Multiple R-squared:  0.7509,    Adjusted R-squared:  0.7494 
F-statistic: 500.9 on 8 and 1329 DF,  p-value: < 2.2e-16

Step 5: improving model performance

adding a higher order age term

insurance$age2 <- insurance$age^2 

add an indicator for BMI >= 30

insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)

creating final model

ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
                   bmi30*smoker + region, data = insurance)
summary(ins_model2)

Call:
lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 * 
    smoker + region, data = insurance)

Residuals:
     Min       1Q   Median       3Q      Max 
-17297.1  -1656.0  -1262.7   -727.8  24161.6 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       139.0053  1363.1359   0.102 0.918792    
age               -32.6181    59.8250  -0.545 0.585690    
age2                3.7307     0.7463   4.999 6.54e-07 ***
children          678.6017   105.8855   6.409 2.03e-10 ***
bmi               119.7715    34.2796   3.494 0.000492 ***
sexmale          -496.7690   244.3713  -2.033 0.042267 *  
bmi30            -997.9355   422.9607  -2.359 0.018449 *  
smokeryes       13404.5952   439.9591  30.468  < 2e-16 ***
regionnorthwest  -279.1661   349.2826  -0.799 0.424285    
regionsoutheast  -828.0345   351.6484  -2.355 0.018682 *  
regionsouthwest -1222.1619   350.5314  -3.487 0.000505 ***
bmi30:smokeryes 19810.1534   604.6769  32.762  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4445 on 1326 degrees of freedom
Multiple R-squared:  0.8664,    Adjusted R-squared:  0.8653 
F-statistic: 781.7 on 11 and 1326 DF,  p-value: < 2.2e-16

making prediction with the regression model

insurance$pred <- predict(ins_model2, insurance)
cor(insurance$pred, insurance$expenses)
[1] 0.9307999

plotting the prediction against expenses

plot(insurance$pred, insurance$expenses)
abline(a = 0, b = 1, col = "red", lwd = 3, lty = 2)

predict(ins_model2,
        data.frame(age = 22, age2 = 22^2, children = 3,
                   bmi = 24, sex = "female", bmi30 = 0,
                   smoker = "no", region = "northwest"))
       1 
5858.241 
predict(ins_model2,
        data.frame(age = 22, age2 = 22^2, children = 1,
                   bmi = 27, sex = "male", bmi30 = 0,
                   smoker = "yes", region = "southeast"))
       1 
17219.31 
LS0tCnRpdGxlOiAiWW91c3NlZi1NTFItQWN0aXZpdHktOCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMgUHJlZGljdGluZyBNZWRpY2FsIEV4cGVuc2VzCgojIyMgRXhwbG9yaW5nIGFuZCBwcmVwYXJpbmcgdGhlIGRhdGEgCgpgYGB7cn0KaW5zdXJhbmNlIDwtIHJlYWQuY3N2KCJpbnN1cmFuY2UuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IFRSVUUpCmluc3VyYW5jZQpgYGAKCiMjIyBTdW1tYXJpemUgdGhlIGNoYXJnZXMgdmFyaWFibGUgCgpgYGB7cn0Kc3VtbWFyeShpbnN1cmFuY2UkZXhwZW5zZXMpCmBgYAoKIyMjIGhpc3RvZ3JhbSBvZiBpbnN1cmFjZSBjaGFyZ2VzIApgYGB7cn0KaGlzdChpbnN1cmFuY2UkZXhwZW5zZXMpCmBgYAoKIyMjIFRhYmxlIG9mIHJlZ2lvbiAKCmBgYHtyfQp0YWJsZShpbnN1cmFuY2UkcmVnaW9uKQpgYGAKCiMjIyBleHBsb3JpbmcgcmVsYXRpb25zaGlwcyBhbW9uZyBmZWF0dXJlczogY29ycmVsYXRpb24gbWF0cml4CmBgYHtyfQpjb3IoaW5zdXJhbmNlW2MoImFnZSIsICJibWkiLCAiY2hpbGRyZW4iLCAiZXhwZW5zZXMiKV0pCmBgYAoKIyMjIHZpc3VhbGl6aW5nIHRoZSByZWxhdGlvbnNoaXBzIAoKYGBge3J9CnBhaXJzKGluc3VyYW5jZVtjKCJhZ2UiLCAiYm1pIiwgImNoaWxkcmVuIiwgImV4cGVuc2VzIildKQpgYGAKCiMjIyBTdGVwIDM6IFRyYWluaW5nIGEgbW9kZWwgb24gdGhlIGRhdGEgCgpgYGB7cn0KaW5zX21vZGVsIDwtIGxtKGV4cGVuc2VzIH4gYWdlICsgY2hpbGRyZW4gKyBibWkgKyBzZXggKyBzbW9rZXIgKyByZWdpb24sIAogICAgICAgICAgICAgICAgZGF0YSA9IGluc3VyYW5jZSkKaW5zX21vZGVsIApgYGAKIyMgU3RlcCA0OkV2YWx1YXRpbmcgbW9kZWwgcGVyZm9ybWFuY2UgCgpgYGB7cn0Kc3VtbWFyeShpbnNfbW9kZWwpCmBgYAoKIyMgU3RlcCA1OiBpbXByb3ZpbmcgbW9kZWwgcGVyZm9ybWFuY2UgCgojIyMgYWRkaW5nIGEgaGlnaGVyIG9yZGVyIGFnZSB0ZXJtIApgYGB7cn0KaW5zdXJhbmNlJGFnZTIgPC0gaW5zdXJhbmNlJGFnZV4yIApgYGAKIyMjIGFkZCBhbiBpbmRpY2F0b3IgZm9yIEJNSSA+PSAzMApgYGB7cn0KaW5zdXJhbmNlJGJtaTMwIDwtIGlmZWxzZShpbnN1cmFuY2UkYm1pID49IDMwLCAxLCAwKQpgYGAKCiMjIyBjcmVhdGluZyBmaW5hbCBtb2RlbCAKYGBge3J9Cmluc19tb2RlbDIgPC0gbG0oZXhwZW5zZXMgfiBhZ2UgKyBhZ2UyICsgY2hpbGRyZW4gKyBibWkgKyBzZXggKwogICAgICAgICAgICAgICAgICAgYm1pMzAqc21va2VyICsgcmVnaW9uLCBkYXRhID0gaW5zdXJhbmNlKQpzdW1tYXJ5KGluc19tb2RlbDIpCmBgYAoKIyMjIG1ha2luZyBwcmVkaWN0aW9uIHdpdGggdGhlIHJlZ3Jlc3Npb24gbW9kZWwgCmBgYHtyfQppbnN1cmFuY2UkcHJlZCA8LSBwcmVkaWN0KGluc19tb2RlbDIsIGluc3VyYW5jZSkKY29yKGluc3VyYW5jZSRwcmVkLCBpbnN1cmFuY2UkZXhwZW5zZXMpCmBgYAoKIyMjIHBsb3R0aW5nIHRoZSBwcmVkaWN0aW9uIGFnYWluc3QgZXhwZW5zZXMgCgpgYGB7cn0KcGxvdChpbnN1cmFuY2UkcHJlZCwgaW5zdXJhbmNlJGV4cGVuc2VzKQphYmxpbmUoYSA9IDAsIGIgPSAxLCBjb2wgPSAicmVkIiwgbHdkID0gMywgbHR5ID0gMikKYGBgCgpgYGB7cn0KcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMjIsIGFnZTIgPSAyMl4yLCBjaGlsZHJlbiA9IDMsCiAgICAgICAgICAgICAgICAgICBibWkgPSAyNCwgc2V4ID0gImZlbWFsZSIsIGJtaTMwID0gMCwKICAgICAgICAgICAgICAgICAgIHNtb2tlciA9ICJubyIsIHJlZ2lvbiA9ICJub3J0aHdlc3QiKSkKYGBgCgoKCmBgYHtyfQpwcmVkaWN0KGluc19tb2RlbDIsCiAgICAgICAgZGF0YS5mcmFtZShhZ2UgPSAyMiwgYWdlMiA9IDIyXjIsIGNoaWxkcmVuID0gMSwKICAgICAgICAgICAgICAgICAgIGJtaSA9IDI3LCBzZXggPSAibWFsZSIsIGJtaTMwID0gMCwKICAgICAgICAgICAgICAgICAgIHNtb2tlciA9ICJ5ZXMiLCByZWdpb24gPSAic291dGhlYXN0IikpCgpgYGAK