Predicting Medical Expenses
Exploring and preparing the data
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
insurance
Summarize the charges variable
summary(insurance$expenses)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1122 4740 9382 13270 16640 63770
histogram of insurace charges
hist(insurance$expenses)

Table of region
table(insurance$region)
northeast northwest southeast southwest
324 325 364 325
exploring relationships among features: correlation matrix
cor(insurance[c("age", "bmi", "children", "expenses")])
age bmi children expenses
age 1.0000000 0.10934101 0.04246900 0.29900819
bmi 0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
visualizing the relationships
pairs(insurance[c("age", "bmi", "children", "expenses")])

Step 3: Training a model on the data
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
data = insurance)
ins_model
Call:
lm(formula = expenses ~ age + children + bmi + sex + smoker +
region, data = insurance)
Coefficients:
(Intercept) age children bmi
-11941.6 256.8 475.7 339.3
sexmale smokeryes regionnorthwest regionsoutheast
-131.4 23847.5 -352.8 -1035.6
regionsouthwest
-959.3
LS0tCnRpdGxlOiAiWW91c3NlZi1NTFItQWN0aXZpdHktOCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMgUHJlZGljdGluZyBNZWRpY2FsIEV4cGVuc2VzCgojIyMgRXhwbG9yaW5nIGFuZCBwcmVwYXJpbmcgdGhlIGRhdGEgCgpgYGB7cn0KaW5zdXJhbmNlIDwtIHJlYWQuY3N2KCJpbnN1cmFuY2UuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IFRSVUUpCmluc3VyYW5jZQpgYGAKCiMjIyBTdW1tYXJpemUgdGhlIGNoYXJnZXMgdmFyaWFibGUgCgpgYGB7cn0Kc3VtbWFyeShpbnN1cmFuY2UkZXhwZW5zZXMpCmBgYAoKIyMjIGhpc3RvZ3JhbSBvZiBpbnN1cmFjZSBjaGFyZ2VzIApgYGB7cn0KaGlzdChpbnN1cmFuY2UkZXhwZW5zZXMpCmBgYAoKIyMjIFRhYmxlIG9mIHJlZ2lvbiAKCmBgYHtyfQp0YWJsZShpbnN1cmFuY2UkcmVnaW9uKQpgYGAKCiMjIyBleHBsb3JpbmcgcmVsYXRpb25zaGlwcyBhbW9uZyBmZWF0dXJlczogY29ycmVsYXRpb24gbWF0cml4CmBgYHtyfQpjb3IoaW5zdXJhbmNlW2MoImFnZSIsICJibWkiLCAiY2hpbGRyZW4iLCAiZXhwZW5zZXMiKV0pCmBgYAoKIyMjIHZpc3VhbGl6aW5nIHRoZSByZWxhdGlvbnNoaXBzIAoKYGBge3J9CnBhaXJzKGluc3VyYW5jZVtjKCJhZ2UiLCAiYm1pIiwgImNoaWxkcmVuIiwgImV4cGVuc2VzIildKQpgYGAKCiMjIyBTdGVwIDM6IFRyYWluaW5nIGEgbW9kZWwgb24gdGhlIGRhdGEgCgpgYGB7cn0KaW5zX21vZGVsIDwtIGxtKGV4cGVuc2VzIH4gYWdlICsgY2hpbGRyZW4gKyBibWkgKyBzZXggKyBzbW9rZXIgKyByZWdpb24sIAogICAgICAgICAgICAgICAgZGF0YSA9IGluc3VyYW5jZSkKaW5zX21vZGVsIApgYGAKIyMgU3RlcCA0OkV2YWx1YXRpbmcgbW9kZWwgcGVyZm9ybWFuY2UgCgpgYGB7cn0Kc3VtbWFyeShpbnNfbW9kZWwpCmBgYAoKIyMgU3RlcCA1OiBpbXByb3ZpbmcgbW9kZWwgcGVyZm9ybWFuY2UgCgojIyMgYWRkaW5nIGEgaGlnaGVyIG9yZGVyIGFnZSB0ZXJtIApgYGB7cn0KaW5zdXJhbmNlJGFnZTIgPC0gaW5zdXJhbmNlJGFnZV4yIApgYGAKIyMjIGFkZCBhbiBpbmRpY2F0b3IgZm9yIEJNSSA+PSAzMApgYGB7cn0KaW5zdXJhbmNlJGJtaTMwIDwtIGlmZWxzZShpbnN1cmFuY2UkYm1pID49IDMwLCAxLCAwKQpgYGAKCiMjIyBjcmVhdGluZyBmaW5hbCBtb2RlbCAKYGBge3J9Cmluc19tb2RlbDIgPC0gbG0oZXhwZW5zZXMgfiBhZ2UgKyBhZ2UyICsgY2hpbGRyZW4gKyBibWkgKyBzZXggKwogICAgICAgICAgICAgICAgICAgYm1pMzAqc21va2VyICsgcmVnaW9uLCBkYXRhID0gaW5zdXJhbmNlKQpzdW1tYXJ5KGluc19tb2RlbDIpCmBgYAoKIyMjIG1ha2luZyBwcmVkaWN0aW9uIHdpdGggdGhlIHJlZ3Jlc3Npb24gbW9kZWwgCmBgYHtyfQppbnN1cmFuY2UkcHJlZCA8LSBwcmVkaWN0KGluc19tb2RlbDIsIGluc3VyYW5jZSkKY29yKGluc3VyYW5jZSRwcmVkLCBpbnN1cmFuY2UkZXhwZW5zZXMpCmBgYAoKIyMjIHBsb3R0aW5nIHRoZSBwcmVkaWN0aW9uIGFnYWluc3QgZXhwZW5zZXMgCgpgYGB7cn0KcGxvdChpbnN1cmFuY2UkcHJlZCwgaW5zdXJhbmNlJGV4cGVuc2VzKQphYmxpbmUoYSA9IDAsIGIgPSAxLCBjb2wgPSAicmVkIiwgbHdkID0gMywgbHR5ID0gMikKYGBgCgpgYGB7cn0KcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMjIsIGFnZTIgPSAyMl4yLCBjaGlsZHJlbiA9IDMsCiAgICAgICAgICAgICAgICAgICBibWkgPSAyNCwgc2V4ID0gImZlbWFsZSIsIGJtaTMwID0gMCwKICAgICAgICAgICAgICAgICAgIHNtb2tlciA9ICJubyIsIHJlZ2lvbiA9ICJub3J0aHdlc3QiKSkKYGBgCgoKCmBgYHtyfQpwcmVkaWN0KGluc19tb2RlbDIsCiAgICAgICAgZGF0YS5mcmFtZShhZ2UgPSAyMiwgYWdlMiA9IDIyXjIsIGNoaWxkcmVuID0gMSwKICAgICAgICAgICAgICAgICAgIGJtaSA9IDI3LCBzZXggPSAibWFsZSIsIGJtaTMwID0gMCwKICAgICAgICAgICAgICAgICAgIHNtb2tlciA9ICJ5ZXMiLCByZWdpb24gPSAic291dGhlYXN0IikpCgpgYGAK