getwd()
[1] "/cloud/project"
launch <- read.csv("insurance.csv")
## Step 2: Exploring and preparing the data ----
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)
'data.frame': 1338 obs. of 7 variables:
$ age : int 19 18 28 33 32 31 46 37 37 60 ...
$ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
$ bmi : num 27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
$ children: int 0 1 3 0 0 0 1 3 2 0 ...
$ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
$ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
$ expenses: num 16885 1726 4449 21984 3867 ...
# summarize the charges variable
summary(insurance$expenses) #this create a summary of the expense
Min. 1st Qu. Median Mean 3rd Qu. Max.
1122 4740 9382 13270 16640 63770
# histogram of insurance charges
hist(insurance$expenses) #this make it visual for easy read by creating a histogram

# table of region
table(insurance$region) #this shows different region in the file
northeast northwest southeast southwest
324 325 364 325
# exploring relationships among features: correlation matrix
cor(insurance[c("age", "bmi", "children", "expenses")]) #this show how different variable are related.
age bmi children expenses
age 1.0000000 0.10934101 0.04246900 0.29900819
bmi 0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
# visualing relationships among features: scatterplot matrix
pairs(insurance[c("age", "bmi", "children", "expenses")])

## Step 3: Training a model on the data ----
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
data = insurance)
ins_model <- lm(expenses ~ ., data = insurance) # this is equivalent to above
# see the estimated beta coefficients
ins_model
Call:
lm(formula = expenses ~ ., data = insurance)
Coefficients:
(Intercept) age sexmale
-11941.6 256.8 -131.4
bmi children smokeryes
339.3 475.7 23847.5
regionnorthwest regionsoutheast regionsouthwest
-352.8 -1035.6 -959.3
# see more detail about the estimated beta coefficients
summary(ins_model)
Call:
lm(formula = expenses ~ ., data = insurance)
Residuals:
Min 1Q Median 3Q Max
-11302.7 -2850.9 -979.6 1383.9 29981.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11941.6 987.8 -12.089 < 2e-16 ***
age 256.8 11.9 21.586 < 2e-16 ***
sexmale -131.3 332.9 -0.395 0.693255
bmi 339.3 28.6 11.864 < 2e-16 ***
children 475.7 137.8 3.452 0.000574 ***
smokeryes 23847.5 413.1 57.723 < 2e-16 ***
regionnorthwest -352.8 476.3 -0.741 0.458976
regionsoutheast -1035.6 478.7 -2.163 0.030685 *
regionsouthwest -959.3 477.9 -2.007 0.044921 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6062 on 1329 degrees of freedom
Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
F-statistic: 500.9 on 8 and 1329 DF, p-value: < 2.2e-16
#this give a sumary
# add a higher-order "age" term
insurance$age2 <- insurance$age^2
# add an indicator for BMI >= 30
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
# create final model
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, data = insurance)
summary(ins_model2)
Call:
lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 *
smoker + region, data = insurance)
Residuals:
Min 1Q Median 3Q Max
-17297.1 -1656.0 -1262.7 -727.8 24161.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 139.0053 1363.1359 0.102 0.918792
age -32.6181 59.8250 -0.545 0.585690
age2 3.7307 0.7463 4.999 6.54e-07 ***
children 678.6017 105.8855 6.409 2.03e-10 ***
bmi 119.7715 34.2796 3.494 0.000492 ***
sexmale -496.7690 244.3713 -2.033 0.042267 *
bmi30 -997.9355 422.9607 -2.359 0.018449 *
smokeryes 13404.5952 439.9591 30.468 < 2e-16 ***
regionnorthwest -279.1661 349.2826 -0.799 0.424285
regionsoutheast -828.0345 351.6484 -2.355 0.018682 *
regionsouthwest -1222.1619 350.5314 -3.487 0.000505 ***
bmi30:smokeryes 19810.1534 604.6769 32.762 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4445 on 1326 degrees of freedom
Multiple R-squared: 0.8664, Adjusted R-squared: 0.8653
F-statistic: 781.7 on 11 and 1326 DF, p-value: < 2.2e-16
# making predictions with the regression model
insurance$pred <- predict(ins_model2, insurance)
cor(insurance$pred, insurance$expenses)
[1] 0.9307999
plot(insurance$pred, insurance$expenses)
abline(a = 0, b = 1, col = "red", lwd = 3, lty = 2) #this create a scartterplot with color for the data sets indicate above

predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 2,
bmi = 30, sex = "male", bmi30 = 1,
smoker = "no", region = "northeast"))
1
5973.774
predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 2,
bmi = 30, sex = "female", bmi30 = 1,
smoker = "no", region = "northeast"))
1
6470.543
predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 0,
bmi = 30, sex = "female", bmi30 = 1,
smoker = "no", region = "northeast"))
1
5113.34
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a
[1] 1.202815
sdr_b
[1] 1.392751
LS0tCnRpdGxlOiAiUiBJbnN1cmFuY2UgZmlsZSBBQ1QgNyAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCgpgYGB7cn0KZ2V0d2QoKSAjdGhpcyBoZWxwIHVwbG9hZCB0aGUgZmlsZSBpbiB0aGUgY2xvdWQgcHJvamVjdApgYGAKCgpgYGB7cn0KbGF1bmNoIDwtIHJlYWQuY3N2KCJpbnN1cmFuY2UuY3N2IikgI3RoaXMgaGVscCBsYXVuY2ggdGhlIGZpbGUgCmBgYAoKCmBgYHtyfQojIyBTdGVwIDI6IEV4cGxvcmluZyBhbmQgcHJlcGFyaW5nIHRoZSBkYXRhIC0tLS0KaW5zdXJhbmNlIDwtIHJlYWQuY3N2KCJpbnN1cmFuY2UuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IFRSVUUpCnN0cihpbnN1cmFuY2UpICN0aGlzIHNob3cgdGhlIGRhdGEgZnJhbWUKYGBgCgoKYGBge3J9CiMgc3VtbWFyaXplIHRoZSBjaGFyZ2VzIHZhcmlhYmxlCnN1bW1hcnkoaW5zdXJhbmNlJGV4cGVuc2VzKSAjdGhpcyBjcmVhdGUgYSBzdW1tYXJ5IG9mIHRoZSBleHBlbnNlIApgYGAKCmBgYHtyfQojIGhpc3RvZ3JhbSBvZiBpbnN1cmFuY2UgY2hhcmdlcwpoaXN0KGluc3VyYW5jZSRleHBlbnNlcykgI3RoaXMgbWFrZSBpdCB2aXN1YWwgZm9yIGVhc3kgcmVhZCBieSBjcmVhdGluZyBhIGhpc3RvZ3JhbSAgCmBgYAoKYGBge3J9CiMgdGFibGUgb2YgcmVnaW9uCnRhYmxlKGluc3VyYW5jZSRyZWdpb24pICN0aGlzIHNob3dzIGRpZmZlcmVudCByZWdpb24gaW4gdGhlIGZpbGUgCmBgYAoKCmBgYHtyfQojIGV4cGxvcmluZyByZWxhdGlvbnNoaXBzIGFtb25nIGZlYXR1cmVzOiBjb3JyZWxhdGlvbiBtYXRyaXgKY29yKGluc3VyYW5jZVtjKCJhZ2UiLCAiYm1pIiwgImNoaWxkcmVuIiwgImV4cGVuc2VzIildKSAjdGhpcyBzaG93IGhvdyBkaWZmZXJlbnQgdmFyaWFibGUgYXJlIHJlbGF0ZWQuCmBgYAoKCmBgYHtyfQojIHZpc3VhbGluZyByZWxhdGlvbnNoaXBzIGFtb25nIGZlYXR1cmVzOiBzY2F0dGVycGxvdCBtYXRyaXgKcGFpcnMoaW5zdXJhbmNlW2MoImFnZSIsICJibWkiLCAiY2hpbGRyZW4iLCAiZXhwZW5zZXMiKV0pICN0aGlzIGNyZWF0ZSBkaWZmZXJlbnQgc2NhdHRlcnBsb3QgZm9yIGVjYWggdmFyaWFibGUKYGBgCgoKYGBge3J9CiMjIFN0ZXAgMzogVHJhaW5pbmcgYSBtb2RlbCBvbiB0aGUgZGF0YSAtLS0tCmluc19tb2RlbCA8LSBsbShleHBlbnNlcyB+IGFnZSArIGNoaWxkcmVuICsgYm1pICsgc2V4ICsgc21va2VyICsgcmVnaW9uLAogICAgICAgICAgICAgICAgZGF0YSA9IGluc3VyYW5jZSkKaW5zX21vZGVsIDwtIGxtKGV4cGVuc2VzIH4gLiwgZGF0YSA9IGluc3VyYW5jZSkgIyB0aGlzIGlzIGVxdWl2YWxlbnQgdG8gYWJvdmUKCiMgc2VlIHRoZSBlc3RpbWF0ZWQgYmV0YSBjb2VmZmljaWVudHMKaW5zX21vZGVsCmBgYAoKYGBge3J9CiMgc2VlIG1vcmUgZGV0YWlsIGFib3V0IHRoZSBlc3RpbWF0ZWQgYmV0YSBjb2VmZmljaWVudHMKc3VtbWFyeShpbnNfbW9kZWwpCiN0aGlzIGdpdmUgYSBzdW1hcnkKCmBgYAoKCmBgYHtyfQojIGFkZCBhIGhpZ2hlci1vcmRlciAiYWdlIiB0ZXJtCmluc3VyYW5jZSRhZ2UyIDwtIGluc3VyYW5jZSRhZ2VeMgpgYGAKCgpgYGB7cn0KIyBhZGQgYW4gaW5kaWNhdG9yIGZvciBCTUkgPj0gMzAKaW5zdXJhbmNlJGJtaTMwIDwtIGlmZWxzZShpbnN1cmFuY2UkYm1pID49IDMwLCAxLCAwKQpgYGAKCgpgYGB7cn0KIyBjcmVhdGUgZmluYWwgbW9kZWwKaW5zX21vZGVsMiA8LSBsbShleHBlbnNlcyB+IGFnZSArIGFnZTIgKyBjaGlsZHJlbiArIGJtaSArIHNleCArCiAgICAgICAgICAgICAgICAgICBibWkzMCpzbW9rZXIgKyByZWdpb24sIGRhdGEgPSBpbnN1cmFuY2UpCmBgYAoKYGBge3J9CnN1bW1hcnkoaW5zX21vZGVsMikgI3RoaXMgZ2l2ZSBhIHN1bW1hcnkgb2YgYWxsIHZhcmlhYmxlIGluZGljYXRlZCBhYm92ZQpgYGAKCmBgYHtyfQojIG1ha2luZyBwcmVkaWN0aW9ucyB3aXRoIHRoZSByZWdyZXNzaW9uIG1vZGVsCmluc3VyYW5jZSRwcmVkIDwtIHByZWRpY3QoaW5zX21vZGVsMiwgaW5zdXJhbmNlKQpjb3IoaW5zdXJhbmNlJHByZWQsIGluc3VyYW5jZSRleHBlbnNlcykKYGBgCgpgYGB7cn0KcGxvdChpbnN1cmFuY2UkcHJlZCwgaW5zdXJhbmNlJGV4cGVuc2VzKQphYmxpbmUoYSA9IDAsIGIgPSAxLCBjb2wgPSAicmVkIiwgbHdkID0gMywgbHR5ID0gMikgI3RoaXMgY3JlYXRlIGEgc2NhcnR0ZXJwbG90IHdpdGggY29sb3IgZm9yIHRoZSBkYXRhIHNldHMgaW5kaWNhdGUgYWJvdmUKYGBgCgoKYGBge3J9CnByZWRpY3QoaW5zX21vZGVsMiwKICAgICAgICBkYXRhLmZyYW1lKGFnZSA9IDMwLCBhZ2UyID0gMzBeMiwgY2hpbGRyZW4gPSAyLAogICAgICAgICAgICAgICAgICAgYm1pID0gMzAsIHNleCA9ICJtYWxlIiwgYm1pMzAgPSAxLAogICAgICAgICAgICAgICAgICAgc21va2VyID0gIm5vIiwgcmVnaW9uID0gIm5vcnRoZWFzdCIpKQpgYGAKCgpgYGB7cn0KcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMzAsIGFnZTIgPSAzMF4yLCBjaGlsZHJlbiA9IDIsCiAgICAgICAgICAgICAgICAgICBibWkgPSAzMCwgc2V4ID0gImZlbWFsZSIsIGJtaTMwID0gMSwKICAgICAgICAgICAgICAgICAgIHNtb2tlciA9ICJubyIsIHJlZ2lvbiA9ICJub3J0aGVhc3QiKSkKYGBgCgoKYGBge3J9CnByZWRpY3QoaW5zX21vZGVsMiwKICAgICAgICBkYXRhLmZyYW1lKGFnZSA9IDMwLCBhZ2UyID0gMzBeMiwgY2hpbGRyZW4gPSAwLAogICAgICAgICAgICAgICAgICAgYm1pID0gMzAsIHNleCA9ICJmZW1hbGUiLCBibWkzMCA9IDEsCiAgICAgICAgICAgICAgICAgICBzbW9rZXIgPSAibm8iLCByZWdpb24gPSAibm9ydGhlYXN0IikpCmBgYAoKCmBgYHtyfQojIHNldCB1cCB0aGUgZGF0YQp0ZWUgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0LCA1LCA1LCA2LCA2LCA3LCA3LCA3LCA3KQphdDEgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0LCA1LCA1KQphdDIgPC0gYyg2LCA2LCA3LCA3LCA3LCA3KQpidDEgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0KQpidDIgPC0gYyg1LCA1LCA2LCA2LCA3LCA3LCA3LCA3KQpgYGAKCgpgYGB7cn0KIyBjb21wdXRlIHRoZSBTRFIKc2RyX2EgPC0gc2QodGVlKSAtIChsZW5ndGgoYXQxKSAvIGxlbmd0aCh0ZWUpICogc2QoYXQxKSArIGxlbmd0aChhdDIpIC8gbGVuZ3RoKHRlZSkgKiBzZChhdDIpKQpzZHJfYiA8LSBzZCh0ZWUpIC0gKGxlbmd0aChidDEpIC8gbGVuZ3RoKHRlZSkgKiBzZChidDEpICsgbGVuZ3RoKGJ0MikgLyBsZW5ndGgodGVlKSAqIHNkKGJ0MikpCmBgYAoKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBTRFIgZm9yIGVhY2ggc3BsaXQKc2RyX2EKYGBgCgoKYGBge3J9CnNkcl9iCmBgYAoK