getwd()
[1] "/cloud/project"
launch <- read.csv("insurance.csv")
## Step 2: Exploring and preparing the data ----
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)
'data.frame':   1338 obs. of  7 variables:
 $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
 $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
 $ bmi     : num  27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
 $ children: int  0 1 3 0 0 0 1 3 2 0 ...
 $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
 $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
 $ expenses: num  16885 1726 4449 21984 3867 ...
# summarize the charges variable
summary(insurance$expenses) #this create a summary of the expense 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1122    4740    9382   13270   16640   63770 
# histogram of insurance charges
hist(insurance$expenses) #this make it visual for easy read by creating a histogram  

# table of region
table(insurance$region) #this shows different region in the file 

northeast northwest southeast southwest 
      324       325       364       325 
# exploring relationships among features: correlation matrix
cor(insurance[c("age", "bmi", "children", "expenses")]) #this show how different variable are related.
               age        bmi   children   expenses
age      1.0000000 0.10934101 0.04246900 0.29900819
bmi      0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
# visualing relationships among features: scatterplot matrix
pairs(insurance[c("age", "bmi", "children", "expenses")])

## Step 3: Training a model on the data ----
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
                data = insurance)
ins_model <- lm(expenses ~ ., data = insurance) # this is equivalent to above

# see the estimated beta coefficients
ins_model

Call:
lm(formula = expenses ~ ., data = insurance)

Coefficients:
    (Intercept)              age          sexmale  
       -11941.6            256.8           -131.4  
            bmi         children        smokeryes  
          339.3            475.7          23847.5  
regionnorthwest  regionsoutheast  regionsouthwest  
         -352.8          -1035.6           -959.3  
# see more detail about the estimated beta coefficients
summary(ins_model)

Call:
lm(formula = expenses ~ ., data = insurance)

Residuals:
     Min       1Q   Median       3Q      Max 
-11302.7  -2850.9   -979.6   1383.9  29981.7 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -11941.6      987.8 -12.089  < 2e-16 ***
age                256.8       11.9  21.586  < 2e-16 ***
sexmale           -131.3      332.9  -0.395 0.693255    
bmi                339.3       28.6  11.864  < 2e-16 ***
children           475.7      137.8   3.452 0.000574 ***
smokeryes        23847.5      413.1  57.723  < 2e-16 ***
regionnorthwest   -352.8      476.3  -0.741 0.458976    
regionsoutheast  -1035.6      478.7  -2.163 0.030685 *  
regionsouthwest   -959.3      477.9  -2.007 0.044921 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6062 on 1329 degrees of freedom
Multiple R-squared:  0.7509,    Adjusted R-squared:  0.7494 
F-statistic: 500.9 on 8 and 1329 DF,  p-value: < 2.2e-16
#this give a sumary
# add a higher-order "age" term
insurance$age2 <- insurance$age^2
# add an indicator for BMI >= 30
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
# create final model
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
                   bmi30*smoker + region, data = insurance)
summary(ins_model2)

Call:
lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 * 
    smoker + region, data = insurance)

Residuals:
     Min       1Q   Median       3Q      Max 
-17297.1  -1656.0  -1262.7   -727.8  24161.6 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       139.0053  1363.1359   0.102 0.918792    
age               -32.6181    59.8250  -0.545 0.585690    
age2                3.7307     0.7463   4.999 6.54e-07 ***
children          678.6017   105.8855   6.409 2.03e-10 ***
bmi               119.7715    34.2796   3.494 0.000492 ***
sexmale          -496.7690   244.3713  -2.033 0.042267 *  
bmi30            -997.9355   422.9607  -2.359 0.018449 *  
smokeryes       13404.5952   439.9591  30.468  < 2e-16 ***
regionnorthwest  -279.1661   349.2826  -0.799 0.424285    
regionsoutheast  -828.0345   351.6484  -2.355 0.018682 *  
regionsouthwest -1222.1619   350.5314  -3.487 0.000505 ***
bmi30:smokeryes 19810.1534   604.6769  32.762  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4445 on 1326 degrees of freedom
Multiple R-squared:  0.8664,    Adjusted R-squared:  0.8653 
F-statistic: 781.7 on 11 and 1326 DF,  p-value: < 2.2e-16
# making predictions with the regression model
insurance$pred <- predict(ins_model2, insurance)
cor(insurance$pred, insurance$expenses)
[1] 0.9307999
plot(insurance$pred, insurance$expenses)
abline(a = 0, b = 1, col = "red", lwd = 3, lty = 2) #this create a scartterplot with color for the data sets indicate above

predict(ins_model2,
        data.frame(age = 30, age2 = 30^2, children = 2,
                   bmi = 30, sex = "male", bmi30 = 1,
                   smoker = "no", region = "northeast"))
       1 
5973.774 
predict(ins_model2,
        data.frame(age = 30, age2 = 30^2, children = 2,
                   bmi = 30, sex = "female", bmi30 = 1,
                   smoker = "no", region = "northeast"))
       1 
6470.543 
predict(ins_model2,
        data.frame(age = 30, age2 = 30^2, children = 0,
                   bmi = 30, sex = "female", bmi30 = 1,
                   smoker = "no", region = "northeast"))
      1 
5113.34 
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a
[1] 1.202815
sdr_b
[1] 1.392751
LS0tCnRpdGxlOiAiUiBJbnN1cmFuY2UgZmlsZSBBQ1QgNyAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCgpgYGB7cn0KZ2V0d2QoKSAjdGhpcyBoZWxwIHVwbG9hZCB0aGUgZmlsZSBpbiB0aGUgY2xvdWQgcHJvamVjdApgYGAKCgpgYGB7cn0KbGF1bmNoIDwtIHJlYWQuY3N2KCJpbnN1cmFuY2UuY3N2IikgI3RoaXMgaGVscCBsYXVuY2ggdGhlIGZpbGUgCmBgYAoKCmBgYHtyfQojIyBTdGVwIDI6IEV4cGxvcmluZyBhbmQgcHJlcGFyaW5nIHRoZSBkYXRhIC0tLS0KaW5zdXJhbmNlIDwtIHJlYWQuY3N2KCJpbnN1cmFuY2UuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IFRSVUUpCnN0cihpbnN1cmFuY2UpICN0aGlzIHNob3cgdGhlIGRhdGEgZnJhbWUKYGBgCgoKYGBge3J9CiMgc3VtbWFyaXplIHRoZSBjaGFyZ2VzIHZhcmlhYmxlCnN1bW1hcnkoaW5zdXJhbmNlJGV4cGVuc2VzKSAjdGhpcyBjcmVhdGUgYSBzdW1tYXJ5IG9mIHRoZSBleHBlbnNlIApgYGAKCmBgYHtyfQojIGhpc3RvZ3JhbSBvZiBpbnN1cmFuY2UgY2hhcmdlcwpoaXN0KGluc3VyYW5jZSRleHBlbnNlcykgI3RoaXMgbWFrZSBpdCB2aXN1YWwgZm9yIGVhc3kgcmVhZCBieSBjcmVhdGluZyBhIGhpc3RvZ3JhbSAgCmBgYAoKYGBge3J9CiMgdGFibGUgb2YgcmVnaW9uCnRhYmxlKGluc3VyYW5jZSRyZWdpb24pICN0aGlzIHNob3dzIGRpZmZlcmVudCByZWdpb24gaW4gdGhlIGZpbGUgCmBgYAoKCmBgYHtyfQojIGV4cGxvcmluZyByZWxhdGlvbnNoaXBzIGFtb25nIGZlYXR1cmVzOiBjb3JyZWxhdGlvbiBtYXRyaXgKY29yKGluc3VyYW5jZVtjKCJhZ2UiLCAiYm1pIiwgImNoaWxkcmVuIiwgImV4cGVuc2VzIildKSAjdGhpcyBzaG93IGhvdyBkaWZmZXJlbnQgdmFyaWFibGUgYXJlIHJlbGF0ZWQuCmBgYAoKCmBgYHtyfQojIHZpc3VhbGluZyByZWxhdGlvbnNoaXBzIGFtb25nIGZlYXR1cmVzOiBzY2F0dGVycGxvdCBtYXRyaXgKcGFpcnMoaW5zdXJhbmNlW2MoImFnZSIsICJibWkiLCAiY2hpbGRyZW4iLCAiZXhwZW5zZXMiKV0pICN0aGlzIGNyZWF0ZSBkaWZmZXJlbnQgc2NhdHRlcnBsb3QgZm9yIGVjYWggdmFyaWFibGUKYGBgCgoKYGBge3J9CiMjIFN0ZXAgMzogVHJhaW5pbmcgYSBtb2RlbCBvbiB0aGUgZGF0YSAtLS0tCmluc19tb2RlbCA8LSBsbShleHBlbnNlcyB+IGFnZSArIGNoaWxkcmVuICsgYm1pICsgc2V4ICsgc21va2VyICsgcmVnaW9uLAogICAgICAgICAgICAgICAgZGF0YSA9IGluc3VyYW5jZSkKaW5zX21vZGVsIDwtIGxtKGV4cGVuc2VzIH4gLiwgZGF0YSA9IGluc3VyYW5jZSkgIyB0aGlzIGlzIGVxdWl2YWxlbnQgdG8gYWJvdmUKCiMgc2VlIHRoZSBlc3RpbWF0ZWQgYmV0YSBjb2VmZmljaWVudHMKaW5zX21vZGVsCmBgYAoKYGBge3J9CiMgc2VlIG1vcmUgZGV0YWlsIGFib3V0IHRoZSBlc3RpbWF0ZWQgYmV0YSBjb2VmZmljaWVudHMKc3VtbWFyeShpbnNfbW9kZWwpCiN0aGlzIGdpdmUgYSBzdW1hcnkKCmBgYAoKCmBgYHtyfQojIGFkZCBhIGhpZ2hlci1vcmRlciAiYWdlIiB0ZXJtCmluc3VyYW5jZSRhZ2UyIDwtIGluc3VyYW5jZSRhZ2VeMgpgYGAKCgpgYGB7cn0KIyBhZGQgYW4gaW5kaWNhdG9yIGZvciBCTUkgPj0gMzAKaW5zdXJhbmNlJGJtaTMwIDwtIGlmZWxzZShpbnN1cmFuY2UkYm1pID49IDMwLCAxLCAwKQpgYGAKCgpgYGB7cn0KIyBjcmVhdGUgZmluYWwgbW9kZWwKaW5zX21vZGVsMiA8LSBsbShleHBlbnNlcyB+IGFnZSArIGFnZTIgKyBjaGlsZHJlbiArIGJtaSArIHNleCArCiAgICAgICAgICAgICAgICAgICBibWkzMCpzbW9rZXIgKyByZWdpb24sIGRhdGEgPSBpbnN1cmFuY2UpCmBgYAoKYGBge3J9CnN1bW1hcnkoaW5zX21vZGVsMikgI3RoaXMgZ2l2ZSBhIHN1bW1hcnkgb2YgYWxsIHZhcmlhYmxlIGluZGljYXRlZCBhYm92ZQpgYGAKCmBgYHtyfQojIG1ha2luZyBwcmVkaWN0aW9ucyB3aXRoIHRoZSByZWdyZXNzaW9uIG1vZGVsCmluc3VyYW5jZSRwcmVkIDwtIHByZWRpY3QoaW5zX21vZGVsMiwgaW5zdXJhbmNlKQpjb3IoaW5zdXJhbmNlJHByZWQsIGluc3VyYW5jZSRleHBlbnNlcykKYGBgCgpgYGB7cn0KcGxvdChpbnN1cmFuY2UkcHJlZCwgaW5zdXJhbmNlJGV4cGVuc2VzKQphYmxpbmUoYSA9IDAsIGIgPSAxLCBjb2wgPSAicmVkIiwgbHdkID0gMywgbHR5ID0gMikgI3RoaXMgY3JlYXRlIGEgc2NhcnR0ZXJwbG90IHdpdGggY29sb3IgZm9yIHRoZSBkYXRhIHNldHMgaW5kaWNhdGUgYWJvdmUKYGBgCgoKYGBge3J9CnByZWRpY3QoaW5zX21vZGVsMiwKICAgICAgICBkYXRhLmZyYW1lKGFnZSA9IDMwLCBhZ2UyID0gMzBeMiwgY2hpbGRyZW4gPSAyLAogICAgICAgICAgICAgICAgICAgYm1pID0gMzAsIHNleCA9ICJtYWxlIiwgYm1pMzAgPSAxLAogICAgICAgICAgICAgICAgICAgc21va2VyID0gIm5vIiwgcmVnaW9uID0gIm5vcnRoZWFzdCIpKQpgYGAKCgpgYGB7cn0KcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMzAsIGFnZTIgPSAzMF4yLCBjaGlsZHJlbiA9IDIsCiAgICAgICAgICAgICAgICAgICBibWkgPSAzMCwgc2V4ID0gImZlbWFsZSIsIGJtaTMwID0gMSwKICAgICAgICAgICAgICAgICAgIHNtb2tlciA9ICJubyIsIHJlZ2lvbiA9ICJub3J0aGVhc3QiKSkKYGBgCgoKYGBge3J9CnByZWRpY3QoaW5zX21vZGVsMiwKICAgICAgICBkYXRhLmZyYW1lKGFnZSA9IDMwLCBhZ2UyID0gMzBeMiwgY2hpbGRyZW4gPSAwLAogICAgICAgICAgICAgICAgICAgYm1pID0gMzAsIHNleCA9ICJmZW1hbGUiLCBibWkzMCA9IDEsCiAgICAgICAgICAgICAgICAgICBzbW9rZXIgPSAibm8iLCByZWdpb24gPSAibm9ydGhlYXN0IikpCmBgYAoKCmBgYHtyfQojIHNldCB1cCB0aGUgZGF0YQp0ZWUgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0LCA1LCA1LCA2LCA2LCA3LCA3LCA3LCA3KQphdDEgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0LCA1LCA1KQphdDIgPC0gYyg2LCA2LCA3LCA3LCA3LCA3KQpidDEgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0KQpidDIgPC0gYyg1LCA1LCA2LCA2LCA3LCA3LCA3LCA3KQpgYGAKCgpgYGB7cn0KIyBjb21wdXRlIHRoZSBTRFIKc2RyX2EgPC0gc2QodGVlKSAtIChsZW5ndGgoYXQxKSAvIGxlbmd0aCh0ZWUpICogc2QoYXQxKSArIGxlbmd0aChhdDIpIC8gbGVuZ3RoKHRlZSkgKiBzZChhdDIpKQpzZHJfYiA8LSBzZCh0ZWUpIC0gKGxlbmd0aChidDEpIC8gbGVuZ3RoKHRlZSkgKiBzZChidDEpICsgbGVuZ3RoKGJ0MikgLyBsZW5ndGgodGVlKSAqIHNkKGJ0MikpCmBgYAoKCgpgYGB7cn0KIyBjb21wYXJlIHRoZSBTRFIgZm9yIGVhY2ggc3BsaXQKc2RyX2EKYGBgCgoKYGBge3J9CnNkcl9iCmBgYAoK