1. Import dataset

library(readxl)
df <- read_excel("~/Desktop/R-dir/R studying/dataset/Insurance dataset.xlsx")

2. Explore dataset

dim(df)
## [1] 1338    7
head(df)
## # A tibble: 6 × 7
##     age sex      bmi children smoker region    charge
##   <dbl> <chr>  <dbl>    <dbl> <chr>  <chr>      <dbl>
## 1    19 female  27.9        0 yes    southwest 16885.
## 2    18 male    33.8        1 no     southeast  1726.
## 3    28 male    33          3 no     southeast  4449.
## 4    33 male    22.7        0 no     northwest 21984.
## 5    32 male    28.9        0 no     northwest  3867.
## 6    31 female  25.7        0 no     southeast  3757.
tail(df)
## # A tibble: 6 × 7
##     age sex      bmi children smoker region    charge
##   <dbl> <chr>  <dbl>    <dbl> <chr>  <chr>      <dbl>
## 1    52 female  44.7        3 no     southwest 11412.
## 2    50 male    31.0        3 no     northwest 10601.
## 3    18 female  31.9        0 no     northeast  2206.
## 4    18 female  36.8        0 no     southeast  1630.
## 5    21 female  25.8        0 no     southwest  2008.
## 6    61 female  29.1        0 yes    northwest 29141.
colnames(df)
## [1] "age"      "sex"      "bmi"      "children" "smoker"   "region"   "charge"
str(df)
## tibble [1,338 × 7] (S3: tbl_df/tbl/data.frame)
##  $ age     : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr [1:1338] "female" "male" "male" "male" ...
##  $ bmi     : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
##  $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr [1:1338] "yes" "no" "no" "no" ...
##  $ region  : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
##  $ charge  : num [1:1338] 16885 1726 4449 21984 3867 ...
hist(df$charge)

df$y = log(df$charge)
hist(df$y)

3. Conventional method

con_fit <- lm(y ~ age + sex + bmi + children + smoker + region, data = df)
con_fit
## 
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoker + region, 
##     data = df)
## 
## Coefficients:
##     (Intercept)              age          sexmale              bmi  
##         7.03056          0.03458         -0.07542          0.01337  
##        children        smokeryes  regionnorthwest  regionsoutheast  
##         0.10186          1.55432         -0.06379         -0.15720  
## regionsouthwest  
##        -0.12895
summary(con_fit)
## 
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoker + region, 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07186 -0.19835 -0.04917  0.06598  2.16636 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0305581  0.0723960  97.112  < 2e-16 ***
## age              0.0345816  0.0008721  39.655  < 2e-16 ***
## sexmale         -0.0754164  0.0244012  -3.091 0.002038 ** 
## bmi              0.0133748  0.0020960   6.381 2.42e-10 ***
## children         0.1018568  0.0100995  10.085  < 2e-16 ***
## smokeryes        1.5543228  0.0302795  51.333  < 2e-16 ***
## regionnorthwest -0.0637876  0.0349057  -1.827 0.067860 .  
## regionsoutheast -0.1571967  0.0350828  -4.481 8.08e-06 ***
## regionsouthwest -0.1289522  0.0350271  -3.681 0.000241 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared:  0.7679, Adjusted R-squared:  0.7666 
## F-statistic: 549.8 on 8 and 1329 DF,  p-value: < 2.2e-16

4. ML approach

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rsample)

4.1 Splitting data

set.seed(123)
index = createDataPartition(df$y, p = 0.7, list = FALSE)
training = df[index,]
testing = df[-index,]
# Training model
ml_fit <- lm(y ~ age + sex + bmi + children + smoker + region, data = training)
summary(ml_fit)
## 
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoker + region, 
##     data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93553 -0.19240 -0.04688  0.07042  2.11652 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.055548   0.085161  82.850  < 2e-16 ***
## age              0.034188   0.001044  32.748  < 2e-16 ***
## sexmale         -0.097856   0.029250  -3.346 0.000854 ***
## bmi              0.013111   0.002491   5.264 1.75e-07 ***
## children         0.100087   0.011996   8.343 2.59e-16 ***
## smokeryes        1.558164   0.035935  43.361  < 2e-16 ***
## regionnorthwest -0.063310   0.041895  -1.511 0.131090    
## regionsoutheast -0.154463   0.041546  -3.718 0.000213 ***
## regionsouthwest -0.110911   0.041806  -2.653 0.008115 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.444 on 929 degrees of freedom
## Multiple R-squared:  0.7735, Adjusted R-squared:  0.7715 
## F-statistic: 396.6 on 8 and 929 DF,  p-value: < 2.2e-16
# Model testing
testing$predicted = predict(ml_fit, testing)
error = testing$predicted - testing$y
sqrt(mean(error^2)) # --> RMSE
## [1] 0.4458804
cor(testing$predicted, testing$y)^2 # --> the coefficient of determination
## [1] 0.7535671

4.2 K-fold cross-validation approach

# Use caret functions: train, trainControl
fit <- train(form = y ~ age + sex + bmi + children + smoker + region, data = df,
             method = "lm", trControl = trainControl(method = "cv", number = 10))
fit
## Linear Regression 
## 
## 1338 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1204, 1204, 1205, 1203, 1203, 1204, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE     
##   0.4441837  0.7659793  0.279278
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(fit)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07186 -0.19835 -0.04917  0.06598  2.16636 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0305581  0.0723960  97.112  < 2e-16 ***
## age              0.0345816  0.0008721  39.655  < 2e-16 ***
## sexmale         -0.0754164  0.0244012  -3.091 0.002038 ** 
## bmi              0.0133748  0.0020960   6.381 2.42e-10 ***
## children         0.1018568  0.0100995  10.085  < 2e-16 ***
## smokeryes        1.5543228  0.0302795  51.333  < 2e-16 ***
## regionnorthwest -0.0637876  0.0349057  -1.827 0.067860 .  
## regionsoutheast -0.1571967  0.0350828  -4.481 8.08e-06 ***
## regionsouthwest -0.1289522  0.0350271  -3.681 0.000241 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared:  0.7679, Adjusted R-squared:  0.7666 
## F-statistic: 549.8 on 8 and 1329 DF,  p-value: < 2.2e-16

4.3 Boostrap validation

fit <- train(form = y ~ age + sex + bmi + children + smoker + region, data = df,
             method = "lm", trControl = trainControl(method = "boot", number = 100))
fit
## Linear Regression 
## 
## 1338 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (100 reps) 
## Summary of sample sizes: 1338, 1338, 1338, 1338, 1338, 1338, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.4505238  0.7608158  0.2835807
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Prof. Tuan: ML - Linear regression https://www.youtube.com/watch?v=VVtXuX4knv4&list=PLbRKZL7ww3qhPFZAIhusefzLbQFg6nyT4&index=9