library(readxl)
df <- read_excel("~/Desktop/R-dir/R studying/dataset/Insurance dataset.xlsx")
dim(df)
## [1] 1338 7
head(df)
## # A tibble: 6 × 7
## age sex bmi children smoker region charge
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 19 female 27.9 0 yes southwest 16885.
## 2 18 male 33.8 1 no southeast 1726.
## 3 28 male 33 3 no southeast 4449.
## 4 33 male 22.7 0 no northwest 21984.
## 5 32 male 28.9 0 no northwest 3867.
## 6 31 female 25.7 0 no southeast 3757.
tail(df)
## # A tibble: 6 × 7
## age sex bmi children smoker region charge
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 52 female 44.7 3 no southwest 11412.
## 2 50 male 31.0 3 no northwest 10601.
## 3 18 female 31.9 0 no northeast 2206.
## 4 18 female 36.8 0 no southeast 1630.
## 5 21 female 25.8 0 no southwest 2008.
## 6 61 female 29.1 0 yes northwest 29141.
colnames(df)
## [1] "age" "sex" "bmi" "children" "smoker" "region" "charge"
str(df)
## tibble [1,338 × 7] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr [1:1338] "female" "male" "male" "male" ...
## $ bmi : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
## $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr [1:1338] "yes" "no" "no" "no" ...
## $ region : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
## $ charge : num [1:1338] 16885 1726 4449 21984 3867 ...
hist(df$charge)
df$y = log(df$charge)
hist(df$y)
con_fit <- lm(y ~ age + sex + bmi + children + smoker + region, data = df)
con_fit
##
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoker + region,
## data = df)
##
## Coefficients:
## (Intercept) age sexmale bmi
## 7.03056 0.03458 -0.07542 0.01337
## children smokeryes regionnorthwest regionsoutheast
## 0.10186 1.55432 -0.06379 -0.15720
## regionsouthwest
## -0.12895
summary(con_fit)
##
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoker + region,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07186 -0.19835 -0.04917 0.06598 2.16636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0305581 0.0723960 97.112 < 2e-16 ***
## age 0.0345816 0.0008721 39.655 < 2e-16 ***
## sexmale -0.0754164 0.0244012 -3.091 0.002038 **
## bmi 0.0133748 0.0020960 6.381 2.42e-10 ***
## children 0.1018568 0.0100995 10.085 < 2e-16 ***
## smokeryes 1.5543228 0.0302795 51.333 < 2e-16 ***
## regionnorthwest -0.0637876 0.0349057 -1.827 0.067860 .
## regionsoutheast -0.1571967 0.0350828 -4.481 8.08e-06 ***
## regionsouthwest -0.1289522 0.0350271 -3.681 0.000241 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared: 0.7679, Adjusted R-squared: 0.7666
## F-statistic: 549.8 on 8 and 1329 DF, p-value: < 2.2e-16
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rsample)
set.seed(123)
index = createDataPartition(df$y, p = 0.7, list = FALSE)
training = df[index,]
testing = df[-index,]
# Training model
ml_fit <- lm(y ~ age + sex + bmi + children + smoker + region, data = training)
summary(ml_fit)
##
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoker + region,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93553 -0.19240 -0.04688 0.07042 2.11652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.055548 0.085161 82.850 < 2e-16 ***
## age 0.034188 0.001044 32.748 < 2e-16 ***
## sexmale -0.097856 0.029250 -3.346 0.000854 ***
## bmi 0.013111 0.002491 5.264 1.75e-07 ***
## children 0.100087 0.011996 8.343 2.59e-16 ***
## smokeryes 1.558164 0.035935 43.361 < 2e-16 ***
## regionnorthwest -0.063310 0.041895 -1.511 0.131090
## regionsoutheast -0.154463 0.041546 -3.718 0.000213 ***
## regionsouthwest -0.110911 0.041806 -2.653 0.008115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.444 on 929 degrees of freedom
## Multiple R-squared: 0.7735, Adjusted R-squared: 0.7715
## F-statistic: 396.6 on 8 and 929 DF, p-value: < 2.2e-16
# Model testing
testing$predicted = predict(ml_fit, testing)
error = testing$predicted - testing$y
sqrt(mean(error^2)) # --> RMSE
## [1] 0.4458804
cor(testing$predicted, testing$y)^2 # --> the coefficient of determination
## [1] 0.7535671
# Use caret functions: train, trainControl
fit <- train(form = y ~ age + sex + bmi + children + smoker + region, data = df,
method = "lm", trControl = trainControl(method = "cv", number = 10))
fit
## Linear Regression
##
## 1338 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1204, 1204, 1205, 1203, 1203, 1204, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.4441837 0.7659793 0.279278
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(fit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07186 -0.19835 -0.04917 0.06598 2.16636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0305581 0.0723960 97.112 < 2e-16 ***
## age 0.0345816 0.0008721 39.655 < 2e-16 ***
## sexmale -0.0754164 0.0244012 -3.091 0.002038 **
## bmi 0.0133748 0.0020960 6.381 2.42e-10 ***
## children 0.1018568 0.0100995 10.085 < 2e-16 ***
## smokeryes 1.5543228 0.0302795 51.333 < 2e-16 ***
## regionnorthwest -0.0637876 0.0349057 -1.827 0.067860 .
## regionsoutheast -0.1571967 0.0350828 -4.481 8.08e-06 ***
## regionsouthwest -0.1289522 0.0350271 -3.681 0.000241 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared: 0.7679, Adjusted R-squared: 0.7666
## F-statistic: 549.8 on 8 and 1329 DF, p-value: < 2.2e-16
fit <- train(form = y ~ age + sex + bmi + children + smoker + region, data = df,
method = "lm", trControl = trainControl(method = "boot", number = 100))
fit
## Linear Regression
##
## 1338 samples
## 6 predictor
##
## No pre-processing
## Resampling: Bootstrapped (100 reps)
## Summary of sample sizes: 1338, 1338, 1338, 1338, 1338, 1338, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.4505238 0.7608158 0.2835807
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Prof. Tuan: ML - Linear regression https://www.youtube.com/watch?v=VVtXuX4knv4&list=PLbRKZL7ww3qhPFZAIhusefzLbQFg6nyT4&index=9