Idata <- read.csv("insurance.csv")
head(Idata)
sum(is.na(Idata))
## [1] 0
## Identify NAs in full data frame
##To compute the total missing values in each column is to use colSums()
#colSums(is.na(Tykodata))
colSums(is.na(Idata))
## age sex bmi children smoker region charges
## 0 0 0 0 0 0 0
##Simple ways to deal with missing value https://uc-r.github.io/missing_values
###No Missing Data. Now the data is ready for analysis.
Now we partition the data into train and test data
# partition data
set.seed(99) # set seed for reproducing the partition
train.index <- sample(c(1:1338), 1000)
train_dataset <- Idata[train.index, ]
test_dataset <- Idata[-train.index, ]
insurance.lm <- lm(charges~ ., data = train_dataset)
# use options() to ensure numbers are not displayed in scientific notation e.g. if possible avoids term like 1.3e+20
# options(scipen = 999)
summary(insurance.lm)
##
## Call:
## lm(formula = charges ~ ., data = train_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11562.7 -2791.5 -974.4 1528.9 29513.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12113.90 1159.02 -10.452 <2e-16 ***
## age 261.40 13.98 18.694 <2e-16 ***
## sexmale -156.99 386.98 -0.406 0.6851
## bmi 343.47 33.64 10.210 <2e-16 ***
## children 405.84 161.02 2.520 0.0119 *
## smokeryes 24218.66 474.47 51.044 <2e-16 ***
## regionnorthwest -457.94 549.22 -0.834 0.4046
## regionsoutheast -1059.07 558.02 -1.898 0.0580 .
## regionsouthwest -1126.42 548.77 -2.053 0.0404 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6059 on 991 degrees of freedom
## Multiple R-squared: 0.7611, Adjusted R-squared: 0.7591
## F-statistic: 394.6 on 8 and 991 DF, p-value: < 2.2e-16
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# use predict() to make predictions on a new set.
insurance.lm.pred <- predict(insurance.lm, test_dataset)
options(scipen=999, digits = 0)
some.residuals <- test_dataset$charges[1:20] - insurance.lm.pred[1:20]
data.frame("Predicted" = insurance.lm.pred[1:20], "Actual" = test_dataset$charges[1:20],
"Residual" = some.residuals)
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(insurance.lm.pred, test_dataset$charges)
## ME RMSE MAE MPE MAPE
## Test set -189 6086 4174 -16.1 42.7
#MAPE is calculated as (actual - predicted) / abs(actual). May give error / NaN if denominator is near zero.
#Interpretation
library(forecast)
all.residuals <- test_dataset$charges - insurance.lm.pred
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
#boxplot(all.residuals)
library(car)
## Loading required package: carData
Boxplot(all.residuals,id.method="all.residuals")
## [1] 61 236 196 85 240 265 200 222 250 38 57 123 248 227 246 96 279 212 80
## [20] 232
# use regsubsets() in package leaps to run an exhaustive search.
library(leaps) #It does not work with categorical variables. Covert them into factors before using this function
# create dummies for fuel type
search <- regsubsets(charges ~ ., data = train_dataset, nbest = 1, nvmax = dim(train_dataset)[2],
method = "exhaustive")
sum <- summary(search)
# show models
sum$which
## (Intercept) age sexmale bmi children smokeryes regionnorthwest
## 1 TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 2 TRUE TRUE FALSE FALSE FALSE TRUE FALSE
## 3 TRUE TRUE FALSE TRUE FALSE TRUE FALSE
## 4 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 5 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 6 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 7 TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## regionsoutheast regionsouthwest
## 1 FALSE FALSE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE FALSE
## 5 FALSE TRUE
## 6 TRUE TRUE
## 7 TRUE TRUE
#show metrics
sum$rsq
## [1] 0.636 0.733 0.758 0.760 0.760 0.761 0.761
sum$adjr2
## [1] 0.636 0.732 0.757 0.759 0.759 0.759 0.759
sum$cp
## [1] 512.27 114.28 11.29 6.72 6.81 5.86 7.16
#Interpretation
##The optimal is reached when cp is less than no. of prectors+1. In this case this is reached when 5 predictors are selected and cp is 5.86.
Checking how the selected model performs ### 9. Using the Step-wise regression methos
# use step() to run stepwise regression.
# set directions = to either "backward", "forward", or "both".
insurance.lm.step <- step(insurance.lm, direction = "both")
## Start: AIC=17427
## charges ~ age + sex + bmi + children + smoker + region
##
## Df Sum of Sq RSS AIC
## - sex 1 6041404 36382937507 17426
## - region 3 202250806 36579146910 17427
## <none> 36376896103 17427
## - children 1 233184055 36610080158 17432
## - bmi 1 3826779343 40203675446 17525
## - age 1 12828433508 49205329612 17728
## - smoker 1 95638557125 132015453228 18714
##
## Step: AIC=17426
## charges ~ age + bmi + children + smoker + region
##
## Df Sum of Sq RSS AIC
## - region 3 204000119 36586937626 17425
## <none> 36382937507 17426
## + sex 1 6041404 36376896103 17427
## - children 1 232372186 36615309693 17430
## - bmi 1 3823612316 40206549823 17524
## - age 1 12838001261 49220938768 17726
## - smoker 1 96573629426 132956566934 18720
##
## Step: AIC=17425
## charges ~ age + bmi + children + smoker
##
## Df Sum of Sq RSS AIC
## <none> 36586937626 17425
## + region 3 204000119 36382937507 17426
## + sex 1 7790716 36579146910 17427
## - children 1 241172269 36828109894 17430
## - bmi 1 3835578877 40422516502 17523
## - age 1 13052634443 49639572069 17728
## - smoker 1 96646251325 133233188951 18716
summary(insurance.lm.step) # Which variables did it drop?
##
## Call:
## lm(formula = charges ~ age + bmi + children + smoker, data = train_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12265 -2785 -912 1485 29065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12377.6 1107.9 -11.17 <0.0000000000000002 ***
## age 263.1 14.0 18.84 <0.0000000000000002 ***
## bmi 325.5 31.9 10.21 <0.0000000000000002 ***
## children 411.9 160.8 2.56 0.011 *
## smokeryes 24183.8 471.7 51.27 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6060 on 995 degrees of freedom
## Multiple R-squared: 0.76, Adjusted R-squared: 0.759
## F-statistic: 786 on 4 and 995 DF, p-value: <0.0000000000000002
insurance.lm.step.pred <- predict(insurance.lm.step, test_dataset)
accuracy(insurance.lm.step.pred, test_dataset$charges)
## ME RMSE MAE MPE MAPE
## Test set -203 6091 4174 -16.3 42.8
#Interretation
To reconfirm that best fit is achieved by retaining 5 predictors and not 4 we again use lm() for 5 predictors.
insurance.lm1 <- lm(charges~ age + bmi + smoker + children + region , data = train_dataset)
# use options() to ensure numbers are not displayed in scientific notation e.g. if possible avoids term like 1.3e+20
# options(scipen = 999)
summary(insurance.lm1)
##
## Call:
## lm(formula = charges ~ age + bmi + smoker + children + region,
## data = train_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11632 -2795 -962 1560 29455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12162.8 1152.2 -10.56 <0.0000000000000002 ***
## age 261.5 14.0 18.71 <0.0000000000000002 ***
## bmi 342.6 33.5 10.21 <0.0000000000000002 ***
## smokeryes 24198.1 471.6 51.31 <0.0000000000000002 ***
## children 405.1 160.9 2.52 0.012 *
## regionnorthwest -457.6 549.0 -0.83 0.405
## regionsoutheast -1063.2 557.7 -1.91 0.057 .
## regionsouthwest -1130.2 548.5 -2.06 0.040 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6060 on 992 degrees of freedom
## Multiple R-squared: 0.761, Adjusted R-squared: 0.759
## F-statistic: 451 on 7 and 992 DF, p-value: <0.0000000000000002
y_test <- test_dataset$charges
yhat_test <- predict(insurance.lm1, newdata = test_dataset)
n_test <- length(test_dataset$charges)
# test RMSE
rmse <- sqrt(sum((y_test - yhat_test)^2) / n_test)
rmse
## [1] 6085