In regression, highly skew data result in poor fittign. When all values are positive log transforation are often used to normalize higly skewed data.

insurance$lcharges <- log(insurance$charges)
summary(insurance$lcharges)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.023   8.464   9.147   9.099   9.720  11.063
par(mfrow = c(1,2))
hist(insurance$lcharges, main = "Histogram of charges", col = "blue")
plot(density(insurance$lcharges), main = "Density plot of charges")

polygon(density(insurance$lcharges), col = "orange")

This log transformation gives more symmetric distribution

#Other variables 3 factor: sex, smoker, region

par(mfrow = c(1,3))
barplot(table(insurance$sex), main = "Sex")
barplot(table(insurance$smoker), main = "Smoker")
barplot(table(insurance$region), main = "region")

3 numerical: age, bmi, children

par(mfrow =c(1,3))
hist(insurance$age, main = "Histogram of age", col = "lightblue")
hist(insurance$bmi, main = "Histogram of bmi", col = "lightgreen")
hist(insurance$children, main = "Histogram of children", col = "red")

#check for outliers

par(mfrow = c(1,3))

boxplot(insurance$age, main = "Boxplot of age")
boxplot(insurance$bmi, main = "Boxplot of bmi")
boxplot(insurance$children, main = "Boxplot of children")

Only bmi has an outlier so remove it

#function to remove outlier
outlier_remover <- function(a){
  df <- a
  aa <- c()
  count <- 1
  for(i in 1:ncol(df)){
    if(is.numeric(df[,i])){
      Q3 <- quantile(df[,i], 0.75, na.rm = TRUE)
      Q1 <- quantile(df[,i], 0.25, na.rm = TRUE)
      IQR <- Q3 - Q1
      upper <- Q3 + 1.5 * IQR
      lower <- Q1 - 1.5 * IQR
      for (j in 1:nrow(df)){
        if(is.na(df[j,i]) == TRUE ) {
          next
        }
        else if(df[j,i] > upper | df[j,i] < lower){
          aa[count] <- j
          count <- count + 1
        }
      }
    }
  }
  df <- df[-aa,]
}
# insurance_new[duplicated(insurance_new),]
# insurance_new[!duplicated(insurance_new),]

#remove duplicated row
# insurance_new[!duplicated(insurance_new$noth)]
# 
# insurance_new <- outlier_remover(insurance)
# str(int)
# 
# unique(insurance_new$charges)
# 
# insurance_new %>% select(charges) %>%  unique() %>% summarise(n = n())

#check duplicates adn remove by specific column 
insure <- unique(insurance, by = "charges")
# 
# #check duplicates again
# insure[duplicated(insure),]
# insure[!duplicated(insure),]

From 1338 data reduced to 1193

#Check correlation between dependant variables

cor(insure[c("age", "bmi", "children")])
##                 age        bmi   children
## age      1.00000000 0.10934361 0.04153621
## bmi      0.10934361 1.00000000 0.01275466
## children 0.04153621 0.01275466 1.00000000

#create scatterplot matrix

library(psych)

pairs.panels(insure[c("age", "sex", "bmi", "children", "smoker", "region")], digits = 2, cor = TRUE, main = "Insurance scatter plot Matrix")

describe(insure)
##          vars    n     mean       sd  median  trimmed     mad     min      max
## age         1 1337    39.22    14.04   39.00    39.02   17.79   18.00    64.00
## sex*        2 1337     1.50     0.50    2.00     1.51    0.00    1.00     2.00
## bmi         3 1337    30.66     6.10   30.40    30.50    6.20   15.96    53.13
## children    4 1337     1.10     1.21    1.00     0.94    1.48    0.00     5.00
## smoker*     5 1337     1.20     0.40    1.00     1.13    0.00    1.00     2.00
## region*     6 1337     2.52     1.11    3.00     2.52    1.48    1.00     4.00
## charges     7 1337 13279.12 12110.36 9386.16 11084.18 7425.45 1121.87 63770.43
## lcharges    8 1337     9.10     0.92    9.15     9.11    0.96    7.02    11.06
##             range  skew kurtosis     se
## age         46.00  0.05    -1.25   0.38
## sex*         1.00 -0.02    -2.00   0.01
## bmi         37.17  0.28    -0.06   0.17
## children     5.00  0.94     0.19   0.03
## smoker*      1.00  1.46     0.13   0.01
## region*      3.00 -0.04    -1.33   0.03
## charges  62648.55  1.51     1.59 331.20
## lcharges     4.04 -0.09    -0.64   0.03

Depend on the scatterplot matrix above, there are not multicollinearity. None of the pairwise correlations among age, sex, bmi, children, smoker, and region are particularly strong (r < 0.2 in each case).

[> 0.7 is multicollinearity]¶ In the scatterplot matrix: red dot shows mean red line is Loess curve the circle, more round most no correlation, more oval more correlation

#Model Building Simple Linear regression

s1_model <- lm(lcharges ~ age, data = insure)
summary(s1_model)
## 
## Call:
## lm(formula = lcharges ~ age, data = insure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3457 -0.4161 -0.3094  0.5015  2.1976 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.747999   0.063389  122.23   <2e-16 ***
## age         0.034469   0.001522   22.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7811 on 1335 degrees of freedom
## Multiple R-squared:  0.2777, Adjusted R-squared:  0.2771 
## F-statistic: 513.1 on 1 and 1335 DF,  p-value: < 2.2e-16

prediction equation is charges = 7.52 + 0.03 * age From the Pr(>|t|) column, we see that the regression coefficient(0.03) is significantly different from zero(p<0.001) and indicates that there’s an expected increase of 0.03 of charges for every 1 year increase in age.

The multiple R-squared(0.4003) indicates that the model accounts for 40% of the variance in charges. The multiple R-squared is also the squared correlation between the actual and predicted value.

The residual standard error(0.6158) can be thought of as the average error in predicting charges from age using this model.

The F-statistic tests whether the predictor variables, taken together, predict the response variable above chance levels.

#Multiple Linear Regression

mul_model <- lm(lcharges ~ age + sex + bmi + children + smoker + region, data = insure)

summary(mul_model)
## 
## Call:
## lm(formula = lcharges ~ age + sex + bmi + children + smoker + 
##     region, data = insure)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07416 -0.19619 -0.04969  0.06655  2.16494 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0314797  0.0723842  97.141  < 2e-16 ***
## age              0.0345390  0.0008725  39.585  < 2e-16 ***
## sexmale         -0.0745638  0.0244054  -3.055  0.00229 ** 
## bmi              0.0134013  0.0020957   6.395 2.22e-10 ***
## children         0.1015405  0.0101005  10.053  < 2e-16 ***
## smokeryes        1.5537619  0.0302763  51.319  < 2e-16 ***
## regionnorthwest -0.0620490  0.0349257  -1.777  0.07586 .  
## regionsoutheast -0.1573100  0.0350753  -4.485 7.92e-06 ***
## regionsouthwest -0.1289664  0.0350196  -3.683  0.00024 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4442 on 1328 degrees of freedom
## Multiple R-squared:  0.7676, Adjusted R-squared:  0.7662 
## F-statistic: 548.4 on 8 and 1328 DF,  p-value: < 2.2e-16

its interpreted as charges = 0.03 * age -0.08 * male + 0.006 bmi + 0.1 * children +1.32 * smokers -0.06 * nothwest -0.17 * southeast - 0.16 * southwest

When there’s more than one independent variable, the regression coefficients indicate the increase in the dependent variable for a unit change in a predictor varable, holding all other independent variables constant. For example, the regression cofficient for bmi is 0.006, suggesting that an increase of 1 in bmi is associated with a 0.006 increase in the charges, controlling for age, sex, children……

1 in children is associated with 0.1 increase in charges 1 in smokers is associaed with 1.32 increase in charges 1 in northwest is associated with 0.06 decrease in charges 1 in southeast is associated with 0.17 decrease in charges 1 in southwest is associated with 0.16 decrease in charges

We also can see regionsouthwest are’t significantly different from zero (p>0.05), suggesting sex and region are’t linearly related when controlling for the other dependent variables. [The * shows whether important of the dependent variables]

#Choose the important variables Relatives importance of Variables

library(relaimpo)

#relative_importance <- calc.relimp(mul_model, type = "lmg", rela = TRUE)

#relative_importance <- calc.relimp(mul_model, type = "lmg", rela = TRUE)
#sort(relative_importance$lmg, decreasing = TRUE)

#Stepwise regression Stepwise regression provides a number of stopping rules for selecting the best subset of variables for our model. In stepwise selection, variables are added to or deleted from a model one at a time, until some stopping criterion is reached.

Forward stepwise/ backward stepwise Overview the principle

#backward stepwise
mul_model1 <- lm(lcharges ~ age + sex + bmi + children + smoker + region, data = insure)

mul_model2 <- lm(lcharges ~ age + sex + bmi + children + smoker, data = insure)

mul_model2 <- lm(lcharges ~ age + sex + bmi + children , data = insure)

mul_model3 <- lm(lcharges ~ age + sex + bmi, data = insure)

mul_model4 <- lm(lcharges ~ age + sex, data = insure)

library(rcompanion)
com_mod <- compareLM(mul_model1, mul_model2, mul_model3, mul_model4)
com_mod
## $Models
##   Formula                                                  
## 1 "lcharges ~ age + sex + bmi + children + smoker + region"
## 2 "lcharges ~ age + sex + bmi + children"                  
## 3 "lcharges ~ age + sex + bmi"                             
## 4 "lcharges ~ age + sex"                                   
## 
## $Fit.criteria
##   Rank Df.res  AIC AICc  BIC R.squared Adj.R.sq    p.value Shapiro.W Shapiro.p
## 1    9   1328 1635 1635 1687    0.7676   0.7662  0.000e+00    0.8374 7.148e-35
## 2    5   1332 3097 3097 3128    0.3025   0.3004 1.240e-102    0.8378 7.709e-35
## 3    4   1333 3131 3131 3157    0.2836   0.2820  4.640e-96    0.8687 4.606e-32
## 4    3   1334 3139 3139 3160    0.2780   0.2769  4.588e-95    0.8572 3.744e-33
summary(mul_model3)
## 
## Call:
## lm(formula = lcharges ~ age + sex + bmi, data = insure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5881 -0.4469 -0.2879  0.5331  2.0945 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.407317   0.120579  61.431  < 2e-16 ***
## age         0.033947   0.001526  22.244  < 2e-16 ***
## sexmale     0.025332   0.042642   0.594  0.55257    
## bmi         0.011361   0.003517   3.231  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7785 on 1333 degrees of freedom
## Multiple R-squared:  0.2836, Adjusted R-squared:  0.282 
## F-statistic: 175.9 on 3 and 1333 DF,  p-value: < 2.2e-16

We usually compare the BIC or AIC or AICc.

BIC(Bayesian Information Criterion)– smallest BIC is better. AIC(Akaike’s Information Criterion)– smallest AIC is better. AICc(Akaike’s Information Criterion, with a correction for small sample sizes) – smallest AICc is better. These rules (Minimum BIC/AIC/AICc) attempt to explain the relationship between predictors and a response without building models that are overly complex in terms of the number of predictors. Since different criteria are used to determine when to stop adding terms to the model, these stopping rules may lead to different “best” models (see Burnham, 2002).

#Check the AIC. 
#Sort the result or Create a line chart for AIC values with model numbers on the x axis, and AIC values on the y axis. 

com_model <- com_mod$Fit.criteria
com_model[order(com_model$AIC), ]
##   Rank Df.res  AIC AICc  BIC R.squared Adj.R.sq    p.value Shapiro.W Shapiro.p
## 1    9   1328 1635 1635 1687    0.7676   0.7662  0.000e+00    0.8374 7.148e-35
## 2    5   1332 3097 3097 3128    0.3025   0.3004 1.240e-102    0.8378 7.709e-35
## 3    4   1333 3131 3131 3157    0.2836   0.2820  4.640e-96    0.8687 4.606e-32
## 4    3   1334 3139 3139 3160    0.2780   0.2769  4.588e-95    0.8572 3.744e-33
plot(com_model$AIC, type ="b", xlab = "model number", ylab = "AICc value")

mul_model1 is the best 2.1 Using step(object, scope, scale = 0, direction = c(“both”, “backward”, “forward”), trace = 1, keep = NULL, steps = 1000, k = 2, …)

base_mod <- lm(lcharges ~ 1, data = insure) #base intercept model only

all_mod <- lm(lcharges ~.-charges, data = insure)  #full model  with all predictors

stepMod <- step(base_mod, scope = list(lower = base_mod, upper = all_mod), direction = "both", trace = 0, steps = 1000) #perform stepwise algorithm

stepMod
## 
## Call:
## lm(formula = lcharges ~ smoker + age + children + bmi + region + 
##     sex, data = insure)
## 
## Coefficients:
##     (Intercept)        smokeryes              age         children  
##         7.03148          1.55376          0.03454          0.10154  
##             bmi  regionnorthwest  regionsoutheast  regionsouthwest  
##         0.01340         -0.06205         -0.15731         -0.12897  
##         sexmale  
##        -0.07456

Use stepAIC() function in the MASS package

library(MASS)

mul_model <- lm(lcharges ~ age + sex + bmi + children + smoker + region, data = insure)

stepAIC(mul_model, direction = "backward")
## Start:  AIC=-2161
## lcharges ~ age + sex + bmi + children + smoker + region
## 
##            Df Sum of Sq    RSS      AIC
## <none>                  262.02 -2161.00
## - sex       1      1.84 263.86 -2153.64
## - region    3      4.71 266.73 -2143.17
## - bmi       1      8.07 270.09 -2122.45
## - children  1     19.94 281.96 -2064.94
## - age       1    309.17 571.19 -1121.08
## - smoker    1    519.63 781.65  -701.66
## 
## Call:
## lm(formula = lcharges ~ age + sex + bmi + children + smoker + 
##     region, data = insure)
## 
## Coefficients:
##     (Intercept)              age          sexmale              bmi  
##         7.03148          0.03454         -0.07456          0.01340  
##        children        smokeryes  regionnorthwest  regionsoutheast  
##         0.10154          1.55376         -0.06205         -0.15731  
## regionsouthwest  
##        -0.12897
summary(mul_model)
## 
## Call:
## lm(formula = lcharges ~ age + sex + bmi + children + smoker + 
##     region, data = insure)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07416 -0.19619 -0.04969  0.06655  2.16494 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0314797  0.0723842  97.141  < 2e-16 ***
## age              0.0345390  0.0008725  39.585  < 2e-16 ***
## sexmale         -0.0745638  0.0244054  -3.055  0.00229 ** 
## bmi              0.0134013  0.0020957   6.395 2.22e-10 ***
## children         0.1015405  0.0101005  10.053  < 2e-16 ***
## smokeryes        1.5537619  0.0302763  51.319  < 2e-16 ***
## regionnorthwest -0.0620490  0.0349257  -1.777  0.07586 .  
## regionsoutheast -0.1573100  0.0350753  -4.485 7.92e-06 ***
## regionsouthwest -0.1289664  0.0350196  -3.683  0.00024 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4442 on 1328 degrees of freedom
## Multiple R-squared:  0.7676, Adjusted R-squared:  0.7662 
## F-statistic: 548.4 on 8 and 1328 DF,  p-value: < 2.2e-16

So the model including all 6 dependent variables is the best one.

Multicollinearity, Variation Inflation Factor (VIF)¶

Multicollinearity(or substantial correlation) generally occurs when there are high correlations between two or more predictor variables. In other words, one predictor variable can be used to predict the other. For example: a person’s hight and weight Not as before methods, the VIF checks the model.

As a general rule of thumb, a VIF for a predictor greater than 5 or 10 indicates that multicollinearity. or square-root(VIF) > 2 indicates multicollinearity problem.

library(car)

vif(mul_model)
##              GVIF Df GVIF^(1/(2*Df))
## age      1.016794  1        1.008362
## sex      1.008944  1        1.004462
## bmi      1.106742  1        1.052018
## children 1.004017  1        1.002006
## smoker   1.012100  1        1.006032
## region   1.099037  3        1.015864
sqrt(vif(mul_model)) > 2
##           GVIF    Df GVIF^(1/(2*Df))
## age      FALSE FALSE           FALSE
## sex      FALSE FALSE           FALSE
## bmi      FALSE FALSE           FALSE
## children FALSE FALSE           FALSE
## smoker   FALSE FALSE           FALSE
## region   FALSE FALSE           FALSE

#Best Subsets

Best subsets is a technique that relies on stepwise regression to search, find and visualize regression models. Unlike stepwise regression, we have more options to see what variables were included in various shortlisted models we can visually inspect the model’s performance w.r.t Adj R-sq.

library(leaps)
subsets <- regsubsets(lcharges ~. -charges, data = insure, nbest = 2)
subsets
## Subset selection object
## Call: regsubsets.formula(lcharges ~ . - charges, data = insure, nbest = 2)
## 8 Variables  (and intercept)
##                 Forced in Forced out
## age                 FALSE      FALSE
## sexmale             FALSE      FALSE
## bmi                 FALSE      FALSE
## children            FALSE      FALSE
## smokeryes           FALSE      FALSE
## regionnorthwest     FALSE      FALSE
## regionsoutheast     FALSE      FALSE
## regionsouthwest     FALSE      FALSE
## 2 subsets of each size up to 8
## Selection Algorithm: exhaustive
plot(subsets, scale = "r2") #regsubsets plot based on R sq
abline(h = 4, v = 0, col = "red")
abline(h = 10, v = 0, col = "green")

* In most instances, all subsets regression is preferable to stepwise regression,¶

#Check model assumptions¶ 1. Check normality of errors: we can create a histogram of residuals from a linear model. The distribution of these residuals should be approximately normal. We can find the residuals of a model by using residuals() function.

mod_re <- residuals(mul_model)
hist(mod_re)

We can see the distribution of these residuals is not approximately normal.

#Type plot(mul_model) to see the independence of errors.

par(mfrow = c(2,2))
plot(mul_model)

From the first plot(top-left), This pattern is indicated by the red line, which should be approximately flat if the disturbances are homoscedastic. If we see clear evidence of a curved relationship, which suggests that we may want to add a quadratic term to the regression. The bottom-right also checks this, and is more convenient as the disturbance term in Y axis is standardized.

top-left and bottom-left plots show how the residuals vary as the fitted values increase. The dots should be randomly placed around the horizontal zero line.

The distribution of residuals with normal quantile plot (top-right) is a probability plot of the standardized residuals against the values that would be expected under normality. If we’ve met the normality assumption, the points on the graph should fall on the straight 45-degree line.

bottom-right is the plot of standardized residuals against the leverage. Leverage is a measure of how much each data point influences the regression. A part the red smoothed line is not stay close to the mid-line and some point have a large Cook’s distance(Cook’s D), which reflects how much the fitted values would change if a point was deleted.

Cook’s D is one of two methods for identifying influential observations. Cook’s D values greater than 4/(n-k-1), where n is the sample size and k is the number of predictor variables, indicate influential observations.

#Improving model performance¶ 1. Adding nonlinear relations(Polynomial regression) To account for a non-linear relationship, we can add a higher order term to the regression model, treating the model as a polynomial. In effect, we will be modeling a relationship like this:

insure$age2 <- insure$age ^ 2

#Transformation – converting a numeric variable to a binary indicator

insure$bmi30 <- ifelse(insure$bmi >= 30, 1, 0)

#Model specification – adding interaction effects¶ To have the obesity indicator(bmi30) and the smoking indicator(smoker) interact

charges ~ bmi30 * smoker

#To summarize the improvements: Added a non-linear term for age Created an indicator for obesity Specified an interaction between obesity and smoking

im_model <- lm(lcharges ~ age + age2 + children + bmi + sex + bmi30 * smoker + region, data = insure)

summary(im_model)
## 
## Call:
## lm(formula = lcharges ~ age + age2 + children + bmi + sex + bmi30 * 
##     smoker + region, data = insure)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84451 -0.18821 -0.07086  0.05362  2.25813 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.945e+00  1.297e-01  53.525  < 2e-16 ***
## age              5.337e-02  5.694e-03   9.374  < 2e-16 ***
## age2            -2.348e-04  7.102e-05  -3.307 0.000969 ***
## children         9.308e-02  1.007e-02   9.240  < 2e-16 ***
## bmi              6.381e-03  3.261e-03   1.957 0.050548 .  
## sexmale         -8.591e-02  2.325e-02  -3.694 0.000230 ***
## bmi30           -2.829e-02  4.026e-02  -0.703 0.482287    
## smokeryes        1.214e+00  4.185e-02  29.016  < 2e-16 ***
## regionnorthwest -5.856e-02  3.325e-02  -1.761 0.078413 .  
## regionsoutheast -1.507e-01  3.345e-02  -4.505 7.21e-06 ***
## regionsouthwest -1.377e-01  3.334e-02  -4.129 3.87e-05 ***
## bmi30:smokeryes  6.421e-01  5.752e-02  11.162  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4228 on 1325 degrees of freedom
## Multiple R-squared:  0.7899, Adjusted R-squared:  0.7882 
## F-statistic: 452.9 on 11 and 1325 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(im_model)

insurance <- read.csv("insurance.csv")
insurance$charges <- log(insurance$charges)
insurance_new <- outlier_remover(insurance)

#create test adn training data set
traininRowIndex <- sample(1:nrow(insurance_new), 0.7 * nrow(insurance_new)) #row indices for training data

trainingData <- insurance_new[traininRowIndex, ] #training data
testData <- insurance_new[-traininRowIndex, ] #test data

#build the model on training dat
model1 <- lm(charges ~ age + sex + bmi + children + smoker + region, data = trainingData)  #predict

pred1 <- predict(model1, testData)
actuals_preds1 <- data.frame(cbind(actuals = testData$charges, predicted = pred1))

cor(actuals_preds1)
##            actuals predicted
## actuals   1.000000  0.873365
## predicted 0.873365  1.000000
insurance <- read.csv("insurance.csv")
insurance$charges <- log(insurance$charges)
insurance_new <- outlier_remover(insurance)

insurance_new$age2 <- insurance_new$age ^ 2
insurance_new$bmi30 <- ifelse(insurance_new$bmi >= 30, 1, 0)


#create training and test data model2 after improvements
traininRowIndex <- sample(1:nrow(insurance_new), 0.7 * nrow(insurance_new)) #row indices for training data


trainingData <- insurance_new[traininRowIndex,] #training data
testgData <- insurance_new[-traininRowIndex,] #test data

#build the model on training data
model2 <- lm(charges ~ age + bmi  + children + smoker + region + sex, data = trainingData)

#predict
pred2 <- predict(model2, testData)
actuals_preds2 <- data.frame(cbind(actuals = testData$charges, predicted = pred2))

cor(actuals_preds2)
##             actuals predicted
## actuals   1.0000000 0.8760506
## predicted 0.8760506 1.0000000