In regression, highly skew data result in poor fittign. When all values are positive log transforation are often used to normalize higly skewed data.
insurance$lcharges <- log(insurance$charges)
summary(insurance$lcharges)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.023 8.464 9.147 9.099 9.720 11.063
par(mfrow = c(1,2))
hist(insurance$lcharges, main = "Histogram of charges", col = "blue")
plot(density(insurance$lcharges), main = "Density plot of charges")
polygon(density(insurance$lcharges), col = "orange")
This log transformation gives more symmetric distribution
#Other variables 3 factor: sex, smoker, region
par(mfrow = c(1,3))
barplot(table(insurance$sex), main = "Sex")
barplot(table(insurance$smoker), main = "Smoker")
barplot(table(insurance$region), main = "region")
3 numerical: age, bmi, children
par(mfrow =c(1,3))
hist(insurance$age, main = "Histogram of age", col = "lightblue")
hist(insurance$bmi, main = "Histogram of bmi", col = "lightgreen")
hist(insurance$children, main = "Histogram of children", col = "red")
#check for outliers
par(mfrow = c(1,3))
boxplot(insurance$age, main = "Boxplot of age")
boxplot(insurance$bmi, main = "Boxplot of bmi")
boxplot(insurance$children, main = "Boxplot of children")
Only bmi has an outlier so remove it
#function to remove outlier
outlier_remover <- function(a){
df <- a
aa <- c()
count <- 1
for(i in 1:ncol(df)){
if(is.numeric(df[,i])){
Q3 <- quantile(df[,i], 0.75, na.rm = TRUE)
Q1 <- quantile(df[,i], 0.25, na.rm = TRUE)
IQR <- Q3 - Q1
upper <- Q3 + 1.5 * IQR
lower <- Q1 - 1.5 * IQR
for (j in 1:nrow(df)){
if(is.na(df[j,i]) == TRUE ) {
next
}
else if(df[j,i] > upper | df[j,i] < lower){
aa[count] <- j
count <- count + 1
}
}
}
}
df <- df[-aa,]
}
# insurance_new[duplicated(insurance_new),]
# insurance_new[!duplicated(insurance_new),]
#remove duplicated row
# insurance_new[!duplicated(insurance_new$noth)]
#
# insurance_new <- outlier_remover(insurance)
# str(int)
#
# unique(insurance_new$charges)
#
# insurance_new %>% select(charges) %>% unique() %>% summarise(n = n())
#check duplicates adn remove by specific column
insure <- unique(insurance, by = "charges")
#
# #check duplicates again
# insure[duplicated(insure),]
# insure[!duplicated(insure),]
From 1338 data reduced to 1193
#Check correlation between dependant variables
cor(insure[c("age", "bmi", "children")])
## age bmi children
## age 1.00000000 0.10934361 0.04153621
## bmi 0.10934361 1.00000000 0.01275466
## children 0.04153621 0.01275466 1.00000000
#create scatterplot matrix
library(psych)
pairs.panels(insure[c("age", "sex", "bmi", "children", "smoker", "region")], digits = 2, cor = TRUE, main = "Insurance scatter plot Matrix")
describe(insure)
## vars n mean sd median trimmed mad min max
## age 1 1337 39.22 14.04 39.00 39.02 17.79 18.00 64.00
## sex* 2 1337 1.50 0.50 2.00 1.51 0.00 1.00 2.00
## bmi 3 1337 30.66 6.10 30.40 30.50 6.20 15.96 53.13
## children 4 1337 1.10 1.21 1.00 0.94 1.48 0.00 5.00
## smoker* 5 1337 1.20 0.40 1.00 1.13 0.00 1.00 2.00
## region* 6 1337 2.52 1.11 3.00 2.52 1.48 1.00 4.00
## charges 7 1337 13279.12 12110.36 9386.16 11084.18 7425.45 1121.87 63770.43
## lcharges 8 1337 9.10 0.92 9.15 9.11 0.96 7.02 11.06
## range skew kurtosis se
## age 46.00 0.05 -1.25 0.38
## sex* 1.00 -0.02 -2.00 0.01
## bmi 37.17 0.28 -0.06 0.17
## children 5.00 0.94 0.19 0.03
## smoker* 1.00 1.46 0.13 0.01
## region* 3.00 -0.04 -1.33 0.03
## charges 62648.55 1.51 1.59 331.20
## lcharges 4.04 -0.09 -0.64 0.03
Depend on the scatterplot matrix above, there are not multicollinearity. None of the pairwise correlations among age, sex, bmi, children, smoker, and region are particularly strong (r < 0.2 in each case).
[> 0.7 is multicollinearity]¶ In the scatterplot matrix: red dot shows mean red line is Loess curve the circle, more round most no correlation, more oval more correlation
#Model Building Simple Linear regression
s1_model <- lm(lcharges ~ age, data = insure)
summary(s1_model)
##
## Call:
## lm(formula = lcharges ~ age, data = insure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3457 -0.4161 -0.3094 0.5015 2.1976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.747999 0.063389 122.23 <2e-16 ***
## age 0.034469 0.001522 22.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7811 on 1335 degrees of freedom
## Multiple R-squared: 0.2777, Adjusted R-squared: 0.2771
## F-statistic: 513.1 on 1 and 1335 DF, p-value: < 2.2e-16
prediction equation is charges = 7.52 + 0.03 * age From the Pr(>|t|) column, we see that the regression coefficient(0.03) is significantly different from zero(p<0.001) and indicates that there’s an expected increase of 0.03 of charges for every 1 year increase in age.
The multiple R-squared(0.4003) indicates that the model accounts for 40% of the variance in charges. The multiple R-squared is also the squared correlation between the actual and predicted value.
The residual standard error(0.6158) can be thought of as the average error in predicting charges from age using this model.
The F-statistic tests whether the predictor variables, taken together, predict the response variable above chance levels.
#Multiple Linear Regression
mul_model <- lm(lcharges ~ age + sex + bmi + children + smoker + region, data = insure)
summary(mul_model)
##
## Call:
## lm(formula = lcharges ~ age + sex + bmi + children + smoker +
## region, data = insure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07416 -0.19619 -0.04969 0.06655 2.16494
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0314797 0.0723842 97.141 < 2e-16 ***
## age 0.0345390 0.0008725 39.585 < 2e-16 ***
## sexmale -0.0745638 0.0244054 -3.055 0.00229 **
## bmi 0.0134013 0.0020957 6.395 2.22e-10 ***
## children 0.1015405 0.0101005 10.053 < 2e-16 ***
## smokeryes 1.5537619 0.0302763 51.319 < 2e-16 ***
## regionnorthwest -0.0620490 0.0349257 -1.777 0.07586 .
## regionsoutheast -0.1573100 0.0350753 -4.485 7.92e-06 ***
## regionsouthwest -0.1289664 0.0350196 -3.683 0.00024 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4442 on 1328 degrees of freedom
## Multiple R-squared: 0.7676, Adjusted R-squared: 0.7662
## F-statistic: 548.4 on 8 and 1328 DF, p-value: < 2.2e-16
its interpreted as charges = 0.03 * age -0.08 * male + 0.006 bmi + 0.1 * children +1.32 * smokers -0.06 * nothwest -0.17 * southeast - 0.16 * southwest
When there’s more than one independent variable, the regression coefficients indicate the increase in the dependent variable for a unit change in a predictor varable, holding all other independent variables constant. For example, the regression cofficient for bmi is 0.006, suggesting that an increase of 1 in bmi is associated with a 0.006 increase in the charges, controlling for age, sex, children……
1 in children is associated with 0.1 increase in charges 1 in smokers is associaed with 1.32 increase in charges 1 in northwest is associated with 0.06 decrease in charges 1 in southeast is associated with 0.17 decrease in charges 1 in southwest is associated with 0.16 decrease in charges
We also can see regionsouthwest are’t significantly different from zero (p>0.05), suggesting sex and region are’t linearly related when controlling for the other dependent variables. [The * shows whether important of the dependent variables]
#Choose the important variables Relatives importance of Variables
library(relaimpo)
#relative_importance <- calc.relimp(mul_model, type = "lmg", rela = TRUE)
#relative_importance <- calc.relimp(mul_model, type = "lmg", rela = TRUE)
#sort(relative_importance$lmg, decreasing = TRUE)
#Stepwise regression Stepwise regression provides a number of stopping rules for selecting the best subset of variables for our model. In stepwise selection, variables are added to or deleted from a model one at a time, until some stopping criterion is reached.
Forward stepwise/ backward stepwise Overview the principle
#backward stepwise
mul_model1 <- lm(lcharges ~ age + sex + bmi + children + smoker + region, data = insure)
mul_model2 <- lm(lcharges ~ age + sex + bmi + children + smoker, data = insure)
mul_model2 <- lm(lcharges ~ age + sex + bmi + children , data = insure)
mul_model3 <- lm(lcharges ~ age + sex + bmi, data = insure)
mul_model4 <- lm(lcharges ~ age + sex, data = insure)
library(rcompanion)
com_mod <- compareLM(mul_model1, mul_model2, mul_model3, mul_model4)
com_mod
## $Models
## Formula
## 1 "lcharges ~ age + sex + bmi + children + smoker + region"
## 2 "lcharges ~ age + sex + bmi + children"
## 3 "lcharges ~ age + sex + bmi"
## 4 "lcharges ~ age + sex"
##
## $Fit.criteria
## Rank Df.res AIC AICc BIC R.squared Adj.R.sq p.value Shapiro.W Shapiro.p
## 1 9 1328 1635 1635 1687 0.7676 0.7662 0.000e+00 0.8374 7.148e-35
## 2 5 1332 3097 3097 3128 0.3025 0.3004 1.240e-102 0.8378 7.709e-35
## 3 4 1333 3131 3131 3157 0.2836 0.2820 4.640e-96 0.8687 4.606e-32
## 4 3 1334 3139 3139 3160 0.2780 0.2769 4.588e-95 0.8572 3.744e-33
summary(mul_model3)
##
## Call:
## lm(formula = lcharges ~ age + sex + bmi, data = insure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5881 -0.4469 -0.2879 0.5331 2.0945
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.407317 0.120579 61.431 < 2e-16 ***
## age 0.033947 0.001526 22.244 < 2e-16 ***
## sexmale 0.025332 0.042642 0.594 0.55257
## bmi 0.011361 0.003517 3.231 0.00127 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7785 on 1333 degrees of freedom
## Multiple R-squared: 0.2836, Adjusted R-squared: 0.282
## F-statistic: 175.9 on 3 and 1333 DF, p-value: < 2.2e-16
We usually compare the BIC or AIC or AICc.
BIC(Bayesian Information Criterion)– smallest BIC is better. AIC(Akaike’s Information Criterion)– smallest AIC is better. AICc(Akaike’s Information Criterion, with a correction for small sample sizes) – smallest AICc is better. These rules (Minimum BIC/AIC/AICc) attempt to explain the relationship between predictors and a response without building models that are overly complex in terms of the number of predictors. Since different criteria are used to determine when to stop adding terms to the model, these stopping rules may lead to different “best” models (see Burnham, 2002).
#Check the AIC.
#Sort the result or Create a line chart for AIC values with model numbers on the x axis, and AIC values on the y axis.
com_model <- com_mod$Fit.criteria
com_model[order(com_model$AIC), ]
## Rank Df.res AIC AICc BIC R.squared Adj.R.sq p.value Shapiro.W Shapiro.p
## 1 9 1328 1635 1635 1687 0.7676 0.7662 0.000e+00 0.8374 7.148e-35
## 2 5 1332 3097 3097 3128 0.3025 0.3004 1.240e-102 0.8378 7.709e-35
## 3 4 1333 3131 3131 3157 0.2836 0.2820 4.640e-96 0.8687 4.606e-32
## 4 3 1334 3139 3139 3160 0.2780 0.2769 4.588e-95 0.8572 3.744e-33
plot(com_model$AIC, type ="b", xlab = "model number", ylab = "AICc value")
mul_model1 is the best 2.1 Using step(object, scope, scale = 0,
direction = c(“both”, “backward”, “forward”), trace = 1, keep = NULL,
steps = 1000, k = 2, …)
base_mod <- lm(lcharges ~ 1, data = insure) #base intercept model only
all_mod <- lm(lcharges ~.-charges, data = insure) #full model with all predictors
stepMod <- step(base_mod, scope = list(lower = base_mod, upper = all_mod), direction = "both", trace = 0, steps = 1000) #perform stepwise algorithm
stepMod
##
## Call:
## lm(formula = lcharges ~ smoker + age + children + bmi + region +
## sex, data = insure)
##
## Coefficients:
## (Intercept) smokeryes age children
## 7.03148 1.55376 0.03454 0.10154
## bmi regionnorthwest regionsoutheast regionsouthwest
## 0.01340 -0.06205 -0.15731 -0.12897
## sexmale
## -0.07456
Use stepAIC() function in the MASS package
library(MASS)
mul_model <- lm(lcharges ~ age + sex + bmi + children + smoker + region, data = insure)
stepAIC(mul_model, direction = "backward")
## Start: AIC=-2161
## lcharges ~ age + sex + bmi + children + smoker + region
##
## Df Sum of Sq RSS AIC
## <none> 262.02 -2161.00
## - sex 1 1.84 263.86 -2153.64
## - region 3 4.71 266.73 -2143.17
## - bmi 1 8.07 270.09 -2122.45
## - children 1 19.94 281.96 -2064.94
## - age 1 309.17 571.19 -1121.08
## - smoker 1 519.63 781.65 -701.66
##
## Call:
## lm(formula = lcharges ~ age + sex + bmi + children + smoker +
## region, data = insure)
##
## Coefficients:
## (Intercept) age sexmale bmi
## 7.03148 0.03454 -0.07456 0.01340
## children smokeryes regionnorthwest regionsoutheast
## 0.10154 1.55376 -0.06205 -0.15731
## regionsouthwest
## -0.12897
summary(mul_model)
##
## Call:
## lm(formula = lcharges ~ age + sex + bmi + children + smoker +
## region, data = insure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07416 -0.19619 -0.04969 0.06655 2.16494
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0314797 0.0723842 97.141 < 2e-16 ***
## age 0.0345390 0.0008725 39.585 < 2e-16 ***
## sexmale -0.0745638 0.0244054 -3.055 0.00229 **
## bmi 0.0134013 0.0020957 6.395 2.22e-10 ***
## children 0.1015405 0.0101005 10.053 < 2e-16 ***
## smokeryes 1.5537619 0.0302763 51.319 < 2e-16 ***
## regionnorthwest -0.0620490 0.0349257 -1.777 0.07586 .
## regionsoutheast -0.1573100 0.0350753 -4.485 7.92e-06 ***
## regionsouthwest -0.1289664 0.0350196 -3.683 0.00024 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4442 on 1328 degrees of freedom
## Multiple R-squared: 0.7676, Adjusted R-squared: 0.7662
## F-statistic: 548.4 on 8 and 1328 DF, p-value: < 2.2e-16
So the model including all 6 dependent variables is the best one.
Multicollinearity(or substantial correlation) generally occurs when there are high correlations between two or more predictor variables. In other words, one predictor variable can be used to predict the other. For example: a person’s hight and weight Not as before methods, the VIF checks the model.
As a general rule of thumb, a VIF for a predictor greater than 5 or 10 indicates that multicollinearity. or square-root(VIF) > 2 indicates multicollinearity problem.
library(car)
vif(mul_model)
## GVIF Df GVIF^(1/(2*Df))
## age 1.016794 1 1.008362
## sex 1.008944 1 1.004462
## bmi 1.106742 1 1.052018
## children 1.004017 1 1.002006
## smoker 1.012100 1 1.006032
## region 1.099037 3 1.015864
sqrt(vif(mul_model)) > 2
## GVIF Df GVIF^(1/(2*Df))
## age FALSE FALSE FALSE
## sex FALSE FALSE FALSE
## bmi FALSE FALSE FALSE
## children FALSE FALSE FALSE
## smoker FALSE FALSE FALSE
## region FALSE FALSE FALSE
#Best Subsets
Best subsets is a technique that relies on stepwise regression to search, find and visualize regression models. Unlike stepwise regression, we have more options to see what variables were included in various shortlisted models we can visually inspect the model’s performance w.r.t Adj R-sq.
library(leaps)
subsets <- regsubsets(lcharges ~. -charges, data = insure, nbest = 2)
subsets
## Subset selection object
## Call: regsubsets.formula(lcharges ~ . - charges, data = insure, nbest = 2)
## 8 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## sexmale FALSE FALSE
## bmi FALSE FALSE
## children FALSE FALSE
## smokeryes FALSE FALSE
## regionnorthwest FALSE FALSE
## regionsoutheast FALSE FALSE
## regionsouthwest FALSE FALSE
## 2 subsets of each size up to 8
## Selection Algorithm: exhaustive
plot(subsets, scale = "r2") #regsubsets plot based on R sq
abline(h = 4, v = 0, col = "red")
abline(h = 10, v = 0, col = "green")
* In most instances, all subsets regression is preferable to stepwise
regression,¶
#Check model assumptions¶ 1. Check normality of errors: we can create a histogram of residuals from a linear model. The distribution of these residuals should be approximately normal. We can find the residuals of a model by using residuals() function.
mod_re <- residuals(mul_model)
hist(mod_re)
We can see the distribution of these residuals is not approximately
normal.
#Type plot(mul_model) to see the independence of errors.
par(mfrow = c(2,2))
plot(mul_model)
From the first plot(top-left), This pattern is indicated by the red line, which should be approximately flat if the disturbances are homoscedastic. If we see clear evidence of a curved relationship, which suggests that we may want to add a quadratic term to the regression. The bottom-right also checks this, and is more convenient as the disturbance term in Y axis is standardized.
top-left and bottom-left plots show how the residuals vary as the fitted values increase. The dots should be randomly placed around the horizontal zero line.
The distribution of residuals with normal quantile plot (top-right) is a probability plot of the standardized residuals against the values that would be expected under normality. If we’ve met the normality assumption, the points on the graph should fall on the straight 45-degree line.
bottom-right is the plot of standardized residuals against the leverage. Leverage is a measure of how much each data point influences the regression. A part the red smoothed line is not stay close to the mid-line and some point have a large Cook’s distance(Cook’s D), which reflects how much the fitted values would change if a point was deleted.
Cook’s D is one of two methods for identifying influential observations. Cook’s D values greater than 4/(n-k-1), where n is the sample size and k is the number of predictor variables, indicate influential observations.
#Improving model performance¶ 1. Adding nonlinear relations(Polynomial regression) To account for a non-linear relationship, we can add a higher order term to the regression model, treating the model as a polynomial. In effect, we will be modeling a relationship like this:
insure$age2 <- insure$age ^ 2
#Transformation – converting a numeric variable to a binary indicator
insure$bmi30 <- ifelse(insure$bmi >= 30, 1, 0)
#Model specification – adding interaction effects¶ To have the obesity indicator(bmi30) and the smoking indicator(smoker) interact
charges ~ bmi30 * smoker
#To summarize the improvements: Added a non-linear term for age Created an indicator for obesity Specified an interaction between obesity and smoking
im_model <- lm(lcharges ~ age + age2 + children + bmi + sex + bmi30 * smoker + region, data = insure)
summary(im_model)
##
## Call:
## lm(formula = lcharges ~ age + age2 + children + bmi + sex + bmi30 *
## smoker + region, data = insure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84451 -0.18821 -0.07086 0.05362 2.25813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.945e+00 1.297e-01 53.525 < 2e-16 ***
## age 5.337e-02 5.694e-03 9.374 < 2e-16 ***
## age2 -2.348e-04 7.102e-05 -3.307 0.000969 ***
## children 9.308e-02 1.007e-02 9.240 < 2e-16 ***
## bmi 6.381e-03 3.261e-03 1.957 0.050548 .
## sexmale -8.591e-02 2.325e-02 -3.694 0.000230 ***
## bmi30 -2.829e-02 4.026e-02 -0.703 0.482287
## smokeryes 1.214e+00 4.185e-02 29.016 < 2e-16 ***
## regionnorthwest -5.856e-02 3.325e-02 -1.761 0.078413 .
## regionsoutheast -1.507e-01 3.345e-02 -4.505 7.21e-06 ***
## regionsouthwest -1.377e-01 3.334e-02 -4.129 3.87e-05 ***
## bmi30:smokeryes 6.421e-01 5.752e-02 11.162 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4228 on 1325 degrees of freedom
## Multiple R-squared: 0.7899, Adjusted R-squared: 0.7882
## F-statistic: 452.9 on 11 and 1325 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(im_model)
insurance <- read.csv("insurance.csv")
insurance$charges <- log(insurance$charges)
insurance_new <- outlier_remover(insurance)
#create test adn training data set
traininRowIndex <- sample(1:nrow(insurance_new), 0.7 * nrow(insurance_new)) #row indices for training data
trainingData <- insurance_new[traininRowIndex, ] #training data
testData <- insurance_new[-traininRowIndex, ] #test data
#build the model on training dat
model1 <- lm(charges ~ age + sex + bmi + children + smoker + region, data = trainingData) #predict
pred1 <- predict(model1, testData)
actuals_preds1 <- data.frame(cbind(actuals = testData$charges, predicted = pred1))
cor(actuals_preds1)
## actuals predicted
## actuals 1.000000 0.873365
## predicted 0.873365 1.000000
insurance <- read.csv("insurance.csv")
insurance$charges <- log(insurance$charges)
insurance_new <- outlier_remover(insurance)
insurance_new$age2 <- insurance_new$age ^ 2
insurance_new$bmi30 <- ifelse(insurance_new$bmi >= 30, 1, 0)
#create training and test data model2 after improvements
traininRowIndex <- sample(1:nrow(insurance_new), 0.7 * nrow(insurance_new)) #row indices for training data
trainingData <- insurance_new[traininRowIndex,] #training data
testgData <- insurance_new[-traininRowIndex,] #test data
#build the model on training data
model2 <- lm(charges ~ age + bmi + children + smoker + region + sex, data = trainingData)
#predict
pred2 <- predict(model2, testData)
actuals_preds2 <- data.frame(cbind(actuals = testData$charges, predicted = pred2))
cor(actuals_preds2)
## actuals predicted
## actuals 1.0000000 0.8760506
## predicted 0.8760506 1.0000000