Goal

Goal

Outline

Data: Source

Data: Preview

df <- read.csv('./data/insurance.csv', stringsAsFactors=FALSE) %>%
  mutate(sex1 = if_else(sex == 'male', 1, 0))

\(n=1338\) customers

Data: Other variables

Exploratory data analysis (EDA)

EDA: Age

p1 <- ggplot(df, aes(x=1, y=age)) +
  geom_boxplot() + 
  xlab(NULL) +
  theme(axis.text.y = element_blank()) +
  coord_flip()

p2 <- ggplot(df, aes(x=age)) +
  geom_histogram(binwidth=1, alpha=0.5)

grid.arrange(p1, p2, ncol=2)

EDA: Sex

table(df$sex) %>% kable()
Var1 Freq
female 662
male 676

EDA: Sex and age

p1 <- ggplot(df, aes(x=sex, y=age)) +
  geom_boxplot() + 
  coord_flip()

p2 <- ggplot(df, aes(x=age, fill=sex)) + geom_density(alpha=0.3) + theme(legend.position = "bottom")

grid.arrange(p1, p2, ncol=2)

EDA: Charges

p1 <- ggplot(df, aes(x=1, y=charges)) +
  geom_boxplot() + 
  xlab(NULL) +
  theme(axis.text.y = element_blank()) +
  coord_flip()

p2 <- ggplot(df, aes(x=charges)) + geom_density()

grid.arrange(p1, p2, ncol=2)

EDA: \(log(charges)\)

EDA: \(log(charges)\)

EDA: \(log(charges)\)

p1 <- ggplot(df, aes(x=1, y=log(charges))) +
  geom_boxplot() + 
  xlab(NULL) +
  theme(axis.text.y = element_blank()) +
  coord_flip()

p2 <- ggplot(df, aes(x=log(charges))) + geom_density()

grid.arrange(p1, p2, ncol=2)

EDA: Interrelationships

PerformanceAnalytics::chart.Correlation(df %>% select(age, sex1, charges),
                                        histogram=TRUE, method='spearman')

EDA: Three clusters?

ggplot(df, aes(x=age, y=charges/1000, colour=sex)) +
  geom_point() +
  ylab('Insurance charges (in thousands of USD)') +
  geom_smooth(se=FALSE)

Inference

Transformations and terms

\(M_1\): Age and sex

m1 <- lm(log(charges) ~ age + sex, df)
summary(m1)
## 
## Call:
## lm(formula = log(charges) ~ age + sex, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3580 -0.4277 -0.3045  0.5010  2.2159 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.727893   0.067338 114.762   <2e-16 ***
## age         0.034568   0.001521  22.721   <2e-16 ***
## sexmale     0.030606   0.042738   0.716    0.474    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7814 on 1335 degrees of freedom
## Multiple R-squared:  0.2789, Adjusted R-squared:  0.2778 
## F-statistic: 258.2 on 2 and 1335 DF,  p-value: < 2.2e-16

\(M_1\): Intrepretation of intercept

\[\hat{\beta_0} = 7.7278 ~~~~~ (***)\]

“The average female customer of age zero will have \(e^{7.7278} = \$2270.60\) in annual charges”

\(M_1\): Interpretation of age coefficient

\[\hat{\beta}_{age} = 0.0345 ~~~~~ (***)\]

“Each additional year in age is associated with a 3.45 percent increase in annual charges, on average”

\(M_1\): Interpretation of sex coefficient

\[\hat{\beta}_{sex} = 0.0306\]

Not significant—“there is no difference in charges by males or females of the same age, on average”

\(M_1\): Performance

performance::performance(m1)
## # Indices of model performance
## 
##     AIC |     BIC |   R2 | R2_adjusted | RMSE
## ---------------------------------------------
## 3142.11 | 3162.90 | 0.28 |        0.28 | 0.78

Residuals analysis: Q-Q plot

qqnorm(rstandard(m1))
qqline(rstandard(m1))

Residuals analysis: Fitted values against residuals

plot(predict(m1), resid(m1))
abline(h=0, col='darkgray', lty=3)

Residuals: Plot against IVs

df$resid1 <- resid(m1)

p1 <- ggplot(df, aes(x=age, y=resid1)) + geom_point() + geom_smooth()

p2 <- ggplot(df, aes(x=resid1, fill=sex)) + geom_density(alpha=0.3) + theme(legend.position = "bottom")

grid.arrange(p1, p2, ncol=2)

Residuals: Outliers

age sex bmi children smoker region charges sex1 resid1
162 18 female 36.85 0 yes southeast 36149.48 0 2.145304
804 18 female 42.24 0 yes southeast 38792.69 0 2.215873

Conclusion

Conclusion: Visualization of fitted coefficients

p1 <- sjPlot::plot_model(m1, type='pred', term='age')
p2 <- sjPlot::plot_model(m1, type='pred', term='sex')
grid.arrange(p1, p2, ncol=2)

Conclusion: Estimate table

sex age fit lwr upr
male 20 4,674 4,308 5,072
male 30 6,605 6,191 7,046
male 40 9,332 8,797 9,900
male 50 13,186 12,324 14,108
male 60 18,631 17,092 20,309
female 20 4,534 4,171 4,927
female 30 6,406 5,997 6,843
female 40 9,051 8,527 9,607
female 50 12,788 11,956 13,679
female 60 18,069 16,590 19,680

Conclusion

Appendix: Full model

Appendix: bmi, smoker, log(charges)

dff <- df %>%
  mutate(has_children = as.factor(if_else(children > 0, 1, 0)),
         children = as.factor(children))

ggplot(dff, aes(y=log(charges), x=bmi, colour=as.factor(smoker))) +
  geom_point() +
  geom_smooth()

Appendix: Identifying the three clusters

dff <- dff %>%
  mutate(sex = if_else(sex == 'male', 1, 0),
         smoker = if_else(smoker == 'yes', 1, 0),
         bmi30 = if_else(bmi > 30, 1, 0))

ggplot(dff, aes(x=age, y=log(charges), colour=as.factor(smoker), shape=as.factor(bmi30))) +
  geom_point() +
  geom_smooth()

Appendix: Identifying the three clusters

  1. The bottom, orange cluster is mostly non-smokers with low BMI
  2. The middle group is non-smokers with high BMI (>30) as well as smokers with low BMI
  3. The top group are almost all smokers with high BMI—the unhealthiest lot

Appendix: Terms

  1. Collapse children into the dichotomous variable has_children
  2. Do not bother with region
  3. Interaction term for age and smoker
  4. Collapse bmi into a variable representing if an individual’s BMI is greater or less than 30
  5. The indicator variable bmi30 should be modeled as interacting with smoker

Appendix: \(M_2\)

## 
## Call:
## lm(formula = log(charges) ~ age * smoker + smoker * bmi30 + sex + 
##     has_children, data = dff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69908 -0.16728 -0.07812  0.01945  2.29029 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     7.054112   0.039707 177.654  < 2e-16 ***
## age             0.041692   0.000850  49.051  < 2e-16 ***
## smoker1         2.478735   0.081179  30.534  < 2e-16 ***
## bmi301         -0.006974   0.023956  -0.291  0.77101    
## sex1           -0.082727   0.021333  -3.878  0.00011 ***
## has_children1   0.239078   0.021478  11.131  < 2e-16 ***
## age:smoker1    -0.033343   0.001891 -17.629  < 2e-16 ***
## smoker1:bmi301  0.690025   0.052798  13.069  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3882 on 1330 degrees of freedom
## Multiple R-squared:  0.8227, Adjusted R-squared:  0.8218 
## F-statistic: 881.8 on 7 and 1330 DF,  p-value: < 2.2e-16

Appendix: \(M_2\) performance

## # Indices of model performance
## 
##     AIC |     BIC |   R2 | R2_adjusted | RMSE
## ---------------------------------------------
## 1274.68 | 1321.47 | 0.82 |        0.82 | 0.39

Appendix: \(M_2\) residuals

dff$resid2 <- resid(m2)
ggplot(dff, aes(x=age, y=resid2, colour=smoker, shape=bmi30)) + geom_point()

Appendix: Conclusion

Appendix: Conclusion (male)

m2_pred_male <- expand.grid(age = seq(18, 64, 1),
                       smoker = unique(dff$smoker),
                       bmi30 = unique(dff$bmi30),
                       sex = as.factor(1),
                       has_children = unique(dff$has_children)) %>%
  mutate(smokerx = if_else(smoker == 1, 'smoker', 'nonsmoker'),
         has_childrenx = if_else(has_children == 1, 'children', 'childless'),
         bmi30x = if_else(bmi30 == 1, 'high BMI', 'low BMI'),
         customer = paste(smokerx, has_childrenx, bmi30x, sep='-'))
m2_pred_male$charges <- exp(predict(m2, m2_pred_male))

ggplot(m2_pred_male, aes(x=age, y=charges/10^3, colour=customer)) +
  geom_line(size=1) + 
  theme(legend.position='bottom',
        legend.text=element_text(size=6),
        legend.title = element_blank())