Insurance: Where an individual pays a company money on a regular basis, in exchange for the promise of covering some or all of the costs of unexpected future events.
Question: How much money does an insurance customer claim against their policy annually?
df <- read.csv('./data/insurance.csv', stringsAsFactors=FALSE) %>%
mutate(sex1 = if_else(sex == 'male', 1, 0))
\(n=1338\) customers
charges: Annual bills to insurance company (USD)agesexbmichildren: Number of childrensmoker: Whether a policyholder smokes tobacco or notregion: Northeast, Northwest, Southeast, and Southwestp1 <- ggplot(df, aes(x=1, y=age)) +
geom_boxplot() +
xlab(NULL) +
theme(axis.text.y = element_blank()) +
coord_flip()
p2 <- ggplot(df, aes(x=age)) +
geom_histogram(binwidth=1, alpha=0.5)
grid.arrange(p1, p2, ncol=2)
table(df$sex) %>% kable()
| Var1 | Freq |
|---|---|
| female | 662 |
| male | 676 |
p1 <- ggplot(df, aes(x=sex, y=age)) +
geom_boxplot() +
coord_flip()
p2 <- ggplot(df, aes(x=age, fill=sex)) + geom_density(alpha=0.3) + theme(legend.position = "bottom")
grid.arrange(p1, p2, ncol=2)
p1 <- ggplot(df, aes(x=1, y=charges)) +
geom_boxplot() +
xlab(NULL) +
theme(axis.text.y = element_blank()) +
coord_flip()
p2 <- ggplot(df, aes(x=charges)) + geom_density()
grid.arrange(p1, p2, ncol=2)
chargesp1 <- ggplot(df, aes(x=1, y=log(charges))) +
geom_boxplot() +
xlab(NULL) +
theme(axis.text.y = element_blank()) +
coord_flip()
p2 <- ggplot(df, aes(x=log(charges))) + geom_density()
grid.arrange(p1, p2, ncol=2)
PerformanceAnalytics::chart.Correlation(df %>% select(age, sex1, charges),
histogram=TRUE, method='spearman')
ggplot(df, aes(x=age, y=charges/1000, colour=sex)) +
geom_point() +
ylab('Insurance charges (in thousands of USD)') +
geom_smooth(se=FALSE)
log(charges)min(age) = 18 from age but I didn’t botherm1 <- lm(log(charges) ~ age + sex, df)
summary(m1)
##
## Call:
## lm(formula = log(charges) ~ age + sex, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3580 -0.4277 -0.3045 0.5010 2.2159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.727893 0.067338 114.762 <2e-16 ***
## age 0.034568 0.001521 22.721 <2e-16 ***
## sexmale 0.030606 0.042738 0.716 0.474
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7814 on 1335 degrees of freedom
## Multiple R-squared: 0.2789, Adjusted R-squared: 0.2778
## F-statistic: 258.2 on 2 and 1335 DF, p-value: < 2.2e-16
\[\hat{\beta_0} = 7.7278 ~~~~~ (***)\]
“The average female customer of age zero will have \(e^{7.7278} = \$2270.60\) in annual charges”
age coefficient\[\hat{\beta}_{age} = 0.0345 ~~~~~ (***)\]
“Each additional year in age is associated with a 3.45 percent increase in annual charges, on average”
sex coefficient\[\hat{\beta}_{sex} = 0.0306\]
Not significant—“there is no difference in charges by males or females of the same age, on average”
performance::performance(m1)
## # Indices of model performance
##
## AIC | BIC | R2 | R2_adjusted | RMSE
## ---------------------------------------------
## 3142.11 | 3162.90 | 0.28 | 0.28 | 0.78
qqnorm(rstandard(m1))
qqline(rstandard(m1))
plot(predict(m1), resid(m1))
abline(h=0, col='darkgray', lty=3)
df$resid1 <- resid(m1)
p1 <- ggplot(df, aes(x=age, y=resid1)) + geom_point() + geom_smooth()
p2 <- ggplot(df, aes(x=resid1, fill=sex)) + geom_density(alpha=0.3) + theme(legend.position = "bottom")
grid.arrange(p1, p2, ncol=2)
| age | sex | bmi | children | smoker | region | charges | sex1 | resid1 | |
|---|---|---|---|---|---|---|---|---|---|
| 162 | 18 | female | 36.85 | 0 | yes | southeast | 36149.48 | 0 | 2.145304 |
| 804 | 18 | female | 42.24 | 0 | yes | southeast | 38792.69 | 0 | 2.215873 |
p1 <- sjPlot::plot_model(m1, type='pred', term='age')
p2 <- sjPlot::plot_model(m1, type='pred', term='sex')
grid.arrange(p1, p2, ncol=2)
| sex | age | fit | lwr | upr |
|---|---|---|---|---|
| male | 20 | 4,674 | 4,308 | 5,072 |
| male | 30 | 6,605 | 6,191 | 7,046 |
| male | 40 | 9,332 | 8,797 | 9,900 |
| male | 50 | 13,186 | 12,324 | 14,108 |
| male | 60 | 18,631 | 17,092 | 20,309 |
| female | 20 | 4,534 | 4,171 | 4,927 |
| female | 30 | 6,406 | 5,997 | 6,843 |
| female | 40 | 9,051 | 8,527 | 9,607 |
| female | 50 | 12,788 | 11,956 | 13,679 |
| female | 60 | 18,069 | 16,590 | 19,680 |
bmi, smoker, log(charges)dff <- df %>%
mutate(has_children = as.factor(if_else(children > 0, 1, 0)),
children = as.factor(children))
ggplot(dff, aes(y=log(charges), x=bmi, colour=as.factor(smoker))) +
geom_point() +
geom_smooth()
dff <- dff %>%
mutate(sex = if_else(sex == 'male', 1, 0),
smoker = if_else(smoker == 'yes', 1, 0),
bmi30 = if_else(bmi > 30, 1, 0))
ggplot(dff, aes(x=age, y=log(charges), colour=as.factor(smoker), shape=as.factor(bmi30))) +
geom_point() +
geom_smooth()
children into the dichotomous variable has_childrenregionage and smokerbmi into a variable representing if an individual’s BMI is greater or less than 30bmi30 should be modeled as interacting with smoker##
## Call:
## lm(formula = log(charges) ~ age * smoker + smoker * bmi30 + sex +
## has_children, data = dff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69908 -0.16728 -0.07812 0.01945 2.29029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.054112 0.039707 177.654 < 2e-16 ***
## age 0.041692 0.000850 49.051 < 2e-16 ***
## smoker1 2.478735 0.081179 30.534 < 2e-16 ***
## bmi301 -0.006974 0.023956 -0.291 0.77101
## sex1 -0.082727 0.021333 -3.878 0.00011 ***
## has_children1 0.239078 0.021478 11.131 < 2e-16 ***
## age:smoker1 -0.033343 0.001891 -17.629 < 2e-16 ***
## smoker1:bmi301 0.690025 0.052798 13.069 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3882 on 1330 degrees of freedom
## Multiple R-squared: 0.8227, Adjusted R-squared: 0.8218
## F-statistic: 881.8 on 7 and 1330 DF, p-value: < 2.2e-16
## # Indices of model performance
##
## AIC | BIC | R2 | R2_adjusted | RMSE
## ---------------------------------------------
## 1274.68 | 1321.47 | 0.82 | 0.82 | 0.39
dff$resid2 <- resid(m2)
ggplot(dff, aes(x=age, y=resid2, colour=smoker, shape=bmi30)) + geom_point()
m2_pred_male <- expand.grid(age = seq(18, 64, 1),
smoker = unique(dff$smoker),
bmi30 = unique(dff$bmi30),
sex = as.factor(1),
has_children = unique(dff$has_children)) %>%
mutate(smokerx = if_else(smoker == 1, 'smoker', 'nonsmoker'),
has_childrenx = if_else(has_children == 1, 'children', 'childless'),
bmi30x = if_else(bmi30 == 1, 'high BMI', 'low BMI'),
customer = paste(smokerx, has_childrenx, bmi30x, sep='-'))
m2_pred_male$charges <- exp(predict(m2, m2_pred_male))
ggplot(m2_pred_male, aes(x=age, y=charges/10^3, colour=customer)) +
geom_line(size=1) +
theme(legend.position='bottom',
legend.text=element_text(size=6),
legend.title = element_blank())