library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(relaimpo)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(ggthemr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
insurance <- read.csv("~/Documents/insurance.csv")
sum(is.na(insurance))
## [1] 0
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
insurance2 <- insurance %>%
mutate_if(is.character, as.factor) %>%
mutate_if(is.numeric, as.integer)
insurance2$charges <- as.numeric(insurance2$charges)
str(insurance2)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : int 27 33 33 22 28 25 33 27 29 25 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16884 1725 4449 21984 3866 ...
head(insurance2, n = 10)
## age sex bmi children smoker region charges
## 1 19 female 27 0 yes southwest 16884
## 2 18 male 33 1 no southeast 1725
## 3 28 male 33 3 no southeast 4449
## 4 33 male 22 0 no northwest 21984
## 5 32 male 28 0 no northwest 3866
## 6 31 female 25 0 no southeast 3756
## 7 46 female 33 1 no southeast 8240
## 8 37 female 27 3 no northwest 7281
## 9 37 male 29 2 no northeast 6406
## 10 60 female 25 0 no northwest 28923
summary(insurance2)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.00 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.00 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.00 Median :1.000
## Mean :39.21 Mean :30.17 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.00 3rd Qu.:2.000
## Max. :64.00 Max. :53.00 Max. :5.000
## region charges
## northeast:324 Min. : 1121
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16639
## Max. :63770
ggplot(insurance2, aes(x = region, y = charges, fill = region)) +
geom_boxplot()

anova1 <- aov(charges~region, data = insurance2)
summary(anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 1.301e+09 433582488 2.97 0.0309 *
## Residuals 1334 1.948e+11 146007139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = charges ~ region, data = insurance2)
##
## $region
## diff lwr upr p adj
## northwest-northeast -988.83972 -3428.9653 1451.28586 0.7245052
## southeast-northeast 1328.99335 -1044.9756 3702.96232 0.4745273
## southwest-northeast -1059.45819 -3499.5838 1380.66740 0.6792015
## southeast-northwest 2317.83308 -54.2028 4689.86896 0.0582944
## southwest-northwest -70.61846 -2508.8634 2367.62648 0.9998517
## southwest-southeast -2388.45154 -4760.4874 -16.41566 0.0476927
ggplot(insurance2, aes(x = smoker, y = charges, fill = smoker)) +
geom_boxplot() +
labs(x = "Smoker", y = "Charges", title = "Difference charges between smoker status")

ttest <- t.test(charges~smoker, data = insurance2)
ttest
##
## Welch Two Sample t-test
##
## data: charges by smoker
## t = -32.752, df = 311.85, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25034.70 -22197.19
## sample estimates:
## mean in group no mean in group yes
## 8433.778 32049.726
ggplot(insurance2, aes(x = as.factor(children), y = charges, fill = as.factor(children))) +
geom_boxplot() +
labs(x = "Total children",
y = "Charges",
title = "Total children vs charges",
fill = "children")

anovachild <- insurance2
anovachild$children <- as.factor(anovachild$children)
anova2 <- aov(charges~children, data = anovachild)
summary(anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 1.301e+09 433582488 2.97 0.0309 *
## Residuals 1334 1.948e+11 146007139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova2)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = charges ~ children, data = anovachild)
##
## $children
## diff lwr upr p adj
## 1-0 365.2451 -2026.03603 2756.526 0.9980109
## 2-0 2707.6050 62.32555 5352.884 0.0412915
## 3-0 2989.3855 -109.98749 6088.758 0.0660080
## 4-0 1484.7383 -5546.12058 8515.597 0.9908596
## 5-0 -3579.9617 -11817.35070 4657.427 0.8169306
## 2-1 2342.3599 -588.41425 5273.134 0.2025477
## 3-1 2624.1403 -722.20786 5970.489 0.2210609
## 4-1 1119.4932 -6023.67910 8262.666 0.9977502
## 5-1 -3945.2068 -12278.66415 4388.251 0.7561940
## 3-2 281.7805 -3250.54513 3814.106 0.9999166
## 4-2 -1222.8667 -8455.03014 6009.297 0.9967699
## 5-2 -6287.5667 -14697.42930 2122.296 0.2705069
## 4-3 -1504.6471 -8914.96412 5905.670 0.9923753
## 5-3 -6569.3471 -15132.89773 1994.203 0.2433497
## 5-4 -5064.7000 -15702.42829 5573.028 0.7517128
ggplot(insurance2, aes(x = age, y = charges, colour = charges)) +
geom_point(alpha = 0.5, size = 2.5)

ggplot(insurance2, aes(x = bmi, y = charges)) +
geom_point()

insurance2$sex <- as.numeric(insurance2$sex)
insurance2$smoker <- as.numeric(insurance2$smoker)
ggcorr(insurance2, label = TRUE)
## Warning in ggcorr(insurance2, label = TRUE): data in column(s) 'region' are not
## numeric and were ignored

insurance2$bmi30 <- as.factor(ifelse(insurance2$bmi >= 30, "yes", "no"))
g1 <- ggplot(insurance2, aes(x = bmi30,fill = bmi30)) +
geom_bar()
g2 <- ggplot(insurance2, aes(x = bmi30, y = charges, fill = bmi30)) +
geom_jitter(aes(colour = bmi30), alpha = 0.7)
grid.arrange(g1, g2, layout_matrix = rbind(c(1,2)))

model1 <- lm(charges ~., data = insurance2)
summary(model1)
##
## Call:
## lm(formula = charges ~ ., data = insurance2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12013.6 -3441.7 -117.7 1549.9 28461.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -31223.43 1422.22 -21.954 < 2e-16 ***
## age 257.29 11.78 21.833 < 2e-16 ***
## sex -161.22 329.82 -0.489 0.625057
## bmi 147.17 46.39 3.172 0.001546 **
## children 478.98 136.49 3.509 0.000464 ***
## smoker 23851.91 409.23 58.285 < 2e-16 ***
## regionnorthwest -385.76 471.79 -0.818 0.413703
## regionsoutheast -884.52 475.28 -1.861 0.062956 .
## regionsouthwest -952.51 473.45 -2.012 0.044439 *
## bmi30yes 2863.81 552.30 5.185 2.49e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6004 on 1328 degrees of freedom
## Multiple R-squared: 0.7558, Adjusted R-squared: 0.7542
## F-statistic: 456.8 on 9 and 1328 DF, p-value: < 2.2e-16
model2 <- lm(charges~age+smoker+children, data = insurance2)
summary(model2)
##
## Call:
## lm(formula = charges ~ age + smoker + children, data = insurance2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15547.5 -1941.1 -1319.1 -425.6 29312.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26695.09 753.68 -35.420 < 2e-16 ***
## age 273.09 12.42 21.990 < 2e-16 ***
## smoker 23842.59 431.84 55.211 < 2e-16 ***
## children 486.66 144.70 3.363 0.000792 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6372 on 1334 degrees of freedom
## Multiple R-squared: 0.7237, Adjusted R-squared: 0.7231
## F-statistic: 1165 on 3 and 1334 DF, p-value: < 2.2e-16
pre_charges <- function(m, a, b, c){
pre_new <- predict(m, data.frame(age=a, bmi=b, children=c))
msg <- paste("age: ",a,", bmi: ",b,", children: ",c," ==> Expect Charges: $",round(pre_new),sep="")
print(msg)
}
model <- lm(charges ~ age+bmi+children, data=insurance2)
pre_charges(model, 19, 30, 0)
## [1] "age: 19, bmi: 30, children: 0 ==> Expect Charges: $7762"
ins_model2_shapley<-calc.relimp(model1,type="lmg")
ins_model2_shapley
## Response variable: charges
## Total response variance: 146652409
## Analysis based on 1338 observations
##
## 9 Regressors:
## Some regressors combined in groups:
## Group region : regionnorthwest regionsoutheast regionsouthwest
##
## Relative importance of 7 (groups of) regressors assessed:
## region age sex bmi children smoker bmi30
##
## Proportion of variance explained by model: 75.58%
## Metrics are not normalized (rela=FALSE).
##
## Relative importance metrics:
##
## lmg
## region 0.003071332
## age 0.087382604
## sex 0.001513631
## bmi 0.018013359
## children 0.003402283
## smoker 0.622028904
## bmi30 0.020414206
##
## Average coefficients for different model sizes:
##
## 1group 2groups 3groups 4groups 5groups
## age 257.7231 255.8374 254.7976 254.4763 254.7951
## sex 1387.2042 1086.1731 805.9138 543.1028 295.2556
## bmi 388.2178 344.7524 302.0902 260.6158 220.7135
## children 683.0984 648.2967 614.0411 580.0646 546.2525
## smoker 23615.9481 23643.4783 23682.8824 23728.2164 23774.3010
## regionnorthwest -988.8397 -896.3165 -799.9287 -700.2098 -597.6584
## regionsoutheast 1328.9934 721.2865 217.3227 -189.4597 -505.1292
## regionsouthwest -1059.4582 -1132.0797 -1164.6940 -1160.3356 -1121.8360
## bmi30 4838.6913 4379.3436 3992.1244 3662.2875 3374.6943
## 6groups 7groups
## age 255.72604 257.2932
## sex 60.73655 -161.2197
## bmi 182.76829 147.1667
## children 512.56887 478.9784
## smoker 23816.73569 23851.9059
## regionnorthwest -492.72012 -385.7622
## regionsoutheast -735.19306 -884.5189
## regionsouthwest -1051.79022 -952.5129
## bmi30 3113.82664 2863.8090
ins_model2_shapley$lmg
## region age sex bmi children smoker
## 0.003071332 0.087382604 0.001513631 0.018013359 0.003402283 0.622028904
## bmi30
## 0.020414206
barplot(sort(ins_model2_shapley$lmg,decreasing = TRUE),col=c(1:7),main="Relative Importance of Predictors",xlab="Predictor Labels",ylab="Shapley Value Regression",font.lab=2)
