library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(relaimpo)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
## 
##     logit
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(ggthemr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
insurance <- read.csv("~/Documents/insurance.csv")
sum(is.na(insurance))
## [1] 0
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
insurance2 <- insurance %>%  
  mutate_if(is.character, as.factor) %>% 
  mutate_if(is.numeric, as.integer)

insurance2$charges <- as.numeric(insurance2$charges)
str(insurance2)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : int  27 33 33 22 28 25 33 27 29 25 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16884 1725 4449 21984 3866 ...
head(insurance2, n = 10)
##    age    sex bmi children smoker    region charges
## 1   19 female  27        0    yes southwest   16884
## 2   18   male  33        1     no southeast    1725
## 3   28   male  33        3     no southeast    4449
## 4   33   male  22        0     no northwest   21984
## 5   32   male  28        0     no northwest    3866
## 6   31 female  25        0     no southeast    3756
## 7   46 female  33        1     no southeast    8240
## 8   37 female  27        3     no northwest    7281
## 9   37   male  29        2     no northeast    6406
## 10  60 female  25        0     no northwest   28923
summary(insurance2)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.00   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.00   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.00   Median :1.000             
##  Mean   :39.21                Mean   :30.17   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.00   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.00   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1121  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16639  
##                  Max.   :63770
ggplot(insurance2, aes(x = region, y = charges, fill = region)) +
  geom_boxplot()

anova1 <- aov(charges~region, data = insurance2)
summary(anova1)
##               Df    Sum Sq   Mean Sq F value Pr(>F)  
## region         3 1.301e+09 433582488    2.97 0.0309 *
## Residuals   1334 1.948e+11 146007139                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = charges ~ region, data = insurance2)
## 
## $region
##                            diff        lwr        upr     p adj
## northwest-northeast  -988.83972 -3428.9653 1451.28586 0.7245052
## southeast-northeast  1328.99335 -1044.9756 3702.96232 0.4745273
## southwest-northeast -1059.45819 -3499.5838 1380.66740 0.6792015
## southeast-northwest  2317.83308   -54.2028 4689.86896 0.0582944
## southwest-northwest   -70.61846 -2508.8634 2367.62648 0.9998517
## southwest-southeast -2388.45154 -4760.4874  -16.41566 0.0476927
ggplot(insurance2, aes(x = smoker, y = charges, fill = smoker)) +
  geom_boxplot() +
  labs(x = "Smoker", y = "Charges", title = "Difference charges between smoker status")

ttest <- t.test(charges~smoker, data = insurance2)
ttest
## 
##  Welch Two Sample t-test
## 
## data:  charges by smoker
## t = -32.752, df = 311.85, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -25034.70 -22197.19
## sample estimates:
##  mean in group no mean in group yes 
##          8433.778         32049.726
ggplot(insurance2, aes(x = as.factor(children), y = charges, fill = as.factor(children))) +
  geom_boxplot() +
  labs(x = "Total children", 
       y = "Charges", 
       title = "Total children vs charges", 
       fill = "children")

anovachild <- insurance2
anovachild$children <- as.factor(anovachild$children)

anova2 <- aov(charges~children, data = anovachild)
summary(anova1)
##               Df    Sum Sq   Mean Sq F value Pr(>F)  
## region         3 1.301e+09 433582488    2.97 0.0309 *
## Residuals   1334 1.948e+11 146007139                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova2)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = charges ~ children, data = anovachild)
## 
## $children
##           diff          lwr      upr     p adj
## 1-0   365.2451  -2026.03603 2756.526 0.9980109
## 2-0  2707.6050     62.32555 5352.884 0.0412915
## 3-0  2989.3855   -109.98749 6088.758 0.0660080
## 4-0  1484.7383  -5546.12058 8515.597 0.9908596
## 5-0 -3579.9617 -11817.35070 4657.427 0.8169306
## 2-1  2342.3599   -588.41425 5273.134 0.2025477
## 3-1  2624.1403   -722.20786 5970.489 0.2210609
## 4-1  1119.4932  -6023.67910 8262.666 0.9977502
## 5-1 -3945.2068 -12278.66415 4388.251 0.7561940
## 3-2   281.7805  -3250.54513 3814.106 0.9999166
## 4-2 -1222.8667  -8455.03014 6009.297 0.9967699
## 5-2 -6287.5667 -14697.42930 2122.296 0.2705069
## 4-3 -1504.6471  -8914.96412 5905.670 0.9923753
## 5-3 -6569.3471 -15132.89773 1994.203 0.2433497
## 5-4 -5064.7000 -15702.42829 5573.028 0.7517128
ggplot(insurance2, aes(x = age, y = charges, colour = charges)) +
  geom_point(alpha = 0.5, size = 2.5)

ggplot(insurance2, aes(x = bmi, y = charges)) +
  geom_point()

insurance2$sex <- as.numeric(insurance2$sex)
insurance2$smoker <- as.numeric(insurance2$smoker)
ggcorr(insurance2, label = TRUE)
## Warning in ggcorr(insurance2, label = TRUE): data in column(s) 'region' are not
## numeric and were ignored

insurance2$bmi30 <- as.factor(ifelse(insurance2$bmi >= 30, "yes", "no"))

g1 <- ggplot(insurance2, aes(x = bmi30,fill = bmi30)) +
  geom_bar()

g2 <- ggplot(insurance2, aes(x = bmi30, y = charges, fill = bmi30)) +
  geom_jitter(aes(colour = bmi30), alpha = 0.7)


grid.arrange(g1, g2, layout_matrix = rbind(c(1,2)))

model1 <- lm(charges ~., data = insurance2)
summary(model1)
## 
## Call:
## lm(formula = charges ~ ., data = insurance2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12013.6  -3441.7   -117.7   1549.9  28461.6 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -31223.43    1422.22 -21.954  < 2e-16 ***
## age                257.29      11.78  21.833  < 2e-16 ***
## sex               -161.22     329.82  -0.489 0.625057    
## bmi                147.17      46.39   3.172 0.001546 ** 
## children           478.98     136.49   3.509 0.000464 ***
## smoker           23851.91     409.23  58.285  < 2e-16 ***
## regionnorthwest   -385.76     471.79  -0.818 0.413703    
## regionsoutheast   -884.52     475.28  -1.861 0.062956 .  
## regionsouthwest   -952.51     473.45  -2.012 0.044439 *  
## bmi30yes          2863.81     552.30   5.185 2.49e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6004 on 1328 degrees of freedom
## Multiple R-squared:  0.7558, Adjusted R-squared:  0.7542 
## F-statistic: 456.8 on 9 and 1328 DF,  p-value: < 2.2e-16
model2 <- lm(charges~age+smoker+children, data = insurance2)
summary(model2)
## 
## Call:
## lm(formula = charges ~ age + smoker + children, data = insurance2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15547.5  -1941.1  -1319.1   -425.6  29312.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -26695.09     753.68 -35.420  < 2e-16 ***
## age            273.09      12.42  21.990  < 2e-16 ***
## smoker       23842.59     431.84  55.211  < 2e-16 ***
## children       486.66     144.70   3.363 0.000792 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6372 on 1334 degrees of freedom
## Multiple R-squared:  0.7237, Adjusted R-squared:  0.7231 
## F-statistic:  1165 on 3 and 1334 DF,  p-value: < 2.2e-16
pre_charges <- function(m, a, b, c){

    pre_new <- predict(m, data.frame(age=a, bmi=b, children=c))

    msg <- paste("age: ",a,", bmi: ",b,", children: ",c,"   ==> Expect Charges: $",round(pre_new),sep="")

    print(msg)

}
model <- lm(charges ~ age+bmi+children, data=insurance2)

pre_charges(model, 19, 30, 0)
## [1] "age: 19, bmi: 30, children: 0   ==> Expect Charges: $7762"
ins_model2_shapley<-calc.relimp(model1,type="lmg")
ins_model2_shapley
## Response variable: charges 
## Total response variance: 146652409 
## Analysis based on 1338 observations 
## 
## 9 Regressors: 
## Some regressors combined in groups: 
##         Group  region : regionnorthwest regionsoutheast regionsouthwest 
## 
##  Relative importance of 7 (groups of) regressors assessed: 
##  region age sex bmi children smoker bmi30 
##  
## Proportion of variance explained by model: 75.58%
## Metrics are not normalized (rela=FALSE). 
## 
## Relative importance metrics: 
## 
##                  lmg
## region   0.003071332
## age      0.087382604
## sex      0.001513631
## bmi      0.018013359
## children 0.003402283
## smoker   0.622028904
## bmi30    0.020414206
## 
## Average coefficients for different model sizes: 
## 
##                     1group    2groups    3groups    4groups    5groups
## age               257.7231   255.8374   254.7976   254.4763   254.7951
## sex              1387.2042  1086.1731   805.9138   543.1028   295.2556
## bmi               388.2178   344.7524   302.0902   260.6158   220.7135
## children          683.0984   648.2967   614.0411   580.0646   546.2525
## smoker          23615.9481 23643.4783 23682.8824 23728.2164 23774.3010
## regionnorthwest  -988.8397  -896.3165  -799.9287  -700.2098  -597.6584
## regionsoutheast  1328.9934   721.2865   217.3227  -189.4597  -505.1292
## regionsouthwest -1059.4582 -1132.0797 -1164.6940 -1160.3356 -1121.8360
## bmi30            4838.6913  4379.3436  3992.1244  3662.2875  3374.6943
##                     6groups    7groups
## age               255.72604   257.2932
## sex                60.73655  -161.2197
## bmi               182.76829   147.1667
## children          512.56887   478.9784
## smoker          23816.73569 23851.9059
## regionnorthwest  -492.72012  -385.7622
## regionsoutheast  -735.19306  -884.5189
## regionsouthwest -1051.79022  -952.5129
## bmi30            3113.82664  2863.8090
ins_model2_shapley$lmg
##      region         age         sex         bmi    children      smoker 
## 0.003071332 0.087382604 0.001513631 0.018013359 0.003402283 0.622028904 
##       bmi30 
## 0.020414206
barplot(sort(ins_model2_shapley$lmg,decreasing = TRUE),col=c(1:7),main="Relative Importance of Predictors",xlab="Predictor Labels",ylab="Shapley Value Regression",font.lab=2)