Predicting Insurance costs with Multiple Linear Regression

Asel Mendis (s3669256) & Jenny Nguyen (3639386)

October 2, 2018

Introduction

Bayesian inference is becoming popular recently. It has been proven to be a productive method with an opposite approach compared to traditional statistics, its applications vary in a plenty of fields, especially in prediction medical cost. In this project, our aim is to analyse ** Personal Medical Cost** by using the Bayesian analysis methods. Some visualisations, model specifications, model implementation and diagnostic checking in JAGS will be mentioned. Three Simulated predictions will be given as an example of how the model being applied.

The dataset

The data set was downloaded from Kaggle. Below is a brief description of the dataset:

Applied Bayesian Method

Expert information - prior information

From doing research and running the regression on the actual dataset, we have

The prediction

we use the model to predict with these three queries:

The query predictions

The query predictions

graphics.off()
rm(list=ls())
setwd("C:/Users/mlcl.local/Downloads")
library(magrittr)
library(rjags)
library(coda)
library(runjags)
library(tufte)
data <- read.csv("insurance_data.csv",header = TRUE )
head(data)
##   age sex    bmi children smoker southwest southeast northeast northwest
## 1  19   1 27.900        0      1         1         0         0         0
## 2  18   0 33.770        1      0         0         1         0         0
## 3  28   0 33.000        3      0         0         1         0         0
## 4  33   0 22.705        0      0         0         0         0         1
## 5  32   0 28.880        0      0         0         0         0         1
## 6  31   1 25.740        0      0         0         1         0         0
##     charges
## 1 16884.924
## 2  1725.552
## 3  4449.462
## 4 21984.471
## 5  3866.855
## 6  3756.622

Charges distribution:

hist(data$charges, main="Histogram of Medical Charges",
xlab="Charges",
col="darkmagenta",
freq=FALSE, breaks = 20)

## Multiple predictors:

yName <- "charges"
xName <- c("age","sex","bmi","children","smoker","southwest","southeast",
            "northeast")
fileNameRoot1 <- "charges_reg"
numSavedSteps <- 3000
thinSteps <- 4

The model

Below is the JAGS model specification:

 modelString = "
  # Standardize the data:
  data {
    ym <- mean(y)
    ysd <- sd(y)
    for ( i in 1:Ntotal ) {
      zy[i] <- ( y[i] - ym ) / ysd
    }
    for ( j in 1:Nx ) {
      xm[j]  <- mean(x[,j])
      xsd[j] <- sd(x[,j])
      for ( i in 1:Ntotal ) {
        zx[i,j] <- ( x[i,j] - xm[j] ) / xsd[j]
      }
    }

    # Specify the priors for original beta parameters
    # Prior locations to reflect the expert information
    mu0 <-  ym # Set to overall mean a priori based on the interpretation of constant term in regression
    mu[1] <- 250 # age
    mu[2] <- -2000 # sex=female
    mu[3] <- 350 # bmi
    mu[4] <- 700 # number of children
    mu[5] <- 20000 # smoker=1
    mu[6] <- -1000 # southwest
    mu[7] <- 2000 # southeast
    mu[8] <- 200 # northeast
  

    # Prior variances to reflect the expert information    
    Var0 <- (ysd^2)*10
    Var[1] <- (xsd[1]^2)*10  # age
    Var[2] <- (xsd[2]^2)*10 # sex=female 
    Var[3] <- (xsd[3]^2)*10 # bmi
    Var[4] <- (xsd[4]^2)*10 # number of children
    Var[5] <- (xsd[5]^2)*10 # smoker=1
    Var[6] <- (xsd[6]^2)*10 # southwest
    Var[7] <- (xsd[7]^2)*10 # southeast
    Var[8] <- (xsd[8]^2)*10 # northeast
   

    # Compute corresponding prior means and variances for the standardised parameters
    muZ[1:Nx] <-  mu[1:Nx] * xsd[1:Nx] / ysd 

    muZ0 <- (mu0 + sum( mu[1:Nx] * xm[1:Nx] / xsd[1:Nx] )*ysd - ym) / ysd 

    # Compute corresponding prior variances and variances for the standardised parameters
    VarZ[1:Nx] <- Var[1:Nx] * ( xsd[1:Nx]/ ysd )^2
    VarZ0 <- Var0 / (ysd^2)

  }
  # Specify the model for standardized data:
  model {
    for ( i in 1:Ntotal ) {
      zy[i] ~ dt( zbeta0 + sum( zbeta[1:Nx] * zx[i,1:Nx] ) , 1/zsigma^2 , nu )
    }

    # Priors vague on standardized scale:
    zbeta0 ~ dnorm( sum(zbeta[1:Nx]*xm[1:Nx]/xsd[1:Nx] - ym/ysd), 1/VarZ0 )  
    for ( j in 1:Nx ) {
      zbeta[j] ~ dnorm( muZ[j] , 1/VarZ[j] )
    }
    zsigma ~ dunif( 1.0E-5 , 1.0E+1 )
    nu ~ dexp(1/30.0)

    # Transform to original scale:
    beta[1:Nx] <- ( zbeta[1:Nx] / xsd[1:Nx] )*ysd
    beta0 <- zbeta0*ysd  + ym - sum( zbeta[1:Nx] * xm[1:Nx] / xsd[1:Nx] )*ysd
    sigma <- zsigma*ysd

    # Compute predictions at every step of the MCMC

    pred1 <- beta0 + beta[1] * xPred1[1] + beta[2] * xPred1[2] + beta[3] * xPred1[3] + 
              beta[4] * xPred1[4] + beta[5] * xPred1[5] + beta[6]*xPred1[6] +
              beta[7]*xPred1[7] + beta[8]*xPred1[8] 

    pred2 <- beta0 + beta[1] * xPred2[1] + beta[2] * xPred2[2] + beta[3] * xPred2[3] + 
              beta[4] * xPred2[4] + beta[5] * xPred2[5] + beta[6]*xPred2[6] +
              beta[7]*xPred2[7] + beta[8]*xPred2[8] 

    pred3 <- beta0 + beta[1] * xPred3[1] + beta[2] * xPred3[2] + beta[3] * xPred3[3] + 
              beta[4] * xPred3[4] + beta[5] * xPred3[5] + beta[6]*xPred3[6] +
              beta[7]*xPred3[7] + beta[8]*xPred3[8] 


  }
  "

Chain Generation

names <- c("beta[1]","beta[2]","beta[3]","beta[4]","beta[5]","beta[6]",
            "beta[7]","beta[8]") 

xpred1 <- c(18,1,35,0,1,0,0,1)
names(xpred1) <- names

xpred2 <- c(40,0,25,2,1,1,0,0)
names(xpred2) <- names

xpred3 <- c(30,1,20,1,0,0,1,0)
names(xpred3) <- names
source("C:/Users/mlcl.local/Downloads/fpfunctions - Copy.R")
## 
## *********************************************************************
## Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition:
## A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier.
## *********************************************************************
source("C:/Users/mlcl.local/Downloads/DBDA2E-utilities.R")
## 
## *********************************************************************
## Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition:
## A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier.
## *********************************************************************
mcmccoda1 <- genMCMC(data=data, xName=xName, yName=yName, numSavedSteps=numSavedSteps,
                    thinSteps=thinSteps, saveName=fileNameRoot1, xPred1 = xpred1,
                    xPred2=xpred2, xPred3=xpred3)
## 
## CORRELATION MATRIX OF PREDICTORS:
##               age    sex    bmi children smoker southwest southeast
## age        1.000  0.021  0.109    0.042 -0.025     0.010    -0.012
## sex        0.021  1.000 -0.046   -0.017 -0.076     0.004    -0.017
## bmi        0.109 -0.046  1.000    0.013  0.004    -0.006     0.270
## children   0.042 -0.017  0.013    1.000  0.008     0.022    -0.023
## smoker    -0.025 -0.076  0.004    0.008  1.000    -0.037     0.068
## southwest  0.010  0.004 -0.006    0.022 -0.037     1.000    -0.346
## southeast -0.012 -0.017  0.270   -0.023  0.068    -0.346     1.000
## northeast  0.002  0.002 -0.138   -0.023  0.003    -0.320    -0.346
##           northeast
## age           0.002
## sex           0.002
## bmi          -0.138
## children     -0.023
## smoker        0.003
## southwest    -0.320
## southeast    -0.346
## northeast     1.000
## 
## Calling 3 simulations using the parallel method...
## Following the progress of chain 1 (the program will wait for all
## chains to finish before continuing):
## Welcome to JAGS 4.3.0 on Sat Oct 06 13:26:02 2018
## JAGS is free software and comes with ABSOLUTELY NO WARRANTY
## Loading module: basemod: ok
## Loading module: bugs: ok
## . . Reading data file data.txt
## . Compiling data graph
##    Resolving undeclared variables
##    Allocating nodes
##    Initializing
##    Reading data back into data table
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 1338
##    Unobserved stochastic nodes: 11
##    Total graph size: 29581
## . Reading parameter file inits1.txt
## . Initializing model
## . Adapting 1000
## -------------------------------------------------| 1000
## ++++++++++++++++++++++++++++++++++++++++++++++++++ 100%
## Adaptation successful
## . Updating 1000
## -------------------------------------------------| 1000
## ************************************************** 100%
## . . . . . . . . . . . Updating 4000
## -------------------------------------------------| 4000
## ************************************************** 100%
## . . . . Updating 0
## . Deleting model
## . 
## All chains have finished
## Simulation complete.  Reading coda files...
## Coda files loaded successfully
## Finished running the simulation

Chain Diagnostics

paraName1 <- varnames(mcmccoda1)
for (pn1 in paraName1) {
    diagMCMC(codaObject=mcmccoda1, parName=pn1, saveName=fileNameRoot1, saveType="eps")
}
The first query predictions

The first query predictions

The second predictions

The second predictions

nu

nu

sigma

sigma

Beta0

Beta0

Beta1

Beta1

Beta2

Beta2

Beta3

Beta3

Beta4

Beta4

Beta5

Beta5

Beta6

Beta6

Beta7

Beta7

Beta8

Beta8

Summary Statitics of chain

summaryInfo1 = smryMCMC( mcmccoda1 , 
                        saveName=fileNameRoot1  )
show(summaryInfo1)
##                    Mean        Median          Mode    ESS HDImass
## CHAIN      2.000000e+00  2.000000e+00  2.997931e+00    1.5    0.95
## beta0     -8.404314e+03 -8.401870e+03 -8.399405e+03 2341.1    0.95
## beta[1]    2.640114e+02  2.641175e+02  2.649164e+02 3000.0    0.95
## beta[2]   -1.999806e+03 -1.999790e+03 -1.999538e+03 3000.0    0.95
## beta[3]    1.708239e+02  1.705800e+02  1.687798e+02 2115.9    0.95
## beta[4]    6.992959e+02  6.993410e+02  7.000364e+02 3000.0    0.95
## beta[5]    1.999998e+04  2.000000e+04  1.999985e+04 2722.5    0.95
## beta[6]   -9.999781e+02 -9.999760e+02 -1.000007e+03 3000.0    0.95
## beta[7]    1.999875e+03  1.999900e+03  2.000028e+03 3000.0    0.95
## beta[8]    1.999948e+02  1.999970e+02  2.000029e+02 3000.0    0.95
## sigma      2.785495e+03  2.782350e+03  2.791379e+03 1964.1    0.95
## nu         1.557408e+00  1.551405e+00  1.513528e+00 1815.8    0.95
## zbeta0    -1.539309e-01 -1.540360e-01 -1.541947e-01 2884.6    0.95
## zbeta[1]   3.063044e-01  3.064275e-01  3.073543e-01 3000.0    0.95
## zbeta[2]  -8.259466e-02 -8.259405e-02 -8.258352e-02 3000.0    0.95
## zbeta[3]   8.602106e-02  8.589815e-02  8.499186e-02 2115.9    0.95
## zbeta[4]   6.961151e-02  6.961600e-02  6.968517e-02 3000.0    0.95
## zbeta[5]   6.667106e-01  6.667100e-01  6.667064e-01 2723.5    0.95
## zbeta[6]  -3.542408e-02 -3.542400e-02 -3.542513e-02 3000.0    0.95
## zbeta[7]   7.351817e-02  7.351890e-02  7.352387e-02 3000.0    0.95
## zbeta[8]   7.077368e-03  7.077450e-03  7.077648e-03 3000.0    0.95
## zsigma     2.300159e-01  2.297560e-01  2.305017e-01 1964.1    0.95
## pred1      2.052690e+04  2.052915e+04  2.054154e+04 2688.3    0.95
## pred2      2.682534e+04  2.682590e+04  2.685085e+04 2816.0    0.95
## pred3      3.631870e+03  3.635070e+03  3.651573e+03 2494.5    0.95
## log10(nu)  1.912622e-01  1.907252e-01  1.912794e-01 1826.0    0.95
##                  HDIlow       HDIhigh CompVal PcntGtCompVal ROPElow
## CHAIN         1.0000000  3.000000e+00      NA            NA      NA
## beta0     -9455.6600000 -7.376300e+03      NA            NA      NA
## beta[1]     249.9170000  2.771760e+02      NA            NA      NA
## beta[2]   -2002.8300000 -1.996780e+03      NA            NA      NA
## beta[3]     137.7180000  2.014310e+02      NA            NA      NA
## beta[4]     692.2240000  7.068120e+02      NA            NA      NA
## beta[5]   19997.4000000  2.000250e+04      NA            NA      NA
## beta[6]   -1002.7000000 -9.972930e+02      NA            NA      NA
## beta[7]    1997.0100000  2.002600e+03      NA            NA      NA
## beta[8]     197.1160000  2.025550e+02      NA            NA      NA
## sigma      2506.1400000  3.088940e+03      NA            NA      NA
## nu            1.3311900  1.771660e+00      NA            NA      NA
## zbeta0       -0.1689830 -1.368990e-01      NA            NA      NA
## zbeta[1]      0.2899530  3.215780e-01      NA            NA      NA
## zbeta[2]     -0.0827195 -8.246950e-02      NA            NA      NA
## zbeta[3]      0.0693499  1.014340e-01      NA            NA      NA
## zbeta[4]      0.0689076  7.035970e-02      NA            NA      NA
## zbeta[5]      0.6666260  6.667960e-01      NA            NA      NA
## zbeta[6]     -0.0355205 -3.532890e-02      NA            NA      NA
## zbeta[7]      0.0734135  7.361890e-02      NA            NA      NA
## zbeta[8]      0.0069755  7.167980e-03      NA            NA      NA
## zsigma        0.2069480  2.550730e-01      NA            NA      NA
## pred1     20118.0000000  2.091660e+04      NA            NA      NA
## pred2     26584.7000000  2.707710e+04      NA            NA      NA
## pred3      3278.6700000  4.014430e+03      NA            NA      NA
## log10(nu)     0.1327334  2.560704e-01      NA            NA      NA
##           ROPEhigh PcntLtROPE PcntInROPE PcntGtROPE
## CHAIN           NA         NA         NA         NA
## beta0           NA         NA         NA         NA
## beta[1]         NA         NA         NA         NA
## beta[2]         NA         NA         NA         NA
## beta[3]         NA         NA         NA         NA
## beta[4]         NA         NA         NA         NA
## beta[5]         NA         NA         NA         NA
## beta[6]         NA         NA         NA         NA
## beta[7]         NA         NA         NA         NA
## beta[8]         NA         NA         NA         NA
## sigma           NA         NA         NA         NA
## nu              NA         NA         NA         NA
## zbeta0          NA         NA         NA         NA
## zbeta[1]        NA         NA         NA         NA
## zbeta[2]        NA         NA         NA         NA
## zbeta[3]        NA         NA         NA         NA
## zbeta[4]        NA         NA         NA         NA
## zbeta[5]        NA         NA         NA         NA
## zbeta[6]        NA         NA         NA         NA
## zbeta[7]        NA         NA         NA         NA
## zbeta[8]        NA         NA         NA         NA
## zsigma          NA         NA         NA         NA
## pred1           NA         NA         NA         NA
## pred2           NA         NA         NA         NA
## pred3           NA         NA         NA         NA
## log10(nu)       NA         NA         NA         NA
# Display posterior information:
plotMCMC( mcmccoda1 , data=data , xName=xName , yName=yName , 
          pairsPlot=TRUE , showCurve=FALSE ,
          saveName="posterior" , saveType="png" )
Post Pair

Post Pair

Posterior Coefficients

Posterior Coefficients

The plotPost function writes mode and \(95\%\) HDI over thehistogram of posterior distribution.

The mode values for beta0, beta1, beta2, beta3, beta4, beta5, beta6, beta7, beta8, sigma are -8330, 262, -2000, 171, 20000, -1000, 2000, 2000, 2780 respectively.

The charges predictions for each query are 20500, 26800, 3640 AUD respectively.