R Markdown

Read the file on Claim Levels

setwd("~/Dropbox/UDLAP/Cursos/2022 Primavera/Tema Selecto/Presentaciones")

ClaimLev <- read.csv("CLAIMLEVEL.csv")
head(ClaimLev)
##   PolicyNum ClaimNum Year ClaimStatus   Claim Deduct EntityType
## 1    120002 20100192 2010      Closed 6838.87   1000     County
## 2    120003 20080726 2007      Closed 2085.00   5000     County
## 3    120003 20081656 2007      Closed 7835.00   5000     County
## 4    120003 20081657 2007      Closed 3500.00   5000     County
## 5    120003 20081656 2007      Closed 1480.00   5000     County
## 6    120003 20081656 2007      Closed  600.00   5000     County
##                      Description CoverageGroup CoverageCode Fire5 CountyCode
## 1                lightningdamage            BC           VF     4        ASH
## 2   lightningdamageatComm.Center            BC           VF     0        BAR
## 3    lightningdamageatwatertower            BC           VF     0        BAR
## 4    lightningdamageatwatertower            BC           VF     0        BAR
## 5     lightningdamgetoradiotower            BC           VF     0        BAR
## 6 vandalismdamageatrecyclecenter            BC           VE     0        BAR
##    county
## 1 Ashland
## 2  Barron
## 3  Barron
## 4  Barron
## 5  Barron
## 6  Barron

We select 2010, we have 403 policyholders and 1377 claims.

ClaimData<-subset(ClaimLev,Year==2010); 
length(unique(ClaimData$PolicyNum))  
## [1] 403
NTot = nrow(ClaimData)
NTot
## [1] 1377

We can analyze claims for 2010, always remember that a descriptive analysis is necessary.

summary(ClaimData$Claim)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        1      789     2250    26623     6171 12922218
sd(ClaimData$Claim)
## [1] 368029.7
summary(log(ClaimData$Claim))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.670   7.719   7.804   8.728  16.374
sd(log(ClaimData$Claim))
## [1] 1.683297

And plotting is always a good idea:

#histogram 
par(mfrow=c(1, 2))
hist(ClaimData$Claim, main="", xlab="Claims")
hist(log(ClaimData$Claim), main="", xlab="Logarithmic Claims")

ONE: We can start with the assumption that this is a LogNormal distribution, we transform our variable:

y <- log(ClaimData$Claim)
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.670   7.719   7.804   8.728  16.374
sd(y)
## [1] 1.683297

To fit our model, we will use the VGAM library, we will use

library(VGAM)
## Loading required package: stats4
## Loading required package: splines
fit.LN <- vglm(Claim ~ 1, family = lognormal, data = ClaimData)
summary(fit.LN)
## 
## Call:
## vglm(formula = Claim ~ 1, family = lognormal, data = ClaimData)
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1  7.80422    0.04535  172.10   <2e-16 ***
## (Intercept):2  0.52039    0.01906   27.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: meanlog, loglink(sdlog)
## 
## Log-likelihood: -13416.87 on 2752 degrees of freedom
## 
## Number of Fisher scoring iterations: 3 
## 
## No Hauck-Donner effect found in any of the estimates

With the coefficients an confidence intervals:

coef(fit.LN)  
## (Intercept):1 (Intercept):2 
##     7.8042218     0.5203908
confint(fit.LN, level = 0.95)
##                   2.5 %    97.5 %
## (Intercept):1 7.7153457 7.8930978
## (Intercept):2 0.4830429 0.5577387

We can also estimate the loglikelihood from out model:

logLik(fit.LN)
## [1] -13416.87

Now, assuming a Gamma distribution:

fit.gamma <- vglm(Claim ~ 1, family = gamma2, data = ClaimData)
summary(fit.gamma)
## 
## Call:
## vglm(formula = Claim ~ 1, family = gamma2, data = ClaimData)
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1 10.18952    0.04999  203.82   <2e-16 ***
## (Intercept):2 -1.23582    0.03001  -41.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: loglink(mu), loglink(shape)
## 
## Log-likelihood: -14150.59 on 2752 degrees of freedom
## 
## Number of Fisher scoring iterations: 13 
## 
## No Hauck-Donner effect found in any of the estimates
coef(fit.gamma) 
## (Intercept):1 (Intercept):2 
##     10.189515     -1.235822

The models gives \(\mu\) and \(\alpha\). We can obtain the parameters: # theta = mu / alpha

theta <- exp(coef(fit.gamma)[1]) / exp(coef(fit.gamma)[2]) 
alpha <- exp(coef(fit.gamma)[2])
theta
## (Intercept):1 
##      91613.78
alpha
## (Intercept):2 
##     0.2905959

And now we can compare the fitted distribution to the observed values:

plot(density(log(ClaimData$Claim)), main = "", xlab = "Log Expenditures")
x <- seq(0, 15, by = 0.01)
fgamma_ex <- dgamma(exp(x), shape = alpha, scale = theta) * exp(x)
lines(x, fgamma_ex, col = "blue")

We can also use glm approach:

library(MASS)
fit.gamma_2 <- glm(Claim ~ 1, data = ClaimData, family = Gamma(link = log)) 
summary(fit.gamma_2, dispersion = gamma.dispersion(fit.gamma_2)) 
## 
## Call:
## glm(formula = Claim ~ 1, family = Gamma(link = log), data = ClaimData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.287  -2.258  -1.764  -1.178  30.926  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 10.18952    0.04999   203.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 3.441204)
## 
##     Null deviance: 6569.1  on 1376  degrees of freedom
## Residual deviance: 6569.1  on 1376  degrees of freedom
## AIC: 28414
## 
## Number of Fisher Scoring iterations: 14

Again the parameters are given by:

theta <- exp(coef(fit.gamma_2)) * gamma.dispersion(fit.gamma_2)
alpha <- 1 / gamma.dispersion(fit.gamma_2)

theta
## (Intercept) 
##    91613.78
alpha
## [1] 0.2905959

Now, we use a Pareto distribution:

fit.pareto <- vglm(Claim ~ 1, paretoII, loc = 0, data = ClaimData)
summary(fit.pareto)
## 
## Call:
## vglm(formula = Claim ~ 1, family = paretoII, data = ClaimData, 
##     loc = 0)
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1  7.7329210  0.0933332  82.853   <2e-16 ***
## (Intercept):2 -0.0008753  0.0538642  -0.016    0.987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: loglink(scale), loglink(shape)
## 
## Log-likelihood: -13404.64 on 2752 degrees of freedom
## 
## Number of Fisher scoring iterations: 5 
## 
## No Hauck-Donner effect found in any of the estimates

And an exponential distribution:

fit.exp <- vglm(Claim ~ 1, exponential, data = ClaimData)
## Warning in vglm.fitter(x = x, y = y, w = w, offset = offset, Xm2 = Xm2, :
## iterations terminated because half-step sizes are very small
## Warning in vglm.fitter(x = x, y = y, w = w, offset = offset, Xm2 = Xm2, : some
## quantities such as z, residuals, SEs may be inaccurate due to convergence at a
## half-step
summary(fit.exp)
## 
## Call:
## vglm(formula = Claim ~ 1, family = exponential, data = ClaimData)
## 
## Coefficients: 
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.18952    0.02695  -378.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Name of linear predictor: loglink(rate) 
## 
## Residual deviance: 6569.099 on 1376 degrees of freedom
## 
## Log-likelihood: -15407.96 on 1376 degrees of freedom
## 
## Number of Fisher scoring iterations: 6 
## 
## No Hauck-Donner effect found in any of the estimates

And now, we compare all the models:

# None of these distributions is doing a great job....
plot(density(log(ClaimData$Claim)), main = "", xlab = "Log Expenditures",
     ylim = c(0 ,0.37))
x <- seq(0, 15, by = 0.01)
fexp_ex <- dgamma(exp(x), scale = exp(-coef(fit.exp)), shape = 1) * exp(x)
lines(x, fexp_ex, col = "red")
fgamma_ex <- dgamma(exp(x), shape = alpha, scale = theta) * exp(x)
lines(x, fgamma_ex, col = "blue")
fpareto_ex <- dparetoII(exp(x), loc = 0, shape = exp(coef(fit.pareto)[2]), 
                        scale = exp(coef(fit.pareto)[1])) * exp(x)
lines(x, fpareto_ex, col = "purple")
flnorm_ex <- dlnorm(exp(x), mean = coef(fit.LN)[1],
                    sd = exp(coef(fit.LN)[2])) * exp(x)
lines(x, flnorm_ex, col = "lightblue")

legend("topleft", c("log(ClaimData$Claim)", "Exponential", "Gamma", "Pareto", 
                    "lognormal"), 
       lty = 1, col = c("black","red","blue","purple","lightblue"))