Read the file on Claim Levels
setwd("~/Dropbox/UDLAP/Cursos/2022 Primavera/Tema Selecto/Presentaciones")
ClaimLev <- read.csv("CLAIMLEVEL.csv")
head(ClaimLev)
## PolicyNum ClaimNum Year ClaimStatus Claim Deduct EntityType
## 1 120002 20100192 2010 Closed 6838.87 1000 County
## 2 120003 20080726 2007 Closed 2085.00 5000 County
## 3 120003 20081656 2007 Closed 7835.00 5000 County
## 4 120003 20081657 2007 Closed 3500.00 5000 County
## 5 120003 20081656 2007 Closed 1480.00 5000 County
## 6 120003 20081656 2007 Closed 600.00 5000 County
## Description CoverageGroup CoverageCode Fire5 CountyCode
## 1 lightningdamage BC VF 4 ASH
## 2 lightningdamageatComm.Center BC VF 0 BAR
## 3 lightningdamageatwatertower BC VF 0 BAR
## 4 lightningdamageatwatertower BC VF 0 BAR
## 5 lightningdamgetoradiotower BC VF 0 BAR
## 6 vandalismdamageatrecyclecenter BC VE 0 BAR
## county
## 1 Ashland
## 2 Barron
## 3 Barron
## 4 Barron
## 5 Barron
## 6 Barron
We select 2010, we have 403 policyholders and 1377 claims.
ClaimData<-subset(ClaimLev,Year==2010);
length(unique(ClaimData$PolicyNum))
## [1] 403
NTot = nrow(ClaimData)
NTot
## [1] 1377
We can analyze claims for 2010, always remember that a descriptive analysis is necessary.
summary(ClaimData$Claim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 789 2250 26623 6171 12922218
sd(ClaimData$Claim)
## [1] 368029.7
summary(log(ClaimData$Claim))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.670 7.719 7.804 8.728 16.374
sd(log(ClaimData$Claim))
## [1] 1.683297
And plotting is always a good idea:
#histogram
par(mfrow=c(1, 2))
hist(ClaimData$Claim, main="", xlab="Claims")
hist(log(ClaimData$Claim), main="", xlab="Logarithmic Claims")
ONE: We can start with the assumption that this is a LogNormal distribution, we transform our variable:
y <- log(ClaimData$Claim)
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.670 7.719 7.804 8.728 16.374
sd(y)
## [1] 1.683297
To fit our model, we will use the VGAM library, we will use
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
fit.LN <- vglm(Claim ~ 1, family = lognormal, data = ClaimData)
summary(fit.LN)
##
## Call:
## vglm(formula = Claim ~ 1, family = lognormal, data = ClaimData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 7.80422 0.04535 172.10 <2e-16 ***
## (Intercept):2 0.52039 0.01906 27.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: meanlog, loglink(sdlog)
##
## Log-likelihood: -13416.87 on 2752 degrees of freedom
##
## Number of Fisher scoring iterations: 3
##
## No Hauck-Donner effect found in any of the estimates
With the coefficients an confidence intervals:
coef(fit.LN)
## (Intercept):1 (Intercept):2
## 7.8042218 0.5203908
confint(fit.LN, level = 0.95)
## 2.5 % 97.5 %
## (Intercept):1 7.7153457 7.8930978
## (Intercept):2 0.4830429 0.5577387
We can also estimate the loglikelihood from out model:
logLik(fit.LN)
## [1] -13416.87
Now, assuming a Gamma distribution:
fit.gamma <- vglm(Claim ~ 1, family = gamma2, data = ClaimData)
summary(fit.gamma)
##
## Call:
## vglm(formula = Claim ~ 1, family = gamma2, data = ClaimData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 10.18952 0.04999 203.82 <2e-16 ***
## (Intercept):2 -1.23582 0.03001 -41.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: loglink(mu), loglink(shape)
##
## Log-likelihood: -14150.59 on 2752 degrees of freedom
##
## Number of Fisher scoring iterations: 13
##
## No Hauck-Donner effect found in any of the estimates
coef(fit.gamma)
## (Intercept):1 (Intercept):2
## 10.189515 -1.235822
The models gives \(\mu\) and \(\alpha\). We can obtain the parameters: # theta = mu / alpha
theta <- exp(coef(fit.gamma)[1]) / exp(coef(fit.gamma)[2])
alpha <- exp(coef(fit.gamma)[2])
theta
## (Intercept):1
## 91613.78
alpha
## (Intercept):2
## 0.2905959
And now we can compare the fitted distribution to the observed values:
plot(density(log(ClaimData$Claim)), main = "", xlab = "Log Expenditures")
x <- seq(0, 15, by = 0.01)
fgamma_ex <- dgamma(exp(x), shape = alpha, scale = theta) * exp(x)
lines(x, fgamma_ex, col = "blue")
We can also use glm approach:
library(MASS)
fit.gamma_2 <- glm(Claim ~ 1, data = ClaimData, family = Gamma(link = log))
summary(fit.gamma_2, dispersion = gamma.dispersion(fit.gamma_2))
##
## Call:
## glm(formula = Claim ~ 1, family = Gamma(link = log), data = ClaimData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.287 -2.258 -1.764 -1.178 30.926
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.18952 0.04999 203.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 3.441204)
##
## Null deviance: 6569.1 on 1376 degrees of freedom
## Residual deviance: 6569.1 on 1376 degrees of freedom
## AIC: 28414
##
## Number of Fisher Scoring iterations: 14
Again the parameters are given by:
theta <- exp(coef(fit.gamma_2)) * gamma.dispersion(fit.gamma_2)
alpha <- 1 / gamma.dispersion(fit.gamma_2)
theta
## (Intercept)
## 91613.78
alpha
## [1] 0.2905959
Now, we use a Pareto distribution:
fit.pareto <- vglm(Claim ~ 1, paretoII, loc = 0, data = ClaimData)
summary(fit.pareto)
##
## Call:
## vglm(formula = Claim ~ 1, family = paretoII, data = ClaimData,
## loc = 0)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 7.7329210 0.0933332 82.853 <2e-16 ***
## (Intercept):2 -0.0008753 0.0538642 -0.016 0.987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: loglink(scale), loglink(shape)
##
## Log-likelihood: -13404.64 on 2752 degrees of freedom
##
## Number of Fisher scoring iterations: 5
##
## No Hauck-Donner effect found in any of the estimates
And an exponential distribution:
fit.exp <- vglm(Claim ~ 1, exponential, data = ClaimData)
## Warning in vglm.fitter(x = x, y = y, w = w, offset = offset, Xm2 = Xm2, :
## iterations terminated because half-step sizes are very small
## Warning in vglm.fitter(x = x, y = y, w = w, offset = offset, Xm2 = Xm2, : some
## quantities such as z, residuals, SEs may be inaccurate due to convergence at a
## half-step
summary(fit.exp)
##
## Call:
## vglm(formula = Claim ~ 1, family = exponential, data = ClaimData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.18952 0.02695 -378.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Name of linear predictor: loglink(rate)
##
## Residual deviance: 6569.099 on 1376 degrees of freedom
##
## Log-likelihood: -15407.96 on 1376 degrees of freedom
##
## Number of Fisher scoring iterations: 6
##
## No Hauck-Donner effect found in any of the estimates
And now, we compare all the models:
# None of these distributions is doing a great job....
plot(density(log(ClaimData$Claim)), main = "", xlab = "Log Expenditures",
ylim = c(0 ,0.37))
x <- seq(0, 15, by = 0.01)
fexp_ex <- dgamma(exp(x), scale = exp(-coef(fit.exp)), shape = 1) * exp(x)
lines(x, fexp_ex, col = "red")
fgamma_ex <- dgamma(exp(x), shape = alpha, scale = theta) * exp(x)
lines(x, fgamma_ex, col = "blue")
fpareto_ex <- dparetoII(exp(x), loc = 0, shape = exp(coef(fit.pareto)[2]),
scale = exp(coef(fit.pareto)[1])) * exp(x)
lines(x, fpareto_ex, col = "purple")
flnorm_ex <- dlnorm(exp(x), mean = coef(fit.LN)[1],
sd = exp(coef(fit.LN)[2])) * exp(x)
lines(x, flnorm_ex, col = "lightblue")
legend("topleft", c("log(ClaimData$Claim)", "Exponential", "Gamma", "Pareto",
"lognormal"),
lty = 1, col = c("black","red","blue","purple","lightblue"))