Data Manipulation

Load in the data

data <- readRDS("C:/Users/aidan/Downloads/data.rds")
View(data)
library(fitdistrplus)
## Warning: package 'fitdistrplus' was built under R version 4.4.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.4.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.4.3

Manipulate data to represent Individual Claim Amount, X

# Estimated above reinsurance ($25,000)
EstM <- 0.23*nrow(data[data$clmcost=='0',])

# Total Claims Number
Total_claims <- sum(data$clmcost) + EstM*25000 ; Total_claims
## [1] 54798154
# Average Claim net reinsurance
Avg_claims <- (sum(data$clmcost)+EstM*25000)/(sum(data$numclms) + EstM) ; Avg_claims
## [1] 18261.91
# A is a list of all non-zero claims. X is a list of all claims UNDER retention. 

A<- subset(data[c("numclms","clmcosts")], numclms>0)
X <- c()

for (row in 1:nrow(A)) {
  if (A[row,1]==1) {
    X <- append(X, A[row,2])
  } else {
    for (claimno in 1:A[row,1]) {
      X <- append(X,A[row,2]/A[row,1])
    }
  }
}
hist(X, main = "Non-Zero claims without retention")

Section A: Question 1 Severity Model’s

Log-normal Model Fit:

(Log-normal model fitted under MLE, MME, MSE)

library(fitdistrplus)

min(X)
## [1] 10
max(X)
## [1] 7355
# Log normal Distribution: 

fit_ln  <- fitdist(X, "lnorm", method = "mme")  # Moment Matching Estimation 
fit_ln2  <- fitdist(X, "lnorm", method = "mle") # Method Maxium Likelihood
fit_ln3  <- fitdist(X, "lnorm", method = "mse") # Maxium goodness of fit 

gofstat(list(fit_ln, fit_ln2, fit_ln3), fitnames = c("MME","MLE", "MSE"))
## Goodness-of-fit statistics
##                                      MME        MLE        MSE
## Kolmogorov-Smirnov statistic   0.1933909 0.07359625  0.1434873
## Cramer-von Mises statistic     8.3596999 1.39874800  5.7000458
## Anderson-Darling statistic   132.7899165 9.43582444 61.0036246
## 
## Goodness-of-fit criteria
##                                     MME      MLE      MSE
## Akaike's Information Criterion 13345.70 12691.55 12873.75
## Bayesian Information Criterion 13355.15 12701.01 12883.20
par(mfrow = c(3,3))
plot(fit_ln)

plot(fit_ln2)

plot(fit_ln3)

# on a level of AIC, BIC and other, MLE dominates the other methods
gofstat(fit_ln2)
## Goodness-of-fit statistics
##                              1-mle-lnorm
## Kolmogorov-Smirnov statistic  0.07359625
## Cramer-von Mises statistic    1.39874800
## Anderson-Darling statistic    9.43582444
## 
## Goodness-of-fit criteria
##                                1-mle-lnorm
## Akaike's Information Criterion    12691.55
## Bayesian Information Criterion    12701.01
gof_ln <- gofstat(fit_ln2)

# Code written for quick change between models 
model <- fit_ln2
set.seed(1234)
sim1<- rlnorm(length(X), meanlog = model$estimate[1], sdlog = model$estimate[2])
par(mfrow=c(2,2))

# choose a common upper limit
xlim <- c(0, 8000)

par(mfrow = c(1,2), mar=c(4,4,2,1))
hist(X,    xlim = xlim, xaxs = "i", main="Histogram of X", breaks = 50)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1", breaks = 50)

par(mfrow=c(1,1))
xmax <- max(c(X, sim1))          
xlim <- c(0, xmax)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1 non scaled x axis")

# Modelled and Simulation quantiles compared to observed quantiles 
data.frame( Modelled = round(c(qlnorm(0, meanlog = model$estimate[1], sdlog = model$estimate[2]), qlnorm(0.25, meanlog = model$estimate[1], sdlog = model$estimate[2] ), qlnorm(0.5,meanlog = model$estimate[1], sdlog = model$estimate[2]), qlnorm(0.75, meanlog = model$estimate[1], sdlog = model$estimate[2]), qlnorm(1, meanlog = model$estimate[1], sdlog = model$estimate[2])),1), Simulation = round((quantile(sim1)),1), Observed = quantile(X))
summary(model)
## Fitting of the distribution ' lnorm ' by maximum likelihood 
## Parameters : 
##         estimate Std. Error
## meanlog 5.697840 0.05595784
## sdlog   1.616978 0.03956810
## Loglikelihood:  -6343.777   AIC:  12691.55   BIC:  12701.01 
## Correlation matrix:
##         meanlog sdlog
## meanlog       1     0
## sdlog         0     1
# fitted distribution mean
exp(model$estimate[1]+0.5*model$estimate[2]^2)
##  meanlog 
## 1102.294
# mean of simulation
data.frame(statistic = c("mean", "standard deviation"), lognormal_Sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

Exponential Fitting under only MLE:

# Fitting of exponential distribution

x <- as.numeric(X)
x <- x[is.finite(x) & x >= 0]
x_pos <- x[x > 0]

fit_exp <- fitdist(
  x_pos, "exp",
  start = list(rate = 1/mean(x_pos)),
  lower = 1e-12,                  # keep rate > 0
  method = "mle",
  optim.method = "Nelder-Mead"    # more stable for 1-param cases
)
summary(fit_exp)
## Fitting of the distribution ' exp ' by maximum likelihood 
## Parameters : 
##         estimate
## rate 0.001272567
## Loglikelihood:  -6401.71   AIC:  12805.42   BIC:  12810.15
gofstat(fit_exp)
## Goodness-of-fit statistics
##                               1-mle-exp
## Kolmogorov-Smirnov statistic  0.1489007
## Cramer-von Mises statistic    6.3393067
## Anderson-Darling statistic   46.8470937
## 
## Goodness-of-fit criteria
##                                1-mle-exp
## Akaike's Information Criterion  12805.42
## Bayesian Information Criterion  12810.15
gof_exp <- gofstat(fit_exp)
coef(fit_exp)
##        rate 
## 0.001272567
plot(fit_exp)

# Code written for quick change between models 
model <- fit_exp
set.seed(1234)

sim1<- rexp(length(X), rate = fit_exp$estimate)
par(mfrow=c(2,2))

# choose a common upper limit
xlim <- c(0, 8000)

par(mfrow = c(1,2), mar=c(4,4,2,1))
hist(X,    xlim = xlim, xaxs = "i", main="Histogram of X", breaks = 50)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1", breaks = 50)

par(mfrow=c(1,1))
xmax <- max(c(X, sim1))          
xlim <- c(0, xmax)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1 non scaled x axis")

# Simulation quantiles compared to observed quantiles 
data.frame( Modelled = round(c(qexp(0, rate = fit_exp$estimate), qexp(0.25, rate = fit_exp$estimate ), qexp(0.5, rate = fit_exp$estimate), qexp(0.75, rate = fit_exp$estimate), qexp(1, rate = fit_exp$estimate)),1), Simulation = round((quantile(sim1)),1), Observed = quantile(X))
summary(model)
## Fitting of the distribution ' exp ' by maximum likelihood 
## Parameters : 
##         estimate
## rate 0.001272567
## Loglikelihood:  -6401.71   AIC:  12805.42   BIC:  12810.15
# pure mean under fitted model
1/fit_exp$estimate
##     rate 
## 785.8132
# observed mean 
mean(X)
## [1] 785.8132
# mean of simulation
data.frame(statistic = c("mean", "standard deviation"), Exponetial_sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

Gamma Fitting through MME:

### Gamma ### 

fit_gam <- fitdist(X, "gamma", method = "mme")

gofstat(fit_gam)
## Goodness-of-fit statistics
##                              1-mme-gamma
## Kolmogorov-Smirnov statistic  0.07455777
## Cramer-von Mises statistic    0.21349555
## Anderson-Darling statistic    3.47019630
## 
## Goodness-of-fit criteria
##                                1-mme-gamma
## Akaike's Information Criterion    12679.84
## Bayesian Information Criterion    12689.29
gof_gamma <- gofstat(fit_gam)

summary(fit_gam)
## Fitting of the distribution ' gamma ' by matching moments 
## Parameters : 
##           estimate   Std. Error
## shape 0.5456635627 4.494607e-02
## rate  0.0006943935 6.580093e-05
## Loglikelihood:  -6337.918   AIC:  12679.84   BIC:  12689.29 
## Correlation matrix:
##           shape      rate
## shape 1.0000000 0.8692414
## rate  0.8692414 1.0000000
plot(fit_gam)

# Code written for quick change between models 
model <- fit_gam
set.seed(1234)
fit_gam$estimate
##        shape         rate 
## 0.5456635627 0.0006943935
sim1<- rgamma(length(X), shape = fit_gam$estimate[1], rate = fit_gam$estimate[2])

# Graph
par(mfrow=c(2,2))

# choose a common upper limit
xlim <- c(0, 8000)

par(mfrow = c(1,2), mar=c(4,4,2,1))
hist(X,    xlim = xlim, xaxs = "i", main="Histogram of X", breaks = 50)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1", breaks = 50)

par(mfrow=c(1,1))
xmax <- max(c(X, sim1))         
xlim <- c(0, xmax)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1 non scaled x axis")

# Simulation quantiles compared to observed quantiles 
data.frame(
          Modelled = round(c(qgamma(0, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(0.25, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(0.5, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(0.75,shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(1, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2])),1), 
           
           Simulation = round((quantile(sim1)),1), 
           
           Observed = quantile(X)
          )
summary(model)
## Fitting of the distribution ' gamma ' by matching moments 
## Parameters : 
##           estimate   Std. Error
## shape 0.5456635627 4.494607e-02
## rate  0.0006943935 6.580093e-05
## Loglikelihood:  -6337.918   AIC:  12679.84   BIC:  12689.29 
## Correlation matrix:
##           shape      rate
## shape 1.0000000 0.8692414
## rate  0.8692414 1.0000000
# fitted distribution mean
mean(sim1)
## [1] 784.6038
# Mean and variance of simulation compared to Observed
data.frame(statistic = c("mean", "standard deviation"), Gamma_Sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

Weibull Distribution through MLE:

fit_wei <- fitdist(X, "weibull", method = "mle")
gofstat(fit_wei)
## Goodness-of-fit statistics
##                              1-mle-weibull
## Kolmogorov-Smirnov statistic    0.04605632
## Cramer-von Mises statistic      0.23584832
## Anderson-Darling statistic      2.50396357
## 
## Goodness-of-fit criteria
##                                1-mle-weibull
## Akaike's Information Criterion      12652.32
## Bayesian Information Criterion      12661.77
gof_wei <- gofstat(fit_wei)
plot(fit_wei)

model <- fit_wei
set.seed(1234)

fit_wei$estimate
##       shape       scale 
##   0.7330834 644.9072807
sim1<- rweibull(length(X), shape = fit_wei$estimate[1], scale = fit_wei$estimate[2])

# choose a common upper limit
xlim <- c(0, 8000)
par(mfrow = c(1,2), mar=c(4,4,2,1))
hist(X,    xlim = xlim, xaxs = "i", main="Histogram of X", breaks = 50)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1", breaks = 50)

par(mfrow=c(1,1))
xmax <- max(c(X, sim1))         
xlim <- c(0, xmax)
hist(sim1, xlim = xlim, xaxs = "i", main="Histogram of sim1 non scaled x axis")

# Modelled, Simulation quantiles compared to observed quantiles 
data.frame(
  Modelled = round(c(qweibull(0, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]), qweibull(0.25, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2] ), qweibull(0.5, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]), qweibull(0.75, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]), qweibull(1, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2])),1), 
            
            Simulation = round((quantile(sim1)),1), 
            
            Observed = quantile(X)
            )
summary(model)
## Fitting of the distribution ' weibull ' by maximum likelihood 
## Parameters : 
##          estimate  Std. Error
## shape   0.7330834  0.01975975
## scale 644.9072807 32.13525400
## Loglikelihood:  -6324.159   AIC:  12652.32   BIC:  12661.77 
## Correlation matrix:
##           shape     scale
## shape 1.0000000 0.3205504
## scale 0.3205504 1.0000000
# fitted distribution mean
mean(sim1)
## [1] 771.2906
# Mean and variance of simulation compared to Observed
data.frame(statistic = c("mean", "standard deviation"), Weibull_sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

Goodness-of-fit Summary table for 4 models:

library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(1234)
data.frame(Goodness_of_fit = c("AIC", "log-likelihood", "RMSE", "AD - Test", "KS - Test") , 
           
           Lognormal = c(round(fit_ln2$aic,1), round(fit_ln2$loglik,1), round(RMSE(rlnorm(length(X), meanlog = fit_ln2$estimate[1], sdlog = fit_ln2$estimate[2]), X),1), gof_ln$adtest, gof_ln$kstest), 
           
           Exponetial = c(round(fit_exp$aic,1), round(fit_exp$loglik,1), round(RMSE(rexp(length(X), rate = fit_exp$estimate),X),1), gof_exp$adtest, gof_exp$kstest), 
           
           Gamma = c(round(fit_gam$aic,1), round(fit_gam$loglik,1), round(RMSE(rgamma(length(X), shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]),X),1), gof_gamma$adtest, gof_gamma$kstest),
           
           Weibull = c(round(fit_wei$aic,1), round(fit_wei$loglik,1), round(RMSE(rweibull(length(X), shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]),X),1), gof_wei$adtest, gof_wei$kstest)
           )

Section B: Question 1 - Frequency Model

# Assessing GLM fit for Poisson, try incoporate covariates in rate estimation
B<- subset(data, numclms>0)
B$status <- factor(B$status)
B$type <- factor(B$type)
B$licyrs <- as.integer(B$licyrs)
B$fuel <- factor(B$fuel)

# poisson family cuz >1 claims possible (can't be binomial)
B.glm <- glm(numclms ~ status + drvage + ncd + licyrs + vehage + type + fuel + exposure, data = B, family = "poisson")

summary(B.glm)
## 
## Call:
## glm(formula = numclms ~ status + drvage + ncd + licyrs + vehage + 
##     type + fuel + exposure, family = "poisson", data = B)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)    0.0678769  0.2927988   0.232    0.817
## statusMarried -0.0636297  0.2341879  -0.272    0.786
## statusSingle  -0.1107747  0.2885774  -0.384    0.701
## statusWidow    0.0412625  0.3501313   0.118    0.906
## drvage        -0.0005002  0.0035515  -0.141    0.888
## ncd           -0.0126230  0.0249743  -0.505    0.613
## licyrs         0.0029012  0.0259188   0.112    0.911
## vehage        -0.0006595  0.0155972  -0.042    0.966
## typeB         -0.0647488  0.3602613  -0.180    0.857
## typeC          0.0618769  0.1593522   0.388    0.698
## typeD          0.0254617  0.0936180   0.272    0.786
## typeE          0.0028157  0.0930145   0.030    0.976
## typeF         -0.0346429  0.4538569  -0.076    0.939
## typeG          0.1572354  0.1833972   0.857    0.391
## typeH          0.1097014  0.3894203   0.282    0.778
## fuelLPG        0.6512332  0.7140966   0.912    0.362
## fuelPetrol     0.0706388  0.2662102   0.265    0.791
## exposure       0.1248079  0.1512130   0.825    0.409
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 34.344  on 789  degrees of freedom
## Residual deviance: 31.279  on 772  degrees of freedom
## AIC: 1674.4
## 
## Number of Fisher Scoring iterations: 4
# under MLE for Poisson
var(data$numclms)
## [1] 0.08433952
mean(data$numclms)
## [1] 0.08181462
# Similar mean and variance so we use Poisson MLE
RateEst <- sum(data$numclms)/sum(data$exposure)

Data Manipulation for Question 2 and 3:

Add 25,000 Retention:

# Following on from original code with the addition of the retention level cap: 
numRet <- round(EstM, digits=0)
X<-append(X,rep(25000,times=numRet))
hist(X,main = "Non-Zero claims with retention", breaks = 30)

# new lambda estimate: 
lambda <- length(X)/sum(data$exposure)
lambda 
## [1] 0.5732022

Section C: Question 2 - Industry Exposure Curves

Swiss Re and Lloyd’s

## ---------------------------------------------------------------
## Inputs
## ---------------------------------------------------------------
M <- 25000              # retention / cap
X <- as.numeric(X)      # severities 
lambda <- length(X)/sum(data$exposure) # Under Poisson MLE

## Normalized severities for exposure-curve fit
Z     <- pmin(X, M) / M
Z_obs <- Z[Z < 1]
ec    <- ecdf(Z_obs)
grid  <- seq(0, max(Z_obs), length.out = 200)
Fn_emp <- ec(grid)

## ---------------------------------------------------------------
## Exposure curves
## ---------------------------------------------------------------
F_swiss  <- function(z, alpha)  1 - (1 - pmin(pmax(z,0),1))^alpha   # alpha>0
F_lloyds <- function(z, beta)   pmin(pmax(z,0),1)^beta              # beta>=1

sse_swiss  <- function(par) { alpha <- exp(par);     mean((F_swiss(grid,  alpha) - Fn_emp)^2) }
sse_lloyds <- function(par) { beta  <- 1 + exp(par); mean((F_lloyds(grid, beta)  - Fn_emp)^2) }

set.seed(1)
fit_sw <- optim(par = log(0.3),     fn = sse_swiss,  method = "BFGS")
fit_ll <- optim(par = log(2 - 1.0), fn = sse_lloyds, method = "BFGS")

alpha_hat <- exp(fit_sw$par)
beta_hat  <- 1 + exp(fit_ll$par)

## Limited expected value from each curve: LEV(M) = M * ∫_0^1 [1 - F(z)] dz
area_sw  <- integrate(function(z) 1 - F_swiss(z,  alpha_hat), 0, 1)$value
area_ll  <- integrate(function(z) 1 - F_lloyds(z, beta_hat),  0, 1)$value

LEV_M_swiss  <- M * area_sw
LEV_M_lloyds <- M * area_ll

## ---------------------------------------------------------------
## Exceedance probability at M (observable)
## ---------------------------------------------------------------
S_M <-  length(X[X>=M])/length(X)  # proportion of capped observations among non-zero claims
S_M
## [1] 0.7217594
## ---------------------------------------------------------------
## Pareto tail continuation above M
##  E[X - M | X > M] = M / (alpha_tail - 1),  alpha_tail > 1
## With sensitivity testing to alpha levels
## ---------------------------------------------------------------

alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2.0, 2.2, 2.5, 3)   # sensitivity set, smaller the value of alpha the heavier the tail, i.e. more prudent (Changes Pareto distribition, that is the distribution of claims above M)

EX_ceded_per_claim <- function(alpha_tail) {
  if (alpha_tail <= 1) return(Inf)
  S_M * (M / (alpha_tail - 1))
}

EX_ceded_list <- sapply(alpha_tail_vec, EX_ceded_per_claim)
names(EX_ceded_list) <- paste0("alpha_", alpha_tail_vec)

## Combine with retained LEV from exposure curves
LEV_ret <- c(Swiss = LEV_M_swiss, Lloyds = LEV_M_lloyds)

## Ground-up mean severity for each curve & tail alpha:
make_summary <- function(LEV_retained) {
  EX_ground <- LEV_retained + EX_ceded_list          # per claim
  cbind(
    LEV_retained = LEV_retained,                     
    EX_ceded     = EX_ceded_list,
    EX_ground    = EX_ground,
    PP_retained  = lambda * LEV_retained,
    PP_ceded     = lambda * EX_ceded_list,
    PP_ground    = lambda * EX_ground
  )
}

summary_swiss  <- make_summary(LEV_M_swiss)
summary_lloyds <- make_summary(LEV_M_lloyds)

summary_swiss
##           LEV_retained   EX_ceded  EX_ground PP_retained   PP_ceded  PP_ground
## alpha_1.1     110.0596 180439.853 180549.913    63.08642 103428.517 103491.603
## alpha_1.2     110.0596  90219.927  90329.986    63.08642  51714.258  51777.345
## alpha_1.5     110.0596  36087.971  36198.030    63.08642  20685.703  20748.790
## alpha_1.7     110.0596  25777.122  25887.182    63.08642  14775.502  14838.589
## alpha_2       110.0596  18043.985  18154.045    63.08642  10342.852  10405.938
## alpha_2.2     110.0596  15036.654  15146.714    63.08642   8619.043   8682.129
## alpha_2.5     110.0596  12029.324  12139.383    63.08642   6895.234   6958.321
## alpha_3       110.0596   9021.993   9132.052    63.08642   5171.426   5234.512
summary_lloyds
##           LEV_retained   EX_ceded EX_ground PP_retained   PP_ceded PP_ground
## alpha_1.1        12500 180439.853 192939.85    7165.027 103428.517 110593.54
## alpha_1.2        12500  90219.927 102719.93    7165.027  51714.258  58879.29
## alpha_1.5        12500  36087.971  48587.97    7165.027  20685.703  27850.73
## alpha_1.7        12500  25777.122  38277.12    7165.027  14775.502  21940.53
## alpha_2          12500  18043.985  30543.99    7165.027  10342.852  17507.88
## alpha_2.2        12500  15036.654  27536.65    7165.027   8619.043  15784.07
## alpha_2.5        12500  12029.324  24529.32    7165.027   6895.234  14060.26
## alpha_3          12500   9021.993  21521.99    7165.027   5171.426  12336.45
## ---------------------------------------------------------------
## Simple plots 
## ---------------------------------------------------------------
par(mfrow=c(1,2))
plot(ec, main = "ECDF of Z and fitted curves", xlab = "z = X/M", ylab = "F(z)")
curve(F_swiss(x,  alpha_hat), add = TRUE, col = 2, lwd = 2)
curve(F_lloyds(x, beta_hat),  add = TRUE, col = 4, lwd = 2)
legend("bottomright", c("Empirical", "Swiss (fit)", "Lloyd's (fit)"),
       lty = 1, col = c(1,2,4), bty = "n")

curve(1 - F_swiss(x,  alpha_hat), 0, 1, n = 200, col = 2, lwd = 2,
      main = "1 - F(z) (area × M = LEV(M))", xlab = "z", ylab = "1 - F(z)")
curve(1 - F_lloyds(x, beta_hat), add = TRUE, col = 4, lwd = 2)
legend("topright", c("Swiss", "Lloyd's"), col = c(2,4), lty = 1, bty = "n")

make_summary <- function(lev) {
  rn <- names(EX_ceded_list)                         
  df <- data.frame(
    alpha = alpha_tail_vec,
    LEV_retained = lev,
    EX_ceded     = EX_ceded_list,
    EX_ground    = lev + EX_ceded_list,
    PP_retained  = lambda * lev,
    PP_ceded     = lambda * EX_ceded_list,
    PP_ground    = lambda * (lev + EX_ceded_list)
  )
  rownames(df) <- rn
  df
}

summary_swiss  <- make_summary(LEV_M_swiss)
summary_lloyds <- make_summary(LEV_M_lloyds)

round(summary_swiss, 2)
round(summary_lloyds, 2)

Section D: Question 2 - Spliced Model

Poisson, Weibull, Pareto Model.

## 1) Tail weight among non-zero claims
S_M <- length(X[X >= M & X > 0]) / length(X[X > 0])
lambda <- length(X)/sum(data$exposure)
lambda
## [1] 0.5732022
## 2) Pareto EX above M
alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2.0, 2.2, 2.5, 3)
EX_ceded_per_claim <- function(alpha_tail) {
  if (alpha_tail <= 1) return(Inf)
  S_M * (M / (alpha_tail - 1))
}
EX_ceded_list <- sapply(alpha_tail_vec, EX_ceded_per_claim)
names(EX_ceded_list) <- paste0("alpha_", alpha_tail_vec)

## 3) Summary builder 
make_summary <- function(ex_retained) {
  data.frame(
    alpha       = alpha_tail_vec,
    EX_retained = ex_retained,              
    EX_ceded    = EX_ceded_list,             
    EX_ground   = ex_retained + EX_ceded_list,   
    PP_retained = lambda * ex_retained,
    PP_ceded    = lambda * EX_ceded_list,
    PP_ground   = lambda * (ex_retained + EX_ceded_list)
  )
}

##  WEIBULL (truncated at M) for X<=M
k  <- unname(fit_wei$estimate[1])   # shape
th <- unname(fit_wei$estimate[2])   # scale

# FULL-distribution LEV up to u for Weibull Dist.  
ex_weibull_full <- function(u, shape, scale) {
  t <- (u/scale)^shape
  scale * gamma(1 + 1/shape) * pgamma(t, shape = 1 + 1/shape, rate = 1) +
    u * exp(-t)
}

# Body CDF at M
F_M_weib <- pweibull(M, shape = k, scale = th)

# Retained LEV at u = M under the splice 
# (1 - S_M) * [ FULL body LEV at M / F_b(M) ] + M * S_M
EX_M_weibull_retained <- (1 - S_M) * (ex_weibull_full(M, k, th) / F_M_weib) + M * S_M

summary_weibull <- make_summary(EX_M_weibull_retained)
round(summary_weibull, 1)
## GAMMA (truncated at M) for X<=M
shape <- unname(fit_gam$estimate[1])
rate  <- unname(fit_gam$estimate[2])  

# FULL-distribution LEV up to M for Gamma Dist.  
EX_gamma_full <- function(u, shape, rate) {
  term1 <- pgamma(rate * u, shape = shape + 1) / rate   
  term2 <- u * (1 - pgamma(rate * u, shape = shape))
  term1 + term2
}

F_M_gamma <- pgamma(M, shape = shape, rate = rate)

EX_M_gamma_retained <- (1 - S_M) * (EX_gamma_full(M, shape, rate) / F_M_gamma) + M * S_M

summary_gamma <- make_summary(EX_M_gamma_retained)
round(summary_gamma, 1)

Plot of Pareto at different alpha’s

# Elbow Graph 
plot(alpha_tail_vec, EX_ceded_list, type="b", pch=16, lwd=2,
     xlab="Tail alpha", ylab="E[(X-M)^+]", main="Ceded Expectation vs Tail α")
abline(v=1.5, lty=3, col=8); text(1.5, max(EX_ceded_list), "α=1.5", pos=4, col=8)

CDF and PDF Plot under alpha = 1.5 (from elbow plot)

S_M <- mean(X >= M & X > 0)                 # tail weight among non-zero claims

# WEIBULL body fit on X < M
k  <- unname(fit_wei$estimate[1])
th <- unname(fit_wei$estimate[2]) 


# Gamme distribution has being hashed out in all cases below
# a <- unname(fit_gam$estimate[1]); 
#b <- unname(fit_gam$estimate[2])

## ---- Body CDF/PDF helpers ----
F_b <- function(x) pweibull(x, shape = k, scale = th)
f_b <- function(x) dweibull(x, shape = k, scale = th)
# For Gamma instead:
# F_b <- function(x) pgamma(x, shape = a, rate = b)
# f_b <- function(x) dgamma(x, shape = a, rate = b)

F_M <- F_b(M); stopifnot(F_M > 0 && S_M < 1)
w   <- 1 - S_M

## ---- Choose a Pareto tail shape alpha ----
# Option B: set alpha by judgement/sensitivity
alpha <- 1.5  

stopifnot(alpha > 1)

# Truncated body parts:
F_M <- F_b(M); w <- 1 - S_M
f_b_trunc <- function(x) f_b(x) / F_M
F_b_trunc <- function(x) F_b(x) / F_M
S_b_trunc <- function(x) 1 - F_b_trunc(x)

# Spliced survival / pdf:
S_total <- function(x) ifelse(x < M, w * S_b_trunc(x) + S_M, S_M * (M/x)^alpha)
f_total <- function(x) ifelse(x < M, w * f_b_trunc(x), S_M * alpha * M^alpha / x^(alpha+1))


##  Spliced PDF/CDF/Survival
# Truncated body pieces on [0, M)
f_b_trunc <- function(x) f_b(x) / F_M
F_b_trunc <- function(x) F_b(x) / F_M
S_b_trunc <- function(x) 1 - F_b_trunc(x)

# Survival (global)
S_total <- function(x) ifelse(
  x < M,
  w * S_b_trunc(x) + S_M,                            # below M
  S_M * (M / x)^alpha                                # tail
)

# CDF (global)
F_total <- function(x) 1 - S_total(x)

# PDF (global)
f_total <- function(x) ifelse(
  x < M,
  w * f_b_trunc(x),                                  # below M
  S_M * alpha * M^alpha / (x^(alpha + 1))            # tail density
)

##  Plot grids 
xs <- sort(unique(c(
  seq(0, M, length.out = 800),
  seq(M, max(M*5, quantile(X[X>0], 0.999, na.rm=TRUE)), length.out = 800)
)))

# Base R plots 
par(mfrow = c(1, 2))
# 1) PDF
plot(xs, f_total(xs), type="l", lwd=2, xlab="x", ylab="pdf f(x)",
     main="Spliced PDF")
abline(v=M, lty=3)

# 2) CDF
plot(xs, F_total(xs), type="l", lwd=2, xlab="x", ylab="CDF F(x)",
     main="Spliced CDF")
abline(v=M, lty=3)

CDF and PDF plot for all possible alpha values (Tail relationships)

# Vector of Pareto tail parameters
alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2.0, 2.2, 2.5, 3)
# Common x-grid
xs <- sort(unique(c(
  seq(0, M, length.out = 800),
  seq(M, max(M*5, quantile(X[X>0], 0.999, na.rm=TRUE)), length.out = 800)
)))

# Set up layout
par(mfrow = c(1, 2), mar = c(5, 5, 2, 1))

cols <- rainbow(length(alpha_tail_vec))

# Expand x-axis by 10 %
xlim_expanded <- range(xs) * c(0.9, 1.1)

#  PDF PLOT
plot(NA, NA,
     xlim = xlim_expanded,
     ylim = c(0, 0.0002),
     xlab = "x", ylab = "PDF f(x)",
     main = "Spliced PDF (stretched x-axis)")
abline(v = M, lty = 3)

for (i in seq_along(alpha_tail_vec)) {
  alpha <- alpha_tail_vec[i]
  f_total <- function(x) ifelse(
    x < M,
    (1 - S_M) * (dweibull(x, k, th) / pweibull(M, k, th)),
    S_M * alpha * M^alpha / (x^(alpha + 1))
  )
  lines(xs, f_total(xs), col = cols[i], lwd = 2)
}

legend("topright", legend = paste0("α = ", alpha_tail_vec),
       col = cols, lwd = 2, cex = 0.8, bty = "n")

#  CDF PLOT
plot(NA, NA,
     xlim = xlim_expanded,
     ylim = c(0, 1),
     xlab = "x", ylab = "CDF F(x)",
     main = "Spliced CDF (stretched x-axis)")
abline(v = M, lty = 3)

for (i in seq_along(alpha_tail_vec)) {
  alpha <- alpha_tail_vec[i]
  S_total <- function(x) ifelse(
    x < M,
    (1 - S_M) * (1 - pweibull(x, k, th) / pweibull(M, k, th)) + S_M,
    S_M * (M / x)^alpha
  )
  lines(xs, 1 - S_total(xs), col = cols[i], lwd = 2)
}

legend("bottomright", legend = paste0("α = ", alpha_tail_vec),
       col = cols, lwd = 2, cex = 0.8, bty = "n")

Plot of Simulated Weibull, Modelled Weibull and Observed data, and same with Pareto

## ----------------------------
## Spliced severity simulator
## ----------------------------
make_r_splice <- function(M, S_M, alpha, body = c("weibull","gamma"), pars) {
  stopifnot(S_M >= 0, S_M < 1, alpha > 1)
  body <- match.arg(body)

  if (body == "weibull") {
    k  <- pars$shape; th <- pars$scale
    F_b  <- function(x) pweibull(x, shape=k, scale=th)
    q_b  <- function(p) qweibull(p, shape=k, scale=th)
  } else {
    a <- pars$shape; b <- pars$rate
    F_b  <- function(x) pgamma(x, shape=a, rate=b)
    q_b  <- function(p) qgamma(p, shape=a, rate=b)
  }

  F_M <- F_b(M); stopifnot(F_M > 0)
  w   <- 1 - S_M

  function(n) {
    u_mix <- runif(n)
    x <- numeric(n)

    # Body draws (Weibull truncated at M)
    idx_b <- which(u_mix > S_M)
    if (length(idx_b) > 0) {
      u_body <- runif(length(idx_b), 0, F_M)
      x[idx_b] <- q_b(u_body)
    }

    # Tail draws (Pareto )
    idx_t <- which(u_mix <= S_M)
    if (length(idx_t) > 0) {
      u_tail <- runif(length(idx_t))
      x[idx_t] <- M / (u_tail)^(1/alpha)
    }

    x
  }
}

## ----------------------------
## Our context
## ----------------------------
set.seed(1)
M <- 25000
S_M <- mean(X >= M & X > 0)      
alpha <- 1.5                     # Chosen tail weight
k  <- unname(fit_wei$estimate[1])
th <- unname(fit_wei$estimate[2])

# Simulator
r_splice_weib <- make_r_splice(
  M=M, S_M=S_M, alpha=alpha,
  body="weibull", pars=list(shape=k, scale=th)
)

# Simulate
n <- 1e5
X_sim <- r_splice_weib(n)

## ----------------------------
## Histogram with fitted Weibull overlay
## ----------------------------
par(mfrow = c(1, 2), mar = c(5,5,3,1))

# Histogram of observed data (only body portion)
hist(X[X < M], breaks = 40, freq = FALSE,
     main = "Observed Data + Fitted Weibull", xlab = "Loss Amount")

# Overlay Weibull PDF (fitted to X < M)
xs <- seq(0, M, length.out = 400)
lines(xs, dweibull(xs, shape = k, scale = th), lwd = 2, col = "blue")

# Overlay simulated density for comparison
lines(density(X_sim[X_sim < M]), col = "red", lwd = 2, lty = 2)
legend("topright", c("Weibull fit", "Simulated (body)"), 
       col = c("blue","red"), lwd = 2, lty = c(1,2), cex = 0.8, bty = "n")

## ----------------------------
##  Log–log survival (Pareto tail)
## ----------------------------
xs_tail <- seq(M, quantile(X_sim, 0.999), length.out = 300)
S_emp <- sapply(xs_tail, function(t) mean(X_sim >= t))

plot(xs_tail, S_emp, log = "xy", type = "l", lwd = 2,
     xlab = "x (log scale)", ylab = "S(x) = P(X ≥ x) (log scale)",
     main = "Simulated Pareto Tail (log–log)")
abline(v = M, lty = 3)
lines(xs_tail, S_M * (M / xs_tail)^alpha, col = "blue", lwd = 2, lty = 2)
legend("topright", c("Empirical (simulated)", "Theoretical Pareto"),
       col = c("black", "blue"), lwd = 2, lty = c(1,2), cex = 0.8, bty = "n")

Variance Table and graph for different alpha and different Upper limit caps

## ===================================================================================
##  Variance of CAPPED Spliced Severity Model (Capped at 500,000, shown by blue line)
## ===================================================================================
set.seed(123)

#  Inputs 
M <- 25000                   # splice threshold
Umax <- 500000               # cap level (change to test sensitivity)
X_nonzero <- X[X > 0 & is.finite(X)]
S_M <- length(X_nonzero[X_nonzero >= M]) / length(X_nonzero)   # empirical tail weight

# Weibull body parameters 
k  <- unname(fit_wei$estimate[1])
th <- unname(fit_wei$estimate[2])

# alpha sensitivity vector
alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2, 2.2, 2.5, 3)

#  Helper generators 
rweibull_trunc_upper <- function(n, shape, scale, M){
  Fu <- pweibull(M, shape=shape, scale=scale)
  u  <- runif(n, 0, Fu)
  qweibull(u, shape=shape, scale=scale)
}
rpareto_unlimited <- function(n, M, alpha){
  M * (runif(n))^(-1/alpha)
}

#  Simulation setup
n <- 1e6
u_mix <- runif(n)
is_tail <- (u_mix < S_M)
nb <- sum(!is_tail); nt <- sum(is_tail)

# Dist component (same for all α) (i.e. Weibull)
X_body <- if (nb > 0) rweibull_trunc_upper(nb, k, th, M) else numeric(0)

# Table initialisation 
results <- data.frame(
  alpha = alpha_tail_vec,
  E_Xcap = NA_real_,
  Var_Xcap = NA_real_,
  SD_Xcap = NA_real_
)

#  Main loop 
for (i in seq_along(alpha_tail_vec)) {
  alpha <- alpha_tail_vec[i]

  # Tail simulation for this α
  X_tail <- if (nt > 0) rpareto_unlimited(nt, M, alpha) else numeric(0)

  # Combine body & tail
  X_sim <- numeric(n)
  if (nb > 0) X_sim[!is_tail] <- X_body
  if (nt > 0) X_sim[is_tail]  <- X_tail

  # Apply cap
  X_cap <- pmin(X_sim, Umax)

  # Stats
  results$E_Xcap[i]  <- mean(X_cap)
  results$Var_Xcap[i] <- var(X_cap)
  results$SD_Xcap[i]  <- sqrt(results$Var_Xcap[i])
}

# Output neatly formatted
print(
  data.frame(
    alpha = results$alpha,
    Mean = format(round(results$E_Xcap, 2), big.mark=","),
    Variance = format(round(results$Var_Xcap, 2), big.mark=","),
    SD = format(round(results$SD_Xcap, 2), big.mark=",")
  ),
  row.names = FALSE
)
##  alpha      Mean       Variance         SD
##    1.1 64,908.53 10,069,449,370 100,346.65
##    1.2 58,951.82  8,238,067,165  90,763.80
##    1.5 46,301.13  4,569,953,885  67,601.43
##    1.7 40,901.97  3,161,783,565  56,229.74
##    2.0 35,480.31  1,924,919,865  43,873.91
##    2.2 32,868.65  1,393,530,662  37,330.02
##    2.5 30,161.10    939,174,869  30,645.96
##    3.0 27,286.24    567,178,599  23,815.51
#  Graphing (one plot per alpha) 
par(mfrow = c(2, 2)) 

for (i in seq_along(alpha_tail_vec)) {
  alpha <- alpha_tail_vec[i]
  X_tail <- if (nt > 0) rpareto_unlimited(nt, M, alpha) else numeric(0)
  X_sim <- numeric(n)
  if (nb > 0) X_sim[!is_tail] <- X_body
  if (nt > 0) X_sim[is_tail]  <- X_tail
  X_cap <- pmin(X_sim, Umax)

  hist(X_cap[X_cap < Umax], breaks=80, freq=FALSE,
       main=sprintf("α = %.1f", alpha),
       xlab="Claim Size", col="lightgrey", border="white")
  abline(v=c(M,Umax), col=c("red","blue"), lty=2, lwd=2)
  legend("topright", legend=c("Splice M","Cap Umax"), col=c("red","blue"), lwd=2, lty=2)
}