R Notebook

data <- readRDS("C:/Users/aidan/Downloads/data.rds")
View(data)
library(fitdistrplus)

## Warning: package 'fitdistrplus' was built under R version 4.4.3

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 4.4.2

## Loading required package: survival

## Warning: package 'survival' was built under R version 4.4.3

#Code for sorting the data by individual Claim amount:

# Estimated above reinsurance ($25,000)
EstM <- 0.23*nrow(data[data$clmcost=='0',])


# Total Claims Number
Total_claims <- sum(data$clmcost) + EstM*25000 ; Total_claims

## [1] 54798154

# Average Claim net reinsurance
Avg_claims <- (sum(data$clmcost)+EstM*25000)/(sum(data$numclms) + EstM) ; Avg_claims

## [1] 18261.91

# A is a list of all non-zero claims. X is a list of all claims UNDER retention. 

A<- subset(data[c("numclms","clmcosts")], numclms>0)
X <- c()

for (row in 1:nrow(A)) {
  if (A[row,1]==1) {
    X <- append(X, A[row,2])
  } else {
    for (claimno in 1:A[row,1]) {
      X <- append(X,A[row,2]/A[row,1])
    }
  }
}
hist(X, main = "Non-Zero claims without retention")

#Question 1 pt. 2 lognormal

library(fitdistrplus)

min(X)

## [1] 10

max(X)

## [1] 7355

# Log normal Distribution: 

fit_ln  <- fitdist(X, "lnorm", method = "mme")  # Moment Matching Estimation 
fit_ln2  <- fitdist(X, "lnorm", method = "mle") # Method Maxium Likelihood
fit_ln3  <- fitdist(X, "lnorm", method = "mse") # Maxium goodness of fit 

gofstat(list(fit_ln, fit_ln2, fit_ln3), fitnames = c("MME","MLE", "MSE"))

## Goodness-of-fit statistics
##                                      MME        MLE        MSE
## Kolmogorov-Smirnov statistic   0.1933909 0.07359625  0.1434873
## Cramer-von Mises statistic     8.3596999 1.39874800  5.7000458
## Anderson-Darling statistic   132.7899165 9.43582444 61.0036246
## 
## Goodness-of-fit criteria
##                                     MME      MLE      MSE
## Akaike's Information Criterion 13345.70 12691.55 12873.75
## Bayesian Information Criterion 13355.15 12701.01 12883.20

par(mfrow = c(3,3))
plot(fit_ln)

plot(fit_ln2)

plot(fit_ln3)

# on a level of AIC, BIC and other, MLE dominates the other methods
gofstat(fit_ln2)

## Goodness-of-fit statistics
##                              1-mle-lnorm
## Kolmogorov-Smirnov statistic  0.07359625
## Cramer-von Mises statistic    1.39874800
## Anderson-Darling statistic    9.43582444
## 
## Goodness-of-fit criteria
##                                1-mle-lnorm
## Akaike's Information Criterion    12691.55
## Bayesian Information Criterion    12701.01

gof_ln <- gofstat(fit_ln2)

# Code written for quick change between models 
model <- fit_ln2
set.seed(1234)
sim1<- rlnorm(length(X), meanlog = model$estimate[1], sdlog = model$estimate[2])
par(mfrow=c(2,2))
hist(X)
hist(sim1)


# Modelled and Simulation quantiles compared to observed quantiles 
data.frame( Modelled = round(c(qlnorm(0, meanlog = model$estimate[1], sdlog = model$estimate[2]), qlnorm(0.25, meanlog = model$estimate[1], sdlog = model$estimate[2] ), qlnorm(0.5,meanlog = model$estimate[1], sdlog = model$estimate[2]), qlnorm(0.75, meanlog = model$estimate[1], sdlog = model$estimate[2]), qlnorm(1, meanlog = model$estimate[1], sdlog = model$estimate[2])),1), Simulation = round((quantile(sim1)),1), Observed = quantile(X))

summary(model)

## Fitting of the distribution ' lnorm ' by maximum likelihood 
## Parameters : 
##         estimate Std. Error
## meanlog 5.697840 0.05595784
## sdlog   1.616978 0.03956810
## Loglikelihood:  -6343.777   AIC:  12691.55   BIC:  12701.01 
## Correlation matrix:
##         meanlog sdlog
## meanlog       1     0
## sdlog         0     1

# fitted distribution mean
exp(model$estimate[1]+0.5*model$estimate[2]^2)

##  meanlog 
## 1102.294

# mean of simulation
data.frame(statistic = c("mean", "standard deviation"), lognormal_Sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

#Fitting Exponential

# Fitting of exponential distribution

x <- as.numeric(X)
x <- x[is.finite(x) & x >= 0]
x_pos <- x[x > 0]

fit_exp <- fitdist(
  x_pos, "exp",
  start = list(rate = 1/mean(x_pos)),
  lower = 1e-12,                  # keep rate > 0
  method = "mle",
  optim.method = "Nelder-Mead"    # more stable for 1-param cases
)
summary(fit_exp)

## Fitting of the distribution ' exp ' by maximum likelihood 
## Parameters : 
##         estimate
## rate 0.001272567
## Loglikelihood:  -6401.71   AIC:  12805.42   BIC:  12810.15

gofstat(fit_exp)

## Goodness-of-fit statistics
##                               1-mle-exp
## Kolmogorov-Smirnov statistic  0.1489007
## Cramer-von Mises statistic    6.3393067
## Anderson-Darling statistic   46.8470937
## 
## Goodness-of-fit criteria
##                                1-mle-exp
## Akaike's Information Criterion  12805.42
## Bayesian Information Criterion  12810.15

gof_exp <- gofstat(fit_exp)
coef(fit_exp)

##        rate 
## 0.001272567

plot(fit_exp)

# Code written for quick change between models 
model <- fit_exp
set.seed(1234)

sim1<- rexp(length(X), rate = fit_exp$estimate)
par(mfrow=c(2,2))
hist(X)
hist(sim1)

# Simulation quantiles compared to observed quantiles 
data.frame( Modelled = round(c(qexp(0, rate = fit_exp$estimate), qexp(0.25, rate = fit_exp$estimate ), qexp(0.5, rate = fit_exp$estimate), qexp(0.75, rate = fit_exp$estimate), qexp(1, rate = fit_exp$estimate)),1), Simulation = round((quantile(sim1)),1), Observed = quantile(X))

summary(model)

## Fitting of the distribution ' exp ' by maximum likelihood 
## Parameters : 
##         estimate
## rate 0.001272567
## Loglikelihood:  -6401.71   AIC:  12805.42   BIC:  12810.15

# pure mean under fitted model
1/fit_exp$estimate

##     rate 
## 785.8132

# observed mean 
mean(X)

## [1] 785.8132

# mean of simulation
data.frame(statistic = c("mean", "standard deviation"), Exponetial_sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

#Gamma model fitted through mme

### Gamma ### 

fit_gam <- fitdist(X, "gamma", method = "mme")

gofstat(fit_gam)

## Goodness-of-fit statistics
##                              1-mme-gamma
## Kolmogorov-Smirnov statistic  0.07455777
## Cramer-von Mises statistic    0.21349555
## Anderson-Darling statistic    3.47019630
## 
## Goodness-of-fit criteria
##                                1-mme-gamma
## Akaike's Information Criterion    12679.84
## Bayesian Information Criterion    12689.29

gof_gamma <- gofstat(fit_gam)

summary(fit_gam)

## Fitting of the distribution ' gamma ' by matching moments 
## Parameters : 
##           estimate   Std. Error
## shape 0.5456635627 4.494607e-02
## rate  0.0006943935 6.580093e-05
## Loglikelihood:  -6337.918   AIC:  12679.84   BIC:  12689.29 
## Correlation matrix:
##           shape      rate
## shape 1.0000000 0.8692414
## rate  0.8692414 1.0000000

plot(fit_gam)

# Code written for quick change between models 
model <- fit_gam
set.seed(1234)
fit_gam$estimate

##        shape         rate 
## 0.5456635627 0.0006943935

sim1<- rgamma(length(X), shape = fit_gam$estimate[1], rate = fit_gam$estimate[2])

# Graph
par(mfrow=c(2,2))
hist(X)
hist(sim1)


# Simulation quantiles compared to observed quantiles 
data.frame(
          Modelled = round(c(qgamma(0, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(0.25, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(0.5, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(0.75,shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]), qgamma(1, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2])),1), 
           
           Simulation = round((quantile(sim1)),1), 
           
           Observed = quantile(X)
          )

summary(model)

## Fitting of the distribution ' gamma ' by matching moments 
## Parameters : 
##           estimate   Std. Error
## shape 0.5456635627 4.494607e-02
## rate  0.0006943935 6.580093e-05
## Loglikelihood:  -6337.918   AIC:  12679.84   BIC:  12689.29 
## Correlation matrix:
##           shape      rate
## shape 1.0000000 0.8692414
## rate  0.8692414 1.0000000

# fitted distribution mean
mean(sim1)

## [1] 784.6038

# Mean and variance of simulation compared to Observed
data.frame(statistic = c("mean", "standard deviation"), Gamma_Sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

#Weibull distribution MLE

fit_wei <- fitdist(X, "weibull", method = "mle")
gofstat(fit_wei)

## Goodness-of-fit statistics
##                              1-mle-weibull
## Kolmogorov-Smirnov statistic    0.04605632
## Cramer-von Mises statistic      0.23584832
## Anderson-Darling statistic      2.50396357
## 
## Goodness-of-fit criteria
##                                1-mle-weibull
## Akaike's Information Criterion      12652.32
## Bayesian Information Criterion      12661.77

gof_wei <- gofstat(fit_wei)
plot(fit_wei)

model <- fit_wei
set.seed(1234)

fit_wei$estimate

##       shape       scale 
##   0.7330834 644.9072807

sim1<- rweibull(length(X), shape = fit_wei$estimate[1], scale = fit_wei$estimate[2])
par(mfrow=c(2,2))
hist(X)
hist(sim1)

# Modelled, Simulation quantiles compared to observed quantiles 
data.frame(
  Modelled = round(c(qweibull(0, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]), qweibull(0.25, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2] ), qweibull(0.5, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]), qweibull(0.75, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]), qweibull(1, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2])),1), 
            
            Simulation = round((quantile(sim1)),1), 
            
            Observed = quantile(X)
            )

summary(model)

## Fitting of the distribution ' weibull ' by maximum likelihood 
## Parameters : 
##          estimate  Std. Error
## shape   0.7330834  0.01975975
## scale 644.9072807 32.13525400
## Loglikelihood:  -6324.159   AIC:  12652.32   BIC:  12661.77 
## Correlation matrix:
##           shape     scale
## shape 1.0000000 0.3205504
## scale 0.3205504 1.0000000

# fitted distribution mean
mean(sim1)

## [1] 771.2906

# Mean and variance of simulation compared to Observed
data.frame(statistic = c("mean", "standard deviation"), Weibull_sim = c(round(mean(sim1),1), round(sd(sim1),1)), Observed =  c(round(mean(X),1), round(sd(X),1)))

#Goodness of fit Summary for all 4 models

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

set.seed(1234)
data.frame(Goodness_of_fit = c("AIC", "log-likelihood", "RMSE", "AD - Test", "KS - Test") , 
           
           Lognormal = c(round(fit_ln2$aic,1), round(fit_ln2$loglik,1), round(RMSE(rlnorm(length(X), meanlog = fit_ln2$estimate[1], sdlog = fit_ln2$estimate[2]), X),1), gof_ln$adtest, gof_ln$kstest), 
           
           Exponetial = c(round(fit_exp$aic,1), round(fit_exp$loglik,1), round(RMSE(rexp(length(X), rate = fit_exp$estimate),X),1), gof_exp$adtest, gof_exp$kstest), 
           
           Gamma = c(round(fit_gam$aic,1), round(fit_gam$loglik,1), round(RMSE(rgamma(length(X), shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]),X),1), gof_gamma$adtest, gof_gamma$kstest),
           
           Weibull = c(round(fit_wei$aic,1), round(fit_wei$loglik,1), round(RMSE(rweibull(length(X), shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]),X),1), gof_wei$adtest, gof_wei$kstest)
           )

#Resulting effect of assuming no distribution between 8,400 -> 25,000 under gamma, weibull distribution and rule of 3

# Using our gamma to estimate expected loss which would accure associated with P[8,400 < X < 25,000]
set.seed(1234)
sim1<- rgamma(length(X), shape = fit_gam$estimate[1], rate = fit_gam$estimate[2])
max <- max(sim1)

q <- pgamma(25000, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2]) - pgamma(max, shape = fit_gam$estimate[1], rate = fit_gam$estimate[2])
upper_impact <- 25000 * q            # <= loss in LEV if you assume no mass in (8.4k,25k)
lambda <- sum(data$numclms)/sum(data$exposure)
upper_impact*lambda

## [1] 2.951265

# Using our weibul to estimate expected loss which would accure associated with P[8,400 < X < 25,000]
set.seed(1234)
sim1<- rweibull(length(X), shape = fit_wei$estimate[1], scale = fit_wei$estimate[2])
max <- max(sim1)

q <- pweibull(25000,  shape = fit_wei$estimate[1], scale = fit_wei$estimate[2]) - pweibull(max, shape = fit_wei$estimate[1], scale = fit_wei$estimate[2])
upper_impact <- 25000 * q            # <= loss in LEV if you assume no mass in (8.4k,25k)
lambda <- sum(data$numclms)/sum(data$exposure)
upper_impact*lambda

## [1] 1.361063

# Rule of 3: for distribution free upper limit estimation 
n_nonzero <- sum(X > 0) ; n_nonzero

## [1] 835

p_upper   <- 3 / n_nonzero              # 95% upper bound with 0 events
upper_impact_emp_per_claim <- 25000 * p_upper
lambda <- sum(data$numclms) / sum(data$exposure)

upper_impact_emp_per_exposure <- upper_impact_emp_per_claim * lambda
upper_impact_emp_total <- upper_impact_emp_per_claim * sum(X)

upper_impact_emp_per_exposure

## [1] 14.32528

upper_impact_emp_total

## [1] 58935988

upper_impact_emp_per_claim

## [1] 89.82036

Adding RAdding RAdding Retention level / Cap

# Following on from original code with the addition of the retention level cap: 
numRet <- round(EstM, digits=0)
X<-append(X,rep(25000,times=numRet))
hist(X,main = "Non-Zero claims with retention")

Threshold exceedance using Pareto and swiss Re and Lloyds curve for X<M

## ---------------------------------------------------------------
## Inputs
## ---------------------------------------------------------------


M <- 25000              # retention / cap
X <- as.numeric(X)      # severities (each capped at M in your data)
lambda <- sum(data$numclms, na.rm = TRUE) / sum(data$exposure, na.rm = TRUE)

## Normalized severities for exposure-curve fit
Z     <- pmin(X, M) / M
Z_obs <- Z[Z < 1]
ec    <- ecdf(Z_obs)
grid  <- seq(0, max(Z_obs), length.out = 200)
Fn_emp <- ec(grid)


## ---------------------------------------------------------------
## Exposure curves
## ---------------------------------------------------------------
F_swiss  <- function(z, alpha)  1 - (1 - pmin(pmax(z,0),1))^alpha   # alpha>0
F_lloyds <- function(z, beta)   pmin(pmax(z,0),1)^beta              # beta>=1

sse_swiss  <- function(par) { alpha <- exp(par);     mean((F_swiss(grid,  alpha) - Fn_emp)^2) }
sse_lloyds <- function(par) { beta  <- 1 + exp(par); mean((F_lloyds(grid, beta)  - Fn_emp)^2) }

set.seed(1)
fit_sw <- optim(par = log(0.3),     fn = sse_swiss,  method = "BFGS")
fit_ll <- optim(par = log(2 - 1.0), fn = sse_lloyds, method = "BFGS")

alpha_hat <- exp(fit_sw$par)
beta_hat  <- 1 + exp(fit_ll$par)

## Limited expected value from each curve: LEV(M) = M * ∫_0^1 [1 - F(z)] dz
area_sw  <- integrate(function(z) 1 - F_swiss(z,  alpha_hat), 0, 1)$value
area_ll  <- integrate(function(z) 1 - F_lloyds(z, beta_hat),  0, 1)$value

LEV_M_swiss  <- M * area_sw
LEV_M_lloyds <- M * area_ll


## ---------------------------------------------------------------
## Exceedance probability at M (observable)
## ---------------------------------------------------------------
S_M <-  length(X[X>=M])/length(X)  # proportion of capped observations among non-zero claims
S_M

## [1] 0.7217594

## ---------------------------------------------------------------
## Pareto tail continuation above M
##  E[X - M | X > M] = M / (alpha_tail - 1),  alpha_tail > 1
## 
## With sensitivity testing to alpha levels
## ---------------------------------------------------------------

alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2.0, 2.2, 2.5, 3)   # sensitivity set, smaller the value of alpha the heavier the tail, i.e. more prudent (Changes Pareto distribition, that is the distribution of claims above M)

EX_ceded_per_claim <- function(alpha_tail) {
  if (alpha_tail <= 1) return(Inf)
  S_M * (M / (alpha_tail - 1))
}

EX_ceded_list <- sapply(alpha_tail_vec, EX_ceded_per_claim)
names(EX_ceded_list) <- paste0("alpha_", alpha_tail_vec)

## Combine with retained LEV from exposure curves (choose Swiss or Lloyd's)
LEV_ret <- c(Swiss = LEV_M_swiss, Lloyds = LEV_M_lloyds)

## Ground-up mean severity for each curve & tail alpha:
make_summary <- function(LEV_retained) {
  EX_ground <- LEV_retained + EX_ceded_list          # per claim
  cbind(
    LEV_retained = LEV_retained,                     # Come back and change this to value from question 1 distribution
    EX_ceded     = EX_ceded_list,
    EX_ground    = EX_ground,
    PP_retained  = lambda * LEV_retained,
    PP_ceded     = lambda * EX_ceded_list,
    PP_ground    = lambda * EX_ground
  )
}

summary_swiss  <- make_summary(LEV_M_swiss)
summary_lloyds <- make_summary(LEV_M_lloyds)

summary_swiss

##           LEV_retained   EX_ceded  EX_ground PP_retained  PP_ceded PP_ground
## alpha_1.1     110.0596 180439.853 180549.913     17.5532 28778.011 28795.564
## alpha_1.2     110.0596  90219.927  90329.986     17.5532 14389.006 14406.559
## alpha_1.5     110.0596  36087.971  36198.030     17.5532  5755.602  5773.155
## alpha_1.7     110.0596  25777.122  25887.182     17.5532  4111.144  4128.698
## alpha_2       110.0596  18043.985  18154.045     17.5532  2877.801  2895.354
## alpha_2.2     110.0596  15036.654  15146.714     17.5532  2398.168  2415.721
## alpha_2.5     110.0596  12029.324  12139.383     17.5532  1918.534  1936.087
## alpha_3       110.0596   9021.993   9132.052     17.5532  1438.901  1456.454

summary_lloyds

##           LEV_retained   EX_ceded EX_ground PP_retained  PP_ceded PP_ground
## alpha_1.1        12500 180439.853 192939.85    1993.601 28778.011 30771.613
## alpha_1.2        12500  90219.927 102719.93    1993.601 14389.006 16382.607
## alpha_1.5        12500  36087.971  48587.97    1993.601  5755.602  7749.204
## alpha_1.7        12500  25777.122  38277.12    1993.601  4111.144  6104.746
## alpha_2          12500  18043.985  30543.99    1993.601  2877.801  4871.402
## alpha_2.2        12500  15036.654  27536.65    1993.601  2398.168  4391.769
## alpha_2.5        12500  12029.324  24529.32    1993.601  1918.534  3912.135
## alpha_3          12500   9021.993  21521.99    1993.601  1438.901  3432.502

## ---------------------------------------------------------------
## Simple plots (optional)
## ---------------------------------------------------------------
par(mfrow=c(1,2))
plot(ec, main = "ECDF of Z and fitted curves", xlab = "z = X/M", ylab = "F(z)")
curve(F_swiss(x,  alpha_hat), add = TRUE, col = 2, lwd = 2)
curve(F_lloyds(x, beta_hat),  add = TRUE, col = 4, lwd = 2)
legend("bottomright", c("Empirical", "Swiss (fit)", "Lloyd's (fit)"),
       lty = 1, col = c(1,2,4), bty = "n")

curve(1 - F_swiss(x,  alpha_hat), 0, 1, n = 200, col = 2, lwd = 2,
      main = "1 - F(z) (area × M = LEV(M))", xlab = "z", ylab = "1 - F(z)")
curve(1 - F_lloyds(x, beta_hat), add = TRUE, col = 4, lwd = 2)
legend("topright", c("Swiss", "Lloyd's"), col = c(2,4), lty = 1, bty = "n")

make_summary <- function(lev) {
  rn <- names(EX_ceded_list)                         # e.g. "alpha_1.1", ...
  df <- data.frame(
    alpha = alpha_tail_vec,
    LEV_retained = lev,
    EX_ceded     = EX_ceded_list,
    EX_ground    = lev + EX_ceded_list,
    PP_retained  = lambda * lev,
    PP_ceded     = lambda * EX_ceded_list,
    PP_ground    = lambda * (lev + EX_ceded_list)
  )
  rownames(df) <- rn
  df
}

summary_swiss  <- make_summary(LEV_M_swiss)
summary_lloyds <- make_summary(LEV_M_lloyds)

round(summary_swiss, 2)

round(summary_lloyds, 2)

plot(alpha_tail_vec, EX_ceded_list, type="b", pch=16, lwd=2,
     xlab="Tail alpha", ylab="E[(X-M)^+]", main="Ceded Expectation vs Tail α")
abline(v=1.5, lty=3, col=8); text(1.5, max(EX_ceded_list), "α=1.5", pos=4, col=8)

Weibull and Gamma for X < M

S_M <-  length(X[X>=M])/length(X)  # proportion of capped observations among non-zero claims
S_M

## [1] 0.7217594

# Pareto 
alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2.0, 2.2, 2.5, 3)   # sensitivity set, smaller the value of alpha the heavier the tail, i.e. more prudent (Changes Pareto distribition, that is the distribution of claims above M)

EX_ceded_per_claim <- function(alpha_tail) {
  if (alpha_tail <= 1) return(Inf)
  S_M * (M / (alpha_tail - 1))
}
EX_ceded_list <- sapply(alpha_tail_vec, EX_ceded_per_claim)
names(EX_ceded_list) <- paste0("alpha_", alpha_tail_vec)

# Summary function: 
make_summary <- function(lev) {
  rn <- names(EX_ceded_list)                         # e.g. "alpha_1.1", ...
  df <- data.frame(
    alpha = alpha_tail_vec,
    LEV_retained = lev,
    EX_ceded     = EX_ceded_list,
    EX_ground    = lev + EX_ceded_list,
    PP_retained  = lambda * lev,
    PP_ceded     = lambda * EX_ceded_list,
    PP_ground    = lambda * (lev + EX_ceded_list)
  )
  rownames(df) <- rn
  df
}


# Weibull
k  <- unname(fit_wei$estimate[1])
th <- unname(fit_wei$estimate[2])

# LEV for Weibull
lev_weibull <- function(u, shape, scale) {
  t <- (u/scale)^shape
  scale * gamma(1 + 1/shape) * pgamma(t, shape = 1 + 1/shape, rate = 1) +
    u * exp(-t)
}

LEV_M_weibull <- lev_weibull(M, shape = k, scale = th)
summary_weibull <- make_summary(LEV_M_weibull)
round(summary_weibull,1)

# Gamma
shape<-fit_gam$estimate[1]
rate<-fit_gam$estimate[2]

LEV_gamma <- function(M, shape, rate) {
  term1 <- pgamma(rate * M, shape = shape + 1) / rate
  term2 <- M * (1 - pgamma(rate * M, shape = shape))
  term1 + term2
}
LEV_M_gamma <- LEV_gamma(M, shape = shape, rate = rate)
LEV_M_gamma

##     rate 
## 1440.106

summary_gamma <- make_summary(LEV_M_gamma)

## Warning in data.frame(alpha = alpha_tail_vec, LEV_retained = lev, EX_ceded =
## EX_ceded_list, : row names were found from a short variable and have been
## discarded

round(summary_gamma,1)

Spliced distributions:

## 1) Tail weight among non-zero claims
S_M <- length(X[X >= M & X > 0]) / length(X[X > 0])
lambda <- sum(data$numclms, na.rm = TRUE) / sum(data$exposure, na.rm = TRUE)

## 2) Pareto EX above M (unchanged — correct)
alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2.0, 2.2, 2.5, 3)

EX_ceded_per_claim <- function(alpha_tail) {
  if (alpha_tail <= 1) return(Inf)
  S_M * (M / (alpha_tail - 1))
}
EX_ceded_list <- sapply(alpha_tail_vec, EX_ceded_per_claim)
names(EX_ceded_list) <- paste0("alpha_", alpha_tail_vec)

## 3) Summary builder 
make_summary <- function(lev_retained) {
  data.frame(
    alpha       = alpha_tail_vec,
    LEV_retained= lev_retained,              # same across alphas at u = M
    EX_ceded    = EX_ceded_list,             # varies with alpha
    EX_ground   = lev_retained + EX_ceded_list,   # = E[X] under splice
    PP_retained = lambda * lev_retained,
    PP_ceded    = lambda * EX_ceded_list,
    PP_ground   = lambda * (lev_retained + EX_ceded_list)
  )
}

##  WEIBULL BODY (truncated at M) 
k  <- unname(fit_wei$estimate[1])   # shape
th <- unname(fit_wei$estimate[2])   # scale

# FULL-distribution LEV up to u for Weibull body (you already had this)
lev_weibull_full <- function(u, shape, scale) {
  t <- (u/scale)^shape
  scale * gamma(1 + 1/shape) * pgamma(t, shape = 1 + 1/shape, rate = 1) +
    u * exp(-t)
}

# Body CDF at M
F_M_weib <- pweibull(M, shape = k, scale = th)

# *** Retained LEV at u = M under the splice ***
# (1 - S_M) * [ FULL body LEV at M / F_b(M) ] + M * S_M
LEV_M_weibull_retained <- (1 - S_M) * (lev_weibull_full(M, k, th) / F_M_weib) + M * S_M

summary_weibull <- make_summary(LEV_M_weibull_retained)
round(summary_weibull, 1)

## ==== GAMMA BODY (truncated at M) ====
shape <- unname(fit_gam$estimate[1])
rate  <- unname(fit_gam$estimate[2])  

# FULL-distribution LEV up to M for Gamma body 
LEV_gamma_full <- function(u, shape, rate) {
  term1 <- pgamma(rate * u, shape = shape + 1) / rate   
  term2 <- u * (1 - pgamma(rate * u, shape = shape))
  term1 + term2
}

F_M_gamma <- pgamma(M, shape = shape, rate = rate)

LEV_M_gamma_retained <- (1 - S_M) * (LEV_gamma_full(M, shape, rate) / F_M_gamma) + M * S_M

summary_gamma <- make_summary(LEV_M_gamma_retained)
round(summary_gamma, 1)

## 1) Tail weight among non-zero claims
S_M <- length(X[X >= M & X > 0]) / length(X[X > 0])

## 2) Pareto EX above M (your function is correct)
alpha_tail_vec <- c(1.1, 1.2, 1.5, 1.7, 2.0, 2.2, 2.5, 3)

EX_ceded_per_claim <- function(alpha_tail) {
  if (alpha_tail <= 1) return(Inf)
  S_M * (M / (alpha_tail - 1))
}
EX_ceded_list <- sapply(alpha_tail_vec, EX_ceded_per_claim)
names(EX_ceded_list) <- paste0("alpha_", alpha_tail_vec)

## 3) Summary builder (lambda must be defined earlier)
make_summary <- function(lev_retained_M) {
  data.frame(
    alpha        = alpha_tail_vec,
    LEV_retained = lev_retained_M,                 # same across alphas at u = M
    EX_ceded     = EX_ceded_list,                  # varies with alpha
    EX_ground    = lev_retained_M + EX_ceded_list, # E[X] under splice
    PP_retained  = lambda * lev_retained_M,
    PP_ceded     = lambda * EX_ceded_list,
    PP_ground    = lambda * (lev_retained_M + EX_ceded_list)
  )
}

## ===== WEIBULL BODY (fit on X < M) =====
k  <- unname(fit_wei$estimate[1])   # shape
th <- unname(fit_wei$estimate[2])   # scale

# Full (one-piece) body LEV up to u
lev_weibull_full <- function(u, shape, scale) {
  t <- (u/scale)^shape
  scale * gamma(1 + 1/shape) * pgamma(t, shape = 1 + 1/shape, rate = 1) +
    u * exp(-t)
}

# Body CDF at M
F_M_weib <- pweibull(M, shape = k, scale = th)

# *** Spliced retained LEV at u = M ***
# (1 - S_M) * [ full-body LEV at M / F_b(M) ] + M * S_M
LEV_M_weibull_retained <- (1 - S_M) * (lev_weibull_full(M, k, th) / F_M_weib) + M * S_M

summary_weibull <- make_summary(LEV_M_weibull_retained)
round(summary_weibull, 1)

## ===== GAMMA BODY (fit on X < M) =====
shape <- unname(fit_gam$estimate[1])   # Gamma shape
rate  <- unname(fit_gam$estimate[2])   # Gamma rate (1/scale)

# Full (one-piece) body LEV up to u for Gamma
LEV_gamma_full <- function(u, shape, rate) {
  term1 <- pgamma(rate * u, shape = shape + 1) / rate
  term2 <- u * (1 - pgamma(rate * u, shape = shape))
  term1 + term2
}

F_M_gamma <- pgamma(M, shape = shape, rate = rate)

LEV_M_gamma_retained <- (1 - S_M) * (LEV_gamma_full(M, shape, rate) / F_M_gamma) + M * S_M

summary_gamma <- make_summary(LEV_M_gamma_retained)
round(summary_gamma, 1)

Plot of Spliced Distribution based on Weibull under alpha = 1.5

## ---- Inputs you already have ----
# X: your data (non-zero + zeros ok)
# M: cap / splice point (e.g. 25000)
S_M <- mean(X >= M & X > 0)                 # tail weight among non-zero claims

# WEIBULL body fit on X < M
k  <- unname(fit_wei$estimate[1])
th <- unname(fit_wei$estimate[2]) 


# Gamme distribution has being hashed out in all cases below
# a <- unname(fit_gam$estimate[1]); 
#b <- unname(fit_gam$estimate[2])

## ---- Body CDF/PDF helpers ----
F_b <- function(x) pweibull(x, shape = k, scale = th)
f_b <- function(x) dweibull(x, shape = k, scale = th)
# For Gamma instead:
# F_b <- function(x) pgamma(x, shape = a, rate = b)
# f_b <- function(x) dgamma(x, shape = a, rate = b)

F_M <- F_b(M); stopifnot(F_M > 0 && S_M < 1)
w   <- 1 - S_M

## ---- Choose a Pareto tail shape alpha ----
# Option B: set alpha by judgement/sensitivity
alpha <- 1.5  # e.g., pick from c(1.2, 1.5, 1.7, 2.0, 2.5)

stopifnot(alpha > 1)

# Truncated body parts:
F_M <- F_b(M); w <- 1 - S_M
f_b_trunc <- function(x) f_b(x) / F_M
F_b_trunc <- function(x) F_b(x) / F_M
S_b_trunc <- function(x) 1 - F_b_trunc(x)

# Spliced survival / pdf:
S_total <- function(x) ifelse(x < M, w * S_b_trunc(x) + S_M, S_M * (M/x)^alpha)
f_total <- function(x) ifelse(x < M, w * f_b_trunc(x), S_M * alpha * M^alpha / x^(alpha+1))


## ---- Spliced PDF/CDF/Survival ----
# Truncated body pieces on [0, M)
f_b_trunc <- function(x) f_b(x) / F_M
F_b_trunc <- function(x) F_b(x) / F_M
S_b_trunc <- function(x) 1 - F_b_trunc(x)

# Survival (global)
S_total <- function(x) ifelse(
  x < M,
  w * S_b_trunc(x) + S_M,                            # below M
  S_M * (M / x)^alpha                                # tail
)

# CDF (global)
F_total <- function(x) 1 - S_total(x)

# PDF (global)
f_total <- function(x) ifelse(
  x < M,
  w * f_b_trunc(x),                                  # below M
  S_M * alpha * M^alpha / (x^(alpha + 1))            # tail density
)

## ---- Plot grids ----
xs <- sort(unique(c(
  seq(0, M, length.out = 800),
  seq(M, max(M*5, quantile(X[X>0], 0.999, na.rm=TRUE)), length.out = 800)
)))

# Base R plots (simple & dependency-free)
par(mfrow = c(1,3), mar = c(4,4,2,1))

# 1) PDF
plot(xs, f_total(xs), type="l", lwd=2, xlab="x", ylab="pdf f(x)",
     main="Spliced PDF")
abline(v=M, lty=3)

# 2) CDF
plot(xs, F_total(xs), type="l", lwd=2, xlab="x", ylab="CDF F(x)",
     main="Spliced CDF")
abline(v=M, lty=3)

# 3) Survival (log–log tail view helps)
plot(xs, S_total(xs), type="l", lwd=2, xlab="x", ylab="Survival S(x)",
     main="Spliced Survival"); abline(v=M, lty=3)

## ----------------------------
## Spliced severity simulator
## ----------------------------
# Inputs:
#   M      : splice/cap threshold (e.g., 25000)
#   S_M    : tail weight among non-zero claims = P(X >= M | X > 0)
#   alpha  : Pareto tail shape (>1), pick by judgment/sensitivity
#   body   : "weibull" or "gamma"
#   pars   : list of body params
#            - Weibull: list(shape = k, scale = th)
#            - Gamma  : list(shape = a, rate = b)
# Returns a function r_splice(n) that simulates n severities.
make_r_splice <- function(M, S_M, alpha, body = c("weibull","gamma"), pars) {
  stopifnot(S_M >= 0, S_M < 1, alpha > 1)
  body <- match.arg(body)

  if (body == "weibull") {
    k  <- pars$shape; th <- pars$scale
    F_b  <- function(x) pweibull(x, shape=k, scale=th)
    q_b  <- function(p) qweibull(p, shape=k, scale=th)
  } else {
    a <- pars$shape; b <- pars$rate
    F_b  <- function(x) pgamma(x, shape=a, rate=b)
    q_b  <- function(p) qgamma(p, shape=a, rate=b)
  }

  F_M <- F_b(M); stopifnot(F_M > 0)
  w   <- 1 - S_M

  function(n) {
    u_mix <- runif(n)
    x <- numeric(n)

    # Body draws: truncated at M → inverse-CDF on [0, F_b(M)]
    idx_b <- which(u_mix > S_M)
    nb <- length(idx_b)
    if (nb > 0) {
      u_body <- runif(nb, min = 0, max = F_M) # uniform on [0, F_b(M)]
      x[idx_b] <- q_b(u_body)                 # ensures x < M almost surely
    }

    # Tail draws: Pareto on [M, ∞) with scale M, shape alpha
    idx_t <- which(u_mix <= S_M)
    nt <- length(idx_t)
    if (nt > 0) {
      u_tail <- runif(nt)                     # on (0,1)
      x[idx_t] <- M / (u_tail)^(1/alpha)     # qPareto(u)
    }

    x
  }
}

## ---------- Example usage ----------
set.seed(1)
M <- 25000

# Tail weight among non-zero claims:
S_M <- mean(X >= M & X > 0)  # use your data

# Choose a tail shape (sensitivity)
alpha <- 1.5  # try c(1.2, 1.5, 1.7, 2.0, 2.5)

# WEIBULL body (from your fit)
k  <- unname(fit_wei$estimate[1])
th <- unname(fit_wei$estimate[2])

r_splice_weib <- make_r_splice(
  M=M, S_M=S_M, alpha=alpha,
  body="weibull", pars=list(shape=k, scale=th)
)

# Or GAMMA body:
# a <- unname(fit_gam$estimate[1]); b <- unname(fit_gam$estimate[2])
# r_splice_gam <- make_r_splice(M, S_M, alpha, body="gamma", pars=list(shape=a, rate=b))

# Simulate
n <- 1e5
X_sim <- r_splice_weib(n)

# Retained and ceded pieces
Retained <- pmin(X_sim, M)
Ceded    <- pmax(X_sim - M, 0)

# SanCeded# Sanity checks
mean(Retained)                    # ≈ LEV(M) under splice

## [1] 18254.45

mean(Ceded)                       # ≈ S_M * M / (alpha - 1)

## [1] 36224.79

mean(Retained) + mean(Ceded)      # ≈ ground-up mean E[X] under splice

## [1] 54479.24

# Quick visuals
hist(X_sim[X_sim < M], breaks=50, freq=FALSE,
     main="Spliced PDF (below M) + Tail overlay", xlab="x")
# Overlay model pdf below M (scaled by w/F_M if you want exact heights);
# for quick check, just add a kernel density for simulated below-M:
lines(density(X_sim[X_sim < M]), lwd=2)

# Survival on log-log to inspect Pareto tail
xs <- seq(M, quantile(X_sim, 0.999), length.out=300)
S_emp <- sapply(xs, function(t) mean(X_sim >= t))
plot(xs, S_emp, log="xy", type="l", lwd=2,
     xlab="x (log)", ylab="S(x) (log)", main="Tail ~ Pareto (log-log)")
abline(v=M, lty=3)

## Assumes you already have:
## X_sim (simulated from the splice), M, S_M, alpha,
## and a body fit (Weibull or Gamma) with its params.

## --- Define body pdf f_b(x) and body CDF at M ---
# If WEIBULL body:
k  <- unname(fit_wei$estimate[1])
th <- unname(fit_wei$estimate[2])
f_b <- function(x) dweibull(x, shape = k, scale = th)
F_M <- pweibull(M, shape = k, scale = th)

# (If GAMMA body instead, swap the two lines above for:)
# a <- unname(fit_gam$estimate[1]); b <- unname(fit_gam$estimate[2])
# f_b <- function(x) dgamma(x, shape = a, rate = b)
# F_M <- pgamma(M, shape = a, rate = b)

stopifnot(F_M > 0)
w <- 1 - S_M

## --- Spliced pdf components ---
f_body_spliced <- function(x) ifelse(x < M, w * f_b(x) / F_M, 0)  # truncated body, weighted
f_pareto       <- function(x) ifelse(x >= M, S_M * alpha * M^alpha / x^(alpha + 1), 0)
f_total        <- function(x) f_body_spliced(x) + f_pareto(x)

## --- Plot: histogram + KDE for below M + overlays for body/pareto/total ---
# pick a wider max for the axis
xmax <- max(M * 3, quantile(X_sim, 0.9995, na.rm = TRUE))  # widen as needed

par(mar = c(4,4,2,1))  # optional: add a bit of right margin
hist(X_sim, breaks = 120, freq = FALSE,
     border = "gray80", col = "gray95",
     xlim = c(0, xmax), xaxs = "i",        # "i" = no extra padding
     main = "Spliced PDF with Pareto Tail Overlay", xlab = "Claim size")

abline(v = M, lty = 3)

# re-add your overlays after hist():
lines(density(X_sim[X_sim < M]), lwd = 2)

xs_left  <- seq(0, M, length.out = 800)
xs_right <- seq(M, xmax, length.out = 800)
lines(xs_left,  f_body_spliced(xs_left),  lwd = 2, lty = 2)
lines(xs_right, f_pareto(xs_right),       lwd = 2, lty = 3)
lines(c(xs_left, xs_right), f_total(c(xs_left, xs_right)), lwd = 2)


# Kernel density of simulated values (optional visual smooth)
lines(density(X_sim[X_sim < M]), lwd = 2)  # default color = black

# Overlays from the model
xs_left  <- seq(0, M, length.out = 800)
xs_right <- seq(M, xmax, length.out = 800)

# Body-only (spliced, below M)
lines(xs_left,  f_body_spliced(xs_left),  lwd = 2, lty = 2)      # dashed

# Pareto-only (tail, >= M)
lines(xs_right, f_pareto(xs_right),       lwd = 2, lty = 3)      # dotted

# Total spliced pdf
lines(c(xs_left, xs_right), f_total(c(xs_left, xs_right)), lwd = 2)

abline(v = M, lty = 3)

legend("topright",
       c("KDE (sim, <M)", "Body (spliced)", "Pareto tail", "Total spliced pdf"),
       lty = c(1, 2, 3, 1), lwd = c(2, 2, 2, 2),
       col = c("black", "black", "black", "black"),
       bty = "n")