The context

From our session, we learn that any \((a,b,0)\) distribution is given by the following expression: \[ p_k = \bigg(a+\frac{b}{k} \bigg) p_{k-1} \] Which can also being written as: \[ k \frac{p_k}{p_{k-1}} = ak+b \quad for \quad k=1,2,... \] Which is a linear function of the integers \(k\). We then, need to pay attention to the slopes of the ratios of probabilities for the claims we are analyzing.

Data

setwd("C:/Users/23043/Dropbox/UDLAP/Cursos/2022 Primavera/Tema Selecto/R")
data<-read.csv("ab0.csv")
head(data)
##   k        Dist1        Dist2       Dist3
## 1 0 0.0000142725 0.0000453999 0.000470185
## 2 1 0.0001784060 0.0004539990 0.002821110
## 3 2 0.0010927370 0.0022699960 0.009027552
## 4 3 0.0043709460 0.0075666550 0.020462451
## 5 4 0.0128396540 0.0189166370 0.036832411
## 6 5 0.0295312040 0.0378332750 0.055985265

We have three different distributions, and we will try to find the best suitable distributions for each one.

Plot

Lets plot the three distributions we have:

library(ggplot2)
ggplot(data, aes(x=k))+geom_line(aes(y=Dist1),color="blue")+
  geom_line(aes(y=Dist2),color="red")+
  geom_line(aes(y=Dist3),color="purple")

Now, we have to make some estimations… based in our formulas, we should estimate \[ k \times \frac{p_k}{p_{k-1}} \]

data$d1<-0
data$d2<-0
data$d3<-0

data<-as.matrix(data)
for (i in 2:30){
  data[i,5]<-data[i,1]*(data[i,2]/data[i-1,2])
}
for (i in 2:30){
  data[i,6]<-data[i,1]*(data[i,3]/data[i-1,3])
}
for (i in 2:30){
  data[i,7]<-data[i,1]*(data[i,4]/data[i-1,4])
}

data<-as.data.frame(data)
names(data)<-c("k","Dist1","Dist2","Dist3","D1","D2","D3")
head(data)
##   k        Dist1        Dist2       Dist3       D1 D2  D3
## 1 0 0.0000142725 0.0000453999 0.000470185  0.00000  0 0.0
## 2 1 0.0001784060 0.0004539990 0.002821110 12.49998 10 6.0
## 3 2 0.0010927370 0.0022699960 0.009027552 12.25000 10 6.4
## 4 3 0.0043709460 0.0075666550 0.020462451 11.99999 10 6.8
## 5 4 0.0128396540 0.0189166370 0.036832411 11.75000 10 7.2
## 6 5 0.0295312040 0.0378332750 0.055985265 11.50000 10 7.6
data1<-data[which(data$k>0),]

ggplot(data1, aes(x=k))+geom_line(aes(y=D1),color="blue")+
  geom_line(aes(y=D2),color="red")+
  geom_line(aes(y=D3),color="purple")

We can then, estimate the parameters we need to have access to specific values for this reason I will create a matrix representation

datam<-as.matrix(data)
p1<-datam[2,3]
p0<- datam[1,3]

lambda<- p1/p0
lambda 
## Dist2 
##    10

The blue line, with \(a < 0\) and \(b \neq 0\) is a binomial distribution:

p1<- datam[2,2]
p2<- datam[3,2]
p3<- datam[4,2]

b<- 6*(p2/p1 - p3/p2)
b
##    Dist1 
## 12.75002
a<- p2/p1 - b/2
a
##      Dist1 
## -0.2500083

Now, we can define the distribution by remembering the original formulas:

q<- a/(a-1)
q
##     Dist1 
## 0.2000053
m<- (b*(1-q)/q)-1
m
##    Dist1 
## 49.99839

The third line, the purple one, with \(a>0\) and \(b \neq0\) is a Negative Binomial:

b<- 6*(p2/p1 - p3/p2)
b
##    Dist1 
## 12.75002
a<- p2/p1 - b/2
a
##      Dist1 
## -0.2500083
q<- 1-a
q
##    Dist1 
## 1.250008
r<- (b/(1-q))+1
r
##     Dist1 
## -49.99839

Zero-modified distribution

n0<- dbinom(0,4,0.3)
n1<- dbinom(1,4,0.3)
n2<- dbinom(2,4,0.3)
n3<- dbinom(3,4,0.3)
n4<- dbinom(4,4,0.3)

cm<- (1-0.4)/(1-n0)
cm
## [1] 0.7895776
n0m<- 0.4
n1m <- cm*n1
n2m <- cm*n2
n3m <- cm*n3
n4m <- cm*n4

n1m
## [1] 0.3249901
n2m
## [1] 0.2089222
n3m
## [1] 0.05969206
n4m
## [1] 0.006395578
# zero-truncated n0<-0
ct <- 1/(1-n0)
ct
## [1] 1.315963
n0t<- 0
n1t <- ct*n1
n2t <- ct*n2
n3t <- ct*n3
n4t <- ct*n4

n1t
## [1] 0.5416502
n2t
## [1] 0.3482037
n3t
## [1] 0.09948677
n4t
## [1] 0.0106593