data used for this session are imported from: https://osf.io/aj7vr/
# read the data and save it as "cyber.data"
head(cyber.data,10)
Because the data are weekly observations we can have as a first graph a time series plot.
library(forecast)
# ts.plot() will create a time series plot
ts.plot(cyber.data[,2],main="Event count", col="red",xlab="week", ylab="number of attacs",lwd=2)
text(280,120,"Maximum",col="blue")# will add text to our graph
ts.plot(cyber.data[,3],main="Average length", col="blue",xlab="week", ylab="Average time of attacs",lwd=2)
text(110,1700,"Maximum",col="red")
summary(cyber.data[,c(2,3)])# descriptive statistics of our dataset
event.count avg.report.length
Min. : 0.00 Min. : 0.0
1st Qu.: 10.00 1st Qu.: 394.0
Median : 21.00 Median : 509.0
Mean : 25.42 Mean : 533.7
3rd Qu.: 35.00 3rd Qu.: 654.0
Max. :126.00 Max. :1700.0
Analyse the data by observing boxplot for any outliers.
boxplot(cyber.data[,2],col="green",main="BoxPlot attacs",xlab="Count")
boxplot(cyber.data[,3],col="orange",main="BoxPlot av.length",xlab="time")
#
par(mfrow=c(1,2))
hist(cyber.data[,2],col="green",main="Histogram attacs",xlab="number of attacs",ylab="Freq")
hist(cyber.data[,3],col="orange",main="Histogram av.length",xlab="time")
we observe some outliers in both variables, especially in the upper region. This was also observed in the time series plot. For now we are not dealing with this outliers . The aim of this material is just to show some basic functions from the library(fitdistr) and library(fitdistrplus) which are used for fitting probability distributions to a vector of numerical data.
Packages needed for this part are:
library(fitdistrplus)
library(stats4)
library(MASS)
# for other necessary test or graphical tools
library(survival)
library(actuar)
library(distrMod)
We can use the function plotdist(data) to obtain the histogram and the cummulative distribution graph of teh data.
plotdist(cyber.data$event.count, histo = TRUE, demp = TRUE)
plotdist(cyber.data$avg.report.length, histo = TRUE, demp = TRUE)
Exercise: Try to simulate 10^5 observations from the most known probability distributions you know and plot their Empirical density and Cummulative distribution. Are these graphs as you expected?
The function descdist() provides a skewness-kurtosis graph to help to choose the best candidate(s) to fit a given dataset. If we want to use it for discrete distributions we may use argument discrete=TRUE.
descdist(cyber.data[,3])
summary statistics
------
min: 0 max: 1700
median: 509
mean: 533.7213
estimated sd: 211.2284
estimated skewness: 1.014491
estimated kurtosis: 6.198751
descdist(cyber.data[,3], boot = 1000)# bootstrapped values 1000
summary statistics
------
min: 0 max: 1700
median: 509
mean: 533.7213
estimated sd: 211.2284
estimated skewness: 1.014491
estimated kurtosis: 6.198751
descdist(cyber.data[,3], boot = 10000)# bootstrapped values 10^4
summary statistics
------
min: 0 max: 1700
median: 509
mean: 533.7213
estimated sd: 211.2284
estimated skewness: 1.014491
estimated kurtosis: 6.198751
# observe the concentration of the simulations
descdist(cyber.data[,3], boot = 1000,discrete=TRUE)
summary statistics
------
min: 0 max: 1700
median: 509
mean: 533.7213
estimated sd: 211.2284
estimated skewness: 1.014491
estimated kurtosis: 6.198751
Trying to fit probability distributions to the number of cyber attacs.
# Negative Binomial
nbinom.f<- fitdist(cyber.data$event.count, "nbinom")
NaNs producedNaNs producedNaNs produced
pois.f<- fitdist(cyber.data$event.count, "pois")# Poisson
NaNs producedNaNs produced
norm.f <- fitdist(cyber.data$event.count, "norm") # Normal
NaNs producedNaNs produced
exp.f <- fitdist(cyber.data$event.count, "exp") # Exponential
NaNs producedNaNs produced
Information about the parameter estimation
summary(nbinom.f)
Fitting of the distribution ' nbinom ' by maximum likelihood
Parameters :
Loglikelihood: -1536.555 AIC: 3077.11 BIC: 3084.916
Correlation matrix:
size mu
size 1.000000e+00 7.175825e-06
mu 7.175825e-06 1.000000e+00
summary(pois.f)
Fitting of the distribution ' pois ' by maximum likelihood
Parameters :
Loglikelihood: -3522.192 AIC: 7046.384 BIC: 7050.287
summary(norm.f)
Fitting of the distribution ' norm ' by maximum likelihood
Parameters :
Loglikelihood: -1612.522 AIC: 3229.044 BIC: 3236.849
Correlation matrix:
mean sd
mean 1 0
sd 0 1
summary(exp.f)
Fitting of the distribution ' exp ' by maximum likelihood
Parameters :
Loglikelihood: -1550.139 AIC: 3102.277 BIC: 3106.18
we can use these graphical test to compare the fitted distribiutions with the real observations. We expect the distributions to follow the patterns of teh real observations in the histogram, QQ-plot, CDF plot and also PP-plot.
par(mfrow = c(2, 2))
plot.legend <- c("Binomial neg", "Poisson","Normal","Exponential")
denscomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
qqcomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
cdfcomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
ppcomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
Scatterplot view. we expect a high linear relationship.We use now the parameter estimated above to simulate random observations from these distributions. And we are using the scatterplot to understand the fitting. We are looking for scatterplots which show a linear pattern.
set.seed(512)
Ex=rexp(length(cyber.data$event.count),0.03934638)# estimated rate= 0.03934638
plot(cyber.data$event.count,Ex)# scatterplot observed vs fitted
No.r=rnorm(length(cyber.data$event.count),25.41530,19.82332) # estimated parameters
plot(cyber.data$event.count,No.r)# scatterplot observed vs fitted
Po=rpois(length(cyber.data$event.count),25.4153)# estimated parameters
plot(cyber.data$event.count,Po)# scatterplot observed vs fitted
neg.bin=rnbinom(length(cyber.data$event.count),size=1.658441, mu=25.415193)# estimated parameters
plot(cyber.data$event.count,neg.bin)# scatterplot observed vs fitted
To observe more clear the fitting between the two most accurate fitting we can construct graphics above only for these two distributions.
par(mfrow = c(2, 2))
plot.legend <- c("Normal","Exponencial")
denscomp(list(norm.f,exp.f), legendtext = plot.legend)
qqcomp(list(norm.f,exp.f), legendtext = plot.legend)
cdfcomp(list(norm.f,exp.f), legendtext = plot.legend)
ppcomp(list(norm.f,exp.f), legendtext = plot.legend)
Compare the Negative Binomial fit and the normal fit.
nbinom.f <- fitdist(cyber.data$event.count, "nbinom") #
NaNs producedNaNs producedNaNs produced
norm.f <- fitdist(cyber.data$event.count, "norm") #
NaNs producedNaNs produced
#
par(mfrow = c(2, 2))
plot.legend <- c("Binomial neg", "Normal")
denscomp(list(nbinom.f,norm.f), legendtext = plot.legend)
qqcomp(list(nbinom.f,norm.f), legendtext = plot.legend)
cdfcomp(list(nbinom.f,norm.f), legendtext = plot.legend)
ppcomp(list(nbinom.f,norm.f), legendtext = plot.legend)
First we substitute the 0 values with values very close to 0 (example 0.1) because for some probability distributions there are conditions of values > zero.
avg.report.length=ifelse(cyber.data$avg.report.length==0,0.01,cyber.data$avg.report.length)
expo.f <- fitdist(avg.report.length, "exp",method="mme")#
NaNs producedNaNs produced
lnorm.f<- fitdist(avg.report.length, "lnorm",method="mme")#
NaNs producedNaNs produced
norma.f<- fitdist(avg.report.length, "norm")#
NaNs producedNaNs produced
weib <- fitdist(avg.report.length, "weibull", start = list(shape = 1, scale = 500))#
NaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs produced
logis.f <- fitdist(avg.report.length, "llogis",start=NULL, fix.arg=NULL, discrete=FALSE, keepdata = TRUE)
NaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs produced
Cullen and Frey graph
descdist(avg.report.length)
summary statistics
------
min: 0.01 max: 1700
median: 509
mean: 533.7214
estimated sd: 211.2282
estimated skewness: 1.0145
estimated kurtosis: 6.198748
descdist(avg.report.length, boot = 1000)# bootstrapped values 1000
summary statistics
------
min: 0.01 max: 1700
median: 509
mean: 533.7214
estimated sd: 211.2282
estimated skewness: 1.0145
estimated kurtosis: 6.198748
descdist(avg.report.length, boot = 10000)# bootstrapped values 10^4
summary statistics
------
min: 0.01 max: 1700
median: 509
mean: 533.7214
estimated sd: 211.2282
estimated skewness: 1.0145
estimated kurtosis: 6.198748
# observe the concentration of the simulations
descdist(avg.report.length, boot = 1000,discrete=TRUE)
summary statistics
------
min: 0.01 max: 1700
median: 509
mean: 533.7214
estimated sd: 211.2282
estimated skewness: 1.0145
estimated kurtosis: 6.198748
Parameter estimations:
summary(expo.f)
Fitting of the distribution ' exp ' by matching moments
Parameters :
Loglikelihood: -2664.434 AIC: 5330.868 BIC: 5334.77
summary(lnorm.f)
Fitting of the distribution ' lnorm ' by matching moments
Parameters :
Loglikelihood: -3620.588 AIC: 7245.177 BIC: 7252.982
summary(norma.f)
Fitting of the distribution ' norm ' by maximum likelihood
Parameters :
Loglikelihood: -2478.007 AIC: 4960.013 BIC: 4967.818
Correlation matrix:
mean sd
mean 1 0
sd 0 1
summary(weib)
Fitting of the distribution ' weibull ' by maximum likelihood
Parameters :
Loglikelihood: -2510.2 AIC: 5024.4 BIC: 5032.205
Correlation matrix:
shape scale
shape 1.0000000 0.2850167
scale 0.2850167 1.0000000
summary(logis.f)
Fitting of the distribution ' llogis ' by maximum likelihood
Parameters :
Loglikelihood: -2538.638 AIC: 5081.277 BIC: 5089.082
Correlation matrix:
shape scale
shape 1.00000000 0.02488679
scale 0.02488679 1.00000000
Graphic test check:
par(mfrow = c(2, 2))
plot.legend <- c("Exponential","Normal", "log-Normal","Weibull","Logis")
denscomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
qqcomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
cdfcomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
ppcomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
#
gofstat(list(expo.f,norma.f,lnorm.f,weib,logis.f),fitnames = c("Exponential","Normal", "log-Normal","Weibull","Logis"))
Goodness-of-fit statistics
Exponential Normal log-Normal
Kolmogorov-Smirnov statistic 0.3433618 0.06189011 0.03403799
Cramer-von Mises statistic 13.4476900 0.42673770 0.14139236
Anderson-Darling statistic 66.2372441 2.74835073 11.17728575
Weibull Logis
Kolmogorov-Smirnov statistic 0.09875598 0.08651839
Cramer-von Mises statistic 0.76792533 0.51785182
Anderson-Darling statistic 5.80145260 4.89002332
Goodness-of-fit criteria
Exponential Normal log-Normal
Akaike's Information Criterion 5330.868 4960.013 7245.177
Bayesian Information Criterion 5334.770 4967.818 7252.982
Weibull Logis
Akaike's Information Criterion 5024.400 5081.277
Bayesian Information Criterion 5032.205 5089.082
We are looking for the lowest value of the test statistics and also the lowest value of the information criteria. The fitting which has the lowest value is supposed to be the most accurate among those compared. In this situation the “best” fitting of the data is from the Normal or Log-normal distributions. (observe the values of test statistics and AIC/BIC).
par(mfrow = c(2, 2))
plot.legend <- c("Normal","Log-normal")
denscomp(list(norma.f,logis.f), legendtext = plot.legend)
qqcomp(list(norma.f,logis.f), legendtext = plot.legend)
cdfcomp(list(norma.f,logis.f), legendtext = plot.legend)
ppcomp(list(norma.f,logis.f), legendtext = plot.legend)
Log.nor=rlnorm(length(cyber.data$avg.report.length),6.2073341,0.3809332)
plot(cyber.data$avg.report.length,Log.nor)#
Nor=rnorm(length(cyber.data$avg.report.length),533.7214 ,210.9395)
plot(cyber.data$avg.report.length,Nor)#
#
par(mfrow=c(1,3))
hist(cyber.data$avg.report.length)
hist(Log.nor)
hist(Nor)
Reference:
https://www.r-project.org/conferences/useR-2009/slides/Delignette-Muller+Pouillot+Denis.pdf
rdocumentation.org/packages/fitdistrplus/versions/1.1-3/topics/descdist