# Benford Law and Probability Simulations
# 
# Eralda Gjika (Dhamo)
# Department of Applied MAthematics
# Faculty of Natural Science
# University of Tirana
# Albania
#
# ATTENTION!: Some of the graphical output will appear in another graphic window in your editor when you apply these commands at your editor. For more statistical tests also execute the part of the commands that are added as comments at the end of this work.
# KUJDES!: Disa nga grafiket qe afishohen gjate punes me ketofunksione dhe komanda do te shfaqen ne nje dritare grafike ne editorin tuaj te R-RStudio. Per me teper teste ekzekutoni edhe komandat  e fundit qe jane ne formen e komenteve. 
# 
# These examples are a simulations of Benford law and a cyber attack dataset is used for controling the Benford law in real data.
# The explanations are in english and albanian language.
# Then some probability distributions are fitted to the data and used for estimation of probability an event will occure in the future.
# Keyboard example (Albanian: SHEMBULL TASTIERA)
# If we type randomly some numbers from the keyboard of a computer we will observe that the figures (0 to 9) will not follow the benford law.Our mind will unconsciously follow some patterns.
# (Albanian: Nese shtypim rastesisht disa numra nje shifror (nga 0 ne 9) nga tastiera e nje kompjuteri do te veme re se frekuenca e tyre nuk do te ndjeke ligjin e Benford. Kjo sepse mendja jone arrin ne menyre te pavetedijshme te krijoje modele.
A=c(5,7,8,4,5,6,5,4,8,4,5,1,2,1,5,8,9,6,5,4,8,7,9,5,2,3,6,9,5,4,1,8,5,9,6,5,2,3)
length(A)
## [1] 38
par(mfrow=c(1,4))
hist(A,main="Vellimi N=38")# volume N=38 randomly numbers
A=c(5,7,8,4,5,6,5,4,8,4,5,1,2,1,5,8,9,6,5,4,8,7,9,5,2,3,6,9,5,4,1,8,5,9,6,5,2,3,1,3,6,9,8,5,4,4,5,8,7,6,5,9,5,2,3,6,5,4,5,7,8,5,4,1,2,3,6,9,8,5,4,1,2,4,8,6,5,9,3,2,5,6,9,8,7,4,1,5,2,8,9,6,5,2,3,6,5)
hist(A,main="Vellimi N=97")# volume N=97 randomly numbers
A=c(5,7,8,4,5,6,5,4,8,4,5,1,2,1,5,8,9,6,5,4,8,7,9,5,2,3,6,9,5,4,1,8,5,9,6,5,2,3,1,3,6,9,8,5,4,4,5,8,7,6,5,9,5,2,3,6,5,4,5,7,8,5,4,1,2,3,6,9,8,5,4,1,2,4,8,6,5,9,3,2,5,6,9,8,7,4,1,5,2,8,9,6,5,2,3,6,5,5,4,6,5,8,9,5,6,2,3,6,5,4,8,7,1,5,2,3,6,9,8,5,4,1,2,3,6,9,8,5,4,7,1,5,8,5,4,2,3,6,9,8,5,5,6,5,4,1,2,3,6,9,8,7,4,1,5,6,2,3)
hist(A,main="Vellimi N=158")#volume N=158 randomly numbers
A=c(5,7,8,4,5,6,5,4,8,4,5,1,2,1,5,8,9,6,5,4,8,7,9,5,2,3,6,9,5,4,1,8,5,9,6,5,2,3,1,3,6,9,8,5,4,4,5,8,7,6,5,9,5,2,3,6,5,4,5,7,8,5,4,1,2,3,6,9,8,5,4,1,2,4,8,6,5,9,3,2,5,6,9,8,7,4,1,5,2,8,9,6,5,2,3,6,5,5,4,6,5,8,9,5,6,2,3,6,5,4,8,7,1,5,2,3,6,9,8,5,4,1,2,3,6,9,8,5,4,7,1,5,8,5,4,1,3,6,9,8,5,5,6,5,4,1,2,3,6,9,8,7,4,1,5,6,2,3,8,4,5,6,2,3,6,5,9,8,5,8,4,5,1,2,3,6,9,5,6,9,5,6,3,2,5,6,8,7,4,1,5,8,6,5,2,7,8,9,5,6,4,5,8,6,9,5,4,1,5,2,6,9)
hist(A,main="Vellimi N=212")# volume N=212 randomly numbers

#
# Now wi will simulate some numbers which follow Benford Law.
# Albanian: Ligji Benford me te dhena te simuluara
# Provojme te simulojme n-vlera me probabilitete teorike qe ndjekin ligjin Benford
# Shohim si ndryshon paraqitja grafike me rritjen e vellimit te zgjedhjes
# Do te perdorim funksionin sample(vektori i vlerave, vellimi, replace, vektori i probabiliteteve)
# Vektorii probabiliteteve te shfaqes se shifrave eshte 
p.benford=c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046)# probabilities of Benford law
par(mfrow=c(1,5))# this will help to compare the histograms of simulations
# Sample 1 (ZGJEDHJA 1 ,N=50)
S1=sample(c(1,2,3,4,5,6,7,8,9),50,replace=TRUE,prob=c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046))
Tab.1=table(S1)/length(S1)
# Histogram of simulated data (Albanian:Histogrami i te dhenave te simuluara)
hist(S1,xlab="Shifrat nga 1 deri 9",ylab="Frekuencat", main="Zgjedhja N=50")
# Sample 2 (ZGJEDHJA 2 , N=100)
S2=sample(c(1,2,3,4,5,6,7,8,9),100,replace=TRUE,prob=c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046))
Tab.2=table(S2)/length(S2)
hist(S2,xlab="Shifrat nga 1 deri 9",ylab="Frekuencat", main="Zgjedhja N=100")
# Sample 3 (ZGJEDHJA 3,  N=500)
S3=sample(c(1,2,3,4,5,6,7,8,9),500,replace=TRUE,prob=c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046))
Tab.3=table(S3)/length(S3)
# Histogram of simulated data (Albanian:Histogrami i te dhenave te simuluara)
hist(S3,xlab="Shifrat nga 1 deri 9",ylab="Frekuencat", main="Zgjedhja N=500")
# Sample 4, (ZGJEDHJA 4,N=1000)
S4=sample(c(1,2,3,4,5,6,7,8,9),1000,replace=TRUE,prob=c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046))
Tab.4=table(S4)/length(S4)
# Histogram of simulated data (Albanian: Histogrami i te dhenave te simuluara)
hist(S4,xlab="Shifrat nga 1 deri 9",ylab="Frekuencat", main="Zgjedhja N=1000")
# ZGJEDHJA 5      N=10000
S5=sample(c(1,2,3,4,5,6,7,8,9),10000,replace=TRUE,prob=c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046))
# Histogram of simulated data (Albanian: Histogrami i te dhenave te simuluara)
hist(S5,xlab="Shifrat nga 1 deri 9",ylab="Frekuencat", main="Zgjedhja N=10^4")
# 
#
###############################################################################
#                   BENFORD LAW - Application to real data (Albanian: Aplikim ne te dhena reale) 
#################################################################################
library(BeyondBenford)

# We will use as an example of following Benford law the data from the dataset "address_PierreBuffiere"
# Albanian: Nje shembull i te dhenave qe i binden ligjit Benford jane te dhenat e numrit te shtepive (346 shtepi)ne nje lagje ne France
data(address_PierreBuffiere)
View(address_PierreBuffiere)# observe teh dataset
#
# Lets investigate the dataset (Albanian: Investigojme databazen)
obs.numb.dig(address_PierreBuffiere, dig=1)# numeron sa "kode shtepie" shfaqin ne shifren e pare
## Warning in prep(dat): NAs introduced by coercion
## [1] 109  64  41  34  26  18  14  14  11
# shifrat nga 1 deri ne 9
obs.numb.dig(address_PierreBuffiere, dig=2)# numeron sa "kode shtepie" shfaqin ne shifren e dyte
## Warning in prep(dat): NAs introduced by coercion
##  [1] 31 24 27 21 25 17 21 16 19 16
# shifrat nga 0 deri ne 9
#
# Probabilities a figure from 1 to 9 will appear in the i=1,2 position of the data.
# Albanian:Sa eshte probabiliteti qe nje shifer e caktuar nga 1 deri ne 9 te shfaqet ne pozicionin e pare?
Benf.val(1, dig=1)# Sa eshte probabiliteti qe shifra 1 te shfaqet ne pozicionin e dyte
## [1] 0.30103
Benf.val(2, dig=1) # Sa eshte probabiliteti qe shifra 2 te shfaqet ne pozicionin e dyte
## [1] 0.1760913
Benf.val(3, dig=1)
## [1] 0.1249387
Benf.val(4, dig=1)
## [1] 0.09691001
# Albanian:vazhdohet me shifrat nga 1 deri ne 9 pasi nje numer NUK MUND TE FILLOJE me 0
i=seq(1,9)
sum(Benf.val(i,dig=1))
## Warning in if (fig == 0) return("First digit is all but 0") else return(log(1
## + : the condition has length > 1 and only the first element will be used
## [1] 1
#Albanian:Sa eshte probabiliteti qe nje shifer e caktuar nga 0 deri ne 9 te shfaqet ne pozicionin e dyte?
Benf.val(0, dig=2)# Sa eshte probabiliteti qe shifra 0 te shfaqet ne pozicionin e dyte
## [1] 0.1196793
Benf.val(1, dig=2)# Sa eshte probabiliteti qe shifra 1 te shfaqet ne pozicionin e dyte
## [1] 0.1138901
Benf.val(2, dig=2)# Sa eshte probabiliteti qe shifra 2 te shfaqet ne pozicionin e dyte
## [1] 0.1088215
Benf.val(3, dig=2)
## [1] 0.1043296
Benf.val(4, dig=2)
## [1] 0.1003082
# vazhdohet deri ne shifren 9
# Per tu siguruar qe funksioni performon sakte kontrollojme shumen  e probabilietteve
i=seq(0,9)
sum(Benf.val(i,dig=2)) # the sum is equal to 1 (Albanian:shuma del 1)
## [1] 1
#
dat.distr(address_PierreBuffiere, dig=1, nclass=65,legend=FALSE,xlab="Vrojtimet",ylab="frekuencat",main="Shperndarja e te dhenave per shifren e pare, 65 intervale")
## Warning in prep(dat): NAs introduced by coercion
legend(25,13,"Shperndarja probabilitare teorike (ideale)",fill="red",box.col = "white")
# we can change the number of classes (Albanian:mund te ndryshojme numrin e klasave, intervaleve) 
dat.distr(address_PierreBuffiere, dig=1, nclass=10,legend=FALSE,xlab="Vrojtimet",ylab="frekuencat",main="Shperndarja e te dhenave per shifren e pare, 10 intervale")
## Warning in prep(dat): NAs introduced by coercion
legend(20,80,"Shperndarja probabilitare teorike (ideale)",fill="red",box.col = "white")
# 65 classes (Albanian:jane konsideruar shifra e dyte e kodit te shtepive klasifikuara ne 65 klasa)
dat.distr(address_PierreBuffiere, dig=2, nclass=65,legend=FALSE,xlab="Vrojtimet",ylab="frekuencat",main="Shperndarja e te dhenave per shifren e dyte")
## Warning in prep(dat): NAs introduced by coercion
legend(25,13,"Shperndarja probabilitare teorike (ideale)",fill="red",box.col = "white")
# 
# Graphical comparisions of Real~ benford ~ Blondeau (Albanian:PARAQITJE GRAFIKE KRAHASUESE REALE VS BENFORD VS BLONDEAU DA SILVA (Reference:2020)
digit.distr(address_PierreBuffiere, dig=1, mod="ben&blo", No.sd=1, Sd.pr=1, main="Shperndarja e shifrave ne pozicionin e pare")
## Warning in prep(dat): NAs introduced by coercion
digit.distr(address_PierreBuffiere, dig=2, mod="ben&blo", No.sd=1, Sd.pr=1, main="Shperndarja e shifrave ne pozicionin e dyte")
## Warning in prep(dat): NAs introduced by coercion
#
#
# Chi -Square test for distrubution of the data(Albanian: Testi Hi-katror per shperndarjen e te dhenave, Hipoteza H0: Shperndarja e studiuar eshte  njejte me shperndarjen teorike Benford.
# 
dat.distr(address_PierreBuffiere, dig=1, nchi=4,legend=FALSE,xlab="Vrojtimet",ylab="frekuencat",main="Shperndarja e te dhenave per shifren e pare")
## Warning in prep(dat): NAs introduced by coercion
## [1] "Class freq.: " "211"           "76"            "31"           
## [5] "13"           
## [1] "Theor. freq.:"    "198.892398965381" "80.2127711819963" "39.3372028630348"
## [5] "12.5576269895883"
##               chi2               pval
## 1   Chi2 value is:    The p-value is:
## 2 10.9635717384506 0.0119244966659539
legend(25,13,"Shperndarja probabilitare teorike (ideale)",fill="red",box.col = "white")
dat.distr(address_PierreBuffiere, dig=2, nchi=4,legend=FALSE,xlab="Vrojtimet",ylab="frekuencat",main="Shperndarja e te dhenave per shifren e dyte")
## Warning in prep(dat): NAs introduced by coercion
## [1] "Class freq.: " "130"           "51"            "25"           
## [5] "11"           
## [1] "Theor. freq.:"    "127.050977504473" "54.6446927640259" "26.7009825219254"
## [5] "8.60334720957567"
##               chi2              pval
## 1   Chi2 value is:   The p-value is:
## 2 4.35018430321684 0.226049147217251
legend(25,13,"Shperndarja probabilitare teorike (ideale)",fill="red",box.col = "white")
#
#
# We obtain the Pearson Chi Square statistics and p-value (Albanian: Testi afishon vleren e statistikes Hi-Katror (Pearson’s chi-squared) dhe p-vleren  
# For p-value > 0.05 we accept H0 (Albanian: per p-value > 0.05 pranohet H0: Shperndarja e te dhenave eshte e njejte me ate teorike
chi2(address_PierreBuffiere, dig=1, pval=1)# sipas Benford
## Warning in prep(dat): NAs introduced by coercion
##               chi2             pval
## 1   Chi2 value is:  The p-value is:
## 2 5.38837806254704 0.79922400649165
chi2(address_PierreBuffiere, dig=1, pval=1, mod="BDS") # sipas Blondeau 
## Warning in prep(dat): NAs introduced by coercion
##               chi2              pval
## 1   Chi2 value is:   The p-value is:
## 2 5.63689806225787 0.687830206180007
chi2(address_PierreBuffiere, dig=2, pval=1)
## Warning in prep(dat): NAs introduced by coercion
##              chi2              pval
## 1  Chi2 value is:   The p-value is:
## 2 3.8479322103018 0.953946357013396
chi2(address_PierreBuffiere, dig=2, pval=1, mod="BDS") # sipas Blondeau 
## Warning in prep(dat): NAs introduced by coercion
##               chi2              pval
## 1   Chi2 value is:   The p-value is:
## 2 2.47996278328848 0.981417506807756
# Albanian:Nga rezultatet e afishura vihet re se ne te dy rastet (Benford dhe Blondeau)hipoteza H0 NUK mund te refuzohet, pra shperndarja e studiuar eshte  e qendrueshme me shperndarjen teorike. 
#
#
################################################################################
#       Example of keyboard numbers (Albanian: A I BINDEN ATO LIGJIT BENFORD?) 
################################################################################
hist(A,main="Numra 1 shifror simuluar nga tastiera,N=212", xlab="Shifra 1 deri 9", ylab="Frekuenca")
obs.numb.dig(A, dig=1)# numeron sa "kode shtepie" shfaqin ne shifren e pare
## [1] 16 19 16 24 49 31 10 26 21
obs.numb.dig(A, dig=2)# numeron sa "kode shtepie" shfaqin ne shifren e pare
##  [1] 0 0 0 0 0 0 0 0 0 0
dat.distr(A, dig=1,legend=FALSE,xlab="Vrojtimet",ylab="frekuencat",main="Shperndarja e te dhenave per shifren e pare")
digit.distr(A, dig=1, mod="ben&blo", No.sd=1, Sd.pr=1, main="Shperndarja e shifrave ne pozicionin e dyte")
chi2(A, dig=1, pval=1)# sipas Benford
##               chi2            pval
## 1   Chi2 value is: The p-value is:
## 2 166.057515555596               0
chi2(A, dig=1, pval=1, mod="BDS") # sipas Blondeau 
## [1] "Chi2 can not be applied: at least one insufficient theoretical frequency"
#
#
#####################################################################################
#                              DATASET CYBER DATA
########################################################################################
# Reference for dataset: https://osf.io/mv98y/ 
# Import the dataset (Albanian:Se pari importojme te dhenat e ruajtura ne formatin csv)
cyber_event_binned_data <- read.csv("~/ALDA-Universiteti/Benford law -Cyber academy 2020/cyber_event_binned_data.csv")
# 
head(cyber_event_binned_data) # mund te shohim si jane te dhenat duke pasqyruar disa rreshta nga databaza
##   week event.count avg.report.length
## 1    1           8               435
## 2    2          11               443
## 3    3           2              1076
## 4    4           2               598
## 5    5           0                 0
## 6    6           3               728
# Nese deshirojme te shohim te gjithe databazen shfrytezojme funksionin View()
par(mfrow=c(1,2))
View(cyber_event_binned_data)
cyber.data=ts(cyber_event_binned_data) # save as a time sereis object (Albanian: ruaj ne formatin e serise kohore per analize fillestare grafike)
# Plot the time series of Number of attacks and average length of attacks (Albanian: bej paraqitje grafike te dy vektoreve te te dhenave)
ts.plot(cyber.data[,2],main="Seria kohore e numrit te sulmeve", col="red",xlab="Java", ylab="Frekuenca",lwd=2)
text(280,120,"Maksimum",col="blue")
ts.plot(cyber.data[,3],main="Seria kohore e kohes mesatare te sulmeve", col="blue",xlab="Java", ylab="Frekuenca",lwd=2)
text(110,1700,"Maksimum",col="red")
# BoxPlot 
summary(cyber.data[,c(2,3)])# informacion paraprak i te dhenave
##   event.count     avg.report.length
##  Min.   :  0.00   Min.   :   0.0   
##  1st Qu.: 10.00   1st Qu.: 394.0   
##  Median : 21.00   Median : 509.0   
##  Mean   : 25.42   Mean   : 533.7   
##  3rd Qu.: 35.00   3rd Qu.: 654.0   
##  Max.   :126.00   Max.   :1700.0
boxplot(cyber.data[,2],col="green",main="BoxPlot per numrin e sulmeve",xlab="Numri i sulmeve")
boxplot(cyber.data[,3],col="orange",main="BoxPlot per kohen mesatare",xlab="Kohezgjatja mesatare e sulmeve")
# Histogram
hist(cyber.data[,2],col="green",main="Histogrami per numrin e sulmeve",xlab="Numri i sulmeve",ylab="Frekuenca")
hist(cyber.data[,3],col="orange",main="Histogrami per kohen mesatare",xlab="Kohezgjatja mesatare e sulmeve")
#
############################################################################################
# Benford law for cyber attack dataset (Albanian: Ligji Benford per te dhenat e sulmeve)
############################################################################################
#
obs.numb.dig(cyber.data[,2], dig=1)# numeron sa jave shfaqin ne shifren e pare
## [1] 83 83 67 39 28 21 18 18  6
# shifrat nga 1 deri ne 9
obs.numb.dig(cyber.data[,2], dig=2)# numeron sa jave shfaqin ne shifren e pare
##  [1] 30 33 35 32 27 26 35 19 23 18
# shifrat nga 0 deri ne 9
dat.distr(cyber.data[,2], dig=1, nclass=65,legend=FALSE,xlab="Numri i sulmeve",ylab="Frekuencat",main="Shperndarja e te dhenave per shifren e pare, 65 intervale")
legend(60,10,"Shperndarja probabilitare teorike (ideale)",fill="red",box.col = "white")
# mudn te ndryshojme numrin e klasave (intervaleve)
dat.distr(cyber.data[,2], dig=2, nclass=65,legend=FALSE,xlab="Numri i sulmeve",ylab="Frekuencat",main="Shperndarja e te dhenave per shifren e dyte, 65 intervale")
legend(60,10,"Shperndarja probabilitare teorike (ideale)",fill="red",box.col = "white")
# 
digit.distr(cyber.data[,2], dig=1, mod="ben&blo", No.sd=1, Sd.pr=1, main="Shperndarja e shifrave ne pozicionin e pare")
digit.distr(cyber.data[,2], dig=2, mod="ben&blo", No.sd=1, Sd.pr=1, main="Shperndarja e shifrave ne pozicionin e dyte")
# Chi-Square test
chi2(cyber.data[,2], dig=1, pval=1)# sipas Benford p-value< 0.05 --> te dhenat nuk ndjekin shperndarjen e propozuar sipas Benford
##               chi2                 pval
## 1   Chi2 value is:      The p-value is:
## 2 30.4641814433273 0.000365642770626784
chi2(cyber.data[,2], dig=1, pval=1, mod="BDS") # sipas Blondeau p-value< 0.05 --> te dhenat nuk ndjekin shperndarjen e propozuar sipas Benford
##               chi2                 pval
## 1   Chi2 value is:      The p-value is:
## 2 26.3935467466887 0.000899181952833783
#
#
########################################################################################
#                                     FITING DISTRIBUTIONS IN R
#######################################################################################
# packages we will need for fitting and simulations (Albanian:Paketa qe na nevojiten per paraqitje grafike dhe simulime probabilitare)
library(fitdistrplus)
## Loading required package: MASS
## Loading required package: survival
library(survival)
library(actuar)
## 
## Attaching package: 'actuar'
## The following object is masked from 'package:grDevices':
## 
##     cm
library(distrMod)
## Loading required package: distr
## Loading required package: startupmsg
## Utilities for Start-Up Messages (version 0.9.6)
## For more information see ?"startupmsg", NEWS("startupmsg")
## Loading required package: sfsmisc
## Object Oriented Implementation of Distributions (version 2.8.0)
## Attention: Arithmetics on distribution objects are understood as operations on corresponding random variables (r.v.s); see distrARITH().
## Some functions from package 'stats' are intentionally masked ---see distrMASK().
## Note that global options are controlled by distroptions() ---c.f. ?"distroptions".
## For more information see ?"distr", NEWS("distr"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several extension packages; try vignette("distr").
## 
## Attaching package: 'distr'
## The following objects are masked from 'package:stats':
## 
##     df, qqplot, sd
## Loading required package: distrEx
## Extensions of Package 'distr' (version 2.8.0)
## Note: Packages "e1071", "moments", "fBasics" should be attached /before/ package "distrEx". See distrExMASK().Note: Extreme value distribution functionality has been moved to
##       package "RobExtremes". See distrExMOVED().
## For more information see ?"distrEx", NEWS("distrEx"), as well as
##   http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several related packages; try vignette("distr").
## 
## Attaching package: 'distrEx'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, median, var
## Loading required package: RandVar
## Implementation of Random Variables (version 1.2.1)
## For more information see ?"RandVar", NEWS("RandVar"), as well as
##   http://robast.r-forge.r-project.org/
## This package also includes a vignette; try vignette("RandVar").
## Loading required package: stats4
## Object Oriented Implementation of Probability Models (version 2.8.4)
## Some functions from pkg's 'base' and 'stats' are intentionally masked ---see distrModMASK().
## Note that global options are controlled by distrModoptions() ---c.f. ?"distrModoptions".
## For more information see ?"distrMod", NEWS("distrMod"), as well as
##   http://distr.r-forge.r-project.org/
## There is a vignette to this package; try vignette("distrMod").
## 
## Package "distrDoc" provides a vignette to the other distrXXX packages,
## as well as to several related packages; try vignette("distr").
## 
## Attaching package: 'distrMod'
## The following object is masked from 'package:stats4':
## 
##     confint
## The following object is masked from 'package:stats':
## 
##     confint
## The following object is masked from 'package:base':
## 
##     norm
library(stats4)
library(MASS)
# 
plotdist(cyber_event_binned_data$event.count, histo = TRUE, demp = TRUE)# Grafiku i densitetit empirik dhe densitetit te grumbulluar per numrin e sulmeve javore
plotdist(cyber_event_binned_data$avg.report.length, histo = TRUE, demp = TRUE)# Grafiku i densitetit empirik dhe densitetit te grumbulluar per kohezgjatjen mesatare te sulmeve javore
#
# Albanian: Nese dyshojme per shperndarjen e te dhenave shfrytezojme teste statistikore per tu orientuar
# Grafiku Cullen and Frey
descdist(cyber_event_binned_data[,3], boot = 1000)
## summary statistics
## ------
## min:  0   max:  1700 
## median:  509 
## mean:  533.7213 
## estimated sd:  211.2284 
## estimated skewness:  1.014491 
## estimated kurtosis:  6.198751
#
# Fiting probability distributions
nbinom.f <- fitdist(cyber_event_binned_data$event.count, "nbinom") # Negative Binomial 
pois.f<- fitdist(cyber_event_binned_data$event.count, "pois")# Poisson
norm.f <- fitdist(cyber_event_binned_data$event.count, "norm") # Normal
exp.f <- fitdist(cyber_event_binned_data$event.count, "exp") # Exponential
################################################################################
par(mfrow = c(2, 2))
plot.legend <- c("Binomial neg", "Puason","Normal","Eksponencial")
denscomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
qqcomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
cdfcomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
ppcomp(list(nbinom.f,pois.f,norm.f,exp.f), legendtext = plot.legend)
################################################################
# look in a new window the output
nbinom.f <- fitdist(cyber_event_binned_data$event.count, "nbinom") # 
pois.f<- fitdist(cyber_event_binned_data$event.count, "pois")# 
norm.f <- fitdist(cyber_event_binned_data$event.count, "norm") # 
exp.f <- fitdist(cyber_event_binned_data$event.count, "exp") # 
####################################################################################
#      comparing two distributions (Albanian: krahasojme dy shperndarje probabilitare)
#####################################################################################
par(mfrow = c(2, 2))
plot.legend <- c("Normal","Eksponencial")
denscomp(list(norm.f,exp.f), legendtext = plot.legend)
qqcomp(list(norm.f,exp.f), legendtext = plot.legend)
cdfcomp(list(norm.f,exp.f), legendtext = plot.legend)
ppcomp(list(norm.f,exp.f), legendtext = plot.legend)
################################################################
#
nbinom.f <- fitdist(cyber_event_binned_data$event.count, "nbinom") # 
norm.f <- fitdist(cyber_event_binned_data$event.count, "norm") # 
par(mfrow = c(2, 2))
plot.legend <- c("Binomial neg", "Normal")
denscomp(list(nbinom.f,norm.f), legendtext = plot.legend)
qqcomp(list(nbinom.f,norm.f), legendtext = plot.legend)
cdfcomp(list(nbinom.f,norm.f), legendtext = plot.legend)
ppcomp(list(nbinom.f,norm.f), legendtext = plot.legend)
summary(nbinom.f)
## Fitting of the distribution ' nbinom ' by maximum likelihood 
## Parameters : 
##       estimate Std. Error
## size  1.658441  0.1253352
## mu   25.415193  1.0647008
## Loglikelihood:  -1536.555   AIC:  3077.11   BIC:  3084.916 
## Correlation matrix:
##              size           mu
## size 1.000000e+00 7.175825e-06
## mu   7.175825e-06 1.000000e+00
summary(pois.f)
## Fitting of the distribution ' pois ' by maximum likelihood 
## Parameters : 
##        estimate Std. Error
## lambda  25.4153  0.2635161
## Loglikelihood:  -3522.192   AIC:  7046.384   BIC:  7050.287
summary(norm.f)
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters : 
##      estimate Std. Error
## mean 25.41530  1.0361818
## sd   19.82332  0.7326911
## Loglikelihood:  -1612.522   AIC:  3229.044   BIC:  3236.849 
## Correlation matrix:
##      mean sd
## mean    1  0
## sd      0  1
summary(exp.f)
## Fitting of the distribution ' exp ' by maximum likelihood 
## Parameters : 
##        estimate  Std. Error
## rate 0.03934638 0.002055339
## Loglikelihood:  -1550.139   AIC:  3102.277   BIC:  3106.18
gofstat(list(nbinom.f,pois.f,norm.f,exp.f),fitnames = c("Binomial neg", "Puason","Normal","Eksponenciale"))
## Chi-squared statistic:  10.91395 1084850548 78.765 29.15183 
## Degree of freedom of the Chi-squared distribution:  13 14 13 14 
## Chi-squared p-value:  0.6180259 0 1.882466e-11 0.009966984 
##    the p-value may be wrong with some theoretical counts < 5  
## Chi-squared table:
##       obscounts theo Binomial neg  theo Puason theo Normal theo Eksponenciale
## <= 3         28          23.96809 1.035337e-05   47.243037           40.74997
## <= 5         20          17.87524 3.548527e-04    8.219591           24.61366
## <= 7         23          19.44427 5.816306e-03    9.118785           22.75100
## <= 10        25          30.18279 1.593786e-01   15.350150           30.93940
## <= 13        21          29.78295 1.743810e+00   17.263348           27.49465
## <= 16        28          28.30933 9.725868e+00   18.976224           24.43343
## <= 19        21          26.26338 3.124948e+01   20.387640           21.71304
## <= 22        27          23.95376 6.292882e+01   21.409006           19.29554
## <= 25        20          21.57330 8.448197e+01   21.973459           17.14720
## <= 28        20          19.24137 7.926980e+01   22.043104           15.23805
## <= 32        24          22.24129 6.567673e+01   28.639317           17.71155
## <= 35        21          14.33254 2.065898e+01   20.317585           11.56949
## <= 41        24          23.41260 9.518978e+00   36.046297           19.41800
## <= 47        20          17.60077 5.646825e-01   28.464492           15.33476
## <= 60        20          24.03725 1.531231e-02   35.716483           23.05950
## > 60         24          23.78106 5.714981e-07   14.831482           34.53077
## 
## Goodness-of-fit criteria
##                                Binomial neg   Puason   Normal Eksponenciale
## Akaike's Information Criterion     3077.110 7046.384 3229.044      3102.277
## Bayesian Information Criterion     3084.916 7050.287 3236.849      3106.180
###############################  
#
#
##########################################################################################
# Average length of attacks (Albanian: Databaza e kohezgjatjes se sulmeve)
######################################################################################
# Substitute the values 0 with positive values (0.05) to fit some probability distributions which are fitted only in positive values 
# Albanian:zevendesojme vlerat 0 me vlera shume afer 0 psh 0.1, sepse disa shperndarje kerkojne vlera >0, meqe ne databaze kemi vetem 3 vlera dhe duke konsideruar qe ato jane 0.1
# which(avg.report.length==0.5)
# avg.report.length=cyber_event_binned_data$avg.report.length[c(5,62,75)==0.05]# ndertojme nje vektor te ri duke zevendesuar vlerat 0 me vlera te peraferta pozitive, kjona lejon te pershtasim shperndarje probabilitare qe nuk pranojne vlera 0 
#
# expo.f <- fitdist(avg.report.length, "exp",method="mme")# 
# lnorm.f<- fitdist(avg.report.length, "lnorm",method="mme")# 
# norma.f<- fitdist(avg.report.length, "norm")#  
# weib <- fitdist(avg.report.length, "weibull", start = list(shape = 1, scale = 500))# 
# logis.f <- fitdist(avg.report.length, "llogis",start=NULL, fix.arg=NULL, discrete=FALSE, keepdata = TRUE)
#######################################################################################################################
#
#par(mfrow = c(2, 2))
#plot.legend <- c("Eksponencial","Normal", "log-Normal","Weibull","Logis")
#denscomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
#qqcomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
#cdfcomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
#ppcomp(list(expo.f,norma.f,lnorm.f,weib,logis.f), legendtext = plot.legend)
#
#gofstat(list(expo.f,norma.f,lnorm.f,weib,logis.f),fitnames = c("Eksponencial","Normal", "log-Normal","Weibull","Logis"))
#
# Comparing log-normal and logis distributions (Albanian:Veme re se shperndarja Log-normal dhe logis jane me afer vrojtimeve
#par(mfrow = c(2, 2))
#plot.legend <- c("log-Normal","Logis")
#denscomp(list(lnorm.f,logis.f), legendtext = plot.legend)
#qqcomp(list(lnorm.f,logis.f), legendtext = plot.legend)
#cdfcomp(list(lnorm.f,logis.f), legendtext = plot.legend)
#ppcomp(list(lnorm.f,logis.f), legendtext = plot.legend)
#
#
# Thank you for reading and sharing!
# Follow my Linkedin profile for more on R-RStudio work
#