data("groundbeef", package = "fitdistrplus")
my_data <- groundbeef$serving
plot(my_data, pch=20)

hist(my_data)

Plot Dist

Plots empirical density function. (Note the empirical density function is built from the CDF.)

plotdist(my_data, histo = TRUE, demp = TRUE)

Fitting a distribution

You can basically use any distribution with the standard d/q/p/r functionality in R.

For continuous, use normal, log normal, exponential, gamma, uniform, beta, logis. For discrete, binomial, geometric, hypergeometric, poisson, or nbinom (multi?)

?fitdist
## starting httpd help server ... done
fit_w  <- fitdist(my_data, "weibull")
fit_g  <- fitdist(my_data, "gamma")
fit_ln <- fitdist(my_data, "lnorm")
summary(fit_ln)
## Fitting of the distribution ' lnorm ' by maximum likelihood 
## Parameters : 
##          estimate Std. Error
## meanlog 4.1693701 0.03366988
## sdlog   0.5366095 0.02380783
## Loglikelihood:  -1261.319   AIC:  2526.639   BIC:  2533.713 
## Correlation matrix:
##         meanlog sdlog
## meanlog       1     0
## sdlog         0     1

See which distribution fits the best using chi-square. Note that is uses the parameters esimated above. Snazzy!

gofstat(list(fit_w, fit_g, fit_ln), fitnames = c("Weilbull", "Gamma", "Lnorm"))
## Goodness-of-fit statistics
##                               Weilbull     Gamma     Lnorm
## Kolmogorov-Smirnov statistic 0.1396646 0.1281246 0.1493090
## Cramer-von Mises statistic   0.6840994 0.6934112 0.8277358
## Anderson-Darling statistic   3.5736460 3.5660192 4.5436542
## 
## Goodness-of-fit criteria
##                                Weilbull    Gamma    Lnorm
## Akaike's Information Criterion 2514.449 2511.250 2526.639
## Bayesian Information Criterion 2521.524 2518.325 2533.713

Note the Kolmogorov-Smirnov is included here. For more information, try out ks.test.

Plotting the results

par(mfrow=c(2,2))
plot.legend <- c("Weibull", "lognormal", "gamma")
denscomp(list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
cdfcomp (list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
qqcomp  (list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
ppcomp  (list(fit_w, fit_g, fit_ln), legendtext = plot.legend)

Cullen and Frey for higher-order moments

descdist(my_data, discrete=FALSE, boot=500)

## summary statistics
## ------
## min:  10   max:  200 
## median:  79 
## mean:  73.64567 
## estimated sd:  35.88487 
## estimated skewness:  0.7352745 
## estimated kurtosis:  3.551384

Fitting Endosulfan

The fitting can work with other non-base distribution. It only needs that the correspodent, d, p, q functions are implemented.

data("endosulfan", package = "fitdistrplus")
my_data <- endosulfan$ATV

fit_ln <- fitdist(my_data, "lnorm")
cdfcomp(fit_ln, xlogscale = TRUE, ylogscale = TRUE)

Add other distributions in the library “actuar”

library(actuar)
## 
## Attaching package: 'actuar'
## The following object is masked from 'package:grDevices':
## 
##     cm
fit_ll <- fitdist(my_data, "llogis", start = list(shape = 1, scale = 500))
fit_P  <- fitdist(my_data, "pareto", start = list(shape = 1, scale = 500))
fit_B  <- fitdist(my_data, "burr",   start = list(shape1 = 0.3, shape2 = 1, rate = 1))
cdfcomp(list(fit_ln, fit_ll, fit_P, fit_B), xlogscale = TRUE, ylogscale = TRUE,
        legendtext = c("lognormal", "loglogistic", "Pareto", "Burr"), lwd=2)

Check for goodness of fit. Remember a low chi-square statistic indicates that the distribution is a good fit.

gofstat(list(fit_ln, fit_ll, fit_P, fit_B), fitnames = c("lnorm", "llogis", "Pareto", "Burr"))
## Goodness-of-fit statistics
##                                  lnorm    llogis     Pareto       Burr
## Kolmogorov-Smirnov statistic 0.1672498 0.1195888 0.08488002 0.06154925
## Cramer-von Mises statistic   0.6373593 0.3827449 0.13926498 0.06803071
## Anderson-Darling statistic   3.4721179 2.8315975 0.89206283 0.52393018
## 
## Goodness-of-fit criteria
##                                   lnorm   llogis   Pareto     Burr
## Akaike's Information Criterion 1068.810 1069.246 1048.112 1045.731
## Bayesian Information Criterion 1074.099 1074.535 1053.400 1053.664

You can bootstrap to estimate the uncertainty in the parameters

ests <- bootdist(fit_B, niter = 1e3)
summary(ests)
## Parametric bootstrap medians and 95% percentile CI 
##           Median       2.5%     97.5%
## shape1 0.2035533 0.09672861 0.3739276
## shape2 1.5786159 1.05231668 2.9645034
## rate   1.4508249 0.66844970 2.6631288
plot(ests)

quantile(ests, probs=.05) 
## (original) estimated quantiles for each specified probability (non-censored data)
##             p=0.05
## estimate 0.2939259
## Median of bootstrap estimates
##             p=0.05
## estimate 0.3004415
## 
## two-sided 95 % CI of each quantile
##           p=0.05
## 2.5 %  0.1717223
## 97.5 % 0.5109727

Normal distribution example

Here’s a good one for normal distributions.

xn <- rnorm(n=100, mean=10, sd=5)
mean(xn); sd(xn)
## [1] 9.528916
## [1] 4.768952
fit_n <- fitdist(xn, "norm")
summary(fit_n)
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters : 
##      estimate Std. Error
## mean 9.528916  0.4745047
## sd   4.745047  0.3355255
## Loglikelihood:  -297.604   AIC:  599.208   BIC:  604.4183 
## Correlation matrix:
##               mean            sd
## mean  1.000000e+00 -1.131244e-09
## sd   -1.131244e-09  1.000000e+00
plotdist(xn, "norm", para=list(mean=mean(xn), sd=sd(xn)), demp = TRUE)

ests <- bootdist(fit_n, niter = 1e3)
summary(ests)
## Parametric bootstrap medians and 95% percentile CI 
##        Median     2.5%     97.5%
## mean 9.508337 8.574283 10.526574
## sd   4.680614 4.065518  5.345391