Based on the PP-plots and chi-squared (chisq), Kolmogorov-Smirnov (ks), aic and bic statistics, it appears that the lognormal distribution best fits these data.
df1 = read.xlsx('Problem_Dataset_06_01.xls', 'Sheet1', header = FALSE)
# first plot histogram and CDF
par(mfrow=c(1,2))
hist(df1$X1)
plot(ecdf(df1$X1))
# based on the histogram, we see that all values are positive with large outliers
# therefore, we should try to fit exponential, lognormal, gamma and weibull distributions
# try exponential distribution
expon1 = fitdist(df1$X1, 'exp', discrete = FALSE)
plot(expon1)
# try log normal distribution
lnorm1 = fitdist(df1$X1, 'lnorm', discrete = FALSE)
plot(lnorm1)
# try gamma distribution
gamma1 = fitdist(df1$X1, 'gamma', discrete = FALSE)
plot(gamma1)
# try weibull distribution
weib1 = fitdist(df1$X1, 'weibull', discrete = FALSE)
plot(weib1)
# compare multiple fits
cbind('exp'=gofstat(expon1), 'lnorm'=gofstat(lnorm1), 'gamma'=gofstat(gamma1), 'weibul'=gofstat(weib1))
## exp lnorm gamma weibul
## chisq 4.60715 0.8254608 1.423542 1.965919
## chisqbreaks Numeric,7 Numeric,7 Numeric,7 Numeric,7
## chisqpvalue 0.595091 0.975381 0.9217208 0.8538404
## chisqdf 6 5 5 5
## chisqtable Numeric,16 Numeric,16 Numeric,16 Numeric,16
## cvm 0.2021652 0.01825121 0.05174037 0.05847634
## cvmtest "not rejected" "not computed" "not rejected" "not rejected"
## ad 1.423471 0.1414258 0.3523081 0.4453717
## adtest "rejected" "not computed" "not rejected" "not rejected"
## ks 0.1640203 0.05044292 0.07461236 0.07934941
## kstest "not rejected" "not rejected" "not rejected" "not rejected"
## aic 294.1736 284.9146 288.2076 290.3611
## bic 295.9113 288.39 291.683 293.8364
## discrete FALSE FALSE FALSE FALSE
## nbfit 1 1 1 1
# print summary of best fitting distribution
summary(lnorm1)
## Fitting of the distribution ' lnorm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## meanlog 2.1850979 0.1189979
## sdlog 0.7711947 0.0841436
## Loglikelihood: -140.4573 AIC: 284.9146 BIC: 288.39
## Correlation matrix:
## meanlog sdlog
## meanlog 1 0
## sdlog 0 1
Again, based on the goodness-of-fit statistics and plots, the lognormal distribution seems to fit the data best.
df2 = read.xlsx('Problem_Dataset_06_02.xls', 'Sheet1', header = FALSE)
# first plot histogram and CDF
par(mfrow=c(1,2))
hist(df2$X1)
plot(ecdf(df2$X1))
# based on the histogram, we see that all values are positive with large outliers
# therefore, we should try to fit exponential, lognormal, gamma and weibull distributions
# try exponential distribution
expon1 = fitdist(df2$X1, 'exp', discrete = FALSE)
plot(expon1)
# try log normal distribution
lnorm1 = fitdist(df2$X1, 'lnorm', discrete = FALSE)
plot(lnorm1)
# try gamma distribution
gamma1 = fitdist(df2$X1, 'gamma', discrete = FALSE)
plot(gamma1)
# try weibull distribution
weib1 = fitdist(df2$X1, 'weibull', discrete = FALSE)
plot(weib1)
# compare multiple fits
cbind('exp'=gofstat(expon1), 'lnorm'=gofstat(lnorm1), 'gamma'=gofstat(gamma1), 'weibul'=gofstat(weib1))
## exp lnorm gamma weibul
## chisq 27.67302 0.7354401 2.57195 6.563886
## chisqbreaks Numeric,7 Numeric,7 Numeric,7 Numeric,7
## chisqpvalue 0.0001082634 0.9809672 0.7656225 0.2551463
## chisqdf 6 5 5 5
## chisqtable Numeric,16 Numeric,16 Numeric,16 Numeric,16
## cvm 1.147379 0.03973639 0.1144419 0.2008969
## cvmtest "rejected" "not computed" "not rejected" "rejected"
## ad 5.971173 0.2673944 0.7166774 1.323028
## adtest "rejected" "not computed" "not rejected" "rejected"
## ks 0.3144366 0.06764782 0.09941791 0.1140618
## kstest "rejected" "not rejected" "not rejected" "not rejected"
## aic 320.4776 282.6829 288.3562 296.2684
## bic 322.3278 286.3832 292.0565 299.9687
## discrete FALSE FALSE FALSE FALSE
## nbfit 1 1 1 1
# print summary of best fitting distribution
summary(lnorm1)
## Fitting of the distribution ' lnorm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## meanlog 2.2579185 0.07155999
## sdlog 0.4905905 0.05059961
## Loglikelihood: -139.3414 AIC: 282.6829 BIC: 286.3832
## Correlation matrix:
## meanlog sdlog
## meanlog 1 0
## sdlog 0 1
In this case, the Poisson distribution seemed to fit the data best.
df3 = read.xlsx('Problem_Dataset_06_03.xls', 'Sheet1', header = FALSE)
# first plot histogram and CDF
par(mfrow=c(1,2))
hist(df2$X1)
plot(ecdf(df2$X1))
# based on the histogram, we see that all values are positive with large outliers
# also, these data are discrete, we should try discrete distributions
# therefore, we should try to fit poisson, negative binomial and normal distributions
# try poisson distribution
pois1 = fitdist(df3$X1, 'pois', discrete = TRUE)
plot(pois1)
# try negative binomial distribution
nbinom1 = fitdist(df3$X1, 'nbinom', discrete = TRUE)
plot(nbinom1)
# try normal distribution
norm1 = fitdist(df3$X1, 'norm', discrete = TRUE)
plot(norm1)
# compare multiple fits
cbind('pois'=gofstat(pois1), 'nbinom'=gofstat(nbinom1), 'norm'=gofstat(norm1))
## pois nbinom norm
## chisq 1.920083 1.919848 8.670335
## chisqbreaks Numeric,4 Numeric,4 Numeric,4
## chisqpvalue 0.5891583 0.3829221 0.01309968
## chisqdf 3 2 2
## chisqtable Numeric,10 Numeric,10 Numeric,10
## aic 164.2799 166.2799 169.3638
## bic 166.0866 169.8932 172.9771
## discrete TRUE TRUE TRUE
## nbfit 1 1 1
# print summary of best fitting distribution
summary(pois1)
## Fitting of the distribution ' pois ' by maximum likelihood
## Parameters :
## estimate Std. Error
## lambda 2.955556 0.2562791
## Loglikelihood: -81.13996 AIC: 164.2799 BIC: 166.0866
\[ F(x) = \begin{cases} 0 & x \lt a \\ \frac{x - a}{b - a} & a \le x \le b \\ 1 & x \gt b \end{cases} \]
\[ X = a + R (b - a) \]
\[ F(x; k, \lambda) = 1 - e^{-(x/\lambda)^k} \]
\[ X = -a[ln(1-R)]^{1/\lambda}I_{[0,1]}(R) \]
set.seed(123)
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
# create a discrete sampling function
lbs_sample = function(item) {
R = runif(1)
if (item == 'oats') {
if (R <= 0.05) {lbs = 0.0}
else if (R <= 0.12) {lbs = 0.5}
else if (R <= 0.21) {lbs = 1.0}
else if (R <= 0.32) {lbs = 1.5}
else if (R <= 0.47) {lbs = 2.0}
else if (R <= 0.72) {lbs = 3.0}
else if (R <= 0.82) {lbs = 4.0}
else if (R <= 0.91) {lbs = 5.0}
else if (R <= 0.97) {lbs = 7.5}
else if (R <= 1.00) {lbs = 10.0}
}
if (item == 'peas') {
if (R <= 0.1) {lbs = 0.0}
else if (R <= 0.3) {lbs = 0.5}
else if (R <= 0.5) {lbs = 1.0}
else if (R <= 0.8) {lbs = 1.5}
else if (R <= 0.9) {lbs = 2.0}
else if (R <= 1.0) {lbs = 3.0}
}
if (item == 'beans') {
if (R <= 0.2) {lbs = 0.0}
else if (R <= 0.6) {lbs = 1.0}
else if (R <= 0.9) {lbs = 3.0}
else if (R <= 1.0) {lbs = 4.5}
}
if (item == 'barley') {
if (R <= 0.2) {lbs = 0.0}
else if (R <= 0.6) {lbs = 0.5}
else if (R <= 0.9) {lbs = 1.0}
else if (R <= 1.0) {lbs = 3.5}
}
return(lbs)
}
# create simulation function for one season
run_sim = function(days = 90) {
# create data table to house inputs
tbl_items = data.table(item = c('oats', 'peas', 'beans', 'barley'),
cost = c(1.05, 3.17, 1.99, 0.95),
price = c(1.29, 3.76, 2.23, 1.65))
tbl_items = tbl_items[, profit := price - cost]
# create data frame to house results
tbl_sim = data.frame(day = rep(0, days), total_revenue = rep(0, days),
total_cost = rep(0, days), total_profit = rep(0, days))
for (i in 1:days) {
qty = NULL
revenue = 0
cost = 0
profit = 0
for (r in 1:nrow(tbl_items)) {
qty = lbs_sample(tbl_items[r, item])
revenue = revenue + tbl_items[r, price] * qty
cost = cost + tbl_items[r, cost] * qty
profit = profit + tbl_items[r, profit] * qty
}
tbl_sim[i,] = c(i, revenue, cost, profit)
}
return(data.table(tbl_sim))
}
# create simulation function for x seasons
run_sim_season = function(seasons = 1000, days = 90) {
tbl_daily = data.frame(day = rep(0, seasons*days), total_revenue = rep(0, seasons*days),
total_cost = rep(0, seasons*days), total_profit = rep(0, seasons*days))
tbl_season = data.frame(season = rep(0, seasons), total_revenue = rep(0, seasons),
total_cost = rep(0, seasons), total_profit = rep(0, seasons))
for (i in 1:seasons) {
## REPLACE LOGIC BELOW WITH A SIMPLE ARRAY AND THEN RETURN A DATA.TABLE
sim = run_sim(days = days)
tbl_daily[(i*days-days+1):(i*days),] = sim
tbl_season[i,] = c(season = i, sim[, lapply(.SD, sum), .SDcols = 2:4])
}
return(list(tbl_daily = data.table(tbl_daily), tbl_season = data.table(tbl_season)))
}
# run simulation x times and aggregate results
sim_results = run_sim_season(seasons = 100, days = 90)
summary(sim_results$tbl_daily)
## day total_revenue total_cost total_profit
## Min. : 1.0 Min. : 0.00 Min. : 0.000 Min. :0.000
## 1st Qu.:23.0 1st Qu.: 9.63 1st Qu.: 7.805 1st Qu.:1.715
## Median :45.5 Median :13.27 Median :10.845 Median :2.250
## Mean :45.5 Mean :13.66 Mean :11.225 Mean :2.433
## 3rd Qu.:68.0 3rd Qu.:17.19 3rd Qu.:14.290 3rd Qu.:2.986
## Max. :90.0 Max. :36.65 Max. :29.915 Max. :7.340
summary(sim_results$tbl_season)
## season total_revenue total_cost total_profit
## Min. : 1.00 Min. :1092 Min. : 901.8 Min. :190.4
## 1st Qu.: 25.75 1st Qu.:1196 1st Qu.: 983.2 1st Qu.:212.3
## Median : 50.50 Median :1231 Median :1010.8 Median :219.2
## Mean : 50.50 Mean :1229 Mean :1010.2 Mean :218.9
## 3rd Qu.: 75.25 3rd Qu.:1259 3rd Qu.:1038.7 3rd Qu.:225.2
## Max. :100.00 Max. :1345 Max. :1110.7 Max. :237.8
sim_results$tbl_daily %>% select(-day) %>%
gather('metric', 'value') %>%
mutate('metric' = factor(metric, levels = c('total_revenue', 'total_cost', 'total_profit'))) %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_grid(. ~ metric, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sim_results$tbl_season %>% select(-season) %>%
gather('metric', 'value') %>%
mutate('metric' = factor(metric, levels = c('total_revenue', 'total_cost', 'total_profit'))) %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_grid(. ~ metric, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.