BHao_Assign8

6.1

Based on the PP-plots and chi-squared (chisq), Kolmogorov-Smirnov (ks), aic and bic statistics, it appears that the lognormal distribution best fits these data.

df1 = read.xlsx('Problem_Dataset_06_01.xls', 'Sheet1', header = FALSE)

# first plot histogram and CDF 
par(mfrow=c(1,2))
hist(df1$X1)
plot(ecdf(df1$X1))

# based on the histogram, we see that all values are positive with large outliers 
# therefore, we should try to fit exponential, lognormal, gamma and weibull distributions 

# try exponential distribution 
expon1 = fitdist(df1$X1, 'exp', discrete = FALSE)
plot(expon1)

# try log normal distribution 
lnorm1 = fitdist(df1$X1, 'lnorm', discrete = FALSE)
plot(lnorm1)

# try gamma distribution 
gamma1 = fitdist(df1$X1, 'gamma', discrete = FALSE)
plot(gamma1)

# try weibull distribution
weib1 = fitdist(df1$X1, 'weibull', discrete = FALSE)
plot(weib1)

# compare multiple fits 
cbind('exp'=gofstat(expon1), 'lnorm'=gofstat(lnorm1), 'gamma'=gofstat(gamma1), 'weibul'=gofstat(weib1))

##             exp            lnorm          gamma          weibul        
## chisq       4.60715        0.8254608      1.423542       1.965919      
## chisqbreaks Numeric,7      Numeric,7      Numeric,7      Numeric,7     
## chisqpvalue 0.595091       0.975381       0.9217208      0.8538404     
## chisqdf     6              5              5              5             
## chisqtable  Numeric,16     Numeric,16     Numeric,16     Numeric,16    
## cvm         0.2021652      0.01825121     0.05174037     0.05847634    
## cvmtest     "not rejected" "not computed" "not rejected" "not rejected"
## ad          1.423471       0.1414258      0.3523081      0.4453717     
## adtest      "rejected"     "not computed" "not rejected" "not rejected"
## ks          0.1640203      0.05044292     0.07461236     0.07934941    
## kstest      "not rejected" "not rejected" "not rejected" "not rejected"
## aic         294.1736       284.9146       288.2076       290.3611      
## bic         295.9113       288.39         291.683        293.8364      
## discrete    FALSE          FALSE          FALSE          FALSE         
## nbfit       1              1              1              1

# print summary of best fitting distribution
summary(lnorm1)

## Fitting of the distribution ' lnorm ' by maximum likelihood 
## Parameters : 
##          estimate Std. Error
## meanlog 2.1850979  0.1189979
## sdlog   0.7711947  0.0841436
## Loglikelihood:  -140.4573   AIC:  284.9146   BIC:  288.39 
## Correlation matrix:
##         meanlog sdlog
## meanlog       1     0
## sdlog         0     1

6.2

Again, based on the goodness-of-fit statistics and plots, the lognormal distribution seems to fit the data best.

df2 = read.xlsx('Problem_Dataset_06_02.xls', 'Sheet1', header = FALSE)

# first plot histogram and CDF 
par(mfrow=c(1,2))
hist(df2$X1)
plot(ecdf(df2$X1))

# based on the histogram, we see that all values are positive with large outliers 
# therefore, we should try to fit exponential, lognormal, gamma and weibull distributions 

# try exponential distribution 
expon1 = fitdist(df2$X1, 'exp', discrete = FALSE)
plot(expon1)

# try log normal distribution 
lnorm1 = fitdist(df2$X1, 'lnorm', discrete = FALSE)
plot(lnorm1)

# try gamma distribution 
gamma1 = fitdist(df2$X1, 'gamma', discrete = FALSE)
plot(gamma1)

# try weibull distribution
weib1 = fitdist(df2$X1, 'weibull', discrete = FALSE)
plot(weib1)

# compare multiple fits 
cbind('exp'=gofstat(expon1), 'lnorm'=gofstat(lnorm1), 'gamma'=gofstat(gamma1), 'weibul'=gofstat(weib1))

##             exp          lnorm          gamma          weibul        
## chisq       27.67302     0.7354401      2.57195        6.563886      
## chisqbreaks Numeric,7    Numeric,7      Numeric,7      Numeric,7     
## chisqpvalue 0.0001082634 0.9809672      0.7656225      0.2551463     
## chisqdf     6            5              5              5             
## chisqtable  Numeric,16   Numeric,16     Numeric,16     Numeric,16    
## cvm         1.147379     0.03973639     0.1144419      0.2008969     
## cvmtest     "rejected"   "not computed" "not rejected" "rejected"    
## ad          5.971173     0.2673944      0.7166774      1.323028      
## adtest      "rejected"   "not computed" "not rejected" "rejected"    
## ks          0.3144366    0.06764782     0.09941791     0.1140618     
## kstest      "rejected"   "not rejected" "not rejected" "not rejected"
## aic         320.4776     282.6829       288.3562       296.2684      
## bic         322.3278     286.3832       292.0565       299.9687      
## discrete    FALSE        FALSE          FALSE          FALSE         
## nbfit       1            1              1              1

# print summary of best fitting distribution
summary(lnorm1)

## Fitting of the distribution ' lnorm ' by maximum likelihood 
## Parameters : 
##          estimate Std. Error
## meanlog 2.2579185 0.07155999
## sdlog   0.4905905 0.05059961
## Loglikelihood:  -139.3414   AIC:  282.6829   BIC:  286.3832 
## Correlation matrix:
##         meanlog sdlog
## meanlog       1     0
## sdlog         0     1

6.3

In this case, the Poisson distribution seemed to fit the data best.

df3 = read.xlsx('Problem_Dataset_06_03.xls', 'Sheet1', header = FALSE)

# first plot histogram and CDF 
par(mfrow=c(1,2))
hist(df2$X1)
plot(ecdf(df2$X1))

# based on the histogram, we see that all values are positive with large outliers 
# also, these data are discrete, we should try discrete distributions  
# therefore, we should try to fit poisson, negative binomial and normal distributions

# try poisson distribution 
pois1 = fitdist(df3$X1, 'pois', discrete = TRUE)
plot(pois1)

# try negative binomial distribution 
nbinom1 = fitdist(df3$X1, 'nbinom', discrete = TRUE)
plot(nbinom1)

# try normal distribution
norm1 = fitdist(df3$X1, 'norm', discrete = TRUE)
plot(norm1)

# compare multiple fits 
cbind('pois'=gofstat(pois1), 'nbinom'=gofstat(nbinom1), 'norm'=gofstat(norm1))

##             pois       nbinom     norm      
## chisq       1.920083   1.919848   8.670335  
## chisqbreaks Numeric,4  Numeric,4  Numeric,4 
## chisqpvalue 0.5891583  0.3829221  0.01309968
## chisqdf     3          2          2         
## chisqtable  Numeric,10 Numeric,10 Numeric,10
## aic         164.2799   166.2799   169.3638  
## bic         166.0866   169.8932   172.9771  
## discrete    TRUE       TRUE       TRUE      
## nbfit       1          1          1

# print summary of best fitting distribution
summary(pois1)

## Fitting of the distribution ' pois ' by maximum likelihood 
## Parameters : 
##        estimate Std. Error
## lambda 2.955556  0.2562791
## Loglikelihood:  -81.13996   AIC:  164.2799   BIC:  166.0866

6.4

\[ F(x) = \begin{cases} 0 & x \lt a \\ \frac{x - a}{b - a} & a \le x \le b \\ 1 & x \gt b \end{cases} \]

\[ X = a + R (b - a) \]

6.5

\[ F(x; k, \lambda) = 1 - e^{-(x/\lambda)^k} \]

\[ X = -a[ln(1-R)]^{1/\lambda}I_{[0,1]}(R) \]

6.8

set.seed(123)
library(data.table)
library(dplyr)

## -------------------------------------------------------------------------

## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!

## -------------------------------------------------------------------------

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

# create a discrete sampling function 
lbs_sample = function(item) {
  R = runif(1)
  if (item == 'oats') {
    if (R <= 0.05) {lbs = 0.0}
    else if (R <= 0.12) {lbs = 0.5}
    else if (R <= 0.21) {lbs = 1.0}
    else if (R <= 0.32) {lbs = 1.5}
    else if (R <= 0.47) {lbs = 2.0}
    else if (R <= 0.72) {lbs = 3.0}
    else if (R <= 0.82) {lbs = 4.0}
    else if (R <= 0.91) {lbs = 5.0}
    else if (R <= 0.97) {lbs = 7.5}
    else if (R <= 1.00) {lbs = 10.0}
  }
  if (item == 'peas') {
    if (R <= 0.1) {lbs = 0.0}
    else if (R <= 0.3) {lbs = 0.5}
    else if (R <= 0.5) {lbs = 1.0}
    else if (R <= 0.8) {lbs = 1.5}
    else if (R <= 0.9) {lbs = 2.0}
    else if (R <= 1.0) {lbs = 3.0}
  }
  if (item == 'beans') {
    if (R <= 0.2) {lbs = 0.0}
    else if (R <= 0.6) {lbs = 1.0}
    else if (R <= 0.9) {lbs = 3.0}
    else if (R <= 1.0) {lbs = 4.5}
  }
  if (item == 'barley') {
    if (R <= 0.2) {lbs = 0.0}
    else if (R <= 0.6) {lbs = 0.5}
    else if (R <= 0.9) {lbs = 1.0}
    else if (R <= 1.0) {lbs = 3.5}
  }
  return(lbs)
}

# create simulation function for one season
run_sim = function(days = 90) {
  # create data table to house inputs
  tbl_items = data.table(item = c('oats', 'peas', 'beans', 'barley'),
                  cost = c(1.05, 3.17, 1.99, 0.95),
                  price = c(1.29, 3.76, 2.23, 1.65))
  tbl_items = tbl_items[, profit := price - cost]

  # create data frame to house results
  tbl_sim = data.frame(day = rep(0, days), total_revenue = rep(0, days), 
                       total_cost = rep(0, days), total_profit = rep(0, days))

  for (i in 1:days) {
    qty = NULL
    revenue = 0
    cost = 0
    profit = 0
    for (r in 1:nrow(tbl_items)) {
      qty = lbs_sample(tbl_items[r, item])
      revenue = revenue + tbl_items[r, price] * qty
      cost = cost + tbl_items[r, cost] * qty
      profit = profit + tbl_items[r, profit] * qty
    }
    tbl_sim[i,] = c(i, revenue, cost, profit)
  }
  return(data.table(tbl_sim))
}

# create simulation function for x seasons
run_sim_season = function(seasons = 1000, days = 90) {
  tbl_daily = data.frame(day = rep(0, seasons*days), total_revenue = rep(0, seasons*days), 
                         total_cost = rep(0, seasons*days), total_profit = rep(0, seasons*days))
  tbl_season = data.frame(season = rep(0, seasons), total_revenue = rep(0, seasons), 
                          total_cost = rep(0, seasons), total_profit = rep(0, seasons))
  for (i in 1:seasons) {
    ## REPLACE LOGIC BELOW WITH A SIMPLE ARRAY AND THEN RETURN A DATA.TABLE 
    sim = run_sim(days = days)
    tbl_daily[(i*days-days+1):(i*days),] = sim
    tbl_season[i,] = c(season = i, sim[, lapply(.SD, sum), .SDcols = 2:4])
  }
  return(list(tbl_daily = data.table(tbl_daily), tbl_season = data.table(tbl_season)))
}

# run simulation x times and aggregate results
sim_results = run_sim_season(seasons = 100, days = 90)

summary(sim_results$tbl_daily)

##       day       total_revenue     total_cost      total_profit  
##  Min.   : 1.0   Min.   : 0.00   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:23.0   1st Qu.: 9.63   1st Qu.: 7.805   1st Qu.:1.715  
##  Median :45.5   Median :13.27   Median :10.845   Median :2.250  
##  Mean   :45.5   Mean   :13.66   Mean   :11.225   Mean   :2.433  
##  3rd Qu.:68.0   3rd Qu.:17.19   3rd Qu.:14.290   3rd Qu.:2.986  
##  Max.   :90.0   Max.   :36.65   Max.   :29.915   Max.   :7.340

summary(sim_results$tbl_season)

##      season       total_revenue    total_cost      total_profit  
##  Min.   :  1.00   Min.   :1092   Min.   : 901.8   Min.   :190.4  
##  1st Qu.: 25.75   1st Qu.:1196   1st Qu.: 983.2   1st Qu.:212.3  
##  Median : 50.50   Median :1231   Median :1010.8   Median :219.2  
##  Mean   : 50.50   Mean   :1229   Mean   :1010.2   Mean   :218.9  
##  3rd Qu.: 75.25   3rd Qu.:1259   3rd Qu.:1038.7   3rd Qu.:225.2  
##  Max.   :100.00   Max.   :1345   Max.   :1110.7   Max.   :237.8

sim_results$tbl_daily %>% select(-day) %>%
  gather('metric', 'value') %>%
  mutate('metric' = factor(metric, levels = c('total_revenue', 'total_cost', 'total_profit'))) %>%
  ggplot(aes(x = value)) +
  geom_histogram() +
  facet_grid(. ~ metric, scales = 'free')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sim_results$tbl_season %>% select(-season) %>%
  gather('metric', 'value') %>%
  mutate('metric' = factor(metric, levels = c('total_revenue', 'total_cost', 'total_profit'))) %>%
  ggplot(aes(x = value)) +
  geom_histogram() +
  facet_grid(. ~ metric, scales = 'free')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.