Data 604 - HW#8

Chapter 6 of Kelton

Problem # 1

I found a great function on Stack Overflow to perform the Kolmogorov-Smirnov. I modified it to include a lognormal function Test

fitData <- function(data, fit="gamma", sample=0.5){
  distrib = list()
  numfit <- length(fit)
  results = matrix(0, ncol=5, nrow=numfit)
  
  for(i in 1:numfit){
    if((fit[i] == "gamma") | 
       (fit[i] == "poisson") | 
       (fit[i] == "weibull") | 
       (fit[i] == "exponential") |
       (fit[i] == "logistic") |
       (fit[i] == "normal") | 
       (fit[i] == "lognormal")
    ) 
      distrib[[i]] = fit[i]
    else stop("Provide a valid distribution to fit data" )
  }
  
  # take a sample of dataset
  n = round(length(data)*sample)
  data = sample(data, size=n, replace=F)
  
  for(i in 1:numfit) {
    if(distrib[[i]] == "gamma") {
      gf_shape = "gamma"
      fd_g <- fitdistr(data, "gamma")
      est_shape = fd_g$estimate[[1]]
      est_rate = fd_g$estimate[[2]]
      
      ks = ks.test(data, "pgamma", shape=est_shape, rate=est_rate)
      
      # add to results
      results[i,] = c(gf_shape, est_shape, est_rate, ks$statistic, ks$p.value)
    }
    
    else if(distrib[[i]] == "poisson"){
      gf_shape = "poisson"
      fd_p <- fitdistr(data, "poisson")
      est_lambda = fd_p$estimate[[1]]
      
      ks = ks.test(data, "ppois", lambda=est_lambda)
      # add to results
      results[i,] = c(gf_shape, est_lambda, "NA", ks$statistic, ks$p.value)
    }
    
    else if(distrib[[i]] == "weibull"){
      gf_shape = "weibull"
      fd_w <- fitdistr(data,densfun=dweibull,start=list(scale=1,shape=2))
      est_shape = fd_w$estimate[[1]]
      est_scale = fd_w$estimate[[2]]
      
      ks = ks.test(data, "pweibull", shape=est_shape, scale=est_scale)
      # add to results
      results[i,] = c(gf_shape, est_shape, est_scale, ks$statistic, ks$p.value) 
    }
    
    else if(distrib[[i]] == "normal"){
      gf_shape = "normal"
      fd_n <- fitdistr(data, "normal")
      est_mean = fd_n$estimate[[1]]
      est_sd = fd_n$estimate[[2]]
      
      ks = ks.test(data, "pnorm", mean=est_mean, sd=est_sd)
      # add to results
      results[i,] = c(gf_shape, est_mean, est_sd, ks$statistic, ks$p.value)
    }
    
    else if(distrib[[i]] == "exponential"){
      gf_shape = "exponential"
      fd_e <- fitdistr(data, "exponential")
      est_rate = fd_e$estimate[[1]]
      ks = ks.test(data, "pexp", rate=est_rate)
      # add to results
      results[i,] = c(gf_shape, est_rate, "NA", ks$statistic, ks$p.value)
    }
    
    else if(distrib[[i]] == "logistic"){
      gf_shape = "logistic"
      fd_l <- fitdistr(data, "logistic")
      est_location = fd_l$estimate[[1]]
      est_scale = fd_l$estimate[[2]]
      ks = ks.test(data, "plogis", est_location, est_scale)
      # add to results
      results[i,] = c(gf_shape, est_location, est_scale, ks$statistic,    ks$p.value) 
    }
    
    else if(distrib[[i]] == "lognormal"){
      gf_shape = "lognormal"
      fd_l <- fitdistr(data, "lognormal")
      est_location = fd_l$estimate[[1]]
      est_scale = fd_l$estimate[[2]]
      ks = ks.test(data, "plnorm", est_location, est_scale)
      # add to results
      results[i,] = c(gf_shape, est_location, est_scale, ks$statistic,    ks$p.value) 
      
    }
  }
  results = rbind(c("distribution", "param1", "param2", "ks stat", "pvalue"), results)
  
  #print(results)
  return(results)
}
df <- read.csv('https://raw.githubusercontent.com/bkreis84/Data-604---Model/master/Problem_Dataset_06_01.csv', header = FALSE)

hist(df$V1)

summary(df$V1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.060   5.052   9.015  11.920  15.900  48.650

library(MASS)
set.seed(1234) 
res = fitData(df$V1, fit=c("lognormal", "logistic","normal","exponential","poisson", "gamma"), sample=1)
res

##      [,1]           [,2]                 [,3]               
## [1,] "distribution" "param1"             "param2"           
## [2,] "lognormal"    "2.18509786831851"   "0.771194673470374"
## [3,] "logistic"     "10.4259241223393"   "4.94933061641018" 
## [4,] "normal"       "11.9204761904762"   "9.71468646465628" 
## [5,] "exponential"  "0.0838892661686574" "NA"               
## [6,] "poisson"      "11.9204761904762"   "NA"               
## [7,] "gamma"        "1.85475826117211"   "0.15559474603739" 
##      [,4]                 [,5]                 
## [1,] "ks stat"            "pvalue"             
## [2,] "0.0504429203971719" "0.999724858607345"  
## [3,] "0.155734516241904"  "0.234552863561509"  
## [4,] "0.155051210381807"  "0.238888743687638"  
## [5,] "0.164020296677381"  "0.186577425194354"  
## [6,] "0.359344671880784"  "2.3109834520918e-05"
## [7,] "0.074599458667955"  "0.959854470639895"

It looks as though the exponential distribution most closely matches ours based on the ks test. This matched our intitial eyeball test and we can plot it.

fd_l <- fitdistr(df$V1, "lognormal")

lnorm_rnd <- rlnorm(10^5,fd_l$estimate[[1]],fd_l$estimate[[2]])

hist(df$V1, prob=TRUE)
lines(density(lnorm_rnd),col="red")

#Values for Simio
paste0('Random.Lognormal','(',fd_l$estimate[[1]],', ', fd_l$estimate[[2]],')')

## [1] "Random.Lognormal(2.18509786831851, 0.771194673470374)"

Problem # 2

df2 <- read.csv('https://raw.githubusercontent.com/bkreis84/Data-604---Model/master/Problem_Dataset_06_02.csv', header = FALSE)

res2 = fitData(df2$V1, fit=c("lognormal", "logistic","normal","exponential","poisson", "gamma", "weibull"), sample=1)
res2

##      [,1]           [,2]                 [,3]               
## [1,] "distribution" "param1"             "param2"           
## [2,] "lognormal"    "2.25791850042432"   "0.490590547852814"
## [3,] "logistic"     "9.90943154675149"   "3.1075031968157"  
## [4,] "normal"       "10.8923404255319"   "6.29688109651794" 
## [5,] "exponential"  "0.0918076337070751" "NA"               
## [6,] "poisson"      "10.8923404255319"   "NA"               
## [7,] "gamma"        "4.00079129716202"   "0.367303383214532"
## [8,] "weibull"      "12.3600989691339"   "1.87903064039097" 
##      [,4]                 [,5]                  
## [1,] "ks stat"            "pvalue"              
## [2,] "0.0676478188010034" "0.972591230660194"   
## [3,] "0.125102083576015"  "0.419484055521505"   
## [4,] "0.158906958340873"  "0.167023294764242"   
## [5,] "0.314436625487534"  "0.000125207234307867"
## [6,] "0.247676617064745"  "0.00502101892798101" 
## [7,] "0.099401755491389"  "0.704481750407175"   
## [8,] "1"                  "5.55111512312578e-16"

#Plot lognormal
lgn <- fitdistr(df2$V1, "lognormal")

lnorm_rnd <- rlnorm(10^5,lgn$estimate[[1]],lgn$estimate[[2]])

hist(df2$V1, prob=TRUE)
lines(density(lnorm_rnd),col="red")

#Plot gamma
gam <- fitdistr(df2$V1, "gamma")

gam_rnd <- rlnorm(10^5,fd_l$estimate[[1]],fd_l$estimate[[2]])

hist(df2$V1, prob=TRUE)
lines(density(gam_rnd),col="red")

paste0('Random.Lognormal','(',round(lgn$estimate[[1]],3),', ', round(lgn$estimate[[2]],3),')')

## [1] "Random.Lognormal(2.258, 0.491)"

Both the gamma and the lognormal look quite close in the plots, but because lognormal had the higher p value, I will go with that again

Problem # 3

df3 <- read.csv('https://raw.githubusercontent.com/bkreis84/Data-604---Model/master/Problem_Dataset_06_03.csv', header = FALSE)

res3 = fitData(df3$V1, fit=c("lognormal", "logistic","normal","exponential","poisson", "gamma", "weibull"), sample=1)
res3

##      [,1]           [,2]                [,3]               
## [1,] "distribution" "param1"            "param2"           
## [2,] "lognormal"    "0.948383947311686" "0.536203350216782"
## [3,] "logistic"     "2.82365832701205"  "0.839256255729805"
## [4,] "normal"       "2.95555555555556"  "1.519584088647"   
## [5,] "exponential"  "0.338345864661654" "NA"               
## [6,] "poisson"      "2.95555555555556"  "NA"               
## [7,] "gamma"        "3.85417434129486"  "1.3040440224953"  
## [8,] "weibull"      "3.34855422571793"  "2.06986939036735" 
##      [,4]                [,5]                  
## [1,] "ks stat"           "pvalue"              
## [2,] "0.165882192561865" "0.167970383731119"   
## [3,] "0.171833254489617" "0.140212260381077"   
## [4,] "0.179713294783737" "0.109291763479242"   
## [5,] "0.336148650504847" "7.6632814080102e-05" 
## [6,] "0.277665321375046" "0.00193882030158021" 
## [7,] "0.151310043253961" "0.254247346126618"   
## [8,] "0.524285314786614" "3.60681484679048e-11"

#It looks like the gamma distribution has the highest p value here

#Plot gamma
gam3 <- fitdistr(df3$V1, "gamma")

gam_rnd <- rgamma(10^5,gam3$estimate[[1]],gam3$estimate[[2]])

hist(df3$V1, prob=TRUE)
lines(density(gam_rnd),col="red")

paste0('Random.Gamma','(',round(gam3$estimate[[1]],3),', ', round(gam3$estimate[[2]],3),')')

## [1] "Random.Gamma(3.854, 1.304)"

In this case the gamma distribution had the highest p value and the plot looks as we would expect.

Problem # 4

\[ f(x) = \frac{1}{(b-a)}, a\leq x \leq b, 0-otherwise\\ F(x) = \frac{(x-a)}{(b-1)}, a\leq x \leq b\\ X = a + (b-a)R \]

Problem # 8

# Data In
oatSize = c(0, 0.5, 1, 1.5, 2.0, 3.0, 4.0, 5.0, 7.5, 10.0)
oatProb = c(0.05, 0.07, 0.09, 0.11, 0.15, 0.25, 0.10, 0.09, 0.06, 0.03)

peaSize = c(0, 0.5, 1, 1.5, 2.0, 3.0)
peaProb = c(0.1, 0.2, 0.2, 0.3, 0.1, 0.1)

beanSize = c(0, 1.0, 3.0, 4.5)
beanProb = c(0.2 , 0.4, 0.3, 0.1)

barSize = c(0, 0.5, 1.0, 3.5)
barProb = c(0.2, 0.4, 0.3, 0.1)

#take our samples

oatSamp <- sample(oatSize, 90, replace=T, prob = oatProb)
peaSamp <- sample(peaSize, 90, replace=T, prob = peaProb)
beanSamp <- sample(beanSize, 90, replace=T, prob = beanProb)
barSamp <- sample(barSize, 90, replace=T, prob = barProb)

df <- data.frame(oatSamp,peaSamp, beanSamp, barSamp)
    
df$cost <- 1.05*oatSamp + 3.17*peaSamp + 1.99 *beanSamp + 0.95*barSamp
df$revenue <- 1.29*oatSamp + 3.76*peaSamp + 2.23 *beanSamp + 1.65*barSamp
    
df$profit <- df$revenue - df$cost


#revenue, cost, profit
paste(sum(df$revenue), sum(df$cost), sum(df$profit))

## [1] "1253.98 1032.2 221.78"

https://stats.stackexchange.com/questions/77752/how-to-check-if-my-data-fits-log-normal-distribution