I found a great function on Stack Overflow to perform the Kolmogorov-Smirnov. I modified it to include a lognormal function Test
fitData <- function(data, fit="gamma", sample=0.5){
distrib = list()
numfit <- length(fit)
results = matrix(0, ncol=5, nrow=numfit)
for(i in 1:numfit){
if((fit[i] == "gamma") |
(fit[i] == "poisson") |
(fit[i] == "weibull") |
(fit[i] == "exponential") |
(fit[i] == "logistic") |
(fit[i] == "normal") |
(fit[i] == "lognormal")
)
distrib[[i]] = fit[i]
else stop("Provide a valid distribution to fit data" )
}
# take a sample of dataset
n = round(length(data)*sample)
data = sample(data, size=n, replace=F)
for(i in 1:numfit) {
if(distrib[[i]] == "gamma") {
gf_shape = "gamma"
fd_g <- fitdistr(data, "gamma")
est_shape = fd_g$estimate[[1]]
est_rate = fd_g$estimate[[2]]
ks = ks.test(data, "pgamma", shape=est_shape, rate=est_rate)
# add to results
results[i,] = c(gf_shape, est_shape, est_rate, ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "poisson"){
gf_shape = "poisson"
fd_p <- fitdistr(data, "poisson")
est_lambda = fd_p$estimate[[1]]
ks = ks.test(data, "ppois", lambda=est_lambda)
# add to results
results[i,] = c(gf_shape, est_lambda, "NA", ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "weibull"){
gf_shape = "weibull"
fd_w <- fitdistr(data,densfun=dweibull,start=list(scale=1,shape=2))
est_shape = fd_w$estimate[[1]]
est_scale = fd_w$estimate[[2]]
ks = ks.test(data, "pweibull", shape=est_shape, scale=est_scale)
# add to results
results[i,] = c(gf_shape, est_shape, est_scale, ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "normal"){
gf_shape = "normal"
fd_n <- fitdistr(data, "normal")
est_mean = fd_n$estimate[[1]]
est_sd = fd_n$estimate[[2]]
ks = ks.test(data, "pnorm", mean=est_mean, sd=est_sd)
# add to results
results[i,] = c(gf_shape, est_mean, est_sd, ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "exponential"){
gf_shape = "exponential"
fd_e <- fitdistr(data, "exponential")
est_rate = fd_e$estimate[[1]]
ks = ks.test(data, "pexp", rate=est_rate)
# add to results
results[i,] = c(gf_shape, est_rate, "NA", ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "logistic"){
gf_shape = "logistic"
fd_l <- fitdistr(data, "logistic")
est_location = fd_l$estimate[[1]]
est_scale = fd_l$estimate[[2]]
ks = ks.test(data, "plogis", est_location, est_scale)
# add to results
results[i,] = c(gf_shape, est_location, est_scale, ks$statistic, ks$p.value)
}
else if(distrib[[i]] == "lognormal"){
gf_shape = "lognormal"
fd_l <- fitdistr(data, "lognormal")
est_location = fd_l$estimate[[1]]
est_scale = fd_l$estimate[[2]]
ks = ks.test(data, "plnorm", est_location, est_scale)
# add to results
results[i,] = c(gf_shape, est_location, est_scale, ks$statistic, ks$p.value)
}
}
results = rbind(c("distribution", "param1", "param2", "ks stat", "pvalue"), results)
#print(results)
return(results)
}
df <- read.csv('https://raw.githubusercontent.com/bkreis84/Data-604---Model/master/Problem_Dataset_06_01.csv', header = FALSE)
hist(df$V1)
summary(df$V1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.060 5.052 9.015 11.920 15.900 48.650
library(MASS)
set.seed(1234)
res = fitData(df$V1, fit=c("lognormal", "logistic","normal","exponential","poisson", "gamma"), sample=1)
res
## [,1] [,2] [,3]
## [1,] "distribution" "param1" "param2"
## [2,] "lognormal" "2.18509786831851" "0.771194673470374"
## [3,] "logistic" "10.4259241223393" "4.94933061641018"
## [4,] "normal" "11.9204761904762" "9.71468646465628"
## [5,] "exponential" "0.0838892661686574" "NA"
## [6,] "poisson" "11.9204761904762" "NA"
## [7,] "gamma" "1.85475826117211" "0.15559474603739"
## [,4] [,5]
## [1,] "ks stat" "pvalue"
## [2,] "0.0504429203971719" "0.999724858607345"
## [3,] "0.155734516241904" "0.234552863561509"
## [4,] "0.155051210381807" "0.238888743687638"
## [5,] "0.164020296677381" "0.186577425194354"
## [6,] "0.359344671880784" "2.3109834520918e-05"
## [7,] "0.074599458667955" "0.959854470639895"
It looks as though the exponential distribution most closely matches ours based on the ks test. This matched our intitial eyeball test and we can plot it.
fd_l <- fitdistr(df$V1, "lognormal")
lnorm_rnd <- rlnorm(10^5,fd_l$estimate[[1]],fd_l$estimate[[2]])
hist(df$V1, prob=TRUE)
lines(density(lnorm_rnd),col="red")
#Values for Simio
paste0('Random.Lognormal','(',fd_l$estimate[[1]],', ', fd_l$estimate[[2]],')')
## [1] "Random.Lognormal(2.18509786831851, 0.771194673470374)"
df2 <- read.csv('https://raw.githubusercontent.com/bkreis84/Data-604---Model/master/Problem_Dataset_06_02.csv', header = FALSE)
res2 = fitData(df2$V1, fit=c("lognormal", "logistic","normal","exponential","poisson", "gamma", "weibull"), sample=1)
res2
## [,1] [,2] [,3]
## [1,] "distribution" "param1" "param2"
## [2,] "lognormal" "2.25791850042432" "0.490590547852814"
## [3,] "logistic" "9.90943154675149" "3.1075031968157"
## [4,] "normal" "10.8923404255319" "6.29688109651794"
## [5,] "exponential" "0.0918076337070751" "NA"
## [6,] "poisson" "10.8923404255319" "NA"
## [7,] "gamma" "4.00079129716202" "0.367303383214532"
## [8,] "weibull" "12.3600989691339" "1.87903064039097"
## [,4] [,5]
## [1,] "ks stat" "pvalue"
## [2,] "0.0676478188010034" "0.972591230660194"
## [3,] "0.125102083576015" "0.419484055521505"
## [4,] "0.158906958340873" "0.167023294764242"
## [5,] "0.314436625487534" "0.000125207234307867"
## [6,] "0.247676617064745" "0.00502101892798101"
## [7,] "0.099401755491389" "0.704481750407175"
## [8,] "1" "5.55111512312578e-16"
#Plot lognormal
lgn <- fitdistr(df2$V1, "lognormal")
lnorm_rnd <- rlnorm(10^5,lgn$estimate[[1]],lgn$estimate[[2]])
hist(df2$V1, prob=TRUE)
lines(density(lnorm_rnd),col="red")
#Plot gamma
gam <- fitdistr(df2$V1, "gamma")
gam_rnd <- rlnorm(10^5,fd_l$estimate[[1]],fd_l$estimate[[2]])
hist(df2$V1, prob=TRUE)
lines(density(gam_rnd),col="red")
paste0('Random.Lognormal','(',round(lgn$estimate[[1]],3),', ', round(lgn$estimate[[2]],3),')')
## [1] "Random.Lognormal(2.258, 0.491)"
Both the gamma and the lognormal look quite close in the plots, but because lognormal had the higher p value, I will go with that again
df3 <- read.csv('https://raw.githubusercontent.com/bkreis84/Data-604---Model/master/Problem_Dataset_06_03.csv', header = FALSE)
res3 = fitData(df3$V1, fit=c("lognormal", "logistic","normal","exponential","poisson", "gamma", "weibull"), sample=1)
res3
## [,1] [,2] [,3]
## [1,] "distribution" "param1" "param2"
## [2,] "lognormal" "0.948383947311686" "0.536203350216782"
## [3,] "logistic" "2.82365832701205" "0.839256255729805"
## [4,] "normal" "2.95555555555556" "1.519584088647"
## [5,] "exponential" "0.338345864661654" "NA"
## [6,] "poisson" "2.95555555555556" "NA"
## [7,] "gamma" "3.85417434129486" "1.3040440224953"
## [8,] "weibull" "3.34855422571793" "2.06986939036735"
## [,4] [,5]
## [1,] "ks stat" "pvalue"
## [2,] "0.165882192561865" "0.167970383731119"
## [3,] "0.171833254489617" "0.140212260381077"
## [4,] "0.179713294783737" "0.109291763479242"
## [5,] "0.336148650504847" "7.6632814080102e-05"
## [6,] "0.277665321375046" "0.00193882030158021"
## [7,] "0.151310043253961" "0.254247346126618"
## [8,] "0.524285314786614" "3.60681484679048e-11"
#It looks like the gamma distribution has the highest p value here
#Plot gamma
gam3 <- fitdistr(df3$V1, "gamma")
gam_rnd <- rgamma(10^5,gam3$estimate[[1]],gam3$estimate[[2]])
hist(df3$V1, prob=TRUE)
lines(density(gam_rnd),col="red")
paste0('Random.Gamma','(',round(gam3$estimate[[1]],3),', ', round(gam3$estimate[[2]],3),')')
## [1] "Random.Gamma(3.854, 1.304)"
In this case the gamma distribution had the highest p value and the plot looks as we would expect.
\[ f(x) = \frac{1}{(b-a)}, a\leq x \leq b, 0-otherwise\\ F(x) = \frac{(x-a)}{(b-1)}, a\leq x \leq b\\ X = a + (b-a)R \]
# Data In
oatSize = c(0, 0.5, 1, 1.5, 2.0, 3.0, 4.0, 5.0, 7.5, 10.0)
oatProb = c(0.05, 0.07, 0.09, 0.11, 0.15, 0.25, 0.10, 0.09, 0.06, 0.03)
peaSize = c(0, 0.5, 1, 1.5, 2.0, 3.0)
peaProb = c(0.1, 0.2, 0.2, 0.3, 0.1, 0.1)
beanSize = c(0, 1.0, 3.0, 4.5)
beanProb = c(0.2 , 0.4, 0.3, 0.1)
barSize = c(0, 0.5, 1.0, 3.5)
barProb = c(0.2, 0.4, 0.3, 0.1)
#take our samples
oatSamp <- sample(oatSize, 90, replace=T, prob = oatProb)
peaSamp <- sample(peaSize, 90, replace=T, prob = peaProb)
beanSamp <- sample(beanSize, 90, replace=T, prob = beanProb)
barSamp <- sample(barSize, 90, replace=T, prob = barProb)
df <- data.frame(oatSamp,peaSamp, beanSamp, barSamp)
df$cost <- 1.05*oatSamp + 3.17*peaSamp + 1.99 *beanSamp + 0.95*barSamp
df$revenue <- 1.29*oatSamp + 3.76*peaSamp + 2.23 *beanSamp + 1.65*barSamp
df$profit <- df$revenue - df$cost
#revenue, cost, profit
paste(sum(df$revenue), sum(df$cost), sum(df$profit))
## [1] "1253.98 1032.2 221.78"
https://stats.stackexchange.com/questions/77752/how-to-check-if-my-data-fits-log-normal-distribution