library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fitdistrplus)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: survival
view(trees)
view(diamonds)
ggplot(diamonds, aes(x=price)) +
geom_histogram(bins= 50)
mean(diamonds$price)
## [1] 3932.8
median(diamonds$price)
## [1] 2401
var(diamonds$price)
## [1] 15915629
The graph shows that this distribution is heavily right skewed with mean is larger than median and the larger the price the smaller the count => Gamma distribution
ggplot(diamonds, aes(x=carat)) +
geom_histogram(bins= 50)
mean(diamonds$carat)
## [1] 0.7979397
median(diamonds$carat)
## [1] 0.7
var(diamonds$carat)
## [1] 0.2246867
Same with the price carat also have a positive right-skewed graph which make it close to Gamma distribution
ggplot(diamonds, aes(x=depth)) +
geom_histogram(bins= 50)
mean(diamonds$depth)
## [1] 61.7494
median(diamonds$depth)
## [1] 61.8
var(diamonds$depth)
## [1] 2.052404
BAsed on the graph, we can observe it with a bell curve wwith mean and meadian almost the same which suggested our data distribution is Normal distribution.
ggplot(trees, aes(x=Girth)) +
geom_histogram(bins= 10)
mean(trees$Girth)
## [1] 13.24839
median(trees$Girth)
## [1] 12.9
var(trees$Girth)
## [1] 9.847914
ggplot(trees, aes(x = Girth)) +
geom_histogram(aes(y = after_stat(density)), bins= 10,
boundary = 0, colour = "black", fill = "white") +
geom_density()
This plot appear to be a reasonable Gamma distribution since it’s continuous variable, right skewed, gradually decrease and doesn’t seem symmetrical
ggplot(trees, aes(x=Height)) +
geom_histogram(bins= 10)
mean(trees$Height)
## [1] 76
median(trees$Height)
## [1] 76
var(trees$Height)
## [1] 40.6
ggplot(trees, aes(x = Height)) +
geom_histogram(aes(y = after_stat(density)), bins= 10,
boundary = 0, colour = "black", fill = "white") +
geom_density()
Since the mean and the median value of the data is quite similar, and the plot data doesn’t show a clear skewness, this data distribution is quite close to the normal distribution
ggplot(trees, aes(x=Volume)) +
geom_histogram(bins= 10)
mean(trees$Volume)
## [1] 30.17097
median(trees$Volume)
## [1] 24.2
var(trees$Volume)
## [1] 270.2028
ggplot(trees, aes(x = Volume)) +
geom_histogram(aes(y = after_stat(density)), bins= 10,
boundary = 0, colour = "black", fill = "white") +
geom_density()
This plot appear to be a reasonable Gamma distribution since it’s continuous variable, right skewed
fit_price <- fitdist(diamonds$price, "gamma", method = "mme")
print(fit_price)
## Fitting of the distribution ' gamma ' by matching moments
## Parameters:
## estimate Std. Error
## shape 0.9718246127 8.429224e-03
## rate 0.0002471076 2.399719e-06
hist(diamonds$price, prob = TRUE, breaks = 50,
main = "Fitted Gamma Distribution on Diamond Price",
xlab = "Price", col = "lightblue")
curve(dgamma(x, shape = fit_price$estimate[1], rate = fit_price$estimate[2]),
add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)
fit_carat <- fitdist(diamonds$carat, "gamma", method = "mme")
print(fit_carat)
## Fitting of the distribution ' gamma ' by matching moments
## Parameters:
## estimate Std. Error
## shape 2.833812 0.02007060
## rate 3.551410 0.02674299
hist(diamonds$carat, prob = TRUE, breaks = 50,
main = "Fitted Gamma Distribution on Diamond Carat",
xlab = "Carat", col = "lightgreen")
curve(dgamma(x, shape = fit_carat$estimate[1], rate = fit_carat$estimate[2]),
add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)
fit_depth <- fitdist(diamonds$depth, "norm")
print(fit_depth)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 61.749405 0.006168391
## sd 1.432608 0.004361702
hist(diamonds$depth, prob = TRUE, breaks = 50,
main = "Fitted Normal Distribution on Diamond Depth",
xlab = "Depth", col = "lightyellow")
curve(dnorm(x, mean = fit_depth$estimate[1], sd = fit_depth$estimate[2]),
add = TRUE, col = "blue", lwd = 2)
legend("topright", legend = c("Fitted Normal PDF"), col = "blue", lwd = 2)
fit_girth <- fitdist(trees$Girth, "gamma")
print(fit_girth)
## Fitting of the distribution ' gamma ' by maximum likelihood
## Parameters:
## estimate Std. Error
## shape 19.005179 4.7855007
## rate 1.434505 0.3660107
hist(trees$Girth, prob = TRUE, breaks = 8,
main = "Fitted Gamma Distribution on Tree Girth",
xlab = "Girth", col = "lightcoral")
curve(dgamma(x, shape = fit_girth$estimate[1], rate = fit_girth$estimate[2]),
add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)
fit_height <- fitdist(trees$Height, "norm")
print(fit_height)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 76.000000 1.125802
## sd 6.268199 0.796062
hist(trees$Height, prob = TRUE, breaks = 8,
main = "Fitted Normal Distribution on Tree Height",
xlab = "Height", col = "lightpink")
curve(dnorm(x, mean = fit_height$estimate[1], sd = fit_height$estimate[2]),
add = TRUE, col = "blue", lwd = 2)
legend("topright", legend = c("Fitted Normal PDF"), col = "blue", lwd = 2)
fit_volume <- fitdist(trees$Volume, "gamma")
print(fit_volume)
## Fitting of the distribution ' gamma ' by maximum likelihood
## Parameters:
## estimate Std. Error
## shape 3.8858176 0.94740916
## rate 0.1288008 0.03352111
hist(trees$Volume, prob = TRUE, breaks = 8,
main = "Fitted Gamma Distribution on Tree Volume",
xlab = "Volume", col = "lavender")
curve(dgamma(x, shape = fit_volume$estimate[1], rate = fit_volume$estimate[2]),
add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)
With a larger sample size, it’s easier to explore the distribution of the data with higher accuracy. With a relatively smaller sample siez, I have to make the bin smaller otherwise there would be a bunch of missing data or empty place in the histogram (that is not easy to estimate the distribution)