Prepare Dataset

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fitdistrplus)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: survival

Q1: Find two data sets

view(trees)
view(diamonds)

Q2: Distributions of at least three numeric variables Picking 3 numeric variable: Diamonds: price, carat, depth Tree: girth, height, volume

Diamonds

1. Price

ggplot(diamonds, aes(x=price)) + 
    geom_histogram(bins= 50)

mean(diamonds$price)
## [1] 3932.8
median(diamonds$price)
## [1] 2401
var(diamonds$price)
## [1] 15915629

The graph shows that this distribution is heavily right skewed with mean is larger than median and the larger the price the smaller the count => Gamma distribution

2. Carat

ggplot(diamonds, aes(x=carat)) + 
    geom_histogram(bins= 50)

mean(diamonds$carat)
## [1] 0.7979397
median(diamonds$carat)
## [1] 0.7
var(diamonds$carat)
## [1] 0.2246867

Same with the price carat also have a positive right-skewed graph which make it close to Gamma distribution

3. Depth

ggplot(diamonds, aes(x=depth)) + 
    geom_histogram(bins= 50)

mean(diamonds$depth)
## [1] 61.7494
median(diamonds$depth)
## [1] 61.8
var(diamonds$depth)
## [1] 2.052404

BAsed on the graph, we can observe it with a bell curve wwith mean and meadian almost the same which suggested our data distribution is Normal distribution.

Trees

1.Girth

ggplot(trees, aes(x=Girth)) + 
    geom_histogram(bins= 10)

mean(trees$Girth)
## [1] 13.24839
median(trees$Girth)
## [1] 12.9
var(trees$Girth)
## [1] 9.847914
ggplot(trees, aes(x = Girth)) +
  geom_histogram(aes(y = after_stat(density)), bins= 10, 
                  boundary = 0, colour = "black", fill = "white") +
  geom_density()

This plot appear to be a reasonable Gamma distribution since it’s continuous variable, right skewed, gradually decrease and doesn’t seem symmetrical

2.Height

ggplot(trees, aes(x=Height)) + 
    geom_histogram(bins= 10)

mean(trees$Height)
## [1] 76
median(trees$Height)
## [1] 76
var(trees$Height)
## [1] 40.6
 ggplot(trees, aes(x = Height)) +
    geom_histogram(aes(y = after_stat(density)), bins= 10, 
                   boundary = 0, colour = "black", fill = "white") +
    geom_density()

Since the mean and the median value of the data is quite similar, and the plot data doesn’t show a clear skewness, this data distribution is quite close to the normal distribution

3.Volume

ggplot(trees, aes(x=Volume)) + 
    geom_histogram(bins= 10)

mean(trees$Volume)
## [1] 30.17097
median(trees$Volume)
## [1] 24.2
var(trees$Volume)
## [1] 270.2028
ggplot(trees, aes(x = Volume)) +
  geom_histogram(aes(y = after_stat(density)), bins= 10, 
                  boundary = 0, colour = "black", fill = "white") +
  geom_density()

This plot appear to be a reasonable Gamma distribution since it’s continuous variable, right skewed

Q3: Fit the Distributions

Diamonds

1. Price - Gamma Distribution (moment method)

fit_price <- fitdist(diamonds$price, "gamma", method = "mme")
print(fit_price)
## Fitting of the distribution ' gamma ' by matching moments 
## Parameters:
##           estimate   Std. Error
## shape 0.9718246127 8.429224e-03
## rate  0.0002471076 2.399719e-06
hist(diamonds$price, prob = TRUE, breaks = 50, 
     main = "Fitted Gamma Distribution on Diamond Price", 
     xlab = "Price", col = "lightblue")
curve(dgamma(x, shape = fit_price$estimate[1], rate = fit_price$estimate[2]), 
      add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)

2. Carat - Gamma Distribution (using method of moments)

fit_carat <- fitdist(diamonds$carat, "gamma", method = "mme")
print(fit_carat)
## Fitting of the distribution ' gamma ' by matching moments 
## Parameters:
##       estimate Std. Error
## shape 2.833812 0.02007060
## rate  3.551410 0.02674299
hist(diamonds$carat, prob = TRUE, breaks = 50, 
     main = "Fitted Gamma Distribution on Diamond Carat", 
     xlab = "Carat", col = "lightgreen")
curve(dgamma(x, shape = fit_carat$estimate[1], rate = fit_carat$estimate[2]), 
      add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)

3. Depth - Normal Distribution

fit_depth <- fitdist(diamonds$depth, "norm")
print(fit_depth)
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters:
##       estimate  Std. Error
## mean 61.749405 0.006168391
## sd    1.432608 0.004361702
hist(diamonds$depth, prob = TRUE, breaks = 50, 
     main = "Fitted Normal Distribution on Diamond Depth", 
     xlab = "Depth", col = "lightyellow")
curve(dnorm(x, mean = fit_depth$estimate[1], sd = fit_depth$estimate[2]), 
      add = TRUE, col = "blue", lwd = 2)
legend("topright", legend = c("Fitted Normal PDF"), col = "blue", lwd = 2)

Tree

1. Girth - Gamma Distribution

fit_girth <- fitdist(trees$Girth, "gamma")
print(fit_girth)
## Fitting of the distribution ' gamma ' by maximum likelihood 
## Parameters:
##        estimate Std. Error
## shape 19.005179  4.7855007
## rate   1.434505  0.3660107
hist(trees$Girth, prob = TRUE, breaks = 8, 
     main = "Fitted Gamma Distribution on Tree Girth", 
     xlab = "Girth", col = "lightcoral")
curve(dgamma(x, shape = fit_girth$estimate[1], rate = fit_girth$estimate[2]), 
      add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)

2. Height- Normal distribution

fit_height <- fitdist(trees$Height, "norm")
print(fit_height)
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters:
##       estimate Std. Error
## mean 76.000000   1.125802
## sd    6.268199   0.796062
hist(trees$Height, prob = TRUE, breaks = 8, 
     main = "Fitted Normal Distribution on Tree Height", 
     xlab = "Height", col = "lightpink")
curve(dnorm(x, mean = fit_height$estimate[1], sd = fit_height$estimate[2]), 
      add = TRUE, col = "blue", lwd = 2)
legend("topright", legend = c("Fitted Normal PDF"), col = "blue", lwd = 2)

3. Volume - Gamma Distribution

fit_volume <- fitdist(trees$Volume, "gamma")
print(fit_volume)
## Fitting of the distribution ' gamma ' by maximum likelihood 
## Parameters:
##        estimate Std. Error
## shape 3.8858176 0.94740916
## rate  0.1288008 0.03352111
hist(trees$Volume, prob = TRUE, breaks = 8, 
     main = "Fitted Gamma Distribution on Tree Volume", 
     xlab = "Volume", col = "lavender")
curve(dgamma(x, shape = fit_volume$estimate[1], rate = fit_volume$estimate[2]), 
      add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Fitted Gamma PDF"), col = "red", lwd = 2)

Q4: Comments on the effect of sample size with the modeling process. Give your own opinion.

With a larger sample size, it’s easier to explore the distribution of the data with higher accuracy. With a relatively smaller sample siez, I have to make the bin smaller otherwise there would be a bunch of missing data or empty place in the histogram (that is not easy to estimate the distribution)