str(trees)
## 'data.frame': 31 obs. of 3 variables:
## $ Girth : num 8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
## $ Height: num 70 65 63 72 81 83 66 75 80 75 ...
## $ Volume: num 10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...
str(diamonds)
## tibble [53,940 Ă— 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(trees, aes(x = Girth)) +
geom_histogram(aes(y = after_stat(density)),
bins = 8,
fill = 'lightblue',
color = 'white') +
geom_density(color = 'red') +
labs(title = 'Distribution graph of grith of trees') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
mean(trees$Girth)
## [1] 13.24839
var(trees$Girth)
## [1] 9.847914
median(trees$Girth)
## [1] 12.9
Speculation: Based on the distribution graph, which the density line is close to a bell, and the fact that the mean and median are close, I think normal distribution is appropriate to model this variable.
fit1 <- fitdist(trees$Girth, 'norm')
print(fit1)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 13.248387 0.5544611
## sd 3.087109 0.3920630
ggplot(trees, aes(x = Girth)) +
geom_histogram(aes(y = after_stat(density)),
bins = 8,
fill = 'lightblue',
color = 'white') +
stat_function(fun = dnorm,
args = list(mean = fit1$estimate[1], sd = fit1$estimate[2]),
color = 'red',
linewidth = 1.5) +
labs(title = 'Fitted normal distribution',
y = 'Density',
x = 'Girth') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
ggplot(trees, aes(x = Height)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = 'lightblue',
color = 'white') +
geom_density(color = 'red') +
labs(title = 'Distribution graph of height of trees') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
mean(trees$Height)
## [1] 76
var(trees$Height)
## [1] 40.6
median(trees$Height)
## [1] 76
Speculation: Because this data set has same mean and median, and its distribution have a rough bell shape. I think normal distribution is appropriate to model this variable too.
fit2 <- fitdist(trees$Height, 'norm')
print(fit2)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 76.000000 1.125802
## sd 6.268199 0.796062
ggplot(trees, aes(x = Height)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = 'lightblue',
color = 'white') +
stat_function(fun = dnorm,
args = list(mean = fit2$estimate[1], sd = fit2$estimate[2]),
color = 'red',
linewidth = 1.5) +
labs(title = 'Fitted normal distribution',
y = 'Density',
x = 'Height') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
ggplot(trees, aes(x = Volume)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = 'lightblue',
color = 'white') +
geom_density(color = 'red') +
labs(title = 'Distribution graph of Volume of trees') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
mean(trees$Volume)
## [1] 30.17097
var(trees$Volume)
## [1] 270.2028
median(trees$Volume)
## [1] 24.2
Speculation: The mean of this data set is slightly bigger than its median, and its distribution have a rough bell shape with a long tail. I may think left skewed distribution is appropriate to model this variable, but since there is small peak of data around 55, it is safe to say that the normal distribution is better.
fit3 <- fitdist(trees$Volume, 'norm')
print(fit3)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 30.17097 2.904316
## sd 16.17055 2.053661
fit3_lnorm <- fitdist(trees$Volume, 'lnorm')
print(fit3_lnorm)
## Fitting of the distribution ' lnorm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## meanlog 3.2727317 0.09298322
## sdlog 0.5177086 0.06574796
ggplot(trees, aes(x = Volume)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = 'lightblue',
color = 'white') +
stat_function(fun = dnorm,
args = list(mean = fit3$estimate[1], sd = fit3$estimate[2]),
color = 'red',
linewidth = 1.5) +
stat_function(fun = dlnorm,
args = list(mean = fit3_lnorm$estimate[1], sd = fit3_lnorm$estimate[2]),
color = 'purple',
linewidth = 1.5) +
labs(title = 'Fitted normal distribution',
y = 'Density',
x = 'Volume') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
ggplot(diamonds, aes(x = depth)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = 1,
fill = 'lightblue',
color = 'white') +
geom_density(color = 'red') +
labs(title = 'Distribution graph of depth of diamonds') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
mean(diamonds$depth)
## [1] 61.7494
var(diamonds$depth)
## [1] 2.052404
median(diamonds$depth)
## [1] 61.8
Speculation: The mean of this data set is close to its median, and its distribution is a thin bell shape. I think normal distribution is appropriate to model this variable.
fit4 <- fitdist(diamonds$depth, 'norm')
print(fit4)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 61.749405 0.006168391
## sd 1.432608 0.004361702
ggplot(diamonds, aes(x = depth)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = 1,
fill = 'lightblue',
color = 'white') +
stat_function(fun = dnorm,
args = list(mean = fit4$estimate[1], sd = fit4$estimate[2]),
color = 'red',
linewidth = 1.5) +
labs(title = 'Fitted normal distribution',
y = 'Density',
x = 'depth') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
ggplot(diamonds, aes(x = table)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = 1,
fill = 'lightblue',
color = 'white') +
geom_density(color = 'red', adjust = 3) +
labs(title = 'Distribution graph of table of diamonds') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
mean(diamonds$table)
## [1] 57.45718
var(diamonds$table)
## [1] 4.992948
median(diamonds$table)
## [1] 57
Speculation: Same as before, the mean of this data set is almost equal to its median, and its distribution is a rough bell. I think normal distribution is appropriate to model this variable too.
fit5 <- fitdist(diamonds$table, 'norm')
print(fit5)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 57.45718 0.009620974
## sd 2.23447 0.006803050
ggplot(diamonds, aes(x = table)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = 1,
fill = 'lightblue',
color = 'white') +
stat_function(fun = dnorm,
args = list(mean = fit5$estimate[1], sd = fit5$estimate[2]),
color = 'red',
linewidth = 1.5) +
labs(title = 'Fitted normal distribution',
y = 'Density',
x = 'Volume') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
ggplot(diamonds, aes(x = price)) +
geom_histogram(aes(y = after_stat(density)),
bins = 50,
fill = 'lightblue',
color = 'white') +
geom_density(color = 'red') +
labs(title = 'Distribution graph of Volume of trees') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
mean(diamonds$price)
## [1] 3932.8
var(diamonds$price)
## [1] 15915629
median(diamonds$price)
## [1] 2401
Speculation: The mean of this data set is a lot bigger than its median, and its distribution have a rough bell shape with a long tail. I think left skewed distribution is appropriate to model this variable too.
fit6 <- fitdist(diamonds$price, 'lnorm')
print(fit6)
## Fitting of the distribution ' lnorm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## meanlog 7.786768 0.004368743
## sdlog 1.014640 0.003089154
ggplot(diamonds, aes(x = price)) +
geom_histogram(aes(y = after_stat(density)),
bins = 50,
fill = 'lightblue',
color = 'white') +
stat_function(fun = dlnorm,
args = list(mean = fit6$estimate[1], sd = fit6$estimate[2]),
color = 'red',
linewidth = 1.5) +
labs(title = 'Fitted normal distribution',
y = 'Density',
x = 'Volume') +
theme(plot.title = element_text(hjust = 0.5, size = 15))
Usually, the smaller the sample is, the harder it is to achieve fitting accuracy. However, it is surprising to see that most of the small sample data is quite precise. It is still safe to say that the bigger the data, the better, because small data can lead to problems I met when I tried to analyze the trees’ volume data. It is quite hard to tell which one is the best at first sight.