# Three normally distributed data sets
d1 <- rnorm(n=500, mean=15, sd=5)
d2 <- rnorm(n=200, mean=30, sd=5)
d3 <- rnorm(n=100, mean=45, sd=5)
# Combining them into a composite dataset
d123 <- c(d1, d2, d3)
# Let’s plot the density function of d123
plot(density(d123), col="blue", lwd=2,
main = "Distribution 1")
# Add vertical lines showing mean and median
abline(v=mean(d123))
abline(v=median(d123), lty="dashed")
# Three normally distributed data sets
d1 <- rnorm(n=100, mean=15, sd=5)
d2 <- rnorm(n=200, mean=30, sd=5)
d3 <- rnorm(n=500, mean=45, sd=5)
# Combining them into a composite dataset
d123 <- c(d1, d2, d3)
# Let’s plot the density function of d123
plot(density(d123), col="blue", lwd=2, main = "Distribution 2")
# Add vertical lines showing mean and median
abline(v=mean(d123))
abline(v=median(d123), lty="dashed")
# normally distributed data set
d = rnorm(n=800,mean = 20,sd = 5)
plot(density(d),col="blue",lwd=2,main="Distribution 3")
# Add vertical lines showing mean and median
abline(v=mean(d))
abline(v=median(d),lty="dashed")
In my opinion mean will be more sensitive. As seen in the Question “example” and question (a), the principle normal distribution is influence by the distribution with lower probability. In these two examples, the mean deviate from the “central” of the principle tendency more than the median. As a result mean will be more sensitive to the outliers.
# create normal distribution
rdata <- rnorm(n = 2000,mean = 0,sd = 1)
plot(density(rdata),col="blue",lwd=2)
# plot mean
abline(v=mean(rdata))
# plot std
rdata_std <- sd(rdata) # cal std
abline(v=rdata_std,lty="dashed")
abline(v=2*rdata_std,lty="dashed")
abline(v=3*rdata_std,lty="dashed")
abline(v=-rdata_std,lty="dashed")
abline(v=-2*rdata_std,lty="dashed")
abline(v=-3*rdata_std,lty="dashed")
1st, 2nd and 3rd quartiles:
q <- quantile(rdata,c(0.25,0.5,0.75))
q
## 25% 50% 75%
## -0.67831011 0.02659151 0.66992570
Number of std from mean to 1st, 2nd and 3rd quartiles:
q_dist <- (q - mean(rdata))/sd(rdata)
q_dist
## 25% 50% 75%
## -0.706821928 0.006801067 0.658094884
(c-1) Number of std from mean to 1st and 3rd quartiles:
rdata_new <- rnorm(n = 2000,mean = 35,sd = 3.5)
q_new <- quantile(rdata_new,c(0.25,0.75))
q_dist_new <- (q_new - mean(rdata_new))/sd(rdata_new)
q_dist_new
## 25% 75%
## -0.6941478 0.6938090
(c-2) Compare your answer to (b)
ANS: The number of standard deviations from mean to the 1st and 3rd quartiles is about 0.66 to 0.68, depends on the result of “random” normal distribution.
(d-1) Number of std from mean to 1st and 3rd quartiles of dataset “d123”:
q_123 <- quantile(d123,c(0.25,0.75))
q_dist <- (q_123 - mean(d123))/sd(d123)
q_dist
## 25% 75%
## -0.6903450 0.7554794
(d-2) Compare your answer to (b)
Comparing with (b) the number of std are both larger.
(a-1)
Freedman-Diaconsis rule
(a-2) Less sensitive than the standard deviation to outliers in data.
rand_data <- rnorm(800, mean=20, sd = 5)
k_i <- ceiling(log2(800))+1
h_i <- (max(rand_data)-min(rand_data))/k_i
h_i
## [1] 2.911575
k_i
## [1] 11
h_ii <- 3.49*sd(rand_data)/800^(1/3)
k_ii <- ceiling((max(rand_data)-min(rand_data))/h_ii)
h_ii
## [1] 1.830515
k_ii
## [1] 18
IQR <- quantile(rand_data,0.75)-quantile(rand_data,0.25)
h_iii <- 2*IQR/800^(1/3)
k_iii <- ceiling((max(rand_data)-min(rand_data))/h_iii)
h_iii
## 75%
## 1.323573
k_iii
## 75%
## 25
out_data <- c(rand_data, runif(10, min=40, max=60))
k_i_new <- ceiling(log2(800))+1
h_i_new <- (max(out_data)-min(out_data))/k_i_new
h_i_new
## [1] 5.068391
h_ii_new <- 3.49*sd(out_data)/800^(1/3)
h_ii_new
## [1] 2.212078
IQR_new <- quantile(out_data,0.75)-quantile(out_data,0.25)
h_iii_new <- 2*IQR/800^(1/3)
h_iii_new
## 75%
## 1.323573
Ans:
Freedman-Diaconis’ choice is least sensitive to outliers. It calculate the width of bins with IQR which is also less sensitive to outliers, comparing with standard derivative and mean.