Overview This is to report 1) a simulation of exponential distribtution to demonstrate the central limited theorem. 2) a basic inference on the ToothGrowth data
library(ggplot2)
library(stats)
library(dplyr)
40 exponentials and 1000 simulations
**The mean of sampling distribution
set.seed(1000)
simu <- replicate(rexp(40, rate = .2), n = 1000)
tsimu <- t(simu)
mns <- apply(tsimu, 1, mean)
mean(mns)
## [1] 4.986963
** The sample mean is very close to the theoretical mean: 1/0.2 = 5**
**The variance of sampling distribution
sds <- apply(tsimu, 1, sd)
mean(sds)
## [1] 4.89047
The sample standard deviation 4.8904701 is very close to the population sd 5
Visualization of the distribution
ggplot(data.frame(means = mns), aes(x = means)) +
geom_histogram(bins = 10) +
geom_vline(data = data.frame(type = "sample means", col = "sample means"), aes(xintercept = mean(mns), lty = type, colour = col)) +
geom_vline(data = data.frame(type = "pop mean", col = "pop mean"), aes(xintercept =5, lty = type, colour = col)) +
ggtitle("The sampling distribution of the sample means") +
scale_colour_manual(name = "legend", values = c("sample means" = "red", "pop mean" = "green")) +
scale_linetype_manual(name = "legend", values =c("sample means" = "solid", "pop mean" = "solid"))
ggplot(data.frame(std = sds), aes(x = sds)) +
geom_histogram(bins = 10) +
geom_vline(data = data.frame(type = "sample sd", col = "sample sd"), aes(linetype = type, xintercept = mean(sds),colour = col)) +
geom_vline(data = data.frame(type = "pop sd", col = "pop sd"), aes(linetype = type, xintercept = 5, colour = col)) +
scale_colour_manual(name = "standard deviation", values = c("sample sd" = "blue", "pop sd" = "yellow")) +
scale_linetype_manual(name = "standard deviation", values = c("sample sd" = "solid", "pop sd" = "solid")) +
ggtitle("The sampling distribution of the sample sd")
> Normal Distribution of the sampling
hist(simu, main = "distribution of 1000 simulations of 40 exponential variables")
hist(mns, main = "The sampling distribution of the sample means")
the sampling distribution of the sample means looks more Gaussian than the original distribution
load data
data("ToothGrowth")
tooth <- ToothGrowth
Exploratory Data analysis
tooth$dose <- as.factor(tooth$dose)
summary(tooth)
## len supp dose
## Min. : 4.20 OJ:30 0.5:20
## 1st Qu.:13.07 VC:30 1 :20
## Median :19.25 2 :20
## Mean :18.81
## 3rd Qu.:25.27
## Max. :33.90
2 levels of
supp, 3 levels ofdose, need to stratify the data
dose <- tooth %>% group_by(supp, dose) %>% summarise(avg = mean(len), std = sd(len))
plot len comparing oj and vc in supp under different doses
ggplot(tooth, aes(x = supp, y = len)) + geom_boxplot() + facet_grid(.~ dose) + labs(x = "Suppliment sources", y = "Average Length")
> looks like there is mean difference in 0.5 dose and 1 dose
Test hypothesis H0: diff = 0, Ha: != 0
dose_0 <- tooth %>%
filter(dose == .5)
t.test(len ~ supp, alternative = "two.sided", data = dose_0)
##
## Welch Two Sample t-test
##
## data: len by supp
## t = 3.1697, df = 14.969, p-value = 0.006359
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.719057 8.780943
## sample estimates:
## mean in group OJ mean in group VC
## 13.23 7.98
dose_1 <- tooth %>%
filter(dose == 1)
t.test(len ~ supp, alternative = "two.sided", data = dose_1)
##
## Welch Two Sample t-test
##
## data: len by supp
## t = 4.0328, df = 15.358, p-value = 0.001038
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.802148 9.057852
## sample estimates:
## mean in group OJ mean in group VC
## 22.70 16.77
dose_2 <- tooth %>%
filter(dose == 2)
t.test(len ~ supp, alternative = "two.sided", data = dose_2)
##
## Welch Two Sample t-test
##
## data: len by supp
## t = -0.046136, df = 14.04, p-value = 0.9639
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.79807 3.63807
## sample estimates:
## mean in group OJ mean in group VC
## 26.06 26.14
Test1, p-value = 0.006359, reject the null, there is significant evidence to support the alternative hypothesis: the means of the length growth with OJ and VC differ at dose 0.5 Test2, p-value = 0.001038, reject the null, there is significant evidence to support that the means of the length growth with OJ and VC differ at dose 1 Test3, p-value = 0.96, failed to reject the null, no mean differences between OJ and VC at dose 2
To compare dose under suppliment sources, we stratify data into OJ and VC
OJ <- tooth %>%
filter(supp == "OJ") %>%
group_by(dose) %>%
summarise( mns = mean(len), std = sd(len))
VC <- tooth %>%
filter(supp == "VC") %>%
group_by(dose) %>%
summarise(mns = mean(len), std = sd(len))
Plot
ggplot(tooth, aes(x = dose, y = len)) + geom_boxplot() + facet_grid(supp ~.) + labs( x = "Dose", y = "Average of Length Growth")
It seems there is differences among doses within each suppliment type.
sdr <- sqrt((as.numeric(OJ[1,3])*19 + as.numeric(OJ[2,3]))/380)
as.numeric(OJ[1,2]-OJ[2,2]) + c(-1,1)*qt(((1-.05)/3)/2, df = 38)*sdr
## [1] -8.979912 -9.960088
The mean growth between dose 0.5 and 1 is different at OJ
sdr <- sqrt((as.numeric(OJ[1,3])*19 + as.numeric(OJ[3,3]))/380)
as.numeric(OJ[1,2]-OJ[3,2]) + c(-1,1)*qt(((1-.05)/3)/2, df = 38)*sdr
## [1] -12.3434 -13.3166
The mean growth between dose 0.5 and 2 is different at OJ
sdr <- sqrt((as.numeric(OJ[2,3])*19 + as.numeric(OJ[3,3]))/380)
as.numeric(OJ[2,2]-OJ[3,2]) + c(-1,1)*qt(((1-.05)/3)/2, df = 38)*sdr
## [1] -2.903346 -3.816654
The mean growth between dose 1 and 2 is different at OJ
sdr <- sqrt((as.numeric(VC[1,3])*19 + as.numeric(VC[2,3]))/380)
as.numeric(VC[1,2]-OJ[2,2]) + c(-1,1)*qt(((1-.05)/3)/2, df = 38)*sdr
## [1] -14.33501 -15.10499
The mean growth between VC 0.5 and VC 1 differs
5.CI for VC 0.5 vs VC 2
sdr <- sqrt((as.numeric(VC[1,3])*19 + as.numeric(VC[3,3]))/380)
as.numeric(VC[1,2]-OJ[3,2]) + c(-1,1)*qt(((1-.05)/3)/2, df = 38)*sdr
## [1] -17.68706 -18.47294
The mean growth between VC 0.5 and VC 2 differs
sdr <- sqrt((as.numeric(VC[1,3])*19 + as.numeric(VC[3,3]))/380)
as.numeric(VC[1,2]-OJ[3,2]) + c(-1,1)*qt(((1-.05)/3)/2, df = 38)*sdr
## [1] -17.68706 -18.47294
The mean growth between VC 1 and VC 2 differs.