This is the report of the second part of the Statistical Inference Peer Assignment: Basic inferential data analysis. It includes supporting material such as the codes and figures.
library("ggplot2")
library("knitr")
library("datasets")
library("ggplot2")
library("ggpmisc")
library("gridExtra")
library("plyr")
# Set the seed (55) for reproducibility, i.e., make available to others the data to verify
#the calculations made with the code.
set.seed(55)
#3. Load the data.
#The data we are going to use in this part of the assignment is the ToothGrowth.
data("ToothGrowth")
df<-ToothGrowth
#Let's summarize and do some tables and quick plots with cluster to explore the data.
summary(df)
## len supp dose
## Min. : 4.20 OJ:30 Min. :0.500
## 1st Qu.:13.07 VC:30 1st Qu.:0.500
## Median :19.25 Median :1.000
## Mean :18.81 Mean :1.167
## 3rd Qu.:25.27 3rd Qu.:2.000
## Max. :33.90 Max. :2.000
table(df$supp, df$dose)
##
## 0.5 1 2
## OJ 10 10 10
## VC 10 10 10
#Let's calculate the confidence intervals (ci) for the mean len of the sample with a #confidence level of 95%
mean_tooth<-mean(ToothGrowth$len)
ci<-mean_tooth + c(-1, 1) * 1.96 * sd(ToothGrowth$len)/sqrt(nrow(ToothGrowth))
paste("mean=",round(mean_tooth,3),",","ci_min=",round(ci[1],3),",","ci_max=",round(ci[2],3))
## [1] "mean= 18.813 , ci_min= 16.878 , ci_max= 20.749"
#Now, let's get the same values by supplement type and dose.
means_by_supp<-tapply(ToothGrowth$len,ToothGrowth$supp,mean)
means_by_dose<-tapply(ToothGrowth$len,ToothGrowth$dose,mean)
ci_supp_OJ<-means_by_supp[1] + c(-1, 1) * 1.96 *
sd(ToothGrowth[ToothGrowth$supp=="OJ",]$len)/sqrt(nrow(ToothGrowth[ToothGrowth$supp=="OJ",]))
ci_supp_VC<-means_by_supp[2] + c(-1, 1) * 1.96 *
sd(ToothGrowth[ToothGrowth$supp=="VC",]$len)/sqrt(nrow(ToothGrowth[ToothGrowth$supp=="VC",]))
ci_dose_0.5<-means_by_dose[1] + c(-1, 1) * 1.96 *
sd(ToothGrowth[ToothGrowth$dose==0.5,]$len)/sqrt(nrow(ToothGrowth[ToothGrowth$dose==0.5,]))
ci_dose_1.0<-means_by_dose[2] + c(-1, 1) * 1.96 *
sd(ToothGrowth[ToothGrowth$dose==1.0,]$len)/sqrt(nrow(ToothGrowth[ToothGrowth$dose==1.0,]))
ci_dose_2.0<-means_by_dose[3] + c(-1, 1) * 1.96 *
sd(ToothGrowth[ToothGrowth$dose==2.0,]$len)/sqrt(nrow(ToothGrowth[ToothGrowth$dose==2.0,]))
#Let's calculate the range of the interval.
dif_supp_OJ<-ci_supp_OJ[2]-ci_supp_OJ[1]
dif_supp_VC<-ci_supp_VC[2]-ci_supp_VC[1]
dif_dose_0.5<-ci_dose_0.5[2]-ci_dose_0.5[1]
dif_dose_1.0<-ci_dose_1.0[2]-ci_dose_1.0[1]
dif_dose_2.0<-ci_dose_2.0[2]-ci_dose_2.0[1]
## [1] "mean_by_OJ= 20.663 , ci_min= 18.3 , ci_max= 23.027 , dif_ci= 4.728"
## [1] "mean_by_VC= 16.963 , ci_min= 14.005 , ci_max= 19.921 , dif_ci= 5.916"
## [1] "mean_by_dose_of_0.5= 10.605 , ci_min= 8.633 , ci_max= 12.577 , dif_ci= 3.944"
## [1] "mean_by_dose_of_1.0= 19.735 , ci_min= 17.8 , ci_max= 21.67 , dif_ci= 3.87"
## [1] "mean_by_dose_of_2.0= 26.1 , ci_min= 24.446 , ci_max= 27.754 , dif_ci= 3.308"
#Let's consider dose and supp at the same time. Let's suppose len (Tooth Length) is normal #distributed. So, no matter the sample size the distribution of the sample mean is normally #distributed, as well. Let's also assume that the 60 observations are independent. Now, let's also
#suppose that the sample have been divided randomly in two groups of 30 individuals. One of these #groups (OJ or VC). We can consider that the two groups are sample from two different population, and let suppose these two population have the same variance.
#Therefore, we have two samples of 30 observations each from two population with the len variable #is normally distributed and not have the same variance. Now we want to evaluate is the difference #in the mean of the len in both group is just by chance or depends on the supplement and dose. So, #the null hypothesis we want to contrast is that the mean difference between the two is zero (0).
#Let's do this contrast first with all doses together and then by dose. In any case, with a 95% #confidence levels, so that we have only a 5% chance of making a Type I error.
t.test(ToothGrowth[ToothGrowth$supp=="OJ",]$len, y = ToothGrowth[ToothGrowth$supp=="VC",]$len, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = TRUE, conf.level = 0.95)
##
## Two Sample t-test
##
## data: ToothGrowth[ToothGrowth$supp == "OJ", ]$len and ToothGrowth[ToothGrowth$supp == "VC", ]$len
## t = 1.9153, df = 58, p-value = 0.06039
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1670064 7.5670064
## sample estimates:
## mean of x mean of y
## 20.66333 16.96333
#Now, let's do the same by dose.
#P-values.
paste("pv_0.5=",round(as.numeric(pv_0.5),6),",","pv_1.0=",round(as.numeric(pv_1.0),6),",","pv_2.0=",round(as.numeric(pv_2.0),5))
## [1] "pv_0.5= 0.005304 , pv_1.0= 0.000781 , pv_2.0= 0.96371"