This document is the final report of the Peer Assessment project - Part 2 from Coursera’s course Statistical Inference, as part of the Specialization in Data Science. It was built up in RStudio, using its knitr functions, meant to be published in pdf format.
Here, we perform some exploratory data analysis and use statistical inference methods in the ToothGrowth dataset already present in R.
The response variable in this analysis is the length of odontoblasts (cells responsible for tooth growth) in 60 guinea pigs.
Each animal received one of three dose levels of vitamin C (0.5, 1, and 2 mg/day) by one of two delivery methods: orange juice (OJ) or ascorbic acid (a form of vitamin C) and coded as VC.
The complete dataset is a data frame with 60 observations on 4 variables:
[,1] gpig integer Guinea Pig
[,2] len numeric Tooth length
[,3] supp factor Supplement type (VC or OJ)
[,4] dose numeric Dose (in milligrams/day)
We first load the libraries necessary for the analysis.
rm(list=ls()) # free up memory for the download of the data sets
library(knitr)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(datasets)
The next step is to load the dataset.
data(ToothGrowth)
str(ToothGrowth)
## 'data.frame': 60 obs. of 4 variables:
## $ gpig: int 1 2 3 4 5 6 7 8 9 10 ...
## $ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
## $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
## $ dose: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
head(ToothGrowth, n=3)
## gpig len supp dose
## 1 1 4.2 VC 0.5
## 2 2 11.5 VC 0.5
## 3 3 7.3 VC 0.5
The next step is to load the dataset.
summary(ToothGrowth)
## gpig len supp dose
## Min. : 1.00 Min. : 4.20 OJ:30 Min. :0.500
## 1st Qu.:15.75 1st Qu.:13.07 VC:30 1st Qu.:0.500
## Median :30.50 Median :19.25 Median :1.000
## Mean :30.50 Mean :18.81 Mean :1.167
## 3rd Qu.:45.25 3rd Qu.:25.27 3rd Qu.:2.000
## Max. :60.00 Max. :33.90 Max. :2.000
Some graphical analysis can be found below:
gr01 <- ggplot(ToothGrowth, aes(x = supp, y = len, fill = supp)) +
geom_boxplot() +
ggtitle(expression(atop("Tooth Growth",
atop(italic("by Supplement Type"),"")))) +
xlab("Supplement Type") + ylab("Tooth Lenght")
gr02 <- ggplot(ToothGrowth, aes(x = factor(dose), y = len, fill = factor(dose))) +
geom_boxplot() +
ggtitle(expression(atop("Tooth Growth",
atop(italic("by Vitamin C Dose"),"")))) +
xlab("Vitamin C Dose (mg/day)") + ylab("Tooth Lenght")
grid.arrange(gr01, gr02, ncol = 2)
The graphs above show possible differences in the average of tooth growth lenght and drive us to further analyse the impact of Vitamin C on both Supplement Type and Vitamin C Dose.
subSuppOJ <- subset(ToothGrowth, supp == "OJ")
subSuppVC <- subset(ToothGrowth, supp == "VC")
subDose005 <- subset(ToothGrowth, dose == 0.5)
subDose010 <- subset(ToothGrowth, dose == 1.0)
subDose020 <- subset(ToothGrowth, dose == 2.0)
par(mfrow=c(1, 2))
qqnorm(subSuppOJ$len, main = "supp = OJ")
qqline(subSuppOJ$len, col = "red", lwd = 1)
qqnorm(subSuppVC$len, main = "supp = VC")
qqline(subSuppVC$len, col = "red", lwd = 1)
ks.test(subSuppOJ$len, "pnorm", mean = mean(subSuppOJ$len),
sd = sd(subSuppOJ$len))
##
## One-sample Kolmogorov-Smirnov test
##
## data: subSuppOJ$len
## D = 0.1382, p-value = 0.6152
## alternative hypothesis: two-sided
ks.test(subSuppVC$len, "pnorm", mean = mean(subSuppVC$len),
sd = sd(subSuppVC$len))
##
## One-sample Kolmogorov-Smirnov test
##
## data: subSuppVC$len
## D = 0.0838, p-value = 0.9845
## alternative hypothesis: two-sided
par(mfrow=c(1, 3))
qqnorm(subDose005$len, main = "dose = 0.5")
qqline(subDose005$len, col = "red", lwd = 1)
qqnorm(subDose010$len, main = "dose = 1.0")
qqline(subDose010$len, col = "red", lwd = 1)
qqnorm(subDose020$len, main = "dose = 2.0")
qqline(subDose020$len, col = "red", lwd = 1)
ks.test(subDose005$len, "pnorm", mean = mean(subDose005$len),
sd = sd(subDose005$len))
##
## One-sample Kolmogorov-Smirnov test
##
## data: subDose005$len
## D = 0.1712, p-value = 0.6011
## alternative hypothesis: two-sided
ks.test(subDose010$len, "pnorm", mean = mean(subDose010$len),
sd = sd(subDose010$len))
##
## One-sample Kolmogorov-Smirnov test
##
## data: subDose010$len
## D = 0.1593, p-value = 0.6901
## alternative hypothesis: two-sided
ks.test(subDose020$len, "pnorm", mean = mean(subDose020$len),
sd = sd(subDose020$len))
##
## One-sample Kolmogorov-Smirnov test
##
## data: subDose020$len
## D = 0.1368, p-value = 0.848
## alternative hypothesis: two-sided
For all cases, on both variables supp and dose, with such high values of p-values, there is no evidence that the data are not normal.
In that case, we can proceed with the t tests.
dataDose <- as.data.frame(rbind(
c(mean(subDose005$len), t.test(subDose005$len)$conf),
c(mean(subDose010$len), t.test(subDose010$len)$conf),
c(mean(subDose020$len), t.test(subDose020$len)$conf)))
dataDose <- cbind(as.factor(c(0.5,1.0,2.0)),dataDose)
names(dataDose) <- c("DoseCode","meanX","CImin","CImax")
dataSupp <- as.data.frame(rbind(
c(mean(subSuppOJ$len), t.test(subSuppOJ$len)$conf),
c(mean(subSuppVC$len), t.test(subSuppVC$len)$conf)))
dataSupp <- cbind(as.factor(c("OJ","VC")),dataSupp)
names(dataSupp) <- c("DoseCode","meanX","CImin","CImax")
gr03 <- ggplot(dataDose, aes(x=DoseCode, y=meanX,
colour=DoseCode,group=DoseCode)) +
geom_errorbar(aes(ymin=CImin, ymax=CImax),colour="black", width=.1) +
geom_point(size=4) +
ggtitle("CI for dose") +
xlab("Vitamin C Dose (mg/day)") + ylab("Tooth Lenght")
gr04 <- ggplot(dataSupp, aes(x=DoseCode, y=meanX,
colour=DoseCode,group=DoseCode)) +
geom_errorbar(aes(ymin=CImin, ymax=CImax),colour="black", width=.1) +
geom_point(size=4) +
ggtitle("CI for supp") +
xlab("Supplement Type") + ylab("Tooth Lenght")
grid.arrange(gr03,gr04,ncol=2)
The confidence intervals for Vitamin C Dose are visually distant from each other, which indicates higher probability of differences in the averages.
We cannot say the same about the Supplement Type, where CI’s are overlapping.
Checking for differences in variance:
var.test(subSuppOJ$len, subSuppVC$len)
##
## F test to compare two variances
##
## data: subSuppOJ$len and subSuppVC$len
## F = 0.6386, num df = 29, denom df = 29, p-value = 0.2331
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3039488 1.3416857
## sample estimates:
## ratio of variances
## 0.6385951
The p-value is high, so we assume equal variances for the t test:
t.test(subSuppOJ$len, subSuppVC$len, paired = FALSE, var.equal = TRUE)
##
## Two Sample t-test
##
## data: subSuppOJ$len and subSuppVC$len
## t = 1.9153, df = 58, p-value = 0.06039
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1670064 7.5670064
## sample estimates:
## mean of x mean of y
## 20.66333 16.96333
The p-value of the test was too close to the significance level \(\alpha = 0.05\) we assumed for the test.
In that case, I would not reject \(H_0\) and assume there is no effect of the Supplement Type (OJ or VC) on the tooth growth lengt of those samples.
Checking for differences in variance (run in pairs of sets per level):
var.test(subDose005$len, subDose010$len)
##
## F test to compare two variances
##
## data: subDose005$len and subDose010$len
## F = 1.0386, num df = 19, denom df = 19, p-value = 0.9351
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4110751 2.6238736
## sample estimates:
## ratio of variances
## 1.038561
var.test(subDose005$len, subDose020$len)
##
## F test to compare two variances
##
## data: subDose005$len and subDose020$len
## F = 1.4215, num df = 19, denom df = 19, p-value = 0.4505
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5626393 3.5913009
## sample estimates:
## ratio of variances
## 1.421481
var.test(subDose010$len, subDose020$len)
##
## F test to compare two variances
##
## data: subDose010$len and subDose020$len
## F = 1.3687, num df = 19, denom df = 19, p-value = 0.5005
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5417489 3.4579584
## sample estimates:
## ratio of variances
## 1.368702
Once again, all p-values are high and, in that case, there is no evidence of differences in the variances. We will assume equal variances for the t tests.
The t tests will be run in pairs of sets per level:
t.test(subDose005$len, subDose010$len, paired = FALSE, var.equal = TRUE)
##
## Two Sample t-test
##
## data: subDose005$len and subDose010$len
## t = -6.4766, df = 38, p-value = 1.266e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.983748 -6.276252
## sample estimates:
## mean of x mean of y
## 10.605 19.735
t.test(subDose005$len, subDose020$len, paired = FALSE, var.equal = TRUE)
##
## Two Sample t-test
##
## data: subDose005$len and subDose020$len
## t = -11.799, df = 38, p-value = 2.838e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -18.15352 -12.83648
## sample estimates:
## mean of x mean of y
## 10.605 26.100
t.test(subDose010$len, subDose020$len, paired = FALSE, var.equal = TRUE)
##
## Two Sample t-test
##
## data: subDose010$len and subDose020$len
## t = -4.9005, df = 38, p-value = 1.811e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.994387 -3.735613
## sample estimates:
## mean of x mean of y
## 19.735 26.100
Here, we can observe very low p-values for the t tests, and we can assume significant differences in the averages of the samples, meaning that we have high level of confidence to assume that the averages increase when the dose of Vitamin C also increase.
As general conclusions: