Load Libraries

# Load libraries
library(datasets)
library(ggplot2)
library(pastecs)
library(dplyr)

Overview:

Basic inferential data analysis of the ToothGrowth data in the R datasets package, ToothGrowth is a data set that studies the effect of Vitamin C on Tooth Growth in Guinea Pigs.

1.Load the ToothGrowth data and perform some basic exploratory data analysis.

Now in the second portion of the project, we’re going to analyze the ToothGrowth data in the R datasets package.

data(ToothGrowth)
str(ToothGrowth)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
head(ToothGrowth)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5
hist(ToothGrowth$len,breaks=20, col="red",xlab="Tooth Length",ylab="Frequency", main ="Tooth Growth Length in Guinea Pigs")

2.Provide a basic summary of the data.

Below is a summary of data statistics for the ToothGrowth data.

The chart was provided as part of the documentation for the ToothGrowth dataset.

stat.desc(ToothGrowth)
##                       len supp        dose
## nbr.val        60.0000000   NA 60.00000000
## nbr.null        0.0000000   NA  0.00000000
## nbr.na          0.0000000   NA  0.00000000
## min             4.2000000   NA  0.50000000
## max            33.9000000   NA  2.00000000
## range          29.7000000   NA  1.50000000
## sum          1128.8000000   NA 70.00000000
## median         19.2500000   NA  1.00000000
## mean           18.8133333   NA  1.16666667
## SE.mean         0.9875223   NA  0.08118705
## CI.mean.0.95    1.9760276   NA  0.16245491
## var            58.5120226   NA  0.39548023
## std.dev         7.6493152   NA  0.62887219
## coef.var        0.4065901   NA  0.53903330
summary(ToothGrowth)
##       len        supp         dose      
##  Min.   : 4.20   OJ:30   Min.   :0.500  
##  1st Qu.:13.07   VC:30   1st Qu.:0.500  
##  Median :19.25           Median :1.000  
##  Mean   :18.81           Mean   :1.167  
##  3rd Qu.:25.27           3rd Qu.:2.000  
##  Max.   :33.90           Max.   :2.000
###
#Chart provided in ToothGrowth data summary document
coplot(len ~ dose | supp, data = ToothGrowth, panel=panel.smooth,
       xlab = "ToothGrowth data: length vs dose, given type of supplement")

## toothgDW<-dcast(ToothGrowth,dose+len~supp,value.var = "len")

3.Use confidence intervals and/or hypothesis tests to compare tooth growth by supp and dose.

T Confidence interval was used to analyze the differences of the two groups of Guinea Pigs supplied with Vitamin C and Orange juice respectively.

library(ggplot2)
library(dplyr)
# add a group column with a sequence of numbers per supplement
ToothGrowth<-mutate(ToothGrowth, groupID = ifelse(supp == "VC",seq(30),seq(30)))
ToothGrowth$groupID<-as.factor(ToothGrowth$groupID)
ToothGrowth$dose<-as.factor(ToothGrowth$dose)

g <- ggplot(ToothGrowth, aes(x = supp, y = len, group = groupID))
g <- g + geom_line(size = 1, aes(colour = groupID)) + geom_point(size =10, pch = 21, fill = "blue", alpha = .5) 
g <- g + ggtitle("Tooth Growth Length based on suplement")
g

g <- ggplot(ToothGrowth, aes(x = dose, y = len, group = dose))
g <- g + geom_boxplot(size = 1, aes(colour = dose)) + facet_grid(.~supp)
g <- g + ggtitle("Tooth Growth Length based on dosage")
g

Calculating the T Confidence Interval based on CLT Theorem and based on T.test R function.

#create two groups
g1<- ToothGrowth[ToothGrowth$supp == "VC",]$len
g2<- ToothGrowth[ToothGrowth$supp == "OJ",]$len

#calculate T CIs for independent groups

n1<-length(g1)
n2<-length(g2)

sp<- sqrt( ((n1 -1) * sd(g1)^2 + (n2-1) * sd(g2)^2) / (n1 + n2 - 2))

md <- mean(g2) - mean(g1)

semd <- sp * sqrt(1/ n1 + 1/n2)

rbind(
md + c(-1,1) * qt(0.975, n1 + n2 -1) * semd,
t.test(g2,g1,paired=FALSE,var.equal = TRUE)$conf
)
##            [,1]     [,2]
## [1,] -0.1656114 7.565611
## [2,] -0.1670064 7.567006

Further analysis to identify variations based on a specific pair of doses.

TG01 <- subset(ToothGrowth, dose %in% c(0.5,1))
TG02 <- subset(ToothGrowth, dose %in% c(0.5,2))
TG12 <- subset(ToothGrowth, dose %in% c(1,2))

rbind(
t.test(len~ dose, paired=FALSE, var.equal=TRUE, data=TG01)$conf,
t.test(len~ dose, paired=FALSE, var.equal=TRUE, data=TG02)$conf,
t.test(len~ dose, paired=FALSE, var.equal=TRUE, data=TG12)$conf
)
##            [,1]       [,2]
## [1,] -11.983748  -6.276252
## [2,] -18.153519 -12.836481
## [3,]  -8.994387  -3.735613

4.Assumptions and Conclusions on ToothGrowth Dataset

The sample population is considered to be independent and randomly selected. A total of 60 guinea pigs were indenpendently selected and splitted into two main groups based on supplement type of OV or VC. Each guinea pig was administered only one specific dose (1/2,1,2)mg of either OJ or VC.

The confidence interval obtained from the sample distribution is very similar to the one obtained useing the R function t.test. As a conclusion the difference in supplement type does not seem to have much impact. Although, the increase on the dosage seems to be have a direct impact with the tooth growth. A new analysis with bigger higher number of observations would be recommended. (bring more guinea pigs)