Data Processing

1.Loading and basic exploratory examining data

library(ggplot2)
library(knitr)
library(kableExtra)
data(ToothGrowth)
str(ToothGrowth)

## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...

It is necessary to change dose column from number format to factor

ToothGrowth[,3] = as.factor(ToothGrowth[,3])
str(ToothGrowth)

## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: Factor w/ 3 levels "0.5","1","2": 1 1 1 1 1 1 1 1 1 1 ...

Generally visualize the ToothGrowth dataset by scatter plot and boxplot.

#mean of tooth length as dose level and delivery method
  average=aggregate(len~dose+supp,data=ToothGrowth,mean)
#scatter plot of tooth length and line for mean of tooth length at each dose and supplement
  scat.plot = ggplot(aes(x=dose, y = len), data = ToothGrowth) + 
              geom_point(aes(color=supp,shape=supp),size=2)+
              geom_line(aes(x=as.numeric(dose),color=supp),data=average,size=1)+
              scale_color_manual(name="Supplement Type",
                                 values = c("royalblue","red3"),
                                 breaks = c("OJ","VC"),
                                 labels = c("Orange Juice","Ascorbic Acid"))+
              scale_shape_manual(name="Supplement Type",values = c(16,17),
                                 breaks = c("OJ","VC"),
                                 labels = c("Orange Juice","Ascorbic Acid"))
#boxplot
  box.plot= ggplot(aes(x = supp, y = len), data = ToothGrowth) +
            geom_boxplot(aes(fill = supp),color="gray20",size=0.5) + facet_wrap(~ dose)+
            scale_fill_manual(name="Supplement Type",
                              values = c("royalblue","firebrick3"),
                              breaks = c("OJ","VC"),
                              labels = c("Orange Juice","Ascorbic Acid"))

scat.plot

box.plot

Boxplot to visual the data response of tooth length to supplement methods only.

ggplot(aes(x = supp, y = len), data = ToothGrowth) + 
  geom_boxplot(aes(fill = supp),color="gray20",size=0.5)+
  scale_fill_manual(name="Supplement Type",
                     values = c("royalblue","firebrick3"),
                     breaks = c("OJ","VC"),
                     labels = c("Orange Juice","Ascorbic Acid"))

Boxplot to visual the data response of tooth length to dose levels only.

ggplot(aes(x = dose, y = len), data = ToothGrowth) + 
  geom_boxplot(aes(fill = dose),color="gray20",size=0.5)+
  scale_fill_manual(name="Dose Levels",
                    values = c("lemonchiffon","yellow1","goldenrod"))

From these overview pictures, it seems that there are differences in the delivery method at 0.5 mg and 1.0 mg dose level but no differences when the dose level reach 2.0 mg. Moreover, the tooth length has increase when increasing the dose levels. Hence, we will examine in 2 aspects whether the supplement type and/or dose levels has any impact on the tooth growth.

2.Data summary

Look at the layout of the experiment, in particular, the number of pigs that were assigned for each delivery methods and each dose levels.

# create the table of layout as matrix
  #by combination
  overal.sum = aggregate(dose~supp,ToothGrowth[,-1],summary)[,-1] 
  #total for delivery method and dose level
  overal.sum = cbind(overal.sum, rowSums(overal.sum))
  overal.sum = rbind(overal.sum, colSums(overal.sum))
  #name the columns and rows
  overal.sum = cbind(c(rep("Supplement Type",2)," "), 
                    c("Orange Juice","Ascorbic Acid","Total"), 
                    overal.sum)

# present the table
  kable(overal.sum, align = "c", "latex", 
        col.names = c(" "," ","0.5", "1", "2", "Total")) %>%
    kable_styling("bordered")%>%
    column_spec(1, bold =T)%>%
    collapse_rows(columns = 1:2) %>%
    add_header_above(c(" " =1," " =1, "Dose Levels" = 3," "=1))

Then, look at the statistics information for the tooth length:

In general:

kable(t(as.matrix(summary(ToothGrowth[,1]))))

Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
4.2	13.075	19.25	18.81333	25.275	33.9

By each combination

#prepare summary data and rename the abbreviation
detail.sum = as.matrix(aggregate(len~dose+supp, data = ToothGrowth, summary))
detail.sum[detail.sum[,2]=="OJ",2] = "Orange Juice"
detail.sum[detail.sum[,2]=="VC",2] = "Ascorbic Acid"

#draw table
table= kable(detail.sum, "latex", align = "c", 
  col.names = c(" "," ","Minimum","1st Quantile",
                "Median","Mean","3rd Quantile","Maximum")) %>%
  kable_styling("bordered","striped")%>%
  collapse_rows(columns = 1:2)%>%
  add_header_above(c("Dose Levels" = 1, "Supplement Type" = 1, "Tooth Length" = 6))

 table

3.Hypothesis test

a. By supplement

H_o : Tooth length is not affected by delivery method or True mean different is equal to 0

t.test(len ~ supp, paired = F, var.equal = F, data = ToothGrowth)

## 
##  Welch Two Sample t-test
## 
## data:  len by supp
## t = 1.9153, df = 55.309, p-value = 0.06063
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1710156  7.5710156
## sample estimates:
## mean in group OJ mean in group VC 
##         20.66333         16.96333

According to the t test, since the t-statistics is not within the confidence interval and p-value is greater than 0.05, there is NO sufficient evidence to prove that the alternative hypothesis is true. We fail to reject to null hypothesis at 5% level of significant.

b. By dose

We will examine the t-test as pair for each level of dose with the null hypothesis:

H₀ : Tooth length is not affected by dose

#prepare subset for each pair to compare
level1 = subset(ToothGrowth, dose %in% c(0.5, 1.0))
level2 = subset(ToothGrowth, dose %in% c(0.5, 2.0))
level3 = subset(ToothGrowth, dose %in% c(1.0, 2.0))

#t test
t.test(len ~ dose, paired = F, var.equal = F, data = level1)

## 
##  Welch Two Sample t-test
## 
## data:  len by dose
## t = -6.4766, df = 37.986, p-value = 1.268e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11.983781  -6.276219
## sample estimates:
## mean in group 0.5   mean in group 1 
##            10.605            19.735

t.test(len ~ dose, paired = F, var.equal = F, data = level2)

## 
##  Welch Two Sample t-test
## 
## data:  len by dose
## t = -11.799, df = 36.883, p-value = 4.398e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -18.15617 -12.83383
## sample estimates:
## mean in group 0.5   mean in group 2 
##            10.605            26.100

t.test(len ~ dose, paired = F, var.equal = F, data = level3)

## 
##  Welch Two Sample t-test
## 
## data:  len by dose
## t = -4.9005, df = 37.101, p-value = 1.906e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.996481 -3.733519
## sample estimates:
## mean in group 1 mean in group 2 
##          19.735          26.100

According to the t-test results, since the p-value of each test are relatively close to 0 and all 3 confidence intervals are not crossed 0, there are sufficient evidence to conclude that the null hypothesis are rejected.

4.Conclusion

Assumptions

Those t-test are conducted as we assume the following assumptions:

The experiment was done randomly which means random pigs are assigned to different level of dose and different supplement type.
Since the sample size is relatively large (n = 60), we assume that the sample is representative of the population
The variances are assumed to be different for the 2 groups being compared.

Conclusion

The delivery methods has no impact on on tooth growth.
The amount of dose has effect on the tooth growth. To be more specific, the mean tooth length increases as the dose level increase.

Statistics Inference: Basic inferential data analysis

Zoey Le

March 3, 2018

Overview

Data introduction