About the Project.

Part 1: Simulation Exercise Instructions

In this project you will investigate the exponential distribution in R and compare it with the Central Limit Theorem. The exponential distribution can be simulated in R with rexp(n, lambda) where lambda is the rate parameter. The mean of exponential distribution is 1/lambda and the standard deviation is also 1/lambda. Set lambda = 0.2 for all of the simulations. You will investigate the distribution of averages of 40 exponentials. Note that you will need to do a thousand simulations.

Part 2: Basic Inferential Data Analysis Instructions

Now in the second portion of the project, we’re going to analyze the ToothGrowth data in the R datasets package.

Part 1:

Loading the library

library(ggplot2)

Visualizing the data

nosim <- 1000;
        n <- 40;
        lambda <- 0.2;
        ## create a matrix with 1000 simulations each with 40 rexp generated 
        ## values
        set.seed(234)
        simdata <- matrix(rexp(nosim * n, rate=lambda), nosim);
        ## for each of the 1000 simulations calculate the average of across the 
        ## 40 values
        simdata_means <- apply(simdata, 1, mean);
        hist(simdata_means, col="gray",breaks=30,border="red",main="Histogram of simulation data (means)",xlab="Simulation data(means)");

1. Showing where the distribution is centered at and comparing it to the theoretical center of the distribution.

 theoretical_mean <- 1/lambda;
        print (paste("Theoretical center of the distribution = ", 
                     theoretical_mean));

## [1] "Theoretical center of the distribution =  5"

 print (paste("Actual center of the distribution based on the simulations 
                     = ", round(mean(simdata_means), 3)));

## [1] "Actual center of the distribution based on the simulations \n                     =  5.002"

2. Showing how variable it is and comparing it to the theoretical variance of the distribution.

theoretical_var <- (1/lambda)^2/n;
        theoretical_sd <- 1/lambda/sqrt(n);
        print (paste("Theoretical variance = ", theoretical_var));

## [1] "Theoretical variance =  0.625"

print (paste("Actual variance based on the simulations = ", 
                     round(var(simdata_means), 3)));

## [1] "Actual variance based on the simulations =  0.663"

print (paste("Theoretical standard deviation = ", 
                     round(theoretical_sd, 3)));

## [1] "Theoretical standard deviation =  0.791"

  print (paste("Actual standard deviation based on the simulations = ", 
                     round(sd(simdata_means), 3)));

## [1] "Actual standard deviation based on the simulations =  0.814"

3. Show that the distribution is approximately normal.

plotdata <- data.frame(simdata_means);
        m <- ggplot(plotdata, aes(x = simdata_means)); 
        m <- m + geom_histogram(aes(y=..density..), colour="red", 
                                fill = "magenta")
        m + geom_density(colour="black", size=2);

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4. Evaluate the coverage of the confidence interval for 1/lambda: X????1.96S/???n.

theoretical_confidence_interval <- theoretical_mean + 
                                        c(-1,1)*1.96*theoretical_sd/n;
        actual_confidence_interval <- mean(simdata_means) + 
                                        c(-1,1)*1.96*sd(simdata_means)/sqrt(n);
        print(paste("Theoretical coverage of the confidence interval for 
                    +/- 1.96sd = ", round(theoretical_confidence_interval, 3)));

## [1] "Theoretical coverage of the confidence interval for \n                    +/- 1.96sd =  4.961"
## [2] "Theoretical coverage of the confidence interval for \n                    +/- 1.96sd =  5.039"

print(paste("Actual coverage of the confidence interval for 
                    +/- 1.96sd = ", round(actual_confidence_interval, 3)));

## [1] "Actual coverage of the confidence interval for \n                    +/- 1.96sd =  4.749"
## [2] "Actual coverage of the confidence interval for \n                    +/- 1.96sd =  5.254"

Part 2

Load the data

data(ToothGrowth)
str(ToothGrowth)

## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...

Looking at the data

head(ToothGrowth)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5

Summary of the data

summary(ToothGrowth)

##       len        supp         dose      
##  Min.   : 4.20   OJ:30   Min.   :0.500  
##  1st Qu.:13.07   VC:30   1st Qu.:0.500  
##  Median :19.25           Median :1.000  
##  Mean   :18.81           Mean   :1.167  
##  3rd Qu.:25.27           3rd Qu.:2.000  
##  Max.   :33.90           Max.   :2.000

Correcting the variables

ToothGrowth$dose<-as.factor(ToothGrowth$dose)

Exploratory data analysis

MeanSupp = split(ToothGrowth$len, ToothGrowth$supp)
sapply(MeanSupp, mean)

##       OJ       VC 
## 20.66333 16.96333

graph

ggplot(aes(x=supp, y=len), data=ToothGrowth) + geom_boxplot(aes(fill=supp),col="blue")+ 
        xlab("Supplement type") +ylab("Tooth length")

##Effect of Vitamin c on tooth length

MeanDose = split(ToothGrowth$len, ToothGrowth$dose)
sapply(MeanDose, mean)

##    0.5      1      2 
## 10.605 19.735 26.100

ggplot(aes(x=dose, y=len), data=ToothGrowth) + geom_boxplot(aes(fill=dose),col="blue") + 
        xlab("Dose in miligrams") +ylab("Tooth length")

##Inferences Do the tooth length of the guinea pigs depends on delivery methods? A t test for the difference will be made to test this claim

len<-ToothGrowth$len
supp<-ToothGrowth$supp
dose<-ToothGrowth$dose
sapply(MeanSupp, var)

##       OJ       VC 
## 43.63344 68.32723

t.test(len[supp=="OJ"], len[supp=="VC"], paired = FALSE, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  len[supp == "OJ"] and len[supp == "VC"]
## t = 1.9153, df = 55.309, p-value = 0.06063
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1710156  7.5710156
## sample estimates:
## mean of x mean of y 
##  20.66333  16.96333

The p-value of this test was 0.06, which is very close to the significance level of 5%. It could be interpreted as a lack of enough evidence to reject the null hypothesis, however it is paramount to account that the 0.05 value of significance is only a convenience value. Furthermore, the confidence interval of the test contains zero (0)

Now we will test the tooth length of the group with vitamin C dosage

t.test(len[dose==2], len[dose==1], paired = FALSE, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  len[dose == 2] and len[dose == 1]
## t = 4.9005, df = 38, p-value = 1.811e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.735613 8.994387
## sample estimates:
## mean of x mean of y 
##    26.100    19.735

Now the p-value of this test is 0, a evidence that we can reject the null hypothesis. Therefore we can assume that the means of dosage change from 1mg to 2mg creates an positive effect on tooth length. Furthermore, the confidence interval does not contain zero (0).

After those analysis we can conclude that supplement type has no effect on tooth growth, and increasing the dose level leads to increased tooth growth.

Statistical Inference Project

Shashwat

December 10, 2018