Coursera Statistical Inference Project

The first part of this project consists of investigation of the exponential distribution in its comparison with the Central Limit Theorem.

#PART 1.1. CALCULATE EXPONENTIAL DISTRIBUTIION AND COMPARE IT TO THE CENTRAL LIMIT THEOREM
set.seed(423)  
lambda=0.2
n=40
sim<-replicate(1000, rexp(n,lambda)) #generate data.
sim.exp<-apply(sim,2,mean) 
obs.mean<- round(mean(replicate(1000, sim)), digits = 3) #calculation of the observed mean.
t.mean<-round(1/lambda,digits=3) #calculation of theoretical mean.```
means<-data.frame(obs.mean,t.mean)
colnames(means)<-c("Observed Mean", "Theoretical Mean")
print(means)

##   Observed Mean Theoretical Mean
## 1         4.975                5

According to the Central Limit Theorem, the distribution of aveages of iid variables becomes that of a standard normal as the sample size increases, therefore SD of the sample are close to the theoretical value.

#PART 1.2. VARIANCE
sample.var<-sd(sim.exp)
t.var<-(1/lambda)/ sqrt(n)
var<-data.frame(sample.var,t.var)
colnames(var)<-c("Sample Variance", "Theoretical Variance")
print(var)

##   Sample Variance Theoretical Variance
## 1       0.8209998            0.7905694

#PART 1.3. DISTRIBUTION
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.2

ggplot(as.data.frame(sim.exp), aes(x=sim.exp)) + geom_histogram(binwidth = 0.25, colour="blue") +
        theme_classic() + xlab("Means") + ylab("Frequency") + 
        geom_vline(xintercept = t.mean)+
        annotate("text", label = "Theoretical Mean= 5", x = 6.5, y = 100, size = 4, colour = "black") +
        annotate("text", label = "Sample Mean= 4.975", x = 6.5, y = 90, size = 4, colour = "black") +
        ggtitle("Means Distribution")

The second part involves basic inferential statistics.

#PART 2 BASIC INFERENTIAL DATA ANALYSIS
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

tg<-data(ToothGrowth) #load data
tg<-data.frame(ToothGrowth)
summary(tg) # provide basic summary

##       len        supp         dose      
##  Min.   : 4.20   OJ:30   Min.   :0.500  
##  1st Qu.:13.07   VC:30   1st Qu.:0.500  
##  Median :19.25           Median :1.000  
##  Mean   :18.81           Mean   :1.167  
##  3rd Qu.:25.27           3rd Qu.:2.000  
##  Max.   :33.90           Max.   :2.000

#show distribution
ggplot(ToothGrowth, aes(x=dose, y = len)) +
        geom_violin(aes(fill = factor(dose)), trim=FALSE) + facet_grid(.~supp) + 
        labs(x="Dosage", y = "Length", title="Figure 1. Dosage vs Length")

VC <- tg %>% filter(tg$supp=='VC')
OJ <- tg %>% filter(tg$supp=='OJ')
vars<-var.test(VC$len, OJ$len)


#Statistics
dose0.5OJ<- tg %>% filter(tg$supp=="OJ" & tg$dose==0.5)
dose0.5VC<- tg %>% filter(tg$supp=="VC" & tg$dose==0.5)
dose1OJ<-tg %>% filter(tg$supp=="OJ" & tg$dose==1.0)
dose1VC<- tg %>% filter(tg$supp=="VC" & tg$dose==1.0)
dose2OJ<-tg %>% filter(tg$supp=="OJ" & tg$dose==2.0)
dose2VC<- tg %>% filter(tg$supp=="VC" & tg$dose==2.0)


C.I_0.5mg<-t.test(dose0.5OJ$len, dose0.5VC$len, paired=FALSE, var.equal= FALSE)$conf
C.I_1mg<-t.test(dose1OJ$len, dose1VC$len, paired=FALSE, var.equal= FALSE)$conf
C.I_2mg<-t.test(dose2OJ$len, dose2VC$len, paired=FALSE, var.equal= FALSE)$conf
CI_all<- data.frame(C.I_0.5mg, C.I_1mg, C.I_2mg, row.names = c("Lower", "Higher"))
print(CI_all)

##        C.I_0.5mg  C.I_1mg  C.I_2mg
## Lower   1.719057 2.802148 -3.79807
## Higher  8.780943 9.057852  3.63807

anova<-aov(len~as.factor(supp)*dose, data=tg)
summary(anova)

##                      Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(supp)       1  205.4   205.4  12.317 0.000894 ***
## dose                  1 2224.3  2224.3 133.415  < 2e-16 ***
## as.factor(supp):dose  1   88.9    88.9   5.333 0.024631 *  
## Residuals            56  933.6    16.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Assumptions are that: 1. Subgroups have different variances 2. Data is unpaired

We can see that both supplement type and dose have a significant influence in tooth growth, as seen from ANOVA analysis and CI calculation

Coursera Statistical Inference Project

Max Mendez L

2/17/2020