The first part of this project consists of investigation of the exponential distribution in its comparison with the Central Limit Theorem.
#PART 1.1. CALCULATE EXPONENTIAL DISTRIBUTIION AND COMPARE IT TO THE CENTRAL LIMIT THEOREM
set.seed(423)
lambda=0.2
n=40
sim<-replicate(1000, rexp(n,lambda)) #generate data.
sim.exp<-apply(sim,2,mean)
obs.mean<- round(mean(replicate(1000, sim)), digits = 3) #calculation of the observed mean.
t.mean<-round(1/lambda,digits=3) #calculation of theoretical mean.```
means<-data.frame(obs.mean,t.mean)
colnames(means)<-c("Observed Mean", "Theoretical Mean")
print(means)
## Observed Mean Theoretical Mean
## 1 4.975 5
According to the Central Limit Theorem, the distribution of aveages of iid variables becomes that of a standard normal as the sample size increases, therefore SD of the sample are close to the theoretical value.
#PART 1.2. VARIANCE
sample.var<-sd(sim.exp)
t.var<-(1/lambda)/ sqrt(n)
var<-data.frame(sample.var,t.var)
colnames(var)<-c("Sample Variance", "Theoretical Variance")
print(var)
## Sample Variance Theoretical Variance
## 1 0.8209998 0.7905694
#PART 1.3. DISTRIBUTION
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
ggplot(as.data.frame(sim.exp), aes(x=sim.exp)) + geom_histogram(binwidth = 0.25, colour="blue") +
theme_classic() + xlab("Means") + ylab("Frequency") +
geom_vline(xintercept = t.mean)+
annotate("text", label = "Theoretical Mean= 5", x = 6.5, y = 100, size = 4, colour = "black") +
annotate("text", label = "Sample Mean= 4.975", x = 6.5, y = 90, size = 4, colour = "black") +
ggtitle("Means Distribution")
The second part involves basic inferential statistics.
#PART 2 BASIC INFERENTIAL DATA ANALYSIS
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tg<-data(ToothGrowth) #load data
tg<-data.frame(ToothGrowth)
summary(tg) # provide basic summary
## len supp dose
## Min. : 4.20 OJ:30 Min. :0.500
## 1st Qu.:13.07 VC:30 1st Qu.:0.500
## Median :19.25 Median :1.000
## Mean :18.81 Mean :1.167
## 3rd Qu.:25.27 3rd Qu.:2.000
## Max. :33.90 Max. :2.000
#show distribution
ggplot(ToothGrowth, aes(x=dose, y = len)) +
geom_violin(aes(fill = factor(dose)), trim=FALSE) + facet_grid(.~supp) +
labs(x="Dosage", y = "Length", title="Figure 1. Dosage vs Length")
VC <- tg %>% filter(tg$supp=='VC')
OJ <- tg %>% filter(tg$supp=='OJ')
vars<-var.test(VC$len, OJ$len)
#Statistics
dose0.5OJ<- tg %>% filter(tg$supp=="OJ" & tg$dose==0.5)
dose0.5VC<- tg %>% filter(tg$supp=="VC" & tg$dose==0.5)
dose1OJ<-tg %>% filter(tg$supp=="OJ" & tg$dose==1.0)
dose1VC<- tg %>% filter(tg$supp=="VC" & tg$dose==1.0)
dose2OJ<-tg %>% filter(tg$supp=="OJ" & tg$dose==2.0)
dose2VC<- tg %>% filter(tg$supp=="VC" & tg$dose==2.0)
C.I_0.5mg<-t.test(dose0.5OJ$len, dose0.5VC$len, paired=FALSE, var.equal= FALSE)$conf
C.I_1mg<-t.test(dose1OJ$len, dose1VC$len, paired=FALSE, var.equal= FALSE)$conf
C.I_2mg<-t.test(dose2OJ$len, dose2VC$len, paired=FALSE, var.equal= FALSE)$conf
CI_all<- data.frame(C.I_0.5mg, C.I_1mg, C.I_2mg, row.names = c("Lower", "Higher"))
print(CI_all)
## C.I_0.5mg C.I_1mg C.I_2mg
## Lower 1.719057 2.802148 -3.79807
## Higher 8.780943 9.057852 3.63807
anova<-aov(len~as.factor(supp)*dose, data=tg)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(supp) 1 205.4 205.4 12.317 0.000894 ***
## dose 1 2224.3 2224.3 133.415 < 2e-16 ***
## as.factor(supp):dose 1 88.9 88.9 5.333 0.024631 *
## Residuals 56 933.6 16.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Assumptions are that: 1. Subgroups have different variances 2. Data is unpaired
We can see that both supplement type and dose have a significant influence in tooth growth, as seen from ANOVA analysis and CI calculation