During past few weeks of class we have taken a tour of well known distributions, such as:
We can use R, a statistical programming software/language to simulate draws from these distributions. R has many built in functions!
# Generating data from built in distributions:
# Binomial: rbinom(n, size, prob)
# Uniform: runif(n, min, max)
# Chi-squared: rchisq(n, df)
# Normal: rnorm(n, mean, sd)
nsim<-10000
binom5<-rbinom(nsim, 10, .5)
unif5<-runif(nsim, 0, 10)
chisq5<-rchisq(nsim, 5)
norm5<-rnorm(nsim, 5, 2)
# Create a dataframe to store these data
dist_df<-data.frame(Data=c(binom5, unif5, chisq5, norm5),
Distribution=c(rep("Binomial", nsim),
rep("Uniform", nsim),
rep("ChiSquared", nsim),
rep("Normal", nsim)))
# We are using the tidyverse package for visualization with ggplot
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
ggplot(dist_df, aes(Data, after_stat(density), fill=Distribution))+
geom_histogram()+
facet_grid(Distribution~., scales = "free_y")
\[\bar{X}\sim Normal(\mu , \sigma/\sqrt{n}), n\rightarrow \infty\]
In class we showed that, for a “sufficiently large” sample size the distribution of sample means is approximately normal with mean \(\mu\) and standard deviation \(\sigma / \sqrt{n}\), regardless of the underlying distribution.
binomCLT<-c()
unifCLT<-c()
chisqCLT<-c()
normCLT<-c()
index<-c()
# Loops for different sample sizes: n=1, 5, 10, and 30
for(i in c(1, 5, 10, 30)){
for(j in 1:nsim){
binomCLT<-c(binomCLT, mean(rbinom(i, 10, .5)))
unifCLT<-c(unifCLT, mean(runif(i, 0, 10)))
chisqCLT<-c(chisqCLT, mean(rchisq(i, 5)))
normCLT<-c(normCLT, mean(rnorm(i, 5, 2)))
index<-c(index, i)
}
}
# Creates a dataframe to store simulation
clt_df<-data.frame(Data=c(binomCLT, unifCLT, chisqCLT, normCLT),
Distribution=c(rep("Binomial", nsim*4),
rep("Uniform", nsim*4),
rep("ChiSquared", nsim*4),
rep("Normal", nsim*4)),
SampSize=rep(index, 4))
# Histograms
ggplot(clt_df, aes(Data, after_stat(density), fill=Distribution))+
geom_histogram()+
facet_grid(Distribution~SampSize, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Approximate Density Curves
ggplot(clt_df, aes(Data, after_stat(density), fill=Distribution))+
geom_density()+
facet_grid(Distribution~SampSize, scales = "free_y")