library(tidyverse)
library(glue)
library(matrixStats)
downloadDate <- date()
glue("Simulation date is {downloadDate}")
## Simulation date is Wed May 20 13:27:45 2020
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] matrixStats_0.56.0 glue_1.4.1 forcats_0.5.0 stringr_1.4.0
## [5] dplyr_0.8.5 purrr_0.3.4 readr_1.3.1 tidyr_1.1.0
## [9] tibble_3.0.1 ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.14 haven_2.2.0 lattice_0.20-41
## [5] colorspace_1.4-1 vctrs_0.3.0 generics_0.0.2 htmltools_0.4.0
## [9] yaml_2.2.1 rlang_0.4.6 pillar_1.4.4 withr_2.2.0
## [13] DBI_1.1.0 dbplyr_1.4.3 modelr_0.1.8 readxl_1.3.1
## [17] lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
## [21] rvest_0.3.5 evaluate_0.14 knitr_1.28 fansi_0.4.1
## [25] broom_0.5.6 Rcpp_1.0.4.6 scales_1.1.1 backports_1.1.7
## [29] jsonlite_1.6.1 fs_1.4.1 hms_0.5.3 digest_0.6.25
## [33] stringi_1.4.6 grid_4.0.0 cli_2.0.2 tools_4.0.0
## [37] magrittr_1.5 crayon_1.3.4 pkgconfig_2.0.3 ellipsis_0.3.1
## [41] xml2_1.3.2 reprex_0.3.0 lubridate_1.7.8 assertthat_0.2.1
## [45] rmarkdown_2.1 httr_1.4.1 rstudioapi_0.11 R6_2.4.1
## [49] nlme_3.1-147 compiler_4.0.0
In this project we will investigate the exponential distribution in R and compare it with the Central Limit Theorem. The exponential distribution can be simulated in R with rexp(n, lambda) where lambda is the rate parameter. The mean of exponential distribution is 1/lambda and the standard deviation is also 1/lambda. Set lambda = 0.2 for all of the simulations. We will investigate the distribution of averages of 40 exponentials. Note that you will need to do a thousand simulations.
Illustrate via simulation and associated explanatory text the properties of the distribution of the mean of 40 exponentials. You should
Show the sample mean and compare it to the theoretical mean of the distribution.
Show how variable the sample is (via variance) and compare it to the theoretical variance of the distribution.
Show that the distribution is approximately normal.
The exponential distribution can be simulated in R with rexp(n, lambda) where lambda is the rate parameter. The mean of exponential distribution is 1/lambda and the standard deviation is also 1/lambda. Set lambda = 0.2 for all of the simulations. We will investigate the distribution of averages of 40 exponentials.
set.seed(420) # random number generator set at 420
lambda <- .2
sampCol <- 40 # number of variables
sampRow <- 1000 # number of observations
expoDis <- matrix(rexp(n=sampRow*sampCol, rate = lambda), sampRow, sampCol)
expoDF <- as.data.frame(expoDis)
# Converted it to a tibble to make it easier to visualize
e_df <- expoDF %>% mutate(
"MEAN" = rowMeans(expoDis),
"SD" = rowSds(expoDis),
"SumCumu" = cumsum(MEAN)/(1:sampRow)) %>%
select(MEAN, SD, SumCumu)
head(e_df)
## MEAN SD SumCumu
## 1 3.770483 4.659639 3.770483
## 2 4.490344 4.707261 4.130413
## 3 4.472378 3.462287 4.244402
## 4 4.150729 4.092566 4.220983
## 5 4.280284 4.394779 4.232843
## 6 4.571118 3.426936 4.289223
e_mean <- mean(e_df$MEAN)
t_mean <- 1/lambda
res_df <- tibble(
Result = c("Sample means", "Theoretical mean"),
"Mean"=c(e_mean, t_mean))
res_df
## # A tibble: 2 x 2
## Result Mean
## <chr> <dbl>
## 1 Sample means 4.96
## 2 Theoretical mean 5
Above, the sample mean is 4.9557223 is close to the CLT mean of 5. The graph below shows the cumulative sum of the sample mean approaching the horizontal line valuing at 1/lambda which is our theoritcal mean.
ggplot(data = e_df, aes(x = 1:sampRow, y = SumCumu)) +
geom_hline(yintercept = t_mean, col = "red") +
geom_line(size = .5) +
labs(x = "Number of obs",
y = "Cumulative mean",
title = "Sample Means of 1000 Observations and 40 Variables") +
geom_text(aes(0,t_mean,
label = "theoretical mean = 1/lambda",
hjust = -1,
vjust =-1)) +
ylim(c(3.5,5.5))
e_df <- e_df %>% mutate(
VarCumu = cumsum((MEAN - e_mean)^2)/seq_along(MEAN-1))
head(e_df)
## MEAN SD SumCumu VarCumu
## 1 3.770483 4.659639 3.770483 1.4047925
## 2 4.490344 4.707261 4.130413 0.8106847
## 3 4.472378 3.462287 4.244402 0.6183304
## 4 4.150729 4.092566 4.220983 0.6257515
## 5 4.280284 4.394779 4.232843 0.5918447
## 6 4.571118 3.426936 4.289223 0.5178573
e_var <- var(e_df$MEAN)
t_var <- (1/(lambda*sqrt(sampCol)))^2
res_var_df <- tibble(
Result = c("Sample variance", "Theoretical variance"),
"Variance" = c(e_var, t_var))
res_var_df
## # A tibble: 2 x 2
## Result Variance
## <chr> <dbl>
## 1 Sample variance 0.610
## 2 Theoretical variance 0.625
ggplot(data=e_df,
aes(x = 1:sampRow,
y = VarCumu))+
geom_hline(yintercept = t_var,
col="red") +
geom_line(size = .5) +
labs(x = "Number of Observations",
y = "Cumulative Variance",
title = "Sample Variances of 1000 Observations and 40 Variables")
Note:
Using Freedman-Diaconis rule to select the width of the bins inside a histogram.
binhist <- hist(e_df$MEAN, plot=FALSE, breaks = "FD")
g2 <- ggplot(data = e_df,
aes(x=MEAN))
g2 <- g2 + geom_histogram(aes(y=..density..),
fill="white",
col="black",
breaks =binhist$breaks) +
geom_density(
aes(MEAN),
color = "blue") +
geom_vline(
xintercept = c(e_mean, t_mean),
color = c("red", "yellow")) +
labs(
x= "Sample Means",
y= "Density",
title = "Histogram of Sample Means")
g2 + stat_function(
fun = dnorm,
colour = "magenta",
args = list(mean = t_mean,
sd = sqrt(t_var)))
e_df %>% ggplot(aes(
sample=MEAN)) +
stat_qq() +
stat_qq_line(color="blue") +
labs(x="Theoretical Quantile",
y="Sample Quantiles",
title = "Quantile - Quantile Plot")
Explore the data with hcluster. Most of the code are taking from a previous class I took here at Coursera titled “Exploratory Data Analysis” by Dr. Roger Peng (follow the link to review his book).
hcDf <- as.data.frame(expoDis)
x <- hcDf$V1[1:100]
y <- hcDf$V2[1:100]
plot(x,y, col = "blue")
text(x+.05, y+.05, as.character(1:100))
dataframe <- data.frame(x=x, y=y)
d<-dist(dataframe)
The first minimum distance in two points are as follows:
rdistxy <- as.matrix(dist(dataframe))
## remove the diagonal from consideration
diag(rdistxy) <- diag(rdistxy) +100000
# Find the index of the points with minimum distance
ind <- which(rdistxy == min(rdistxy), arr.ind = TRUE)
ind
## row col
## 71 71 48
## 48 48 71
Plot the two points closest together and draw a tree to merge these two points
par(mfrow = c(1,2))
plot(x,y, col = "yellow", pch = 19, cex =2, main="Data")
text(x+.05, y+.05, labels=as.character(1:30))
points(x[ind[1,]], y[ind[1,]], col="green", pch =19, cex=2)
hcluster <- dist(dataframe) %>% hclust
dendro <- as.dendrogram(hcluster)
cutDendro <-cut(dendro, h=hcluster$height[2])
plot(cutDendro$lower[[63]], yaxt = "n", main ="Begin building tree")
Continue building the tree. Below code will provide us the next minimum distance.
nextmin <- rdistxy[order(rdistxy)][3]
ind1 <- which(rdistxy==nextmin,arr.ind = TRUE)
ind1
## row col
## 79 79 78
## 78 78 79
par(mfrow = c(1,3))
plot(x,y, col = "yellow", pch = 19, cex =2, main="Data")
text(x+.05, y+.05, labels=as.character(1:30))
points(x[ind[1,]], y[ind[1,]], col="green", pch =19, cex=2);
points(x[ind1[1,]], y[ind1[1,]], col="red", pch =19, cex=2)
hcluster <- dist(dataframe) %>% hclust
dendro <- as.dendrogram(hcluster)
cutDendro <-cut(dendro, h=hcluster$height[2])
plot(cutDendro$lower[[63]], yaxt = "n", main ="Begin building tree")
plot(cutDendro$lower[[78]], yaxt = "n", main ="Begin building tree")
Eventually the more you branch out the tree you will come to this but there is simplier process to practicing Dendogram.
hclustering <- data.frame(x=x, y=y) %>% dist %>% hclust
plot(hclustering)
#### Heatmap function
Still having fun with data.
dataMatrix <- data.frame(x=x,y=y) %>% data.matrix
heatmap(dataMatrix)