Environment

Github view mycode

library(tidyverse)
library(glue)
library(matrixStats)
downloadDate <- date()

glue("Simulation date is {downloadDate}")
## Simulation date is Wed May 20 13:27:45 2020
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] matrixStats_0.56.0 glue_1.4.1         forcats_0.5.0      stringr_1.4.0     
##  [5] dplyr_0.8.5        purrr_0.3.4        readr_1.3.1        tidyr_1.1.0       
##  [9] tibble_3.0.1       ggplot2_3.3.0      tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0 xfun_0.14        haven_2.2.0      lattice_0.20-41 
##  [5] colorspace_1.4-1 vctrs_0.3.0      generics_0.0.2   htmltools_0.4.0 
##  [9] yaml_2.2.1       rlang_0.4.6      pillar_1.4.4     withr_2.2.0     
## [13] DBI_1.1.0        dbplyr_1.4.3     modelr_0.1.8     readxl_1.3.1    
## [17] lifecycle_0.2.0  munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0
## [21] rvest_0.3.5      evaluate_0.14    knitr_1.28       fansi_0.4.1     
## [25] broom_0.5.6      Rcpp_1.0.4.6     scales_1.1.1     backports_1.1.7 
## [29] jsonlite_1.6.1   fs_1.4.1         hms_0.5.3        digest_0.6.25   
## [33] stringi_1.4.6    grid_4.0.0       cli_2.0.2        tools_4.0.0     
## [37] magrittr_1.5     crayon_1.3.4     pkgconfig_2.0.3  ellipsis_0.3.1  
## [41] xml2_1.3.2       reprex_0.3.0     lubridate_1.7.8  assertthat_0.2.1
## [45] rmarkdown_2.1    httr_1.4.1       rstudioapi_0.11  R6_2.4.1        
## [49] nlme_3.1-147     compiler_4.0.0

About

In this project we will investigate the exponential distribution in R and compare it with the Central Limit Theorem. The exponential distribution can be simulated in R with rexp(n, lambda) where lambda is the rate parameter. The mean of exponential distribution is 1/lambda and the standard deviation is also 1/lambda. Set lambda = 0.2 for all of the simulations. We will investigate the distribution of averages of 40 exponentials. Note that you will need to do a thousand simulations.

Illustrate via simulation and associated explanatory text the properties of the distribution of the mean of 40 exponentials. You should

  1. Show the sample mean and compare it to the theoretical mean of the distribution.

  2. Show how variable the sample is (via variance) and compare it to the theoretical variance of the distribution.

  3. Show that the distribution is approximately normal.

Simulation

The exponential distribution can be simulated in R with rexp(n, lambda) where lambda is the rate parameter. The mean of exponential distribution is 1/lambda and the standard deviation is also 1/lambda. Set lambda = 0.2 for all of the simulations. We will investigate the distribution of averages of 40 exponentials.

set.seed(420) # random number generator set at 420
lambda <- .2 
sampCol <- 40 # number of variables
sampRow <- 1000  # number of observations

expoDis <- matrix(rexp(n=sampRow*sampCol, rate = lambda), sampRow, sampCol)

expoDF <- as.data.frame(expoDis)
# Converted it to a tibble to make it easier to visualize

e_df <- expoDF %>% mutate(
        "MEAN" = rowMeans(expoDis), 
        "SD" = rowSds(expoDis), 
        "SumCumu" = cumsum(MEAN)/(1:sampRow)) %>% 
        select(MEAN, SD, SumCumu)

head(e_df)
##       MEAN       SD  SumCumu
## 1 3.770483 4.659639 3.770483
## 2 4.490344 4.707261 4.130413
## 3 4.472378 3.462287 4.244402
## 4 4.150729 4.092566 4.220983
## 5 4.280284 4.394779 4.232843
## 6 4.571118 3.426936 4.289223

Sample mean vs. the theoretical mean of the distribution.

e_mean <- mean(e_df$MEAN)

t_mean <- 1/lambda

res_df <- tibble(
        Result = c("Sample means", "Theoretical mean"),
        "Mean"=c(e_mean, t_mean))

res_df
## # A tibble: 2 x 2
##   Result            Mean
##   <chr>            <dbl>
## 1 Sample means      4.96
## 2 Theoretical mean  5

Above, the sample mean is 4.9557223 is close to the CLT mean of 5. The graph below shows the cumulative sum of the sample mean approaching the horizontal line valuing at 1/lambda which is our theoritcal mean.

ggplot(data = e_df, aes(x = 1:sampRow, y = SumCumu)) + 
        geom_hline(yintercept = t_mean, col = "red") +
        geom_line(size = .5) + 
        labs(x = "Number of obs", 
             y = "Cumulative mean", 
             title = "Sample Means of 1000 Observations and 40 Variables") +
        geom_text(aes(0,t_mean,
             label = "theoretical mean = 1/lambda", 
             hjust = -1, 
             vjust =-1)) + 
        ylim(c(3.5,5.5))

Show how variable the sample is (via variance) and compare it to the theoretical variance of the distribution.

e_df <- e_df %>% mutate(
        VarCumu = cumsum((MEAN - e_mean)^2)/seq_along(MEAN-1))

head(e_df)
##       MEAN       SD  SumCumu   VarCumu
## 1 3.770483 4.659639 3.770483 1.4047925
## 2 4.490344 4.707261 4.130413 0.8106847
## 3 4.472378 3.462287 4.244402 0.6183304
## 4 4.150729 4.092566 4.220983 0.6257515
## 5 4.280284 4.394779 4.232843 0.5918447
## 6 4.571118 3.426936 4.289223 0.5178573
e_var <- var(e_df$MEAN)
t_var <- (1/(lambda*sqrt(sampCol)))^2

res_var_df <- tibble(
        Result = c("Sample variance", "Theoretical variance"), 
        "Variance" = c(e_var, t_var))

res_var_df
## # A tibble: 2 x 2
##   Result               Variance
##   <chr>                   <dbl>
## 1 Sample variance         0.610
## 2 Theoretical variance    0.625
ggplot(data=e_df, 
          aes(x = 1:sampRow, 
              y = VarCumu))+ 
          geom_hline(yintercept = t_var, 
              col="red") + 
          geom_line(size = .5) + 
          labs(x = "Number of Observations", 
               y = "Cumulative Variance", 
               title = "Sample Variances of 1000 Observations and 40 Variables")

Normally Distributed

Note:

Using Freedman-Diaconis rule to select the width of the bins inside a histogram.

binhist <- hist(e_df$MEAN, plot=FALSE, breaks = "FD")

g2 <- ggplot(data = e_df, 
             aes(x=MEAN)) 

g2 <- g2 + geom_histogram(aes(y=..density..),
        fill="white", 
        col="black",
        breaks =binhist$breaks) + 
      geom_density(
        aes(MEAN), 
        color = "blue") + 
      geom_vline(
        xintercept = c(e_mean, t_mean), 
        color = c("red", "yellow")) + 
      labs(
        x= "Sample Means", 
        y= "Density", 
        title = "Histogram of Sample Means")

g2 + stat_function(
      fun = dnorm, 
      colour = "magenta", 
      args = list(mean = t_mean, 
                  sd = sqrt(t_var)))

e_df %>% ggplot(aes(
            sample=MEAN)) +   
            stat_qq() + 
            stat_qq_line(color="blue") + 
            labs(x="Theoretical Quantile", 
                 y="Sample Quantiles", 
                 title = "Quantile - Quantile Plot")

Hierarchical Clustering

Explore the data with hcluster. Most of the code are taking from a previous class I took here at Coursera titled “Exploratory Data Analysis” by Dr. Roger Peng (follow the link to review his book).

hcDf <- as.data.frame(expoDis)
x <- hcDf$V1[1:100]
y <- hcDf$V2[1:100]
plot(x,y, col = "blue")
text(x+.05, y+.05, as.character(1:100))

dataframe <- data.frame(x=x, y=y)

d<-dist(dataframe)

The first minimum distance in two points are as follows:

rdistxy <- as.matrix(dist(dataframe))

## remove the diagonal from consideration
diag(rdistxy) <- diag(rdistxy) +100000

# Find the index of the points with minimum distance
ind <- which(rdistxy == min(rdistxy), arr.ind = TRUE)

ind
##    row col
## 71  71  48
## 48  48  71

Plot the two points closest together and draw a tree to merge these two points

par(mfrow = c(1,2))
plot(x,y, col = "yellow", pch = 19, cex =2, main="Data")
text(x+.05, y+.05, labels=as.character(1:30))
points(x[ind[1,]], y[ind[1,]], col="green", pch =19, cex=2)

hcluster <- dist(dataframe) %>% hclust
dendro <- as.dendrogram(hcluster)
cutDendro <-cut(dendro, h=hcluster$height[2])

plot(cutDendro$lower[[63]], yaxt = "n", main ="Begin building tree")

Continue building the tree. Below code will provide us the next minimum distance.

nextmin <- rdistxy[order(rdistxy)][3]
ind1 <- which(rdistxy==nextmin,arr.ind = TRUE)
ind1
##    row col
## 79  79  78
## 78  78  79
par(mfrow = c(1,3))
plot(x,y, col = "yellow", pch = 19, cex =2, main="Data")
text(x+.05, y+.05, labels=as.character(1:30))
points(x[ind[1,]], y[ind[1,]], col="green", pch =19, cex=2);
points(x[ind1[1,]], y[ind1[1,]], col="red", pch =19, cex=2)

hcluster <- dist(dataframe) %>% hclust
dendro <- as.dendrogram(hcluster)
cutDendro <-cut(dendro, h=hcluster$height[2])
plot(cutDendro$lower[[63]], yaxt = "n", main ="Begin building tree")
plot(cutDendro$lower[[78]], yaxt = "n", main ="Begin building tree")

Eventually the more you branch out the tree you will come to this but there is simplier process to practicing Dendogram.

hclustering <- data.frame(x=x, y=y) %>% dist %>% hclust
plot(hclustering)

#### Heatmap function

Still having fun with data.

dataMatrix <- data.frame(x=x,y=y) %>% data.matrix
heatmap(dataMatrix)