\(~\)

1 - Generate a dataset of 1000 observations that contains \(x_1 ∼ \mathbb{U}(0, 1)\) and \(u + 1 ∼ \chi^2(1)\). Plot both \(x_1\) and \(u+1\)

\(~\)

library(ggplot2)
library(RColorBrewer)
library(plotly)
library(reshape2)
library(kableExtra)
library(tidyverse)
library(stringr)
library(data.table)

# Generating the data set

x_1 <- runif(1000, min = 0, max = 1)

# Plotting the variable x_1

x_1_plot <- as.data.frame(x_1) %>% 
  ggplot(aes(x = x_1)) +
  geom_histogram(binwidth=0.01,fill = '#96E8BC', color = '#7DD181') +
  theme(panel.background = 
            element_rect(fill ="white"),
          axis.line = element_line(size = 1, 
                                   colour ="gray",
                                   linetype=2)) +
  xlab("Distribution of x_1") +
  ylab("Count") 

# Output
  
ggplotly(x_1_plot)
# Generating the data set 

u_plus_1 <- rchisq(1000,df=1)

# Plotting the variable u + 1 

u_plus_1_plot <- as.data.frame(u_plus_1) %>% 
  ggplot(aes(x = u_plus_1)) +
  geom_histogram(binwidth=0.1, fill = '#96E8BC', color = '#7DD181') +
  theme(panel.background = 
            element_rect(fill ="white"),
          axis.line = element_line(size = 1, 
                                   colour ="gray",
                                   linetype=2)) +
  xlab("Distribution of u + 1") +
  ylab("Count") 

# Output
  
ggplotly(u_plus_1_plot)

\(~\)

2 - Now that you know how to draw random variables, clear all data and follow these steps:

\(~\)

A - Define a program (in Stata) or a function (in other languages) that generates \(u + 1 ∼ \chi^2(1)\) over N = 5 observations and retrieves the mean and standard deviation of the generated sample

\(~\)

# Generating the distribution

x_1 <- runif(5, min = 0, max = 1)
u_plus_1 <- rchisq(5,df=1)


# Storing the data set and computing mean and sd 

dataset_1 <- as.data.frame(rbind(x_1, u_plus_1)) %>%
             summarise(Mean_x_1 = mean(x_1),
                       Mean_u_plus_1 = mean(u_plus_1),
                       sd_x_1 = sd(x_1),
                       sd_u_plus_1 = sd(u_plus_1))
# Output

head(melt(dataset_1)) %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
variable value
Mean_x_1 0.5302880
Mean_u_plus_1 2.6085303
sd_x_1 0.3222855
sd_u_plus_1 4.2763744

\(~\)

B - Simulate the program 10 000 times. you should obtain a dataset with a variable for the mean and one for the standard deviation

\(~\)

# Iterating the program : 

## For uniform

### Setting a blank matrix in which data will be stored 

m_uni = matrix(NA, nrow = 5, ncol = 10000)
m_uni <- as.data.frame(m_uni)

### Filling vectors of uniformly distributed generated data 

for(col in 1:ncol(m_uni)) {
  m_uni[col] <- round(runif(5, min = 0, max = 1), digits = 4)
}

dataset_mean_sd_uni <- m_uni %>%
  summarise(across(.cols = everything(),
                   list(mean = mean, sd = sd))) %>%
  rowid_to_column("index") %>%
  mutate(index = case_when(index == 1 ~ "uni"))


## For chi^2

### Setting a blank matrix in which data will be stored 

m_chi = matrix(NA, nrow = 5, ncol = 10000)
m_chi <- as.data.frame(m_chi)

### Filling vectors of chi^2 distributed generated data 

for(col in 1:ncol(m_chi)) {
  m_chi[col] <- round(rchisq(5,df=1), digits = 4)
}

dataset_mean_sd_chi <- m_chi %>%
  summarise(across(.cols = everything(),
                   list(mean = mean, sd = sd))) %>%
  rowid_to_column("index") %>%
  mutate(index = case_when(index == 1 ~ "chi^2"))

## Merging the two data sets 

data_short <- rbind(dataset_mean_sd_chi, dataset_mean_sd_uni)
data_long <- melt(data_short)

head(data_long) %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
index variable value
chi^2 V1_mean 0.7731400
uni V1_mean 0.2434600
chi^2 V1_sd 1.0484636
uni V1_sd 0.2702839
chi^2 V2_mean 1.6661400
uni V2_mean 0.5650200

\(~\)

C - Summarize and plot a histogram of the mean variable

\(~\)

mean_plot <- data_long %>% filter(str_ends(variable, "mean")) 

mean_plot <- mean_plot %>%
  ggplot(aes(x = value), group = index, color = index) +
  geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') + 
  theme(panel.background = 
            element_rect(fill ="white"),
          axis.line = element_line(size = 1, 
                                   colour ="gray",
                                   linetype=2)) +
  xlab("Mean distribution")

# Output
  
ggplotly(mean_plot)

\(~\)

3 - Repeat the simulation in 2) but for different programs with N = 10,100 and 1000. Calculate the summary statistics of each of the mean variables obtained. What do you observe?

\(~\)

# Let's build the functions for our program 

mean_distrib_custom_chi <- function(mrow, mcol){
  ### Setting a blank matrix in which data will be stored 

  m_chi = matrix(NA, nrow = mrow, ncol = mcol)
  m_chi <- as.data.frame(m_chi)

### Filling vectors of uniformly distributed generated data 

  for(col in 1:ncol(m_chi)) {
    m_chi[col] <- round(rchisq(5,df=1), digits = 4)
  }

  dataset_mean_sd_chi <- m_chi %>%
    summarise(across(.cols = everything(),
                   list(mean = mean, sd = sd))) %>%
    rowid_to_column("index") %>%
    mutate(index = case_when(index == 1 ~ "chi^2"))
  return(dataset_mean_sd_chi)
  
}

mean_distrib_custom_uni <- function(mrow, mcol){
  
  ### Setting a blank matrix in which data will be stored 

  m_uni = matrix(NA, nrow = mrow, ncol = mcol)
  m_uni <- as.data.frame(m_uni)

### Filling vectors of uniformly distributed generated data 

  for(col in 1:ncol(m_uni)) {
    m_uni[col] <- round(runif(5, min = 0, max = 1), digits = 4)
  }

  dataset_mean_sd_uni <- m_uni %>%
    summarise(across(.cols = everything(),
                   list(mean = mean, sd = sd))) %>%
    rowid_to_column("index") %>%
    mutate(index = case_when(index == 1 ~ "uni"))
  return(dataset_mean_sd_uni)
}

\(~\)

These two functions return two distinct data sets that we can merge :

\(~\)

# For N = 10 

data <- rbind(mean_distrib_custom_chi(5, 10), mean_distrib_custom_uni(5, 10))
data_long_1 <- melt(data) %>% filter(str_ends(variable, "mean"))

data_long_1_summary <- data_long_1 %>%
               summarise(Min = min(value),
                         '1st Qu' = quantile(value, probs = 0.25),
                         Median = quantile(value, probs = 0.5),
                         Mean = mean(value),
                         '3rd Qu' = quantile(value, probs = 0.75),
                         Max = max(value),
                         sd = sd(value))

data_long_1_summary %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
Min 1st Qu Median Mean 3rd Qu Max sd
0.25456 0.53792 0.72984 0.787901 0.802135 2.4272 0.5300473
data_long_1_summary
# For N = 100

data_1 <- rbind(mean_distrib_custom_chi(5, 100), mean_distrib_custom_uni(5, 100))
data_long_2 <- melt(data_1) %>% filter(str_ends(variable, "mean")) 

data_long_2_summary <- data_long_2 %>%
               summarise(Min = min(value),
                         '1st Qu' = quantile(value, probs = 0.25),
                         Median = quantile(value, probs = 0.5),
                         Mean = mean(value),
                         '3rd Qu' = quantile(value, probs = 0.75),
                         Max = max(value),
                         sd = sd(value))

data_long_2_summary %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
Min 1st Qu Median Mean 3rd Qu Max sd
0.08766 0.44677 0.56668 0.7668332 0.93129 3.43982 0.5329929
# For N = 1000

data_2 <- rbind(mean_distrib_custom_chi(5, 1000), mean_distrib_custom_uni(5, 1000))
data_long_3 <- melt(data_2) %>% filter(str_ends(variable, "mean")) 

data_long_3_summary <- data_long_3 %>%
               summarise(Min = min(value),
                         '1st Qu' = quantile(value, probs = 0.25),
                         Median = quantile(value, probs = 0.5),
                         Mean = mean(value),
                         '3rd Qu' = quantile(value, probs = 0.75),
                         Max = max(value),
                         sd = sd(value))

data_long_3_summary %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
Min 1st Qu Median Mean 3rd Qu Max sd
0.05822 0.437845 0.5731 0.7495552 0.864245 4.05702 0.5323606

\(~\)

4 - Take the standard deviation of the mean variable in question 3 when \(N = 10\). Multiply with \(\sqrt{10}/\sqrt{100}\) and \(\sqrt{10}/\sqrt{1000}\) respectively and compare this to the results in question 3. Explain what you see. What sample size would you need to obtain a standard deviation of about 0.001?

\(~\)

N_100_1 <- sd(data_long_1$value)*((10)^(0.5)/(100)^(0.5))
N_1000_1 <- sd(data_long_1$value)*((10)^(0.5)/(1000)^(0.5))
N_10 <- sd(data_long_1$value)

set <- c(N_10, N_100_1, N_1000_1)

set
## [1] 0.53004726 0.16761566 0.05300473

\(~\)

5 - Run the same simulations as in 3), but now inspect the distribution of the variable using a histogram. What do you observe?

\(~\)

Plotting the distribution to check normality assumption around the mean when the sample is large enough.

\(~\)

# For N = 10 

mean_plot10 <- data_long_1 %>%
  ggplot(aes(x = value), group = index, color = index) +
  geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') + 
  theme(panel.background = 
            element_rect(fill ="white"),
          axis.line = element_line(size = 1, 
                                   colour ="gray",
                                   linetype=2)) +
  xlab("Mean distribution : N = 10")

# Output
  
ggplotly(mean_plot10)
# For N = 100

mean_plot100 <- data_long_2 %>%
  ggplot(aes(x = value), group = index, color = index) +
  geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') + 
  theme(panel.background = 
            element_rect(fill ="white"),
          axis.line = element_line(size = 1, 
                                   colour ="gray",
                                   linetype=2)) +
  xlab("Mean distribution : N = 100")

# Output
  
ggplotly(mean_plot100)
# For N = 1000

mean_plot1000 <- data_long_3 %>%
  ggplot(aes(x = value), group = index, color = index) +
  geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') + 
  theme(panel.background = 
            element_rect(fill ="white"),
          axis.line = element_line(size = 1, 
                                   colour ="gray",
                                   linetype=2)) +
  xlab("Mean distribution : N = 1000")

# Output
  
ggplotly(mean_plot1000)

\(~\)

A - Set up a program or function that does the following for \(N = 10\) :

\(~\)

# Generating variable

x_1 <- runif(10, min = 0, max = 1)
x_2 <- rbinom(10, size = 1, prob = 0.3)
u_plus_1 <- rchisq(10,df=1)
y <- 1+2*x_1+10*x_2 + u_plus_1

# Building the dataset

dataset_2 <- rbind(y, x_1, x_2, u_plus_1)

# Output

head(t(dataset_2)) %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
y x_1 x_2 u_plus_1
12.438084 0.6238498 1 0.1903844
4.077286 0.8344314 0 1.4084228
13.183887 0.6073338 1 0.9692196
13.467538 0.9506272 1 0.5662838
3.781189 0.5298906 0 1.7214078
14.662762 0.8253926 1 2.0119766

\(~\)

B - Regress \(y\) on \(x_1\), \(x_2\)

\(~\)

model_1 <- lm(y ~ x_1 + x_2, data = as.data.frame(dataset_2))

summary(model_1)
## 
## Call:
## lm(formula = y ~ x_1 + x_2, data = as.data.frame(dataset_2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95950 -0.57547 -0.08894  0.18514  1.51260 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.9752     0.8310   2.377   0.0491 *  
## x_1           3.1395     1.2999   2.415   0.0464 *  
## x_2           9.1026     0.6083  14.963 1.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8745 on 7 degrees of freedom
## Multiple R-squared:  0.9771, Adjusted R-squared:  0.9705 
## F-statistic: 149.1 on 2 and 7 DF,  p-value: 1.829e-06

\(~\)

C - Retrive coefficients \(\beta_0\) on \(\beta_1\), \(\beta_2\)

\(~\)

# Extracting coefficients

beta_0 <- model_1$coeff[1]
beta_1 <- model_1$coeff[2]
beta_2 <- model_1$coeff[3]

# Extracting the corresponding variances 

beta_0_sd <- sqrt(vcov(model_1)[1,1])
beta_1_sd <- sqrt(vcov(model_1)[2,2])
beta_2_sd <- sqrt(vcov(model_1)[3,3])

# Creating the corresponding data set

`true value` <- c(1, 2, 10)
coeff <- c(beta_0, beta_1, beta_2)
sd <- c(beta_0_sd, beta_1_sd, beta_2_sd)
  
data.frame(`true value`, coeff, sd) %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
true.value coeff sd
(Intercept) 1 1.975165 0.8310497
x_1 2 3.139547 1.2999271
x_2 10 9.102589 0.6083256
generatedata_1 <- function(size = size, N = N) {
  
  # Generating the data set 

data = as.data.frame(matrix(NA, N, size*4))

for(col in 1:size) {
  for(col_1 in 1:size) {
    for(col_2 in 1:size) {
      for(col_3 in 1:size) {
        data[col] <- runif(N, min = 0, max = 1)
        data[col_1+size] <- rbinom(N, size = 1, prob = 0.3)
        data[col_2+(2*size)] <- rchisq(N,df=1)
        data[col_3+(3*size)] <- 1+2*data[col] +
                                10*data[col_1+10] + 
                                data[col_2+20]
      }
    }
  }
}
  return(data)
}

generatedata_1(size = 10, N = 10000)