\(~\)
\(~\)
library(ggplot2)
library(RColorBrewer)
library(plotly)
library(reshape2)
library(kableExtra)
library(tidyverse)
library(stringr)
library(data.table)
# Generating the data set
x_1 <- runif(1000, min = 0, max = 1)
# Plotting the variable x_1
x_1_plot <- as.data.frame(x_1) %>%
ggplot(aes(x = x_1)) +
geom_histogram(binwidth=0.01,fill = '#96E8BC', color = '#7DD181') +
theme(panel.background =
element_rect(fill ="white"),
axis.line = element_line(size = 1,
colour ="gray",
linetype=2)) +
xlab("Distribution of x_1") +
ylab("Count")
# Output
ggplotly(x_1_plot)
# Generating the data set
u_plus_1 <- rchisq(1000,df=1)
# Plotting the variable u + 1
u_plus_1_plot <- as.data.frame(u_plus_1) %>%
ggplot(aes(x = u_plus_1)) +
geom_histogram(binwidth=0.1, fill = '#96E8BC', color = '#7DD181') +
theme(panel.background =
element_rect(fill ="white"),
axis.line = element_line(size = 1,
colour ="gray",
linetype=2)) +
xlab("Distribution of u + 1") +
ylab("Count")
# Output
ggplotly(u_plus_1_plot)
\(~\)
\(~\)
\(~\)
# Generating the distribution
x_1 <- runif(5, min = 0, max = 1)
u_plus_1 <- rchisq(5,df=1)
# Storing the data set and computing mean and sd
dataset_1 <- as.data.frame(rbind(x_1, u_plus_1)) %>%
summarise(Mean_x_1 = mean(x_1),
Mean_u_plus_1 = mean(u_plus_1),
sd_x_1 = sd(x_1),
sd_u_plus_1 = sd(u_plus_1))
# Output
head(melt(dataset_1)) %>%
kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| variable | value |
|---|---|
| Mean_x_1 | 0.5302880 |
| Mean_u_plus_1 | 2.6085303 |
| sd_x_1 | 0.3222855 |
| sd_u_plus_1 | 4.2763744 |
\(~\)
\(~\)
# Iterating the program :
## For uniform
### Setting a blank matrix in which data will be stored
m_uni = matrix(NA, nrow = 5, ncol = 10000)
m_uni <- as.data.frame(m_uni)
### Filling vectors of uniformly distributed generated data
for(col in 1:ncol(m_uni)) {
m_uni[col] <- round(runif(5, min = 0, max = 1), digits = 4)
}
dataset_mean_sd_uni <- m_uni %>%
summarise(across(.cols = everything(),
list(mean = mean, sd = sd))) %>%
rowid_to_column("index") %>%
mutate(index = case_when(index == 1 ~ "uni"))
## For chi^2
### Setting a blank matrix in which data will be stored
m_chi = matrix(NA, nrow = 5, ncol = 10000)
m_chi <- as.data.frame(m_chi)
### Filling vectors of chi^2 distributed generated data
for(col in 1:ncol(m_chi)) {
m_chi[col] <- round(rchisq(5,df=1), digits = 4)
}
dataset_mean_sd_chi <- m_chi %>%
summarise(across(.cols = everything(),
list(mean = mean, sd = sd))) %>%
rowid_to_column("index") %>%
mutate(index = case_when(index == 1 ~ "chi^2"))
## Merging the two data sets
data_short <- rbind(dataset_mean_sd_chi, dataset_mean_sd_uni)
data_long <- melt(data_short)
head(data_long) %>%
kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| index | variable | value |
|---|---|---|
| chi^2 | V1_mean | 0.7731400 |
| uni | V1_mean | 0.2434600 |
| chi^2 | V1_sd | 1.0484636 |
| uni | V1_sd | 0.2702839 |
| chi^2 | V2_mean | 1.6661400 |
| uni | V2_mean | 0.5650200 |
\(~\)
\(~\)
mean_plot <- data_long %>% filter(str_ends(variable, "mean"))
mean_plot <- mean_plot %>%
ggplot(aes(x = value), group = index, color = index) +
geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') +
theme(panel.background =
element_rect(fill ="white"),
axis.line = element_line(size = 1,
colour ="gray",
linetype=2)) +
xlab("Mean distribution")
# Output
ggplotly(mean_plot)
\(~\)
\(~\)
# Let's build the functions for our program
mean_distrib_custom_chi <- function(mrow, mcol){
### Setting a blank matrix in which data will be stored
m_chi = matrix(NA, nrow = mrow, ncol = mcol)
m_chi <- as.data.frame(m_chi)
### Filling vectors of uniformly distributed generated data
for(col in 1:ncol(m_chi)) {
m_chi[col] <- round(rchisq(5,df=1), digits = 4)
}
dataset_mean_sd_chi <- m_chi %>%
summarise(across(.cols = everything(),
list(mean = mean, sd = sd))) %>%
rowid_to_column("index") %>%
mutate(index = case_when(index == 1 ~ "chi^2"))
return(dataset_mean_sd_chi)
}
mean_distrib_custom_uni <- function(mrow, mcol){
### Setting a blank matrix in which data will be stored
m_uni = matrix(NA, nrow = mrow, ncol = mcol)
m_uni <- as.data.frame(m_uni)
### Filling vectors of uniformly distributed generated data
for(col in 1:ncol(m_uni)) {
m_uni[col] <- round(runif(5, min = 0, max = 1), digits = 4)
}
dataset_mean_sd_uni <- m_uni %>%
summarise(across(.cols = everything(),
list(mean = mean, sd = sd))) %>%
rowid_to_column("index") %>%
mutate(index = case_when(index == 1 ~ "uni"))
return(dataset_mean_sd_uni)
}
\(~\)
These two functions return two distinct data sets that we can merge :
\(~\)
# For N = 10
data <- rbind(mean_distrib_custom_chi(5, 10), mean_distrib_custom_uni(5, 10))
data_long_1 <- melt(data) %>% filter(str_ends(variable, "mean"))
data_long_1_summary <- data_long_1 %>%
summarise(Min = min(value),
'1st Qu' = quantile(value, probs = 0.25),
Median = quantile(value, probs = 0.5),
Mean = mean(value),
'3rd Qu' = quantile(value, probs = 0.75),
Max = max(value),
sd = sd(value))
data_long_1_summary %>%
kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| Min | 1st Qu | Median | Mean | 3rd Qu | Max | sd |
|---|---|---|---|---|---|---|
| 0.25456 | 0.53792 | 0.72984 | 0.787901 | 0.802135 | 2.4272 | 0.5300473 |
data_long_1_summary
# For N = 100
data_1 <- rbind(mean_distrib_custom_chi(5, 100), mean_distrib_custom_uni(5, 100))
data_long_2 <- melt(data_1) %>% filter(str_ends(variable, "mean"))
data_long_2_summary <- data_long_2 %>%
summarise(Min = min(value),
'1st Qu' = quantile(value, probs = 0.25),
Median = quantile(value, probs = 0.5),
Mean = mean(value),
'3rd Qu' = quantile(value, probs = 0.75),
Max = max(value),
sd = sd(value))
data_long_2_summary %>%
kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| Min | 1st Qu | Median | Mean | 3rd Qu | Max | sd |
|---|---|---|---|---|---|---|
| 0.08766 | 0.44677 | 0.56668 | 0.7668332 | 0.93129 | 3.43982 | 0.5329929 |
# For N = 1000
data_2 <- rbind(mean_distrib_custom_chi(5, 1000), mean_distrib_custom_uni(5, 1000))
data_long_3 <- melt(data_2) %>% filter(str_ends(variable, "mean"))
data_long_3_summary <- data_long_3 %>%
summarise(Min = min(value),
'1st Qu' = quantile(value, probs = 0.25),
Median = quantile(value, probs = 0.5),
Mean = mean(value),
'3rd Qu' = quantile(value, probs = 0.75),
Max = max(value),
sd = sd(value))
data_long_3_summary %>%
kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| Min | 1st Qu | Median | Mean | 3rd Qu | Max | sd |
|---|---|---|---|---|---|---|
| 0.05822 | 0.437845 | 0.5731 | 0.7495552 | 0.864245 | 4.05702 | 0.5323606 |
\(~\)
\(~\)
N_100_1 <- sd(data_long_1$value)*((10)^(0.5)/(100)^(0.5))
N_1000_1 <- sd(data_long_1$value)*((10)^(0.5)/(1000)^(0.5))
N_10 <- sd(data_long_1$value)
set <- c(N_10, N_100_1, N_1000_1)
set
## [1] 0.53004726 0.16761566 0.05300473
\(~\)
\(~\)
Plotting the distribution to check normality assumption around the mean when the sample is large enough.
\(~\)
# For N = 10
mean_plot10 <- data_long_1 %>%
ggplot(aes(x = value), group = index, color = index) +
geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') +
theme(panel.background =
element_rect(fill ="white"),
axis.line = element_line(size = 1,
colour ="gray",
linetype=2)) +
xlab("Mean distribution : N = 10")
# Output
ggplotly(mean_plot10)
# For N = 100
mean_plot100 <- data_long_2 %>%
ggplot(aes(x = value), group = index, color = index) +
geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') +
theme(panel.background =
element_rect(fill ="white"),
axis.line = element_line(size = 1,
colour ="gray",
linetype=2)) +
xlab("Mean distribution : N = 100")
# Output
ggplotly(mean_plot100)
# For N = 1000
mean_plot1000 <- data_long_3 %>%
ggplot(aes(x = value), group = index, color = index) +
geom_histogram(binwidth=0.05,fill = '#96E8BC', color = '#7DD181') +
theme(panel.background =
element_rect(fill ="white"),
axis.line = element_line(size = 1,
colour ="gray",
linetype=2)) +
xlab("Mean distribution : N = 1000")
# Output
ggplotly(mean_plot1000)
\(~\)
\(~\)
# Generating variable
x_1 <- runif(10, min = 0, max = 1)
x_2 <- rbinom(10, size = 1, prob = 0.3)
u_plus_1 <- rchisq(10,df=1)
y <- 1+2*x_1+10*x_2 + u_plus_1
# Building the dataset
dataset_2 <- rbind(y, x_1, x_2, u_plus_1)
# Output
head(t(dataset_2)) %>%
kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| y | x_1 | x_2 | u_plus_1 |
|---|---|---|---|
| 12.438084 | 0.6238498 | 1 | 0.1903844 |
| 4.077286 | 0.8344314 | 0 | 1.4084228 |
| 13.183887 | 0.6073338 | 1 | 0.9692196 |
| 13.467538 | 0.9506272 | 1 | 0.5662838 |
| 3.781189 | 0.5298906 | 0 | 1.7214078 |
| 14.662762 | 0.8253926 | 1 | 2.0119766 |
\(~\)
\(~\)
model_1 <- lm(y ~ x_1 + x_2, data = as.data.frame(dataset_2))
summary(model_1)
##
## Call:
## lm(formula = y ~ x_1 + x_2, data = as.data.frame(dataset_2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95950 -0.57547 -0.08894 0.18514 1.51260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9752 0.8310 2.377 0.0491 *
## x_1 3.1395 1.2999 2.415 0.0464 *
## x_2 9.1026 0.6083 14.963 1.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8745 on 7 degrees of freedom
## Multiple R-squared: 0.9771, Adjusted R-squared: 0.9705
## F-statistic: 149.1 on 2 and 7 DF, p-value: 1.829e-06
\(~\)
\(~\)
# Extracting coefficients
beta_0 <- model_1$coeff[1]
beta_1 <- model_1$coeff[2]
beta_2 <- model_1$coeff[3]
# Extracting the corresponding variances
beta_0_sd <- sqrt(vcov(model_1)[1,1])
beta_1_sd <- sqrt(vcov(model_1)[2,2])
beta_2_sd <- sqrt(vcov(model_1)[3,3])
# Creating the corresponding data set
`true value` <- c(1, 2, 10)
coeff <- c(beta_0, beta_1, beta_2)
sd <- c(beta_0_sd, beta_1_sd, beta_2_sd)
data.frame(`true value`, coeff, sd) %>%
kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| true.value | coeff | sd | |
|---|---|---|---|
| (Intercept) | 1 | 1.975165 | 0.8310497 |
| x_1 | 2 | 3.139547 | 1.2999271 |
| x_2 | 10 | 9.102589 | 0.6083256 |
generatedata_1 <- function(size = size, N = N) {
# Generating the data set
data = as.data.frame(matrix(NA, N, size*4))
for(col in 1:size) {
for(col_1 in 1:size) {
for(col_2 in 1:size) {
for(col_3 in 1:size) {
data[col] <- runif(N, min = 0, max = 1)
data[col_1+size] <- rbinom(N, size = 1, prob = 0.3)
data[col_2+(2*size)] <- rchisq(N,df=1)
data[col_3+(3*size)] <- 1+2*data[col] +
10*data[col_1+10] +
data[col_2+20]
}
}
}
}
return(data)
}
generatedata_1(size = 10, N = 10000)