library(MASS)
library(matlib)
## Warning: package 'matlib' was built under R version 4.4.2
library(rockchalk)
## Warning: package 'rockchalk' was built under R version 4.4.2
## 
## Attaching package: 'rockchalk'
## The following object is masked from 'package:MASS':
## 
##     mvrnorm
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::select()    masks MASS::select()
## ✖ dplyr::summarize() masks rockchalk::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Answer 1.1 u is an abstract term that measures the variation of y that x_1, x_2, and x_3 cannot account for. When we assume that u is a standard normal, we are assuming that the variation of y that is not explained by x_1, x_2, and x_3 is just random noise. If we were to include u (the random noise), this would in turn be adding all the unexplained variation in y and would simply lead to overfitting. \

#Answer 1.2

beta1s_100 = c()
for (i in 1:100) {
  X = mvrnorm(100, mu = c(0,0,0), Sigma = diag(3))
  x1 = X[,1]
  x2 = X[,2]
  x3 = X[,3]
  u = rnorm(100, 0, 1)
  y = 3 * x1 + 2 * x2 -2 * x3 + u
  Betas = inv(t(X) %*% X)%*%t(X)%*%y
  Beta1 = Betas[1]
  beta1s_100 = c(beta1s_100, Beta1)
}
beta1s_100 = data.frame(beta1s_100)
ggplot(data = beta1s_100, aes(x = beta1s_100)) + geom_histogram(fill = "red") + labs(title = "Distribution of Beta1 with 100 Samples", x = "Beta1", y = "Frequency") + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Answer 1.3

beta1s_5 = c()
for (i in 1:100) {
  X = mvrnorm(5, mu = c(0,0,0), Sigma = diag(3))
  u = rnorm(5, 0, 1)
  x1 = X[,1]
  x2 = X[,2]
  x3 = X[,3]
  y = 3 * x1 + 2 * x2 -2 * x3 + u
  Betas = inv(t(X) %*% X)%*%t(X)%*%y
  Beta1 = Betas[1]
  beta1s_5 = c(beta1s_5, Beta1)
}

beta1s_5 = data.frame(beta1s_5)
ggplot(data = beta1s_5, aes(x = beta1s_5)) + geom_histogram(fill = "blue") + labs(title = "Distribution of Beta1 with 5 Samples", x = "Beta1", y = "Frequency") + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Answer 1.4

beta1s_100_df <- data.frame(value = beta1s_100, group = "100 Samples")
beta1s_5_df <- data.frame(value = beta1s_5, group = "5 Samples")
names(beta1s_100_df) = names(beta1s_5_df)
combined_data <- rbind(beta1s_100_df, beta1s_5_df)

# Plot with a legend
ggplot(combined_data, aes(x = beta1s_5, fill = group)) +
  geom_histogram(alpha = 0.7, position = "identity", bins = 30, color = "black") +
  labs(
    title = "Distribution of Beta1",
    x = "Beta1",
    y = "Frequency",
    fill = "Sample Size"
  ) +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))

The estimators of the 100 sample and 5 sample data are both unbiased. However, the variance of the 5 sample estimator is much higher than that of the 100 sample estimator.

#Answer 1.5

beta1matrix = matrix(0, nrow = 196, ncol = 100)
for (i in 5:200) {
  beta1s = c()
  for (j in 1:100) {
    X = mvrnorm(i, mu = c(0,0,0), Sigma = diag(3))
    x1 = X[,1]
    x2 = X[,2]
    x3 = X[,3]
    u = rnorm(i, 0,1)
    y = 3 * x1 + 2 * x2 -2 * x3 + u
    Betas = inv(t(X) %*% X)%*%t(X)%*%y
    Beta1 = Betas[1]
    beta1s = c(beta1s, Beta1)
  }
  beta1matrix[i-4,] = beta1s
}

#Answer 1.6

averagebeta1s = rowSums(beta1matrix)/100
mylist = seq(5, 200)
averagebeta1s_df = data.frame(cbind(averagebeta1s, mylist))
names(averagebeta1s_df) = c("average_beta1", "N")
ggplot(data=averagebeta1s_df, aes(x=N, y=average_beta1, group=1)) +
  geom_line()+
  geom_point() + labs(x = "Number of Samples", y = "Average Beta1", title = "Relationship between the Average value of Beta1 and Sample Size")

#Answer 1.7 As seen above, the value of Beta_1 converges to the population parameter 3 as you increase the Sample Size. This is a demonstration of consistency.

#Answer 1.8

beta1s_100 = c()
confidence_intervals = matrix(0, 100, 2)
for (i in 1:100) {
  X = mvrnorm(100, mu = c(0,0,0), Sigma = diag(3))
  x1 = X[,1]
  x2 = X[,2]
  x3 = X[,3]
  u = rnorm(100, 0, 1)
  y = 3 * x1 + 2 * x2 -2 * x3 + u
  model = lm(y ~ 0 + x1 + x2 + x3)
  confidence_intervals[i,] = confint(model)[1]
}
result = sum(ifelse((confidence_intervals[,1] >= 3.0 & confidence_intervals[,2] <=3.0), 1, 0))/100
result
## [1] 0
summary(model)
## 
## Call:
## lm(formula = y ~ 0 + x1 + x2 + x3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.99998 -0.74500  0.07483  0.68322  2.66969 
## 
## Coefficients:
##    Estimate Std. Error t value Pr(>|t|)    
## x1   3.0590     0.0961   31.83   <2e-16 ***
## x2   2.0340     0.1186   17.16   <2e-16 ***
## x3  -2.1512     0.1016  -21.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.026 on 97 degrees of freedom
## Multiple R-squared:  0.9496, Adjusted R-squared:  0.9481 
## F-statistic: 609.5 on 3 and 97 DF,  p-value: < 2.2e-16

None of the Confidence intervals contain the true value of Beta1. This is surprising as we would expect that at least 95% of the confidence intervals contain the true value of Beta1.

#Answer 2.1 and Answer 2.2

smallbeta1_3 = matrix(0, nrow = 96, ncol = 100)
mediumbeta1_25 = matrix(0, nrow = 96, ncol = 100)
largebeta1_50 = matrix(0, nrow = 96, ncol = 100)
DesignMatrix = data.frame(mvrnorm(i, mu = matrix(0,1,50), Sigma = diag(50)))
suppressWarnings({

##Lets do the small dataset first
for (i in 5:100) {
  beta1s = c()
  for (j in 1:100) {
    u = rnorm(i, 0, 1)
    y =  3 * DesignMatrix$X1 + 2 * DesignMatrix$X2 - 2 * DesignMatrix$X3 + u
    short_dataset_model = lm(y ~ 0+ X1+X2+X3, data = DesignMatrix)
    Beta1 = coef(short_dataset_model)[1]
    beta1s = c(beta1s, Beta1)
  }
  smallbeta1_3[(i-4),] = beta1s
}

for (i in 5:100) {
  beta1s = c()
  for (j in 1:100) {
    u = rnorm(i, 0, 1)
    y =  3 * DesignMatrix$X1 + 2 * DesignMatrix$X2 - 2 * DesignMatrix$X3 + u
    formula <- as.formula(paste("y ~ 0 +", paste(names(DesignMatrix)[1:25], collapse = " + ")))
    short_dataset_model = lm(formula, data = DesignMatrix)
    Beta1 = coef(short_dataset_model)[1]
    beta1s = c(beta1s, Beta1)
  }
  mediumbeta1_25[(i-4),] = beta1s
}

for (i in 5:100) {
  beta1s = c()
  for (j in 1:100) {
    u = rnorm(i, 0, 1)
    y =  3 * DesignMatrix$X1 + 2 * DesignMatrix$X2 - 2 * DesignMatrix$X3 + u
    formula <- as.formula(paste("y ~ 0 +", paste(names(DesignMatrix)[1:50], collapse = " + ")))
    short_dataset_model = lm(formula, data = DesignMatrix)
    Beta1 = coef(short_dataset_model)[1]
    beta1s = c(beta1s, Beta1)
  }
  largebeta1_50[(i-4),] = beta1s
}

})

#Answer 2.3

averagebeta1s_3 = rowSums(smallbeta1_3)/100
averagebeta1s_3 = replace(averagebeta1s_3, averagebeta1s_3 == 0, NA)
averagebeta1s_25 = rowSums(mediumbeta1_25)/100
averagebeta1s_50 = rowSums(largebeta1_50)/100
mylist = seq(5, 100)
df = data.frame(cbind(averagebeta1s_3, averagebeta1s_25, averagebeta1s_50, mylist))
names(df) = c("3 Regressors", "25 Regressors", "50 Regressors", "Number of Samples")
df_stretched <- df %>%
  pivot_longer(cols = ends_with("Regressors"), names_to = "Series", values_to = "Value")
ggplot(data=df_stretched, aes(x=`Number of Samples`, y = Value, color = Series)) +
  geom_line() +
  geom_point() + labs(x = "Number of Samples", y = "Average Beta1", title = "Relationship between the Average value of Beta1 and Sample Size")

standard_deviation_3 = apply(smallbeta1_3, 1, sd)
standard_deviation_25 = apply(mediumbeta1_25, 1, sd)
standard_deviation_50 = apply(largebeta1_50, 1, sd)
mylist = seq(5, 100)
df_sd = data.frame(cbind(standard_deviation_3, standard_deviation_25, standard_deviation_50, mylist))
names(df_sd) = c("3 Regressors", "25 Regressors", "50 Regressors", "Number of Samples")
df_stretched_sd <- df_sd %>%
  pivot_longer(cols = ends_with("Regressors"), names_to = "Series", values_to = "Value")
ggplot(data=df_stretched_sd, aes(x=`Number of Samples`, y = Value, color = Series)) +
  geom_line() +
  geom_point() + labs(x = "Number of Samples", y = "Standard Deviation of Beta1", title = "Relationship between the Standard Deviation of Beta1 and Sample Size")

#Answer 2.4 In all three regressions there is no evidence that suggests Beta1 is biased. All three regressions estimate Beta1 to be very close to 3, which is the population parameter. However, it is quite clear that some estimators are more volatile in others. In the graph that shows the standard deviation against the Number of samples stratified on the regression type, we can see that the higher the regressions, the more consistently higher the variance/standard deviation is.

#Answer 2.5 No not necessarily. Adding too many regressors may create highly volatile estimators. As seen in the graph above, the standard deviation of Beta1 with the 50 regressors is consistently higher than that of 25 regressors and 3 regressors. This can also be seen in the graph that plots the average value of Beta1 over N stratified over the regressions. The blue line (50 regressors) can clearly be seen having harsher fluctuations than the rest.

Furthermore, adding too many predictors can cause overfitting, which means the model may not be generalizable, causing the out of sample error to be large or the prediction error to be high.

#Answer 2.6 One way to prevent this is to use ML techniques such as Lasso or Ridge Regression to remove excessive parameters that would end up hurting the model. One can also perform cross validation in order to tune the hyperparameters to create an optimized regression model. A more hands on approach could be to perform a statistical analysis on the data and see what only include the parameters that best explain the variance of the response.