Assignment 1 Results by Submission Date

Assignment 1

setwd("C:/Users/drobb/Desktop/Linear Regression")
library(ggplot2)
library(MASS)
library(quantreg)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(investr)
url <- "https://bgreenwell.github.io/uc-bana7052/data/alumni.csv"
alumni <- read.csv(url)
str(alumni)  # print structure of the alumni data frame
## 'data.frame':    48 obs. of  5 variables:
##  $ school                     : chr  "Boston College" "Brandeis University " "Brown University" "California Institute of Technology" ...
##  $ percent_of_classes_under_20: int  39 68 60 65 67 52 45 69 72 61 ...
##  $ student_faculty_ratio      : int  13 8 8 3 10 8 12 7 13 10 ...
##  $ alumni_giving_rate         : int  25 33 40 46 28 31 27 31 35 53 ...
##  $ private                    : int  1 1 1 1 1 1 1 1 1 1 ...
summary(alumni)
##     school          percent_of_classes_under_20 student_faculty_ratio
##  Length:48          Min.   :29.00               Min.   : 3.00        
##  Class :character   1st Qu.:44.75               1st Qu.: 8.00        
##  Mode  :character   Median :59.50               Median :10.50        
##                     Mean   :55.73               Mean   :11.54        
##                     3rd Qu.:66.25               3rd Qu.:13.50        
##                     Max.   :77.00               Max.   :23.00        
##  alumni_giving_rate    private      
##  Min.   : 7.00      Min.   :0.0000  
##  1st Qu.:18.75      1st Qu.:0.0000  
##  Median :29.00      Median :1.0000  
##  Mean   :29.27      Mean   :0.6875  
##  3rd Qu.:38.50      3rd Qu.:1.0000  
##  Max.   :67.00      Max.   :1.0000
#Note: response variable is Alumni Giving Rate and the predictor variable is % of classes under 20
summary(alumni$percent_of_classes_under_20)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   44.75   59.50   55.73   66.25   77.00
summary(alumni$alumni_giving_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00   18.75   29.00   29.27   38.50   67.00
ggplot(alumni, aes(x= percent_of_classes_under_20, y=  alumni_giving_rate)) + 
  geom_point() +
  labs(title = "Alumni Giving Rate to Percent of classes under 20", 
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")

find_outliers <- function(variable) {
  q <- quantile(variable)
  iqr <- IQR(variable)
  lower_limit <- q[2] - 1.2 * iqr
  upper_limit <- q[4] +1.2 * iqr
  return(variable[variable < lower_limit | variable > upper_limit])
}


outliers_under_20 <- boxplot.stats(alumni$percent_of_classes_under_20)$out
outliers_giving_rate <- boxplot.stats(alumni$alumni_giving_rate)$out
correlation_coefficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data=alumni)
coefficients(model)
##                 (Intercept) percent_of_classes_under_20 
##                  -7.3860676                   0.6577687
outliers_under_20
## integer(0)
# find Outliers
residuals <- residuals(model)
confidence_interval <- mean(alumni$alumni_giving_rate) + c(-1.96, 1.96) * sd(residuals)
outliers <- subset(alumni, alumni_giving_rate < confidence_interval[1] | alumni_giving_rate > confidence_interval[2])
print(outliers)
##                        school percent_of_classes_under_20 student_faculty_ratio
## 10          Dartmouth College                          61                    10
## 21       Princeton University                          68                     5
## 27     U. of California-Davis                          32                    19
## 28    U. of California-Irvine                          42                    20
## 30 U. of California-San Diego                          48                    19
## 48            Yale University                          77                     7
##    alumni_giving_rate private
## 10                 53       1
## 21                 67       1
## 27                  7       0
## 28                  9       0
## 30                  8       0
## 48                 50       1
#Linear Regression is  -7.2860676 + 0.6577686x
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
                     "\nCorrelation Coefficient:", round(correlation_coefficient, 2),
                     "\nRegression Equation: Y = -7.39 + 0.66X",
                     sep = ""),
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'

# Question 2 ----------------------------------------------------------------------


set.seed(7052)
n <- 100
X <- rnorm(n, mean = 2, sd = 0.1)
error <- rnorm(n, mean = 0, sd = 0.5)
Y <- 10 + 5 * X + error
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.725   1.923   2.001   2.004   2.070   2.243
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.09   19.67   20.11   20.17   20.70   21.80
boxplot(X, Y, names = c("X", "Y"), main = "Question 2 Boxplot")

correlation_coefficient_2 <- cor(X, Y)
plot(X, Y, main = "Question 2 Scatter Plot", xlab = "X", ylab="Y")

model_Q2 <- lm(Y ~ X)
coefficients(model_Q2)
## (Intercept)           X 
##    9.021796    5.565160
mse_Q2 <- mean(model$residuals^2)
mse_Q2
## [1] 103.1601
mean_X_Q2 <- mean(X)
mean_Y_Q2 <- mean(Y)
plot(X, Y, main = "Question 2 Plot", xlab = "X", ylab = "Y")
lines(X, predict(model_Q2), col = "red")
points(mean_X_Q2, mean_Y_Q2, col = "blue", pch = 19)
intercept <- coef(model_Q2)[1]
slope <- coef(model_Q2)[2]
equation <- paste("Y =", round(intercept, 2), "+", round(slope, 2), "X")
text(mean_X_Q2, mean_Y_Q2, equation, pos = 3, col = "black")

# Question 3 ----------------------------------------------------------------------

library(MASS)
lad_model <- rlm(Y ~ X)
summary(lad_model)
## 
## Call: rlm(formula = Y ~ X)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.21113 -0.30597  0.00906  0.30188  1.35462 
## 
## Coefficients:
##             Value   Std. Error t value
## (Intercept)  9.1089  0.8469    10.7552
## X            5.5220  0.4221    13.0833
## 
## Residual standard error: 0.4542 on 98 degrees of freedom
library(quantreg)
lad_quantile_model <- rq(Y ~ X)
summary(lad_quantile_model)
## Warning in rq.fit.br(x, y, tau = tau, ci = TRUE, ...): Solution may be
## nonunique
## 
## Call: rq(formula = Y ~ X)
## 
## tau: [1] 0.5
## 
## Coefficients:
##             coefficients lower bd upper bd
## (Intercept)  8.61233      6.15307 11.81847
## X            5.77192      4.12206  6.98627

##Report Question 1 Beginning with the relationship of class size to likelihood of donation after graduation, the predictor variable is the percent of classes the student attended with less than 20 students and the response variable is the alumni giving rate provided in the data set. Please see Table 1.1 and Table 1.2 below for outline of summary data. The nature of the data is linear with significant outliers in the data. Outliers are defined as points where the correlation of class size and giving rate does not fall within the 95th percentile of acceptance. There are two identified outliers to the dataset: someone who under the accepted donation level from New York University with a giving rate of 13% and a percent of classes under 20 at 63%, the second from Princeton University with a donation level higher than expected at 67% and a percent of classes under 20 at 68%. The correlation coefficient of the data is 0.65, which leads to the understanding that there is a moderate correlation between the amount of classes under 20 students and the annual giving rate of the alumni. The relationship is positively correlated. When fitting a simple linear regression to the data, the estimated regression equation results in being Y=-7.39 + 0.66X where X is the percent of classes under 20 students and Y is the alumni giving rate. Generally, we can interpret that the more classes the group of alumni takes with a class size of less than twenty, the more likely individuals are to donate back to the college. However, with the correlation coefficient being moderate and not strong, there are likely other significant factors that need to be taken into consideration before using this equation as a predictor of giving rates. The scatterplot outlining the information discussed in this question can be seen below in Plot 1.1. Table 1.2: Summary Statistics of Alumni Giving Rate Min 7.00 1st Qu. 18.75 Median 29.00 Mean 29.27 3rd Qu. 38.50 Max 67.00 Table 1.1: Summary Statistics of Alumni Percent of classes under 20 students Min 29.00 1st Qu. 44.75 Median 59.50 Mean 55.73 3rd Qu. 66.25 Max 77.00

Plot 1.1

Question 2 Running a simulation assuming the mean response is E(Y|X) = 10 + 5X and a data set to seed 7052, we can conclude information in Tables 2.1 and 2.2 where X is the predictor variable and Y is the Response variable. There are no extreme outliers as seen in Plot 2.1 and Plot 2.2, and there is a correlation coefficient of 0.8042197705. When fitting a simple linear regression, the estimated model is Y=9.02 + 5.57X, with a MSE of 103.16. The sample mean of X is 2.003677 and the sample mean of Y is 20.17258. When plotting the fitted regression line and the point associated with the sample means of both X and Y, we find the line of best fit passes directly through the sample mean point. Table 2.1: Summary Statistics of X Min 1.725 1st Qu. 1.923 Median 2.001 Mean 2.004 3rd Qu. 2.070 Max 2.243 Table 1.2: Summary Statistics of Alumni Giving Rate Min 18.09 1st Qu. 19.67 Median 20.11 Mean 20.17 3rd Qu. 20.70 Max 21.80

Plot 2.1

Plot 2.2

Question 3 When minimizing the RSS equation using the Least Absolute Deviations regression, We would use this function as it is not effected heavily by outliers with the square term. When performing this function, we use rlm to manipulate the Y and X variables and result in a standard error of 0.4542 on 98 degrees of freedom. When minimizing the RSS equation with the LAD quantile model, we are also buffering against outliers, but returning the quartile expectations rather than individual expectations of the dataset. Ordinary Least Squares is a popular choice for estimating β_0 and β_1 as it applies when the data is normally skewed, and can be reliable as sample sizes continue to increase. This helps reduce bias in the data using linear regression assumptions.

Question 4 In relationship a, the point (X ‾,Y ‾ ) is the mean of the data set. As shown in plot 2.2, the regression line must pass through this point. In Relationship b, the sum of the residuals will always be zero with the least squares regression as each residual is the difference between the observed and predicted values. This means, when summed, opposing assumptions will cancel out resulting in the sum of zero. In relationship c, this has the same logic as relationship b where the sum of the predicted values is equal to the sum of observed values when the ordinary least squares estimation is used to fit the regression line. In relationship d, the product of the residuals and the predictor values will equal zero when summed due to the minimization of the summed of squared residuals and when minimized, the values will sum to zero. In relationship e, the minimization property is also in play, therefore with ordinary least squares regression the sum of the residuals weighted by the fitted values will always be zero. In relationship f, the objective of OLS is outlined, showing that the goal of the observed least squares is minimized.

Assignment 2

Module 2 Assignment

# Question 1 ----------------------------------------------------------------------
#Note: this is the same plot as used in the last assignment
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
                     "\nCorrelation Coefficient:", round(correlation_coefficient, 2),
                     "\nRegression Equation: Y = -7.39 + 0.66X",
                     sep = ""),
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'

tstat <- summary(model)$coefficients["percent_of_classes_under_20", "t value"]
p_value <- summary(model)$coefficients["percent_of_classes_under_20", "Pr(>|t|)"]
cat("Estimated Slope:", coef(model)["percent_of_classes_under_20"], "\n")
## Estimated Slope: 0.6577687
cat("T-Statstic:", tstat, "\n")
## T-Statstic: 5.734448
cat("P-Value:", p_value, "\n")
## P-Value: 7.228121e-07
if (!is.na(p_value) && p_value < 0.05) {
  cat("The null hypothesis is rejected as estimated slope is significant at α=0.05 level.\n")
} else {
  cat("The estimated slope is not statistically significant at the α=0.05 level.\n")
}
## The null hypothesis is rejected as estimated slope is significant at α=0.05 level.
#null hypothesis is rejected

#Repeat part a. above using th equivalent F-test

model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
f_test <- anova(model)
f_statistic <- f_test$"F value"[1]
p_value_f <- f_test$"Pr(>F)"[1]
cat("Estimated Slope F-Test", coef(model)["percent_of_classes_under_20"], "\n")
## Estimated Slope F-Test 0.6577687
cat("F-Statistic:", f_statistic, "\n")
## F-Statistic: 32.88389
cat("P-Value (F-Test):", p_value_f, "\n")
## P-Value (F-Test): 7.228121e-07
if(p_value_f < 0.05){
  cat("The slope from the F-test is statistically significant at α=0.05 level.\n")
  cat("The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.\n")
} else {
  cat("The estimated slope is not statistically significant at α=0.05 level.\n")
  cat("The null hypothesis is suported and there is not a correlation between class size and alumni giving rate. \n")
}
## The slope from the F-test is statistically significant at α=0.05 level.
## The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.
#What is the value of R^2?  Please interpret
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
r_squared <- summary(model)$r.squared
cat("R-squared:", r_squared, "\n")
## R-squared: 0.4168645
cat(r_squared*100, "percent of the change in giving rate is related to the percentage of classes they attended under 20 students.", 
    (1-(r_squared))*100, "percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors. \n")
## 41.68645 percent of the change in giving rate is related to the percentage of classes they attended under 20 students. 58.31355 percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors.
r_correlation_cofficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
cat("Correlation Coefficient (r):", r_correlation_cofficient, "\n")
## Correlation Coefficient (r): 0.6456504
cat("The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2.  r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship.  R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.")
## The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2.  r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship.  R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.
cat("In this model, the relationship is positively correlated and class size represents 41% of donation percent")
## In this model, the relationship is positively correlated and class size represents 41% of donation percent
mean_data <- data.frame(percent_of_classes_under_20 = mean_X_Q2, alumni_giving_rate = mean_Y_Q2)
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point() +
  stat_smooth(method = "lm", se = TRUE, col = "blue", fill = "lightblue") +
  geom_point(data = mean_data, aes(x = mean_X_Q2, y = mean_Y_Q2), col = "red", pch = 19) +
  labs(title = "Fitted Regression Line with 95% Confidence Band",
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'

cat("At the mean point, the confidence band is narrower compared to the rest of the model")
## At the mean point, the confidence band is narrower compared to the rest of the model
# Question 2 ----------------------------------------------------------------------
set.seed(7052)

simulated_regression <- function(n, sigma_error) {
  X <- rnorm(n, mean=2, sd=0.1)
  error <- rnorm(n, mean=0, sd=sigma_error)
  Y <- 10 + 5 * X + error
  
  # b: Add linear regression
  model <- lm(Y ~ X)
  coefficients_table <- broom::tidy(model)
  t_statistic <- coefficients_table[2, "statistic"]
  p_value <- coefficients_table[2, "p.value"]
  mse <- mean(model$residuals^2)
  
  # Return results
  return(data.frame(
    n=n,
    sigma_error = sigma_error,
    coefficients = coefficients_table,
    t_statistic = t_statistic,
    p_value = p_value,
    mse = mse
  ))
}

results <- bind_rows(
  simulated_regression(100, 0.5), 
  simulated_regression(100, 1),
  simulated_regression(400, 0.5), 
  simulated_regression(400, 1)
)

print(results)
##     n sigma_error coefficients.term coefficients.estimate
## 1 100         0.5       (Intercept)              9.021796
## 2 100         0.5                 X              5.565160
## 3 100         1.0       (Intercept)             13.077796
## 4 100         1.0                 X              3.460281
## 5 400         0.5       (Intercept)              9.697322
## 6 400         0.5                 X              5.160805
## 7 400         1.0       (Intercept)              9.963304
## 8 400         1.0                 X              5.033593
##   coefficients.std.error coefficients.statistic coefficients.p.value statistic
## 1              0.8336483              10.822065         2.007658e-18 13.395491
## 2              0.4154502              13.395491         7.118010e-24 13.395491
## 3              2.0874900               6.264842         9.978287e-09  3.343580
## 4              1.0349032               3.343580         1.172512e-03  3.343580
## 5              0.5207461              18.621975         4.072601e-56 19.811650
## 6              0.2604935              19.811650         2.797442e-61 19.811650
## 7              1.0317701               9.656516         5.832549e-20  9.749772
## 8              0.5162781               9.749772         2.784618e-20  9.749772
##        p.value       mse
## 1 7.118010e-24 0.1992276
## 2 7.118010e-24 0.1992276
## 3 1.172512e-03 0.9106786
## 4 1.172512e-03 0.9106786
## 5 2.797442e-61 0.2568544
## 6 2.797442e-61 0.2568544
## 7 2.784618e-20 1.0589855
## 8 2.784618e-20 1.0589855
# Question 3 ----------------------------------------------------------------------
#a
true_intercept <- 10
true_slope <- 5
bias_intercept <- mean(results$coefficients.estimate[results$coefficients.term == "(Intercept)"]) - true_intercept
bias_slope <- mean(results$coefficients.estimate[results$coefficients.term == "X"]) - true_slope
variance_intercept <- var(results$coefficients.estimate[results$coefficients.term == "(Intercept)"])
variance_slope <- var(results$coefficients.estimate[results$coefficients.term == "X"])
cat("Bias for Intercept:", bias_intercept, "\n")
## Bias for Intercept: 0.4400544
cat("Bias for Slope:", bias_slope, "\n")
## Bias for Slope: -0.1950401
cat("Variance for Intercept:", variance_intercept, "\n")
## Variance for Intercept: 3.249359
cat("Variance for Slope:", variance_slope, "\n")
## Variance for Slope: 0.8549879
#b
cat("As the sample size increases, we expect the variance of β_0 and β_1 to decrease.  This is because as the sample size increases, the result tends to normalize and therefore the percent of variances tends to decrease")
## As the sample size increases, we expect the variance of β_0 and β_1 to decrease.  This is because as the sample size increases, the result tends to normalize and therefore the percent of variances tends to decrease
cat("As the error variance increases, we expect the variance of β_0 and β_1 to increase.  This is because the points are not as close to the regression line as before.  Therefore, the end result is more dependent on the individual points used in the analysis, and the variability is higher")
## As the error variance increases, we expect the variance of β_0 and β_1 to increase.  This is because the points are not as close to the regression line as before.  Therefore, the end result is more dependent on the individual points used in the analysis, and the variability is higher
#c
#Using original sigma value of 0.1 
true_sigma_squared <- 0.1^2
bias_mse <- mean(results$mse) - true_sigma_squared
ml_estimate_sigma_squared <- var(model$residuals)
cat("Bias of MSE:", bias_mse, "\n")
## Bias of MSE: 0.5964365
cat("ML estimate of sigma squared:", ml_estimate_sigma_squared, "\n")
## ML estimate of sigma squared: 105.355
cat("The main difference between the bias of the model's MSE and the ML estimate of sigma squared is that the bias of the MSE is the difference between the average estimated MSE adn the variance of the error terms. The bias is influenced by the random variables used in the dataset. Whereas, the ML estimate of sigma squared is the residuals in the models' variances. It can show the estimate of the true variance in error terms, not just the underlying variance. This returns a better picture of true variability in the data as a whole.  This is why we use MSE, because it looks at how well the predictions match the actual data.  ")
## The main difference between the bias of the model's MSE and the ML estimate of sigma squared is that the bias of the MSE is the difference between the average estimated MSE adn the variance of the error terms. The bias is influenced by the random variables used in the dataset. Whereas, the ML estimate of sigma squared is the residuals in the models' variances. It can show the estimate of the true variance in error terms, not just the underlying variance. This returns a better picture of true variability in the data as a whole.  This is why we use MSE, because it looks at how well the predictions match the actual data.

Danielle Robben BANA 7052 Module 2 Assignment

Key Information Please note that code has been provided in R Markdown file in Appendix A found at the bottom of this report.

Question 1 As seen in graph A, the regression equation regarding the percent of classes a student took with less than 20 peers and the rate at which those students proceeded to donate back to the university is Y = -7.39 + 0.66X, with a correlation coefficient of 0.65. The estimated slope of the relationship is m=0.6577687, with a t-statistic of 5.734448 and a p-value of 7.228121e-07. The null hypothesis states that there is no relationship between the giving rate of alumni and the percent of classes each college has with less than 20 students in it. As the p-value is less than 0.05, the null hypothesis is rejected. As the t-statistic is positive and significant, and the p-value is not classified as significant, we can show a positive relationship between the dependent variable of giving rate and the independent variable of percent of classes under 20. This information can be seen in graph A. Using the equivalent F-Test, the estimated slope of the equation in 0.6577687. The F-Statistic is 32.88389, and the p-value is 7.228121e-07. Since the F-Statistic is significant, the null hypothesis is rejected and there is a correlation between class size and alumni giving rate. Since this is in aligned with the simple linear regression slope, as expected, they both reject the null hypothesis. The R-squared value is 0.4168645, meaning 41.68645 percent of the change in giving rate is related to the percentage of classes they attended under 20 students. 51.31355 percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors. The correlation coefficient r between the percent of class sizes under 20 and the alumni giving rate is 0.6456504. The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2. Lowercase r, the correlation coefficient ranges from -1 to 1 so it can show a positive or negative form of linear relationship. R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained. In this model, the relationship is positively correlated and class size represents 41% of the relationship of the donation percent. When plotting the data, we can see at the mean point, the confidence band is narrower compared to the rest of the model. See line in Graph B.

Graph A:

Graph B:

Question 2 Please see the attached Table A for all estimations for the t-statistics, p-values, and MSE’s for each test. As the sample size increases, the coefficient statistic increases and the p-value decreases. When the error variance decreases form 1 to 0.5, the standard errors of the coefficient estimates also decrease. This makes since as they are positively correlated. The smaller the standard error, the smaller the p-values become as the model becomes more precise. The t-statistics on the other hand become larger as the error variance decreases.
When the sample size increases, the standard errors of the coefficient estimates become smaller. The t-statistics increase and the p-values decrease resulting in an increased statistical significance. Using this information, the estimated intercept and slope change as the standard error and sample sizes change. We can see that the intercept occurs when we expect the predictor variable to be zero. This is as expected and seen in previous graphs as well. In addition, the slope changes for these graphs as the response variables change with the change of coefficients. With the smaller coefficients, the smaller the MSE is. This shows that the best fit for the data was when the sigma error was at 100 and the coefficient was at 0.5. In conclusion, we find that the smaller the error in variance and the larger the sample size we can come up with more precise estimates and results. When possible, it is recommended to sample the largest portion of the population as possible and to decrease the MSE when drawing conclusions.

Table A:

n sigma_error coefficients.term coefficients.estimate coefficients.std.error coefficients.statistic coefficients.p.value statistic p.value mse

1 100 0.5 (Intercept) 9.021796 0.8336483 10.822065 2.007658e-18 13.395491 7.118010e-24 0.1992276 2 100 0.5 X 5.565160 0.4154502 13.395491 7.118010e-24 13.395491 7.118010e-24 0.1992276 3 100 1.0 (Intercept) 13.077796 2.0874900 6.264842 9.978287e-09 3.343580 1.172512e-03 0.9106786 4 100 1.0 X 3.460281 1.0349032 3.343580 1.172512e-03 3.343580 1.172512e-03 0.9106786 5 400 0.5 (Intercept) 9.697322 0.5207461 18.621975 4.072601e-56 19.811650 2.797442e-61 0.2568544 6 400 0.5 X 5.160805 0.2604935 19.811650 2.797442e-61 19.811650 2.797442e-61 0.2568544 7 400 1.0 (Intercept) 9.963304 1.0317701 9.656516 5.832549e-20 9.749772 2.784618e-20 1.0589855 8 400 1.0 X 5.033593 0.5162781 9.749772 2.784618e-20 9.749772 2.784618e-20 1.0589855

Question 3 Using the intercept value of 10 and the slope of 5, the bias of the intercept estimate is 0.4400544 and the bias for slope is -0.1950401. The variance of the intercept estimate is 3.249359 and the variance for the slope is 0.8549879. As the sample size increases, we expect the variance of β_0 and β_1 to decrease. This is because as the sample size increases, the result tends to normalize and therefore the percent of variances tends to decrease. As the error variance increases, we expect the variance of β_0 and β_1 to increase. This is because the points are not as close to the regression line as before. Therefore, the end result is more dependent on the individual points used in the analysis, and the variability is higher. The bias of the model’s MSE is 0.5964365 and the ML estimate of sigma squared is 105.355. T he main difference between the bias of the model’s MSE and the ML estimate of sigma squared is that the bias of the MSE is the difference between the average estimated MSE and the variance of the error terms. The bias is influenced by the random variables used in the dataset. Whereas, the ML estimate of sigma squared is the residuals in the models’ variances. It can show the estimate of the true variance in error terms, not just the underlying variance. This returns a better picture of true variability in the data as a whole. This is why we use MSE, because it looks at how well the predictions match the actual data.

Appendix Appendix A: http://rpubs.com/drobb2019/LinearRegressionAssignment1-2