setwd("C:/Users/drobb/Desktop/Linear Regression")
library(ggplot2)
library(MASS)
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(investr)
url <- "C:/Users/drobb/Desktop/Linear Regression/alumni.csv"
alumni <- read.csv(url)
str(alumni) # print structure of the alumni data frame
## 'data.frame': 48 obs. of 5 variables:
## $ school : chr "Boston College" "Brandeis University " "Brown University" "California Institute of Technology" ...
## $ percent_of_classes_under_20: int 39 68 60 65 67 52 45 69 72 61 ...
## $ student_faculty_ratio : int 13 8 8 3 10 8 12 7 13 10 ...
## $ alumni_giving_rate : int 25 33 40 46 28 31 27 31 35 53 ...
## $ private : int 1 1 1 1 1 1 1 1 1 1 ...
summary(alumni)
## school percent_of_classes_under_20 student_faculty_ratio
## Length:48 Min. :29.00 Min. : 3.00
## Class :character 1st Qu.:44.75 1st Qu.: 8.00
## Mode :character Median :59.50 Median :10.50
## Mean :55.73 Mean :11.54
## 3rd Qu.:66.25 3rd Qu.:13.50
## Max. :77.00 Max. :23.00
## alumni_giving_rate private
## Min. : 7.00 Min. :0.0000
## 1st Qu.:18.75 1st Qu.:0.0000
## Median :29.00 Median :1.0000
## Mean :29.27 Mean :0.6875
## 3rd Qu.:38.50 3rd Qu.:1.0000
## Max. :67.00 Max. :1.0000
#Note: response variable is Alumni Giving Rate and the predictor variable is % of classes under 20
summary(alumni$percent_of_classes_under_20)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 44.75 59.50 55.73 66.25 77.00
summary(alumni$alumni_giving_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 18.75 29.00 29.27 38.50 67.00
ggplot(alumni, aes(x= percent_of_classes_under_20, y= alumni_giving_rate)) +
geom_point() +
labs(title = "Alumni Giving Rate to Percent of classes under 20",
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
find_outliers <- function(variable) {
q <- quantile(variable)
iqr <- IQR(variable)
lower_limit <- q[2] - 1.2 * iqr
upper_limit <- q[4] +1.2 * iqr
return(variable[variable < lower_limit | variable > upper_limit])
}
outliers_under_20 <- boxplot.stats(alumni$percent_of_classes_under_20)$out
outliers_giving_rate <- boxplot.stats(alumni$alumni_giving_rate)$out
correlation_coefficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data=alumni)
coefficients(model)
## (Intercept) percent_of_classes_under_20
## -7.3860676 0.6577687
outliers_under_20
## integer(0)
# find Outliers
residuals <- residuals(model)
confidence_interval <- mean(alumni$alumni_giving_rate) + c(-1.96, 1.96) * sd(residuals)
outliers <- subset(alumni, alumni_giving_rate < confidence_interval[1] | alumni_giving_rate > confidence_interval[2])
print(outliers)
## school percent_of_classes_under_20 student_faculty_ratio
## 10 Dartmouth College 61 10
## 21 Princeton University 68 5
## 27 U. of California-Davis 32 19
## 28 U. of California-Irvine 42 20
## 30 U. of California-San Diego 48 19
## 48 Yale University 77 7
## alumni_giving_rate private
## 10 53 1
## 21 67 1
## 27 7 0
## 28 9 0
## 30 8 0
## 48 50 1
#Linear Regression is -7.2860676 + 0.6577686x
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
"\nCorrelation Coefficient:", round(correlation_coefficient, 2),
"\nRegression Equation: Y = -7.39 + 0.66X",
sep = ""),
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'
# Question 2 ----------------------------------------------------------------------
set.seed(7052)
n <- 100
X <- rnorm(n, mean = 2, sd = 0.1)
error <- rnorm(n, mean = 0, sd = 0.5)
Y <- 10 + 5 * X + error
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.725 1.923 2.001 2.004 2.070 2.243
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.09 19.67 20.11 20.17 20.70 21.80
boxplot(X, Y, names = c("X", "Y"), main = "Question 2 Boxplot")
correlation_coefficient_2 <- cor(X, Y)
plot(X, Y, main = "Question 2 Scatter Plot", xlab = "X", ylab="Y")
model_Q2 <- lm(Y ~ X)
coefficients(model_Q2)
## (Intercept) X
## 9.021796 5.565160
mse_Q2 <- mean(model$residuals^2)
mse_Q2
## [1] 103.1601
mean_X_Q2 <- mean(X)
mean_Y_Q2 <- mean(Y)
plot(X, Y, main = "Question 2 Plot", xlab = "X", ylab = "Y")
lines(X, predict(model_Q2), col = "red")
points(mean_X_Q2, mean_Y_Q2, col = "blue", pch = 19)
intercept <- coef(model_Q2)[1]
slope <- coef(model_Q2)[2]
equation <- paste("Y =", round(intercept, 2), "+", round(slope, 2), "X")
text(mean_X_Q2, mean_Y_Q2, equation, pos = 3, col = "black")
# Question 3 ----------------------------------------------------------------------
library(MASS)
lad_model <- rlm(Y ~ X)
summary(lad_model)
##
## Call: rlm(formula = Y ~ X)
## Residuals:
## Min 1Q Median 3Q Max
## -1.21113 -0.30597 0.00906 0.30188 1.35462
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 9.1089 0.8469 10.7552
## X 5.5220 0.4221 13.0833
##
## Residual standard error: 0.4542 on 98 degrees of freedom
library(quantreg)
lad_quantile_model <- rq(Y ~ X)
summary(lad_quantile_model)
## Warning in rq.fit.br(x, y, tau = tau, ci = TRUE, ...): Solution may be
## nonunique
##
## Call: rq(formula = Y ~ X)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 8.61233 6.15307 11.81847
## X 5.77192 4.12206 6.98627
##Report Question 1 Beginning with the relationship of class size to likelihood of donation after graduation, the predictor variable is the percent of classes the student attended with less than 20 students and the response variable is the alumni giving rate provided in the data set. Please see Table 1.1 and Table 1.2 below for outline of summary data. The nature of the data is linear with significant outliers in the data. Outliers are defined as points where the correlation of class size and giving rate does not fall within the 95th percentile of acceptance. There are two identified outliers to the dataset: someone who under the accepted donation level from New York University with a giving rate of 13% and a percent of classes under 20 at 63%, the second from Princeton University with a donation level higher than expected at 67% and a percent of classes under 20 at 68%. The correlation coefficient of the data is 0.65, which leads to the understanding that there is a moderate correlation between the amount of classes under 20 students and the annual giving rate of the alumni. The relationship is positively correlated. When fitting a simple linear regression to the data, the estimated regression equation results in being Y=-7.39 + 0.66X where X is the percent of classes under 20 students and Y is the alumni giving rate. Generally, we can interpret that the more classes the group of alumni takes with a class size of less than twenty, the more likely individuals are to donate back to the college. However, with the correlation coefficient being moderate and not strong, there are likely other significant factors that need to be taken into consideration before using this equation as a predictor of giving rates. The scatterplot outlining the information discussed in this question can be seen below in Plot 1.1. Table 1.2: Summary Statistics of Alumni Giving Rate Min 7.00 1st Qu. 18.75 Median 29.00 Mean 29.27 3rd Qu. 38.50 Max 67.00 Table 1.1: Summary Statistics of Alumni Percent of classes under 20 students Min 29.00 1st Qu. 44.75 Median 59.50 Mean 55.73 3rd Qu. 66.25 Max 77.00
Plot 1.1
Question 2 Running a simulation assuming the mean response is E(Y|X) = 10 + 5X and a data set to seed 7052, we can conclude information in Tables 2.1 and 2.2 where X is the predictor variable and Y is the Response variable. There are no extreme outliers as seen in Plot 2.1 and Plot 2.2, and there is a correlation coefficient of 0.8042197705. When fitting a simple linear regression, the estimated model is Y=9.02 + 5.57X, with a MSE of 103.16. The sample mean of X is 2.003677 and the sample mean of Y is 20.17258. When plotting the fitted regression line and the point associated with the sample means of both X and Y, we find the line of best fit passes directly through the sample mean point. Table 2.1: Summary Statistics of X Min 1.725 1st Qu. 1.923 Median 2.001 Mean 2.004 3rd Qu. 2.070 Max 2.243 Table 1.2: Summary Statistics of Alumni Giving Rate Min 18.09 1st Qu. 19.67 Median 20.11 Mean 20.17 3rd Qu. 20.70 Max 21.80
Plot 2.1
Plot 2.2
Question 3 When minimizing the RSS equation using the Least Absolute Deviations regression, We would use this function as it is not effected heavily by outliers with the square term. When performing this function, we use rlm to manipulate the Y and X variables and result in a standard error of 0.4542 on 98 degrees of freedom. When minimizing the RSS equation with the LAD quantile model, we are also buffering against outliers, but returning the quartile expectations rather than individual expectations of the dataset. Ordinary Least Squares is a popular choice for estimating β_0 and β_1 as it applies when the data is normally skewed, and can be reliable as sample sizes continue to increase. This helps reduce bias in the data using linear regression assumptions.
Question 4 In relationship a, the point (X ‾,Y ‾ ) is the mean of the data set. As shown in plot 2.2, the regression line must pass through this point. In Relationship b, the sum of the residuals will always be zero with the least squares regression as each residual is the difference between the observed and predicted values. This means, when summed, opposing assumptions will cancel out resulting in the sum of zero. In relationship c, this has the same logic as relationship b where the sum of the predicted values is equal to the sum of observed values when the ordinary least squares estimation is used to fit the regression line. In relationship d, the product of the residuals and the predictor values will equal zero when summed due to the minimization of the summed of squared residuals and when minimized, the values will sum to zero. In relationship e, the minimization property is also in play, therefore with ordinary least squares regression the sum of the residuals weighted by the fitted values will always be zero. In relationship f, the objective of OLS is outlined, showing that the goal of the observed least squares is minimized.
# Question 1 ----------------------------------------------------------------------
#Note: this is the same plot as used in the last assignment
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
"\nCorrelation Coefficient:", round(correlation_coefficient, 2),
"\nRegression Equation: Y = -7.39 + 0.66X",
sep = ""),
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'
tstat <- summary(model)$coefficients["percent_of_classes_under_20", "t value"]
p_value <- summary(model)$coefficients["percent_of_classes_under_20", "Pr(>|t|)"]
cat("Estimated Slope:", coef(model)["percent_of_classes_under_20"], "\n")
## Estimated Slope: 0.6577687
cat("T-Statstic:", tstat, "\n")
## T-Statstic: 5.734448
cat("P-Value:", p_value, "\n")
## P-Value: 7.228121e-07
if (!is.na(p_value) && p_value < 0.05) {
cat("The null hypothesis is rejected as estimated slope is significant at α=0.05 level.\n")
} else {
cat("The estimated slope is not statistically significant at the α=0.05 level.\n")
}
## The null hypothesis is rejected as estimated slope is significant at α=0.05 level.
#null hypothesis is rejected
#Repeat part a. above using th equivalent F-test
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
f_test <- anova(model)
f_statistic <- f_test$"F value"[1]
p_value_f <- f_test$"Pr(>F)"[1]
cat("Estimated Slope F-Test", coef(model)["percent_of_classes_under_20"], "\n")
## Estimated Slope F-Test 0.6577687
cat("F-Statistic:", f_statistic, "\n")
## F-Statistic: 32.88389
cat("P-Value (F-Test):", p_value_f, "\n")
## P-Value (F-Test): 7.228121e-07
if(p_value_f < 0.05){
cat("The slope from the F-test is statistically significant at α=0.05 level.\n")
cat("The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.\n")
} else {
cat("The estimated slope is not statistically significant at α=0.05 level.\n")
cat("The null hypothesis is suported and there is not a correlation between class size and alumni giving rate. \n")
}
## The slope from the F-test is statistically significant at α=0.05 level.
## The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.
#What is the value of R^2? Please interpret
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
r_squared <- summary(model)$r.squared
cat("R-squared:", r_squared, "\n")
## R-squared: 0.4168645
cat(r_squared*100, "percent of the change in giving rate is related to the percentage of classes they attended under 20 students.",
(1-(r_squared))*100, "percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors. \n")
## 41.68645 percent of the change in giving rate is related to the percentage of classes they attended under 20 students. 58.31355 percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors.
r_correlation_cofficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
cat("Correlation Coefficient (r):", r_correlation_cofficient, "\n")
## Correlation Coefficient (r): 0.6456504
cat("The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2. r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship. R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.")
## The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2. r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship. R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.
cat("In this model, the relationship is positively correlated and class size represents 41% of donation percent")
## In this model, the relationship is positively correlated and class size represents 41% of donation percent
mean_data <- data.frame(percent_of_classes_under_20 = mean_X_Q2, alumni_giving_rate = mean_Y_Q2)
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
stat_smooth(method = "lm", se = TRUE, col = "blue", fill = "lightblue") +
geom_point(data = mean_data, aes(x = mean_X_Q2, y = mean_Y_Q2), col = "red", pch = 19) +
labs(title = "Fitted Regression Line with 95% Confidence Band",
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'
cat("At the mean point, the confidence band is narrower compared to the rest of the model")
## At the mean point, the confidence band is narrower compared to the rest of the model
# Question 2 ----------------------------------------------------------------------
set.seed(7052)
simulated_regression <- function(n, sigma_error) {
X <- rnorm(n, mean=2, sd=0.1)
error <- rnorm(n, mean=0, sd=sigma_error)
Y <- 10 + 5 * X + error
# b: Add linear regression
model <- lm(Y ~ X)
coefficients_table <- broom::tidy(model)
t_statistic <- coefficients_table[2, "statistic"]
p_value <- coefficients_table[2, "p.value"]
mse <- mean(model$residuals^2)
# Return results
return(data.frame(
n=n,
sigma_error = sigma_error,
coefficients = coefficients_table,
t_statistic = t_statistic,
p_value = p_value,
mse = mse
))
}
results <- bind_rows(
simulated_regression(100, 0.5),
simulated_regression(100, 1),
simulated_regression(400, 0.5),
simulated_regression(400, 1)
)
print(results)
## n sigma_error coefficients.term coefficients.estimate
## 1 100 0.5 (Intercept) 9.021796
## 2 100 0.5 X 5.565160
## 3 100 1.0 (Intercept) 13.077796
## 4 100 1.0 X 3.460281
## 5 400 0.5 (Intercept) 9.697322
## 6 400 0.5 X 5.160805
## 7 400 1.0 (Intercept) 9.963304
## 8 400 1.0 X 5.033593
## coefficients.std.error coefficients.statistic coefficients.p.value statistic
## 1 0.8336483 10.822065 2.007658e-18 13.395491
## 2 0.4154502 13.395491 7.118010e-24 13.395491
## 3 2.0874900 6.264842 9.978287e-09 3.343580
## 4 1.0349032 3.343580 1.172512e-03 3.343580
## 5 0.5207461 18.621975 4.072601e-56 19.811650
## 6 0.2604935 19.811650 2.797442e-61 19.811650
## 7 1.0317701 9.656516 5.832549e-20 9.749772
## 8 0.5162781 9.749772 2.784618e-20 9.749772
## p.value mse
## 1 7.118010e-24 0.1992276
## 2 7.118010e-24 0.1992276
## 3 1.172512e-03 0.9106786
## 4 1.172512e-03 0.9106786
## 5 2.797442e-61 0.2568544
## 6 2.797442e-61 0.2568544
## 7 2.784618e-20 1.0589855
## 8 2.784618e-20 1.0589855
# Question 3 ----------------------------------------------------------------------
#a
true_intercept <- 10
true_slope <- 5
bias_intercept <- mean(results$coefficients.estimate[results$coefficients.term == "(Intercept)"]) - true_intercept
bias_slope <- mean(results$coefficients.estimate[results$coefficients.term == "X"]) - true_slope
variance_intercept <- var(results$coefficients.estimate[results$coefficients.term == "(Intercept)"])
variance_slope <- var(results$coefficients.estimate[results$coefficients.term == "X"])
cat("Bias for Intercept:", bias_intercept, "\n")
## Bias for Intercept: 0.4400544
cat("Bias for Slope:", bias_slope, "\n")
## Bias for Slope: -0.1950401
cat("Variance for Intercept:", variance_intercept, "\n")
## Variance for Intercept: 3.249359
cat("Variance for Slope:", variance_slope, "\n")
## Variance for Slope: 0.8549879
#b
cat("As the sample size increases, we expect the variance of β_0 and β_1 to decrease. This is because as the sample size increases, the result tends to normalize and therefore the percent of variances tends to decrease")
## As the sample size increases, we expect the variance of β_0 and β_1 to decrease. This is because as the sample size increases, the result tends to normalize and therefore the percent of variances tends to decrease
cat("As the error variance increases, we expect the variance of β_0 and β_1 to increase. This is because the points are not as close to the regression line as before. Therefore, the end result is more dependent on the individual points used in the analysis, and the variability is higher")
## As the error variance increases, we expect the variance of β_0 and β_1 to increase. This is because the points are not as close to the regression line as before. Therefore, the end result is more dependent on the individual points used in the analysis, and the variability is higher
#c
#Using original sigma value of 0.1
true_sigma_squared <- 0.1^2
bias_mse <- mean(results$mse) - true_sigma_squared
ml_estimate_sigma_squared <- var(model$residuals)
cat("Bias of MSE:", bias_mse, "\n")
## Bias of MSE: 0.5964365
cat("ML estimate of sigma squared:", ml_estimate_sigma_squared, "\n")
## ML estimate of sigma squared: 105.355
cat("The main difference between the bias of the model's MSE and the ML estimate of sigma squared is that the bias of the MSE is the difference between the average estimated MSE adn the variance of the error terms. The bias is influenced by the random variables used in the dataset. Whereas, the ML estimate of sigma squared is the residuals in the models' variances. It can show the estimate of the true variance in error terms, not just the underlying variance. This returns a better picture of true variability in the data as a whole. This is why we use MSE, because it looks at how well the predictions match the actual data. ")
## The main difference between the bias of the model's MSE and the ML estimate of sigma squared is that the bias of the MSE is the difference between the average estimated MSE adn the variance of the error terms. The bias is influenced by the random variables used in the dataset. Whereas, the ML estimate of sigma squared is the residuals in the models' variances. It can show the estimate of the true variance in error terms, not just the underlying variance. This returns a better picture of true variability in the data as a whole. This is why we use MSE, because it looks at how well the predictions match the actual data.
# Assignment 3 --------------------------------------------------------------------
# Question 1 ----------------------------------------------------------------------
#a.
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary <- summary(model)
print(summary)
##
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 +
## student_faculty_ratio, data = alumni)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.00 -6.57 -1.95 4.42 24.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6556 13.5076 2.936 0.005225 **
## percent_of_classes_under_20 0.1662 0.1626 1.022 0.312128
## student_faculty_ratio -1.7021 0.4421 -3.850 0.000371 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared: 0.5613, Adjusted R-squared: 0.5418
## F-statistic: 28.79 on 2 and 45 DF, p-value: 8.869e-09
get_eq <- function(model) {
eq <- as.character(paste("Y =", round(coef(model)[1], 2), "+", round(coef(model)[2], 2), "* X"))
return(eq)
}
classesunder20togiving <- ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, col = "blue") +
labs(title = "Alumni giving rate to % of classes under 20",
x = "% of Classes Under 20",
y = "Alumni Giving Rate") +
geom_text(x = 40, y = 50, label = get_eq(model), color = "blue", parse = TRUE)
print(classesunder20togiving)
## `geom_smooth()` using formula = 'y ~ x'
studentfacultytogiving <- ggplot(alumni, aes(x = student_faculty_ratio, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, col = "blue") +
labs(title = "Alumni giving rate to Student Faculty Ratio",
x = "Student Faculty Ratio",
y = "Alumni Giving Rate") +
geom_text(x = 10, y = 50, label = get_eq(model), color = "blue", parse = TRUE)
print(studentfacultytogiving)
## `geom_smooth()` using formula = 'y ~ x'
combined_data <- rbind(
transform(alumni, variable = "% of Classes Under 20"),
transform(alumni, variable = "Student Faculty Ratio", percent_of_classes_under_20 = student_faculty_ratio)
)
classesunder20andstudentfaculty <- ggplot(combined_data, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point(aes(color = variable)) +
geom_smooth(method = "lm", se = FALSE, aes(group = variable, color = variable), show.legend = FALSE) +
facet_wrap(~ variable, scales = "free") +
labs(title = "Alumni Giving Rate",
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
print(classesunder20andstudentfaculty)
## `geom_smooth()` using formula = 'y ~ x'
#cube with both as x, y and giving rate as z
library(rgl)
combined_model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
plot3d(alumni$percent_of_classes_under_20, alumni$student_faculty_ratio, alumni$alumni_giving_rate,
col = "red", type = "s", size = 2, main = "Multiple Linear Regression",
xlab = "% of Classes Under 20", ylab = "Student Faculty Ratio", zlab = "Alumni Giving Rate")
grid <- expand.grid(percent_of_classes_under_20 = seq(min(alumni$percent_of_classes_under_20), max(alumni$percent_of_classes_under_20), length.out = 100),
student_faculty_ratio = seq(min(alumni$student_faculty_ratio), max(alumni$student_faculty_ratio), length.out = 100))
grid$predicted <- predict(model, newdata = grid)
mesh <- expand.grid(
percent_of_classes_under_20 = seq(min(alumni$percent_of_classes_under_20), max(alumni$percent_of_classes_under_20), length.out = 100),
student_faculty_ratio = seq(min(alumni$student_faculty_ratio), max(alumni$student_faculty_ratio), length.out = 100)
)
mesh$predicted <- predict(model, newdata = mesh)
with(mesh, surface3d(x = matrix(percent_of_classes_under_20, nrow = 100, ncol = 100),
y = matrix(student_faculty_ratio, nrow = 100, ncol = 100),
z = matrix(predicted, nrow = 100, ncol = 100),
color = "blue", alpha = 0.5))
combined_model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary_combined_model_Assignment3Q1 <- summary(combined_model)
print(summary_combined_model_Assignment3Q1)
##
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 +
## student_faculty_ratio, data = alumni)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.00 -6.57 -1.95 4.42 24.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6556 13.5076 2.936 0.005225 **
## percent_of_classes_under_20 0.1662 0.1626 1.022 0.312128
## student_faculty_ratio -1.7021 0.4421 -3.850 0.000371 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared: 0.5613, Adjusted R-squared: 0.5418
## F-statistic: 28.79 on 2 and 45 DF, p-value: 8.869e-09
#My final regression equation for the relationship between the percent of classes under 20 students (X1) and the student faculty ratio (X2) and the rate at which alumni donate (Y) is Y= 39.6556 + 0.1662* X1 - 1.7021* X2 . While this is the best fit, there are outliers, specifically the Minimum value and the maximum value of the residuals. As the first quartile, median and 3rd quartile are relatively in-line with a deviance of less than 5, we can accept this as a best fit model. The significance codes returned from this model show that the student faculty ratio is significant in the alumni donation rate, however the percent of classes under 20 students is not a significant factor into the alumni donation rate when compared to the student faculty ratio. Even with the significance between student faculty ratio and donation rate, the R squared value is returning 56.13%, showing that just over half the donation rate can be explained by either of the two provided variables. The low p-value also verifies that a connection to explain the donation rate has been determined in the model.
#b.
predicted_alumni_giving <- data.frame(percent_of_classes_under_20 = 50, student_faculty_ratio = 10)
predicted_alumni_giving_rate <- predict(combined_model, newdata = predicted_alumni_giving)
cat("Predicted Alumni Giving Rate:", predicted_alumni_giving_rate, "\n")
## Predicted Alumni Giving Rate: 30.94291
#If an observation occurred where the percent of classes under 20 was 50% and the student faculty ratio was 10, we would expect 30.94291% of the alumni to donate back to the college
#c.
coefficient_table <- summary(model)$coefficients
print(coefficient_table)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6555835 13.5075774 2.935803 0.0052247868
## percent_of_classes_under_20 0.1661686 0.1625520 1.022249 0.3121275033
## student_faculty_ratio -1.7021103 0.4421271 -3.849821 0.0003709425
# When testing the statistical significance of the regression coefficient using t-test, the null hypothesis is that the coefficients would be equal to zero, thus neither the percent of classes under twenty students nor the ratio of students to faculty would have an effect on the percent of alumni who donate to the colleges. Since the p-value of the intercept is 0.005 and less than the accepted threshold of 0.05, we would say there is an impact from at least one of the two variables on the donation rate. Since the p-value of the percent of classes under 20 is 0.31, and greater than 0.05, we confirm the null hypothesis in stating that the percent of classes under 20 does not have statistical significance on the donation rate. As the p-value of the student-faculty ratio is 0.00037 and less than 0.05, then we deny the null hypothesis and affirm that there is a statistically significant relationship between donation rate and student-faculty ratio.
#d.
f_statistic3 <- summary(model)$fstatistic
p_value_f_Stat3 <- pf(f_statistic3[1], f_statistic3[2], f_statistic3[3], lower.tail = FALSE)
cat("F-statistic:", f_statistic3[1], "\n")
## F-statistic: 28.79264
cat("Degrees of Freedom:", f_statistic3[2],"/",f_statistic3[3],"\n")
## Degrees of Freedom: 2 / 45
cat("P-value3", p_value_f_Stat3, "\n")
## P-value3 8.868867e-09
# When testing the F-statistic, we state the null hypothesis that all coefficients in the model are zero, thus they do not impact the percent of alumni who donate. The resulting F-statistic is 28.79 with 2/45 degrees of freedom. This means that put together, the percent of classes under 20 and the student to faculty ratio explains 28.79264 of the percent of alumni who donate and there are 47 data points taken into consideration with 2 predictors and 45 remaining observations. The p-value is 868867e^-09, which is very close to zero. This indicates that the model as a whole is statistically significant, and observing an F-statistic of 28.79 would be very unlikely. As the p-value is less than 0.05, we reject the null hypothesis and there is significance between the variables and the output.
#e.
r_squared3 <- summary(model)$r.squared
cat("R-Squared Value:", r_squared3, "\n")
## R-Squared Value: 0.5613406
#The coefficient determination value is 0.5613406, meaning 56.13% of the alumni giving rate can be attributed to either the percent of classes under 20 students or the student faculty ratio.
#f
cor_percent <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
cor_ratio <- cor(alumni$student_faculty_ratio, alumni$alumni_giving_rate)
cat("Correlation coefficient between percent_of_classes_under_20 and alumni_giving_rate (r_1):", cor_percent, "\n")
## Correlation coefficient between percent_of_classes_under_20 and alumni_giving_rate (r_1): 0.6456504
cat("Correlation coefficient between student_faculty_ratio and alumni_giving_rate (r_2):", cor_ratio, "\n")
## Correlation coefficient between student_faculty_ratio and alumni_giving_rate (r_2): -0.7423975
#The correlation coefficient between percent of classes under 20 and alumni giving rate (r1) is 0.6456504. As it is a moderately positive value, we can expect a positive linear relationship between percent of classes under 20 and donation rate. The correlation coefficient between student faculty ratio and alumni giving rate (r2) is -0.7423975. As a strong negative value, we can expect a strong negative relationship between student faculty ratio and alumni giving rate. The coefficient of determination (R2) is 0.5613, and as stated before, shows how much the giving rate is effected by either of the variables. Using these r values, we receive an unexpected equation as R2 should equal the square root of (r2)^2 + (r2)^2. We would expect the value to be 0.983879. Because of this inconsistency, we must do additional analysis to confirm results.
# Question 2 ----------------------------------------------------------------------
#a/b.
set.seed(123)
N<-1000
X1 <- rnorm(N, mean=2, sd=0.1)
X2 <- rnorm(N, mean=0, sd=0.4)
epsilon <- rnorm(N, mean=0, sd=0.5)
Y <- 10+5*X1 - 2*X2 + epsilon
model_a <- lm(Y ~ X1 + X2)
model_a <- summary(model_a)
print(model_a)
##
## Call:
## lm(formula = Y ~ X1 + X2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41801 -0.31386 -0.01849 0.32692 1.68933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.20445 0.31401 32.50 <2e-16 ***
## X1 4.89254 0.15672 31.22 <2e-16 ***
## X2 -1.96562 0.03848 -51.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4894 on 997 degrees of freedom
## Multiple R-squared: 0.7697, Adjusted R-squared: 0.7693
## F-statistic: 1666 on 2 and 997 DF, p-value: < 2.2e-16
#The estimated prediction equation is Y=10.20445 + 4.89254X1 – 1.96562X2. The intercept of 10.20445 has a standard error of 0.31401, the X1 coefficient 4. 89254 has a standard error of 0.15672 and the X2 coefficient is – 1.96562 with a standard error of 0.03848. AS both the X1 coefficient and the intercept have a standard error greater than 0.05, we would say that the errors are significant. The null hypothesis states that the intercept, X1 coefficient, and X2 coefficient would all equal zero and thus have no effect on the outcome. We reject the null hypothesis due to this model. The intercept has a t value of 32.50 and a p value of <2e-16. X1 has a t-value of 31.22 and a p value of <2e-16. X2 has a t-value of -51.08 and a p-value of 2e<16. Since the p-values are very low, we interpret that each coefficient and intercept are highly significant, the intercept and X1 with a positive correlation and X2 with a negative correlation. The fitted model’s MSE is 0.4894, showing that the predictions are relatively accurate.
#Using a new error term, we were able to change the prediction equation to Y=10.70283 + 4.63185X1 – 1.93232X2. Where the intercept of 10.70283 has a standard error of 0.63801, the coefficient 4. 63185 has a standard error of 0.31842 and the coefficient -1. 93232 has a standard error of 0.07819. As the intercept still has an error greater than 0.05, we would say this error is significant, however both X variables do not have significant values. The null hypothesis states that the intercept, X1 coefficient, and X2 coefficient would all equal zero and thus have no effect on the outcome. We reject the null hypothesis due to this model. As the p-values are extremely low from this simulation, we say that the coefficients and intercept are highly significant, and X1 has a positive correlation and X2 has a negative correlation. The MSE is 0.9933, showing that the predictions are farther away from the actual model than in the previous test.
#c
epsilon_new <- rnorm(N, mean = 0, sd = 1)
Y_new <- 10 + 5 * X1 - 2 * X2 + epsilon_new
model_c <- lm(Y_new ~ X1+X2)
summary(model_c)
##
## Call:
## lm(formula = Y_new ~ X1 + X2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.11699 -0.62664 0.00437 0.66561 2.90740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.03929 0.63737 15.75 <2e-16 ***
## X1 4.97594 0.31810 15.64 <2e-16 ***
## X2 -2.01676 0.07811 -25.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9933 on 997 degrees of freedom
## Multiple R-squared: 0.4596, Adjusted R-squared: 0.4585
## F-statistic: 423.9 on 2 and 997 DF, p-value: < 2.2e-16
#d
N_new <- 400
X1_new <- rnorm(N_new, mean = 2, sd = 0.1)
X2_new <- rnorm(N_new, mean = 0, sd = 0.4)
epsilon_new <- rnorm(N_new, mean = 0, sd = 0.5)
Y_new <- 10 + 5 * X1_new - 2 * X2_new + epsilon_new
model_d <- lm(Y_new ~ X1_new + X2_new)
summary(model_d)
##
## Call:
## lm(formula = Y_new ~ X1_new + X2_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.52153 -0.31078 0.02841 0.32596 1.08097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.5572 0.4768 20.04 <2e-16 ***
## X1_new 5.2195 0.2386 21.87 <2e-16 ***
## X2_new -1.9189 0.0605 -31.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4772 on 397 degrees of freedom
## Multiple R-squared: 0.7931, Adjusted R-squared: 0.7921
## F-statistic: 761 on 2 and 397 DF, p-value: < 2.2e-16
#e
#Using the data pulled in models c and d, we can conclude that the larger the error, the less significant the coefficients are and the higher the MSE is. This makes sense as the impact of the coefficients will decrease as there is more error within the variables. In addition, the smaller the sample size, the less precise the estimates will be. However, when the error variance remains the same we can still expect significant coefficients, thus having a low mean standard error.
# Question 3 ----------------------------------------------------------------------
#a
X <- model.matrix(combined_model)
beta <- coef(combined_model)
cat("Multiple Linear Regression Model with Normal Errors:\n")
## Multiple Linear Regression Model with Normal Errors:
cat("Y =", paste0("(", paste0(beta, " * X", colnames(X), collapse = " + "), ")", sep = ""), "+ ε\n")
## Y = (39.6555834726146 * X(Intercept) + 0.166168629593652 * Xpercent_of_classes_under_20 + -1.70211027228975 * Xstudent_faculty_ratio) + ε
#b-
# The multiple linear regression model is Y=Xsub1 βsub1+ Xsub2 βsub2 + epsilon, where Y is the alumni giving rate, X1 is the percent of classes under 20 and X2 is the student faculty ratio. Both B values are the coefficients as defined above and epsilon is the normally distributed errors. In R, I combined both Xsub1 and Xsub2 to simplify the linear equation. The resulting equation using the data provided is:
#Y = (39.6555834726146 * X(Intercept) + 0.166168629593652 * Xpercent_of_classes_under_20 + -1.70211027228975 * Xstudent_faculty_ratio) + ε .
#With this model, we assume the relationship between each variable and the Y value is linear, the residuals are independent with constant variance normally distributed. We also assume that there are no significant outliers within the data set.
#c
cat("Model Matrix (X):\n")
## Model Matrix (X):
print(X)
## (Intercept) percent_of_classes_under_20 student_faculty_ratio
## 1 1 39 13
## 2 1 68 8
## 3 1 60 8
## 4 1 65 3
## 5 1 67 10
## 6 1 52 8
## 7 1 45 12
## 8 1 69 7
## 9 1 72 13
## 10 1 61 10
## 11 1 68 8
## 12 1 65 7
## 13 1 54 10
## 14 1 73 8
## 15 1 64 9
## 16 1 55 11
## 17 1 65 6
## 18 1 63 13
## 19 1 66 8
## 20 1 32 19
## 21 1 68 5
## 22 1 62 8
## 23 1 69 7
## 24 1 67 9
## 25 1 56 12
## 26 1 58 17
## 27 1 32 19
## 28 1 42 20
## 29 1 41 18
## 30 1 48 19
## 31 1 45 20
## 32 1 65 4
## 33 1 31 23
## 34 1 29 15
## 35 1 51 15
## 36 1 40 16
## 37 1 53 13
## 38 1 65 7
## 39 1 63 10
## 40 1 53 13
## 41 1 39 21
## 42 1 44 13
## 43 1 37 12
## 44 1 37 13
## 45 1 68 9
## 46 1 59 11
## 47 1 73 7
## 48 1 77 7
## attr(,"assign")
## [1] 0 1 2
#You can see the model matrix X as printed above.
#d
beta<-coef(combined_model)
print(beta)
## (Intercept) percent_of_classes_under_20
## 39.6555835 0.1661686
## student_faculty_ratio
## -1.7021103
#The least squares estimate of beta is the subset [39.6555834 // 0.16616868629 // -1.70211027].
#e
#Being unbiased means that most of the time the estimate provided from the sample is representative of the population we are pulling from. This means that the sample provided is a good representation of the population as a whole.
##Danielle Robben BANA 7052 Module 2 Assignment
Key Information Please note that code has been provided in R Markdown file in Appendix A found at the bottom of this report.
Question 1 My final regression equation for the relationship between
the percent of classes under 20 students (X1) and the student faculty
ratio (X2) and the rate at which alumni donate (Y) is Y= 39.6556 +
0.1662* X1 - 1.7021* X2 . While this is the best fit, there are
outliers, specifically the Minimum value and the maximum value of the
residuals. As the first quartile, median and 3rd quartile are relatively
in-line with a deviance of less than 5, we can accept this as a best fit
model. The significance codes returned from this model show that the
student faculty ratio is significant in the alumni donation rate,
however the percent of classes under 20 students is not a significant
factor into the alumni donation rate when compared to the student
faculty ratio. Even with the significance between student faculty ratio
and donation rate, the R squared value is returning 56.13%, showing that
just over half the donation rate can be explained by either of the two
provided variables. The low p-value also verifies that a connection to
explain the donation rate has been determined in the model. If an
observation occurred where the percent of classes under 20 was 50% and
the student faculty ratio was 10, we would expect 30.94291% of the
alumni to donate back to the college. When testing the statistical
significance of the regression coefficient using t-test, the null
hypothesis is that the coefficients would be equal to zero, thus neither
the percent of classes under twenty students nor the ratio of students
to faculty would have an effect on the percent of alumni who donate to
the colleges. Since the p-value of the intercept is 0.005 and less than
the accepted threshold of 0.05, we would say there is an impact from at
least one of the two variables on the donation rate. Since the p-value
of the percent of classes under 20 is 0.31, and greater than 0.05, we
confirm the null hypothesis in stating that the percent of classes under
20 does not have statistical significance on the donation rate. As the
p-value of the student-faculty ratio is 0.00037 and less than 0.05, then
we deny the null hypothesis and affirm that there is a statistically
significant relationship between donation rate and student-faculty
ratio.
When testing the F-statistic, we state the null hypothesis that all
coefficients in the model are zero, thus they do not impact the percent
of alumni who donate. The resulting F-statistic is 28.79 with 2/45
degrees of freedom. This means that put together, the percent of classes
under 20 and the student to faculty ratio explains 28.79264 of the
percent of alumni who donate and there are 47 data points taken into
consideration with 2 predictors and 45 remaining observations. The
p-value is 868867e^-09, which is very close to zero. This indicates that
the model as a whole is statistically significant, and observing an
F-statistic of 28.79 would be very unlikely. As the p-value is less than
0.05, we reject the null hypothesis and there is significance between
the variables and the output.
The coefficient determination value is 0.5613406, meaning 56.13% of the
alumni giving rate can be attributed to either the percent of classes
under 20 students or the student faculty ratio. The correlation
coefficient between percent of classes under 20 and alumni giving rate
(r1) is 0.6456504. As it is a moderately positive value, we can expect a
positive linear relationship between percent of classes under 20 and
donation rate. The correlation coefficient between student faculty ratio
and alumni giving rate (r2) is -0.7423975. As a strong negative value,
we can expect a strong negative relationship between student faculty
ratio and alumni giving rate. The coefficient of determination (R2) is
0.5613, and as stated before, shows how much the giving rate is effected
by either of the variables. Using these r values, we receive an
unexpected equation as R2 should equal the square root of (r2)^2 +
(r2)^2. We would expect the value to be 0.983879. Because of this
inconsistency, we must do additional analysis to confirm results.
Question 2 The estimated prediction equation is Y=10.20445 + 4.89254X1 – 1.96562X2. The intercept of 10.20445 has a standard error of 0.31401, the X1 coefficient 4. 89254 has a standard error of 0.15672 and the X2 coefficient is – 1.96562 with a standard error of 0.03848. AS both the X1 coefficient and the intercept have a standard error greater than 0.05, we would say that the errors are significant. The null hypothesis states that the intercept, X1 coefficient, and X2 coefficient would all equal zero and thus have no effect on the outcome. We reject the null hypothesis due to this model. The intercept has a t value of 32.50 and a p value of <2e-16. X1 has a t-value of 31.22 and a p value of <2e-16. X2 has a t-value of -51.08 and a p-value of 2e<16. Since the p-values are very low, we interpret that each coefficient and intercept are highly significant, the intercept and X1 with a positive correlation and X2 with a negative correlation. The fitted model’s MSE is 0.4894, showing that the predictions are relatively accurate. Using a new error term, we were able to change the prediction equation to Y=10.70283 + 4.63185X1 – 1.93232X2. Where the intercept of 10.70283 has a standard error of 0.63801, the coefficient 4. 63185 has a standard error of 0.31842 and the coefficient -1. 93232 has a standard error of 0.07819. As the intercept still has an error greater than 0.05, we would say this error is significant, however both X variables do not have significant values. The null hypothesis states that the intercept, X1 coefficient, and X2 coefficient would all equal zero and thus have no effect on the outcome. We reject the null hypothesis due to this model. As the p-values are extremely low from this simulation, we say that the coefficients and intercept are highly significant, and X1 has a positive correlation and X2 has a negative correlation. The MSE is 0.9933, showing that the predictions are farther away from the actual model than in the previous test. Using the data pulled in models c and d, we can conclude that the larger the error, the less significant the coefficients are and the higher the MSE is. This makes sense as the impact of the coefficients will decrease as there is more error within the variables. In addition, the smaller the sample size, the less precise the estimates will be. However, when the error variance remains the same we can still expect significant coefficients, thus having a low mean standard error.
Question 3 The multiple linear regression model is Y=Xsub1 βsub1+ Xsub2 βsub2 + epsilon, where Y is the alumni giving rate, X1 is the percent of classes under 20 and X2 is the student faculty ratio. Both B values are the coefficients as defined above and epsilon is the normally distributed errors. In R, I combined both Xsub1 and Xsub2 to simplify the linear equation. The resulting equation using the data provided is: Y = (39.6555834726146 * X(Intercept) + 0.166168629593652 * Xpercent_of_classes_under_20 + -1.70211027228975 * Xstudent_faculty_ratio) + ε . With this model, we assume the relationship between each variable and the Y value is linear, the residuals are independent with constant variance normally distributed. We also assume that there are no significant outliers within the data set. You can see the model matrix X as printed above. The least squares estimate of beta is the subset [39.6555834 // 0.16616868629 // -1.70211027]. Being unbiased means that most of the time the estimate provided from the sample is representative of the population we are pulling from. This means that the sample provided is a good representation of the population as a whole.
Appendix Appendix A: http://rpubs.com/drobb2019/LinearRegressionAssignment1-2