setwd("C:/Users/drobb/Desktop/Linear Regression")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(MASS)
## Warning: package 'MASS' was built under R version 4.3.2
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rgl)
## Warning: package 'rgl' was built under R version 4.3.2
library(scatterplot3d)
####Question 1
####Assignment1
# Assignment 1 ----------------------------------------------------------------------
# Question 1 ----------------------------------------------------------------------
url <- "C:/Users/drobb/Desktop/Linear Regression/alumni.csv"
alumni <- read.csv(url)
str(alumni) # print structure of the alumni data frame
## 'data.frame': 48 obs. of 5 variables:
## $ school : chr "Boston College" "Brandeis University " "Brown University" "California Institute of Technology" ...
## $ percent_of_classes_under_20: int 39 68 60 65 67 52 45 69 72 61 ...
## $ student_faculty_ratio : int 13 8 8 3 10 8 12 7 13 10 ...
## $ alumni_giving_rate : int 25 33 40 46 28 31 27 31 35 53 ...
## $ private : int 1 1 1 1 1 1 1 1 1 1 ...
summary(alumni)
## school percent_of_classes_under_20 student_faculty_ratio
## Length:48 Min. :29.00 Min. : 3.00
## Class :character 1st Qu.:44.75 1st Qu.: 8.00
## Mode :character Median :59.50 Median :10.50
## Mean :55.73 Mean :11.54
## 3rd Qu.:66.25 3rd Qu.:13.50
## Max. :77.00 Max. :23.00
## alumni_giving_rate private
## Min. : 7.00 Min. :0.0000
## 1st Qu.:18.75 1st Qu.:0.0000
## Median :29.00 Median :1.0000
## Mean :29.27 Mean :0.6875
## 3rd Qu.:38.50 3rd Qu.:1.0000
## Max. :67.00 Max. :1.0000
#Note: response variable is Alumni Giving Rate and the predictor variable is % of classes under 20
summary(alumni$percent_of_classes_under_20)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 44.75 59.50 55.73 66.25 77.00
summary(alumni$alumni_giving_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 18.75 29.00 29.27 38.50 67.00
ggplot(alumni, aes(x= percent_of_classes_under_20, y= alumni_giving_rate)) +
geom_point() +
labs(title = "Alumni Giving Rate to Percent of classes under 20",
x = "% of Classes Under 20",
y = "Alumni Giving Rate")

find_outliers <- function(variable) {
q <- quantile(variable)
iqr <- IQR(variable)
lower_limit <- q[2] - 1.2 * iqr
upper_limit <- q[4] +1.2 * iqr
return(variable[variable < lower_limit | variable > upper_limit])
}
outliers_under_20 <- boxplot.stats(alumni$percent_of_classes_under_20)$out
outliers_giving_rate <- boxplot.stats(alumni$alumni_giving_rate)$out
correlation_coefficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data=alumni)
coefficients(model)
## (Intercept) percent_of_classes_under_20
## -7.3860676 0.6577687
outliers_under_20
## integer(0)
#find Outliers
residuals <- residuals(model)
confidence_interval <- mean(alumni$alumni_giving_rate) + c(-1.96, 1.96) * sd(residuals)
outliers <- subset(alumni, alumni_giving_rate < confidence_interval[1] | alumni_giving_rate > confidence_interval[2])
print(outliers)
## school percent_of_classes_under_20 student_faculty_ratio
## 10 Dartmouth College 61 10
## 21 Princeton University 68 5
## 27 U. of California-Davis 32 19
## 28 U. of California-Irvine 42 20
## 30 U. of California-San Diego 48 19
## 48 Yale University 77 7
## alumni_giving_rate private
## 10 53 1
## 21 67 1
## 27 7 0
## 28 9 0
## 30 8 0
## 48 50 1
#Linear Regression is -7.2860676 + 0.6577686x
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
"\nCorrelation Coefficient:", round(correlation_coefficient, 2),
"\nRegression Equation: Y = -7.39 + 0.66X",
sep = ""),
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'

#Assignment 2 --------------------------------------------------------------------
# Question 1 ----------------------------------------------------------------------
#Note: this is the same plot as used in the last assignment
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
"\nCorrelation Coefficient:", round(correlation_coefficient, 2),
"\nRegression Equation: Y = -7.39 + 0.66X",
sep = ""),
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'

tstat <- summary(model)$coefficients["percent_of_classes_under_20", "t value"]
p_value <- summary(model)$coefficients["percent_of_classes_under_20", "Pr(>|t|)"]
cat("Estimated Slope:", coef(model)["percent_of_classes_under_20"], "\n")
## Estimated Slope: 0.6577687
cat("T-Statstic:", tstat, "\n")
## T-Statstic: 5.734448
cat("P-Value:", p_value, "\n")
## P-Value: 7.228121e-07
if (!is.na(p_value) && p_value < 0.05) {
cat("The null hypothesis is rejected as estimated slope is significant at α=0.05 level.\n")
} else {
cat("The estimated slope is not statistically significant at the α=0.05 level.\n")
}
## The null hypothesis is rejected as estimated slope is significant at α=0.05 level.
#null hypothesis is rejected
#Repeat part a. above using th equivalent F-test
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
f_test <- anova(model)
f_statistic <- f_test$"F value"[1]
p_value_f <- f_test$"Pr(>F)"[1]
cat("Estimated Slope F-Test", coef(model)["percent_of_classes_under_20"], "\n")
## Estimated Slope F-Test 0.6577687
cat("F-Statistic:", f_statistic, "\n")
## F-Statistic: 32.88389
cat("P-Value (F-Test):", p_value_f, "\n")
## P-Value (F-Test): 7.228121e-07
if(p_value_f < 0.05){
cat("The slope from the F-test is statistically significant at α=0.05 level.\n")
cat("The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.\n")
} else {
cat("The estimated slope is not statistically significant at α=0.05 level.\n")
cat("The null hypothesis is suported and there is not a correlation between class size and alumni giving rate. \n")
}
## The slope from the F-test is statistically significant at α=0.05 level.
## The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.
#What is the value of R^2? Please interpret
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
r_squared <- summary(model)$r.squared
cat("R-squared:", r_squared, "\n")
## R-squared: 0.4168645
cat(r_squared*100, "percent of the change in giving rate is related to the percentage of classes they attended under 20 students.",
(1-(r_squared))*100, "percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors. \n")
## 41.68645 percent of the change in giving rate is related to the percentage of classes they attended under 20 students. 58.31355 percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors.
r_correlation_cofficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
cat("Correlation Coefficient (r):", r_correlation_cofficient, "\n")
## Correlation Coefficient (r): 0.6456504
cat("The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2. r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship. R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.")
## The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2. r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship. R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.
cat("In this model, the relationship is positively correlated and class size represents 41% of donation percent")
## In this model, the relationship is positively correlated and class size represents 41% of donation percent
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
stat_smooth(method = "lm", se = TRUE, col = "blue", fill = "lightblue") +
labs(title = "Fitted Regression Line with 95% Confidence Band",
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
## `geom_smooth()` using formula = 'y ~ x'

cat("At the mean point, the confidence band is narrower compared to the rest of the model")
## At the mean point, the confidence band is narrower compared to the rest of the model
#a.
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary <- summary(model)
print(summary)
##
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 +
## student_faculty_ratio, data = alumni)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.00 -6.57 -1.95 4.42 24.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6556 13.5076 2.936 0.005225 **
## percent_of_classes_under_20 0.1662 0.1626 1.022 0.312128
## student_faculty_ratio -1.7021 0.4421 -3.850 0.000371 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared: 0.5613, Adjusted R-squared: 0.5418
## F-statistic: 28.79 on 2 and 45 DF, p-value: 8.869e-09
get_eq <- function(model) {
eq <- as.character(paste("Y =", round(coef(model)[1], 2), "+", round(coef(model)[2], 2), "* X"))
return(eq)
}
classesunder20togiving <- ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
#geom_smooth(method = "lm", se = FALSE, col = "blue") +
labs(title = "Alumni giving rate to % of classes under 20",
x = "% of Classes Under 20",
y = "Alumni Giving Rate") +
geom_text(x = 40, y = 50, label = get_eq(model), color = "blue", parse = TRUE)
print(classesunder20togiving)

studentfacultytogiving <- ggplot(alumni, aes(x = student_faculty_ratio, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, col = "blue") +
labs(title = "Alumni giving rate to Student Faculty Ratio",
x = "Student Faculty Ratio",
y = "Alumni Giving Rate") +
geom_text(x = 10, y = 50, label = get_eq(model), color = "blue", parse = TRUE)
print(studentfacultytogiving)
## `geom_smooth()` using formula = 'y ~ x'

combined_data <- rbind(
transform(alumni, variable = "% of Classes Under 20"),
transform(alumni, variable = "Student Faculty Ratio", percent_of_classes_under_20 = student_faculty_ratio)
)
classesunder20andstudentfaculty <- ggplot(combined_data, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point(aes(color = variable)) +
geom_smooth(method = "lm", se = FALSE, aes(group = variable, color = variable), show.legend = FALSE) +
facet_wrap(~ variable, scales = "free") +
labs(title = "Alumni Giving Rate",
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
print(classesunder20andstudentfaculty)
## `geom_smooth()` using formula = 'y ~ x'

combined_model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary_combined_model_Assignment3Q1 <- summary(combined_model)
print(summary_combined_model_Assignment3Q1)
##
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 +
## student_faculty_ratio, data = alumni)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.00 -6.57 -1.95 4.42 24.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6556 13.5076 2.936 0.005225 **
## percent_of_classes_under_20 0.1662 0.1626 1.022 0.312128
## student_faculty_ratio -1.7021 0.4421 -3.850 0.000371 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared: 0.5613, Adjusted R-squared: 0.5418
## F-statistic: 28.79 on 2 and 45 DF, p-value: 8.869e-09
#My final regression equation for the relationship between the percent of classes under 20 students (X1) and the student faculty ratio (X2) and the rate at which alumni donate (Y) is Y= 39.6556 + 0.1662* X1 - 1.7021* X2 . While this is the best fit, there are outliers, specifically the Minimum value and the maximum value of the residuals. As the first quartile, median and 3rd quartile are relatively in-line with a deviance of less than 5, we can accept this as a best fit model. The significance codes returned from this model show that the student faculty ratio is significant in the alumni donation rate, however the percent of classes under 20 students is not a significant factor into the alumni donation rate when compared to the student faculty ratio. Even with the significance between student faculty ratio and donation rate, the R squared value is returning 56.13%, showing that just over half the donation rate can be explained by either of the two provided variables. The low p-value also verifies that a connection to explain the donation rate has been determined in the model.
#b.
predicted_alumni_giving <- data.frame(percent_of_classes_under_20 = 50, student_faculty_ratio = 10)
predicted_alumni_giving_rate <- predict(combined_model, newdata = predicted_alumni_giving)
cat("Predicted Alumni Giving Rate:", predicted_alumni_giving_rate, "\n")
## Predicted Alumni Giving Rate: 30.94291
#If an observation occurred where the percent of classes under 20 was 50% and the student faculty ratio was 10, we would expect 30.94291% of the alumni to donate back to the college
#c.
coefficient_table <- summary(model)$coefficients
print(coefficient_table)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6555835 13.5075774 2.935803 0.0052247868
## percent_of_classes_under_20 0.1661686 0.1625520 1.022249 0.3121275033
## student_faculty_ratio -1.7021103 0.4421271 -3.849821 0.0003709425
# When testing the statistical significance of the regression coefficient using t-test, the null hypothesis is that the coefficients would be equal to zero, thus neither the percent of classes under twenty students nor the ratio of students to faculty would have an effect on the percent of alumni who donate to the colleges. Since the p-value of the intercept is 0.005 and less than the accepted threshold of 0.05, we would say there is an impact from at least one of the two variables on the donation rate. Since the p-value of the percent of classes under 20 is 0.31, and greater than 0.05, we confirm the null hypothesis in stating that the percent of classes under 20 does not have statistical significance on the donation rate. As the p-value of the student-faculty ratio is 0.00037 and less than 0.05, then we deny the null hypothesis and affirm that there is a statistically significant relationship between donation rate and student-faculty ratio.
#d.
f_statistic3 <- summary(model)$fstatistic
p_value_f_Stat3 <- pf(f_statistic3[1], f_statistic3[2], f_statistic3[3], lower.tail = FALSE)
cat("F-statistic:", f_statistic3[1], "\n")
## F-statistic: 28.79264
cat("Degrees of Freedom:", f_statistic3[2],"/",f_statistic3[3],"\n")
## Degrees of Freedom: 2 / 45
cat("P-value3", p_value_f_Stat3, "\n")
## P-value3 8.868867e-09
# When testing the F-statistic, we state the null hypothesis that all coefficients in the model are zero, thus they do not impact the percent of alumni who donate. The resulting F-statistic is 28.79 with 2/45 degrees of freedom. This means that put together, the percent of classes under 20 and the student to faculty ratio explains 28.79264 of the percent of alumni who donate and there are 47 data points taken into consideration with 2 predictors and 45 remaining observations. The p-value is 868867e^-09, which is very close to zero. This indicates that the model as a whole is statistically significant, and observing an F-statistic of 28.79 would be very unlikely. As the p-value is less than 0.05, we reject the null hypothesis and there is significance between the variables and the output.
#e.
r_squared3 <- summary(model)$r.squared
cat("R-Squared Value:", r_squared3, "\n")
## R-Squared Value: 0.5613406
#The coefficient determination value is 0.5613406, meaning 56.13% of the alumni giving rate can be attributed to either the percent of classes under 20 students or the student faculty ratio.
#f
cor_percent <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
cor_ratio <- cor(alumni$student_faculty_ratio, alumni$alumni_giving_rate)
cat("Correlation coefficient between percent_of_classes_under_20 and alumni_giving_rate (r_1):", cor_percent, "\n")
## Correlation coefficient between percent_of_classes_under_20 and alumni_giving_rate (r_1): 0.6456504
cat("Correlation coefficient between student_faculty_ratio and alumni_giving_rate (r_2):", cor_ratio, "\n")
## Correlation coefficient between student_faculty_ratio and alumni_giving_rate (r_2): -0.7423975
#The correlation coefficient between percent of classes under 20 and alumni giving rate (r1) is 0.6456504. As it is a moderately positive value, we can expect a positive linear relationship between percent of classes under 20 and donation rate. The correlation coefficient between student faculty ratio and alumni giving rate (r2) is -0.7423975. As a strong negative value, we can expect a strong negative relationship between student faculty ratio and alumni giving rate. The coefficient of determination (R2) is 0.5613, and as stated before, shows how much the giving rate is effected by either of the variables. Using these r values, we receive an unexpected equation as R2 should equal the square root of (r2)^2 + (r2)^2. We would expect the value to be 0.983879. Because of this inconsistency, we must do additional analysis to confirm results.
# Question 1 ----------------------------------------------------------------------
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary(model)
##
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 +
## student_faculty_ratio, data = alumni)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.00 -6.57 -1.95 4.42 24.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6556 13.5076 2.936 0.005225 **
## percent_of_classes_under_20 0.1662 0.1626 1.022 0.312128
## student_faculty_ratio -1.7021 0.4421 -3.850 0.000371 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared: 0.5613, Adjusted R-squared: 0.5418
## F-statistic: 28.79 on 2 and 45 DF, p-value: 8.869e-09
#a.
plot(model, which = 2)

qqnorm(resid(model))
qqline(resid(model))

#b.
plot(model, which = 3)

#c.
plot(model, which = 5)

plot(model, which = 4)

#d.
library(car)
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(model)
## percent_of_classes_under_20 student_faculty_ratio
## 2.611671 2.611671
#e.
new_data <- data.frame(percent_of_classes_under_20 = 40, student_faculty_ratio = 5)
A4_predicted_alumni_giving <- predict(model, newdata = new_data)
cat("Predicted Alumni Giving Rate X1=40, X2=5:", A4_predicted_alumni_giving)
## Predicted Alumni Giving Rate X1=40, X2=5: 37.79178