Final Linear Regression

setwd("C:/Users/drobb/Desktop/Linear Regression")
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.2

library(MASS)

## Warning: package 'MASS' was built under R version 4.3.2

library(quantreg)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(rgl)

## Warning: package 'rgl' was built under R version 4.3.2

library(scatterplot3d)

####Question 1
####Assignment1
# Assignment 1 ----------------------------------------------------------------------
# Question 1 ----------------------------------------------------------------------

url <- "C:/Users/drobb/Desktop/Linear Regression/alumni.csv"
alumni <- read.csv(url)
str(alumni)  # print structure of the alumni data frame

## 'data.frame':    48 obs. of  5 variables:
##  $ school                     : chr  "Boston College" "Brandeis University " "Brown University" "California Institute of Technology" ...
##  $ percent_of_classes_under_20: int  39 68 60 65 67 52 45 69 72 61 ...
##  $ student_faculty_ratio      : int  13 8 8 3 10 8 12 7 13 10 ...
##  $ alumni_giving_rate         : int  25 33 40 46 28 31 27 31 35 53 ...
##  $ private                    : int  1 1 1 1 1 1 1 1 1 1 ...

summary(alumni)

##     school          percent_of_classes_under_20 student_faculty_ratio
##  Length:48          Min.   :29.00               Min.   : 3.00        
##  Class :character   1st Qu.:44.75               1st Qu.: 8.00        
##  Mode  :character   Median :59.50               Median :10.50        
##                     Mean   :55.73               Mean   :11.54        
##                     3rd Qu.:66.25               3rd Qu.:13.50        
##                     Max.   :77.00               Max.   :23.00        
##  alumni_giving_rate    private      
##  Min.   : 7.00      Min.   :0.0000  
##  1st Qu.:18.75      1st Qu.:0.0000  
##  Median :29.00      Median :1.0000  
##  Mean   :29.27      Mean   :0.6875  
##  3rd Qu.:38.50      3rd Qu.:1.0000  
##  Max.   :67.00      Max.   :1.0000

#Note: response variable is Alumni Giving Rate and the predictor variable is % of classes under 20
summary(alumni$percent_of_classes_under_20)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   44.75   59.50   55.73   66.25   77.00

summary(alumni$alumni_giving_rate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00   18.75   29.00   29.27   38.50   67.00

ggplot(alumni, aes(x= percent_of_classes_under_20, y=  alumni_giving_rate)) + 
  geom_point() +
  labs(title = "Alumni Giving Rate to Percent of classes under 20", 
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")

find_outliers <- function(variable) {
  q <- quantile(variable)
  iqr <- IQR(variable)
  lower_limit <- q[2] - 1.2 * iqr
  upper_limit <- q[4] +1.2 * iqr
  return(variable[variable < lower_limit | variable > upper_limit])
}


outliers_under_20 <- boxplot.stats(alumni$percent_of_classes_under_20)$out
outliers_giving_rate <- boxplot.stats(alumni$alumni_giving_rate)$out
correlation_coefficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data=alumni)
coefficients(model)

##                 (Intercept) percent_of_classes_under_20 
##                  -7.3860676                   0.6577687

outliers_under_20

## integer(0)

#find Outliers
residuals <- residuals(model)
confidence_interval <- mean(alumni$alumni_giving_rate) + c(-1.96, 1.96) * sd(residuals)
outliers <- subset(alumni, alumni_giving_rate < confidence_interval[1] | alumni_giving_rate > confidence_interval[2])
print(outliers)

##                        school percent_of_classes_under_20 student_faculty_ratio
## 10          Dartmouth College                          61                    10
## 21       Princeton University                          68                     5
## 27     U. of California-Davis                          32                    19
## 28    U. of California-Irvine                          42                    20
## 30 U. of California-San Diego                          48                    19
## 48            Yale University                          77                     7
##    alumni_giving_rate private
## 10                 53       1
## 21                 67       1
## 27                  7       0
## 28                  9       0
## 30                  8       0
## 48                 50       1

#Linear Regression is  -7.2860676 + 0.6577686x
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
                     "\nCorrelation Coefficient:", round(correlation_coefficient, 2),
                     "\nRegression Equation: Y = -7.39 + 0.66X",
                     sep = ""),
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")

## `geom_smooth()` using formula = 'y ~ x'

#Assignment 2 --------------------------------------------------------------------
# Question 1 ----------------------------------------------------------------------
#Note: this is the same plot as used in the last assignment
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
                     "\nCorrelation Coefficient:", round(correlation_coefficient, 2),
                     "\nRegression Equation: Y = -7.39 + 0.66X",
                     sep = ""),
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")

## `geom_smooth()` using formula = 'y ~ x'

tstat <- summary(model)$coefficients["percent_of_classes_under_20", "t value"]
p_value <- summary(model)$coefficients["percent_of_classes_under_20", "Pr(>|t|)"]
cat("Estimated Slope:", coef(model)["percent_of_classes_under_20"], "\n")

## Estimated Slope: 0.6577687

cat("T-Statstic:", tstat, "\n")

## T-Statstic: 5.734448

cat("P-Value:", p_value, "\n")

## P-Value: 7.228121e-07

if (!is.na(p_value) && p_value < 0.05) {
  cat("The null hypothesis is rejected as estimated slope is significant at α=0.05 level.\n")
} else {
  cat("The estimated slope is not statistically significant at the α=0.05 level.\n")
}

## The null hypothesis is rejected as estimated slope is significant at α=0.05 level.

#null hypothesis is rejected

#Repeat part a. above using th equivalent F-test

model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
f_test <- anova(model)
f_statistic <- f_test$"F value"[1]
p_value_f <- f_test$"Pr(>F)"[1]
cat("Estimated Slope F-Test", coef(model)["percent_of_classes_under_20"], "\n")

## Estimated Slope F-Test 0.6577687

cat("F-Statistic:", f_statistic, "\n")

## F-Statistic: 32.88389

cat("P-Value (F-Test):", p_value_f, "\n")

## P-Value (F-Test): 7.228121e-07

if(p_value_f < 0.05){
  cat("The slope from the F-test is statistically significant at α=0.05 level.\n")
  cat("The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.\n")
} else {
  cat("The estimated slope is not statistically significant at α=0.05 level.\n")
  cat("The null hypothesis is suported and there is not a correlation between class size and alumni giving rate. \n")
}

## The slope from the F-test is statistically significant at α=0.05 level.
## The null hypothesis is rejected and there is a correlation between class size and alumni giving rate.

#What is the value of R^2?  Please interpret
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data = alumni)
r_squared <- summary(model)$r.squared
cat("R-squared:", r_squared, "\n")

## R-squared: 0.4168645

cat(r_squared*100, "percent of the change in giving rate is related to the percentage of classes they attended under 20 students.", 
    (1-(r_squared))*100, "percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors. \n")

## 41.68645 percent of the change in giving rate is related to the percentage of classes they attended under 20 students. 58.31355 percent of the change in giving rate is not explained by the percentage of classes under 20 students and can be effected by other factors.

r_correlation_cofficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
cat("Correlation Coefficient (r):", r_correlation_cofficient, "\n")

## Correlation Coefficient (r): 0.6456504

cat("The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2.  r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship.  R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.")

## The relationship between r^2 and R^2 are equal to one another in a linear regression. So r is the squareroot of R^2.  r, the correlation coefficient ranges from -1 to 1, so it can show a positive or negative form of linear relationship.  R^2 must be positive and if it is 0, there is no explained variance and if it is 1 all variance is explained.

cat("In this model, the relationship is positively correlated and class size represents 41% of donation percent")

## In this model, the relationship is positively correlated and class size represents 41% of donation percent

ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point() +
  stat_smooth(method = "lm", se = TRUE, col = "blue", fill = "lightblue") +
  labs(title = "Fitted Regression Line with 95% Confidence Band",
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")

## `geom_smooth()` using formula = 'y ~ x'

cat("At the mean point, the confidence band is narrower compared to the rest of the model")

## At the mean point, the confidence band is narrower compared to the rest of the model

#a. 

model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary <- summary(model)
print(summary)

## 
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 + 
##     student_faculty_ratio, data = alumni)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.00  -6.57  -1.95   4.42  24.56 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  39.6556    13.5076   2.936 0.005225 ** 
## percent_of_classes_under_20   0.1662     0.1626   1.022 0.312128    
## student_faculty_ratio        -1.7021     0.4421  -3.850 0.000371 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared:  0.5613, Adjusted R-squared:  0.5418 
## F-statistic: 28.79 on 2 and 45 DF,  p-value: 8.869e-09

get_eq <- function(model) {
  eq <- as.character(paste("Y =", round(coef(model)[1], 2), "+", round(coef(model)[2], 2), "* X"))
  return(eq)
}
classesunder20togiving <- ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point() +
  #geom_smooth(method = "lm", se = FALSE, col = "blue") +
  labs(title = "Alumni giving rate to % of classes under 20",
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate") +
  geom_text(x = 40, y = 50, label = get_eq(model), color = "blue", parse = TRUE)
print(classesunder20togiving)

studentfacultytogiving <- ggplot(alumni, aes(x = student_faculty_ratio, y = alumni_giving_rate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, col = "blue") +
  labs(title = "Alumni giving rate to Student Faculty Ratio",
       x = "Student Faculty Ratio",
       y = "Alumni Giving Rate") +
  geom_text(x = 10, y = 50, label = get_eq(model), color = "blue", parse = TRUE)
print(studentfacultytogiving)

## `geom_smooth()` using formula = 'y ~ x'

combined_data <- rbind(
  transform(alumni, variable = "% of Classes Under 20"),
  transform(alumni, variable = "Student Faculty Ratio", percent_of_classes_under_20 = student_faculty_ratio)
)

classesunder20andstudentfaculty <- ggplot(combined_data, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
  geom_point(aes(color = variable)) +
  geom_smooth(method = "lm", se = FALSE, aes(group = variable, color = variable), show.legend = FALSE) +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Alumni Giving Rate",
       x = "% of Classes Under 20",
       y = "Alumni Giving Rate")

print(classesunder20andstudentfaculty)

## `geom_smooth()` using formula = 'y ~ x'

combined_model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary_combined_model_Assignment3Q1 <- summary(combined_model)
print(summary_combined_model_Assignment3Q1)

## 
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 + 
##     student_faculty_ratio, data = alumni)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.00  -6.57  -1.95   4.42  24.56 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  39.6556    13.5076   2.936 0.005225 ** 
## percent_of_classes_under_20   0.1662     0.1626   1.022 0.312128    
## student_faculty_ratio        -1.7021     0.4421  -3.850 0.000371 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared:  0.5613, Adjusted R-squared:  0.5418 
## F-statistic: 28.79 on 2 and 45 DF,  p-value: 8.869e-09

#My final regression equation for the relationship between the percent of classes under 20 students (X1) and the student faculty ratio (X2) and the rate at which alumni donate (Y) is  Y= 39.6556 + 0.1662* X1  - 1.7021* X2 .  While this is the best fit, there are outliers, specifically the Minimum value and the maximum value of the residuals.  As the first quartile, median and 3rd quartile are relatively in-line with a deviance of less than 5, we can accept this as a best fit model.  The significance codes returned from this model show that the student faculty ratio is significant in the alumni donation rate, however the percent of classes under 20 students is not a significant factor into the alumni donation rate when compared to the student faculty ratio.  Even with the significance between student faculty ratio and donation rate, the R squared value is returning 56.13%, showing that just over half the donation rate can be explained by either of the two provided variables.  The low p-value also verifies that a connection to explain the donation rate has been determined in the model.  


#b. 
predicted_alumni_giving <- data.frame(percent_of_classes_under_20 = 50, student_faculty_ratio = 10)
predicted_alumni_giving_rate <- predict(combined_model, newdata = predicted_alumni_giving)
cat("Predicted Alumni Giving Rate:", predicted_alumni_giving_rate, "\n")

## Predicted Alumni Giving Rate: 30.94291

#If an observation occurred where the percent of classes under 20 was 50% and the student faculty ratio was 10, we would expect 30.94291% of the alumni to donate back to the college

#c. 

coefficient_table <- summary(model)$coefficients
print(coefficient_table)

##                               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)                 39.6555835 13.5075774  2.935803 0.0052247868
## percent_of_classes_under_20  0.1661686  0.1625520  1.022249 0.3121275033
## student_faculty_ratio       -1.7021103  0.4421271 -3.849821 0.0003709425

#   When testing the statistical significance of the regression coefficient using t-test, the null hypothesis is that the coefficients would be equal to zero, thus neither the percent of classes under twenty students nor the ratio of students to faculty would have an effect on the percent of alumni who donate to the colleges.  Since the p-value  of the intercept is 0.005 and less than the accepted threshold of 0.05, we would say there is an impact from at least one of the two variables on the donation rate.  Since the p-value of the percent of classes under 20 is 0.31, and greater than 0.05, we confirm the null hypothesis in stating that the percent of classes under 20 does not have statistical significance on the donation rate.  As the p-value of the student-faculty ratio is 0.00037 and less than 0.05, then we deny the null hypothesis and affirm that there is a statistically significant relationship between donation rate and student-faculty ratio.  

#d. 
f_statistic3 <- summary(model)$fstatistic
p_value_f_Stat3 <- pf(f_statistic3[1], f_statistic3[2], f_statistic3[3], lower.tail = FALSE)
cat("F-statistic:", f_statistic3[1], "\n")

## F-statistic: 28.79264

cat("Degrees of Freedom:", f_statistic3[2],"/",f_statistic3[3],"\n")

## Degrees of Freedom: 2 / 45

cat("P-value3", p_value_f_Stat3, "\n")

## P-value3 8.868867e-09

#   When testing the F-statistic, we state the null hypothesis that all coefficients in the model are zero, thus they do not impact the percent of alumni who donate.  The resulting F-statistic is 28.79 with 2/45 degrees of freedom.  This means that put together, the percent of classes under 20 and the student to faculty ratio explains 28.79264 of the percent of alumni who donate and there are 47 data points taken into consideration with 2 predictors and 45 remaining observations.  The p-value is 868867e^-09, which is very close to zero.  This indicates that the model as a whole is statistically significant, and observing an F-statistic of 28.79 would be very unlikely.   As the p-value is less than 0.05, we reject the null hypothesis and there is significance between the variables and the output.  

#e. 
r_squared3 <- summary(model)$r.squared
cat("R-Squared Value:", r_squared3, "\n")

## R-Squared Value: 0.5613406

#The coefficient determination value is 0.5613406, meaning 56.13% of the alumni giving rate can be attributed to either the percent of classes under 20 students or the student faculty ratio. 


#f
cor_percent <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
cor_ratio <- cor(alumni$student_faculty_ratio, alumni$alumni_giving_rate)

cat("Correlation coefficient between percent_of_classes_under_20 and alumni_giving_rate (r_1):", cor_percent, "\n")

## Correlation coefficient between percent_of_classes_under_20 and alumni_giving_rate (r_1): 0.6456504

cat("Correlation coefficient between student_faculty_ratio and alumni_giving_rate (r_2):", cor_ratio, "\n")

## Correlation coefficient between student_faculty_ratio and alumni_giving_rate (r_2): -0.7423975

#The correlation coefficient between percent of classes under 20 and alumni giving rate (r1) is 0.6456504.  As it is a moderately positive value, we can expect a positive linear relationship between percent of classes under 20 and donation rate.  The correlation coefficient between student faculty ratio and alumni giving rate (r2) is -0.7423975.  As a strong negative value, we can expect a strong negative relationship between student faculty ratio and alumni giving rate.   The coefficient of determination (R2)  is 0.5613, and as stated before, shows how much the giving rate is effected by either of the variables.   Using these r values, we receive an unexpected equation as R2 should equal the square root of (r2)^2 + (r2)^2.  We would expect the value to be 0.983879.  Because of this inconsistency, we must do additional analysis to confirm results.

# Question 1 ----------------------------------------------------------------------
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio, data = alumni)
summary(model)

## 
## Call:
## lm(formula = alumni_giving_rate ~ percent_of_classes_under_20 + 
##     student_faculty_ratio, data = alumni)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.00  -6.57  -1.95   4.42  24.56 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  39.6556    13.5076   2.936 0.005225 ** 
## percent_of_classes_under_20   0.1662     0.1626   1.022 0.312128    
## student_faculty_ratio        -1.7021     0.4421  -3.850 0.000371 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared:  0.5613, Adjusted R-squared:  0.5418 
## F-statistic: 28.79 on 2 and 45 DF,  p-value: 8.869e-09

#a. 
plot(model, which = 2)

qqnorm(resid(model))
qqline(resid(model))

#b. 
plot(model, which = 3)

#c. 
plot(model, which = 5)

plot(model, which = 4)

#d. 
library(car)

## Warning: package 'car' was built under R version 4.3.2

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.2

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

vif(model)

## percent_of_classes_under_20       student_faculty_ratio 
##                    2.611671                    2.611671

#e. 
new_data <- data.frame(percent_of_classes_under_20 = 40, student_faculty_ratio = 5)
A4_predicted_alumni_giving <- predict(model, newdata = new_data)
cat("Predicted Alumni Giving Rate X1=40, X2=5:", A4_predicted_alumni_giving)

## Predicted Alumni Giving Rate X1=40, X2=5: 37.79178

Final Linear Regression

Danielle Robben

2023-11-19