url <- "https://bgreenwell.github.io/uc-bana7052/data/alumni.csv"
alumni <- read.csv(url)
dim(alumni)
## [1] 48 5
alumni <- alumni[c(2,3,4,5)]
sapply(alumni, class)
## percent_of_classes_under_20 student_faculty_ratio
## "integer" "integer"
## alumni_giving_rate private
## "integer" "integer"
alumni$private=as.factor(alumni$private)
summary(alumni)
## percent_of_classes_under_20 student_faculty_ratio alumni_giving_rate
## Min. :29.00 Min. : 3.00 Min. : 7.00
## 1st Qu.:44.75 1st Qu.: 8.00 1st Qu.:18.75
## Median :59.50 Median :10.50 Median :29.00
## Mean :55.73 Mean :11.54 Mean :29.27
## 3rd Qu.:66.25 3rd Qu.:13.50 3rd Qu.:38.50
## Max. :77.00 Max. :23.00 Max. :67.00
## private
## 0:15
## 1:33
##
##
##
##
boxplot(alumni, use.cols = TRUE)
pairs(alumni, cex = 1.2, pch = 19, col = adjustcolor("darkred", alpha.f = 0.5))
GGally::ggpairs(alumni)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
alumni %>%
lm(alumni_giving_rate ~ student_faculty_ratio + percent_of_classes_under_20 , data = .) %>%
vif()
## student_faculty_ratio percent_of_classes_under_20
## 2.611671 2.611671
For model selection, we try all models from no effect to 3- way interactions based on AIC and BIC using 3 ways of model selection – forward selection, backward elimination and step wise selection.
Code:
fit_min <- lm(alumni_giving_rate ~ 1, data = alumni)
fit_max_2 <- lm(alumni_giving_rate ~ .^3, data = alumni)
fs_1 <- step(fit_min, direction = "forward",
scope = list(lower = fit_min,
upper = fit_max_2),
trace = 0, k = log(nrow(alumni)))
be_1 <- step(fit_max_2, direction = "backward",
trace = 0, k = log(nrow(alumni)))
be_2 <- step(fit_max_2, direction = "backward",
trace = 0, k = 2)
ss_1 <- step(be_2, direction = "both",
scope = list(lower = fit_min,
upper = fit_max_2),
trace = 0, k = log(nrow(alumni)))
fs_2 <- step(fit_min, direction = "forward",
scope = list(lower = fit_min,
upper = fit_max_2),
trace = 0, k = 2)
ss_2 <- step(be_2, direction = "both",
scope = list(lower = fit_min,
upper = fit_max_2),
trace = 0, k = 2)
PRESS <- function(object, ...) {
if(!missing(...)) {
res <- sapply(list(object, ...), FUN = function(x) {
sum(rstandard(x, type = "predictive") ^ 2)
})
names(res) <- as.character(match.call()[-1L])
res
} else {
sum(rstandard(object, type = "predictive") ^ 2)
}
}
modelMetrics <- function(object, ...) {
if(!missing(...)) {
res <- sapply(list(object, ...), FUN = function(x) {
c("AIC" = AIC(x), "BIC" = BIC(x),
"adjR2" = summary(x)$adj.r.squared,
"RMSE" = sigma(x), "PRESS" = PRESS(x),
"nterms" = length(coef(x)))
})
colnames(res) <- as.character(match.call()[-1L])
res
} else {
c("AIC" = AIC(object), "BIC" = BIC(object),
"adjR2" = summary(object)$adj.r.squared,
"RMSE" = sigma(object), "PRESS" = PRESS(object),
"nterms" = length(coef(object)))
}
}
res <- modelMetrics(be_1,fs_1,ss_1,be_2,fs_2,ss_2)
round(res, digits = 3)
## be_1 fs_1 ss_1 be_2 fs_2 ss_2
## AIC 352.196 352.196 352.196 351.365 351.137 351.365
## BIC 357.810 357.810 357.810 362.592 360.493 362.592
## adjR2 0.541 0.541 0.541 0.574 0.569 0.574
## RMSE 9.103 9.103 9.103 8.768 8.829 8.768
## PRESS 4138.880 4138.880 4138.880 4020.749 4124.727 4020.749
## nterms 2.000 2.000 2.000 5.000 4.000 5.000
Based on the above parameters mainly PRESS and adjusted R-squated value, we have 2 major choices
Considering below factors we choose our model to be model 1. a. Model 1 is a very simple model with only 1 predictor variable b. There is not much improvement in the value of adjusted R-squared as we include more variables and interaction between variables as seen in model 2 c. The model 2 doesnot capture the correct trend between response variable and predictor variable. (sign of estimated parameter of student_faculty_Ratio is opposite)
Therefore our model is
We look at different diagnostic plots to know the adequacy of the selected regression model
yhat <- fitted(ss_2)
rstan <- rstandard(ss_2)
library(broom)
alumni2 <- alumni %>%
lm(alumni_giving_rate ~ percent_of_classes_under_20 + student_faculty_ratio + private + percent_of_classes_under_20:student_faculty_ratio, data = .) %>%
augment() %>%
mutate(row_num = 1:n())
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
#yhat vs residuals
ggplot(alumni2, aes(x = .fitted, y = .std.resid)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red2") +
geom_hline(yintercept = c(-2, 2), linetype = "dotted") +
# geom_smooth(color = "forestgreen", alpha = 0.1, se = FALSE) +
xlab("Fitted value") +
ylab("Standardized residual") +
theme_light()
#QQ plot
ggplot(alumni2, aes(sample = .std.resid)) +
geom_qq(alpha = 0.3) +
geom_qq_line(linetype = "dashed", color = "red2") +
xlab("Theoretical quantile") +
ylab("Sample quantile") +
theme_light()
ggplot(alumni2, aes(x = row_num, y = .std.resid)) +
geom_point(alpha = 0.3) +
geom_line() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red2") +
xlab("Index") +
ylab("Standardized residual") +
theme_light()
ggplot(alumni2, aes(x = percent_of_classes_under_20, y = .std.resid)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red2") +
geom_hline(yintercept = c(-2, 2), linetype = "dotted") +
geom_smooth(color = "forestgreen", alpha = 0.1, se = FALSE) +
ylab("Standardized residual") +
theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(alumni2, aes(x = student_faculty_ratio, y = .std.resid)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red2") +
geom_hline(yintercept = c(-2, 2), linetype = "dotted") +
geom_smooth(color = "forestgreen", alpha = 0.1, se = FALSE) +
ylab("Standardized residual") +
theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(alumni2, aes(x = private, y = .std.resid)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red2") +
geom_hline(yintercept = c(-2, 2), linetype = "dotted") +
geom_smooth(color = "forestgreen", alpha = 0.1, se = FALSE) +
ylab("Standardized residual") +
theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#outliers
h <- hatvalues(ss_2)
plot(h, type = "h", ylim = extendrange(h, f = 0.15))
abline(h = 2 * 5 / nrow(alumni), lty = "dotted")
text(h, labels = seq_len(nrow(alumni)), pos = 3, col = "red2")
Using Linear Regression, we developed a model to predict the alumni giving rate of universities. According to the final model, the alumni giving rate is dependent on student faculty ration of the university. For every unit increase in the student faculty ratio, the alumni giving rate decreases by 2.0572. This suggests universities to either have fewer intake of students or hire more faculty to maintain a better student faculty ratio. This would increase the university’s income from alumni.