setwd("C:/Users/drobb/Desktop/Linear Regression")
library(ggplot2)
# Question 1 ----------------------------------------------------------------------
url <- "https://bgreenwell.github.io/uc-bana7052/data/alumni.csv"
alumni <- read.csv(url)
str(alumni) # print structure of the alumni data frame
summary(alumni)
#Note: response variable is Alumni Giving Rate and the predictor variable is % of classes under 20
summary(alumni$percent_of_classes_under_20)
summary(alumni$alumni_giving_rate)
ggplot(alumni, aes(x= percent_of_classes_under_20, y= alumni_giving_rate)) +
geom_point() +
labs(title = "Alumni Giving Rate to Percent of classes under 20",
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
# find_outliers <- function(variable) {
# q <- quantile(variable)
# iqr <- IQR(variable)
# lower_limit <- q[2] - 1.5 * iqr
# upper_limit <- q[4] +1.5 * iqr
# return(variable[variable < lower_limit | variable > upper_limit])
# }
#
#
# outliers_under_20 <- boxplot.stats(alumni$percent_of_classes_under_20)$out
# outliers_giving_rate <- boxplot.stats(alumni$alumni_giving_rate)$out
correlation_coefficient <- cor(alumni$percent_of_classes_under_20, alumni$alumni_giving_rate)
model <- lm(alumni_giving_rate ~ percent_of_classes_under_20, data=alumni)
coefficients(model)
# find Outliers
residuals <- residuals(model)
confidence_interval <- mean(alumni$alumni_giving_rate) + c(-1.96, 1.96) * sd(residuals)
outliers <- subset(alumni, alumni_giving_rate < confidence_interval[1] | alumni_giving_rate > confidence_interval[2])
print(outliers)
#Linear Regression is -7.2860676 + 0.6577686x
ggplot(alumni, aes(x = percent_of_classes_under_20, y = alumni_giving_rate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("Alumni Giving Rate to Percent of Classes Under 20\n",
"\nCorrelation Coefficient:", round(correlation_coefficient, 2),
"\nRegression Equation: Y = -7.39 + 0.66X",
sep = ""),
x = "% of Classes Under 20",
y = "Alumni Giving Rate")
# Question 2 ----------------------------------------------------------------------
set.seed(7052)
n <- 100
X <- rnorm(n, mean = 2, sd = 0.1)
error <- rnorm(n, mean = 0, sd = 0.5)
Y <- 10 + 5 * X + error
summary(X)
summary(Y)
boxplot(X, Y, names = c("X", "Y"), main = "Question 2 Boxplot")
correlation_coefficient_2 <- cor(X, Y)
plot(X, Y, main = "Question 2 Scatter Plot", xlab = "X", ylab="Y")
model_Q2 <- lm(Y ~ X)
coefficients(model_Q2)
mse_Q2 <- mean(model$residuals^2)
mse_Q2
mean_X_Q2 <- mean(X)
mean_Y_Q2 <- mean(Y)
plot(X, Y, main = "Question 2 Plot", xlab = "X", ylab = "Y")
lines(X, predict(model_Q2), col = "red")
points(mean_X_Q2, mean_Y_Q2, col = "blue", pch = 19)
# Question 3 ----------------------------------------------------------------------
library(MASS)
lad_model <- rlm(Y ~ X)
summary(lad_model)
library(quantreg)
lad_quantile_model <- rq(Y ~ X)
summary(lad_quantile_model)