# Loading the data
setwd("~/Desktop/R Materials/mih140/Lecture 9(10) - Linear Regression I")
cba = read.table("cba_admissions_1999.txt", sep = "\t", header = T, quote = "", allowEscapes = T)
# Cleaning out the NA's
cba = cba[!is.na(cba$SAT_math), ]
cba$rank = cba$HS_rank/cba$HS_class_size # make new feature called "rank"
cba = cba[!is.na(cba$rank),] # prune students w/out a rank
# Examine the data for correlations
library(gpairs)
gpairs(cba[,c("SAT_math", "SAT_verbal", "rank")])
# Lets start with a big model
# Features we want to predict/explain: rank
# Features we'll use to do that: SAT scores, gender, from PA?, scholarship status
full_model = lm(data = cba, rank ~ SAT_verbal + SAT_math + Max_Test_Score + Female + from_PA + scholarship_yes_no + Max_Test_Score:scholarship_yes_no)
summary(full_model) # R^2 = .2646
##
## Call:
## lm(formula = rank ~ SAT_verbal + SAT_math + Max_Test_Score +
## Female + from_PA + scholarship_yes_no + Max_Test_Score:scholarship_yes_no,
## data = cba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29283 -0.09366 -0.01340 0.08266 0.38522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3214534 0.0767938 4.186 3.21e-05 ***
## SAT_verbal 0.0001585 0.0002693 0.588 0.556450
## SAT_math 0.0001136 0.0002590 0.439 0.661017
## Max_Test_Score -0.0001186 0.0002622 -0.452 0.651291
## Female -0.0684513 0.0105166 -6.509 1.47e-10 ***
## from_PA -0.0498007 0.0133036 -3.743 0.000197 ***
## scholarship_yes_no 0.0925036 0.1456863 0.635 0.525673
## Max_Test_Score:scholarship_yes_no -0.0002102 0.0001174 -1.790 0.073825 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1313 on 683 degrees of freedom
## Multiple R-squared: 0.2646, Adjusted R-squared: 0.2571
## F-statistic: 35.11 on 7 and 683 DF, p-value: < 2.2e-16
# Size of the model? 7
# install.packages("leaps")
library(leaps)
candidate_models = regsubsets(data = cba, rank ~ SAT_verbal + SAT_math + Max_Test_Score + Female + from_PA + scholarship_yes_no + Max_Test_Score:scholarship_yes_no)
summary(candidate_models)
## Subset selection object
## Call: regsubsets.formula(data = cba, rank ~ SAT_verbal + SAT_math +
## Max_Test_Score + Female + from_PA + scholarship_yes_no +
## Max_Test_Score:scholarship_yes_no)
## 7 Variables (and intercept)
## Forced in Forced out
## SAT_verbal FALSE FALSE
## SAT_math FALSE FALSE
## Max_Test_Score FALSE FALSE
## Female FALSE FALSE
## from_PA FALSE FALSE
## scholarship_yes_no FALSE FALSE
## Max_Test_Score:scholarship_yes_no FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
## SAT_verbal SAT_math Max_Test_Score Female from_PA scholarship_yes_no
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " "
## 3 ( 1 ) " " " " " " "*" "*" " "
## 4 ( 1 ) " " " " " " "*" "*" "*"
## 5 ( 1 ) "*" " " " " "*" "*" "*"
## 6 ( 1 ) "*" " " "*" "*" "*" "*"
## 7 ( 1 ) "*" "*" "*" "*" "*" "*"
## Max_Test_Score:scholarship_yes_no
## 1 ( 1 ) "*"
## 2 ( 1 ) "*"
## 3 ( 1 ) "*"
## 4 ( 1 ) "*"
## 5 ( 1 ) "*"
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
plot(candidate_models)
subset_model = lm(data = cba, rank ~ Female + from_PA + scholarship_yes_no:Max_Test_Score) # Note this model only uses SAT scores if the student recieved a scholarship, otherwise it just makes a mean prediction from demographic data. Can you think of any explanation why this might be?
Final model: \(\text{rank} = .34 - .069*Female - .05*\text{from_PA} - .00014*\text{Max_Test_Score}*\text{scholarship_yes_no}\)
Seems like a sensible model. Intercept rank is .34, i.e. top third of class. Being from PA for which Pitt has strong draw sees an improvement in class rank by .05. Female applicants apparently also have stronger class performance improving the class rank by ~.07. Then, if a student won a scholarship, there is a linear improvement in class rank based on how high their best score was.