Introduction

The purpose of this data analysis project is to analyze, predict, and observe the importance of these variables on the chance of admittance into a Masters graduate program.

The dataset contains several variables which are considered important during the application for Masters Programs. The variables included are :

GRE Scores ( out of 340 ) TOEFL Scores ( out of 120 ) University Rating ( out of 5 ) Statement of Purpose Strength ( out of 5) Letter of Recommendation Strength ( out of 5 ) Undergraduate GPA ( out of 10 ) Research Experience ( either 0 or 1 ) Chance of Admit ( ranging from 0 to 1 )

Dataset: https://www.kaggle.com/mohansacharya/graduate-admissions/home

#Load Dataset into r.

setwd('C:/Users/Christopher Fleming/Desktop/PowerBI')
Admissions <- read.csv('AdmissionsData.csv')
View(Admissions)
attach(Admissions)
library(GGally)

## Warning: package 'GGally' was built under R version 3.6.2

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)
library(ggcorrplot)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:GGally':
## 
##     nasa

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.2

## -- Attaching packages ----------------------------------------------------- tidyverse 1.3.0 --

## v tibble  2.1.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## v purrr   0.3.3

## -- Conflicts -------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 3.0-1

#Remove unnecessary columns
Admissions <- Admissions[, -1]

Summmary Statistics

This sections provides the average, minimum, maximum, and standard deviation of each variable.

#Remove Unnecessary columns

#Data Summary
#Averages

Averages <- data.frame('MeanGRE' = mean(GRE.Score), 'MeanTOEFL' = mean(TOEFL.Score),
                      'MeanRating' = mean(University.Rating), 'MeanSOP' = mean(SOP),
                      'MeanLOR' = mean(LOR), 'MeanCGPA' = mean(CGPA), 'MeanResearch' = mean(Research),
                      'MeanChance' = mean(Chance.of.Admit))


Averages

##   MeanGRE MeanTOEFL MeanRating MeanSOP MeanLOR MeanCGPA MeanResearch MeanChance
## 1 316.472   107.192      3.114   3.374   3.484  8.57644         0.56    0.72174

#Min

Minimum <- data.frame('MinGRE' = min(GRE.Score), 'MinTOEFL' = min(TOEFL.Score),
                      'MinRating' = min(University.Rating), 'MinSOP' = min(SOP),
                      'MinLOR' = min(LOR), 'MinCGPA' = min(CGPA), 'MinResearch' = min(Research),
                      'MinChance' = min(Chance.of.Admit))
                    
Minimum

##   MinGRE MinTOEFL MinRating MinSOP MinLOR MinCGPA MinResearch MinChance
## 1    290       92         1      1      1     6.8           0      0.34

#Max

Maximum <- data.frame('MaxGRE' = max(GRE.Score), 'MaxTOEFL' = max(TOEFL.Score),
                      'MaxRating' = max(University.Rating), 'MaxSOP' = max(SOP),
                      'MaxLOR' = max(LOR), 'MaxCGPA' = max(CGPA), 'MaxResearch' = max(Research),
                      'MaxChance' = max(Chance.of.Admit))

Maximum

##   MaxGRE MaxTOEFL MaxRating MaxSOP MaxLOR MaxCGPA MaxResearch MaxChance
## 1    340      120         5      5      5    9.92           1      0.97

#Standard Deviation
Std.Deviation <- data.frame('SdGRE' = sd(GRE.Score), 'SdTOEFL' = sd(TOEFL.Score),
                      'SdRating' = sd(University.Rating), 'SdSOP' = sd(SOP),
                      'SdLOR' = sd(LOR), 'SdCGPA' = sd(CGPA), 'SdResearch' = sd(Research),
                      'SdChance' = sd(Chance.of.Admit))

Distribution Analysis

This section observes the spread and distribution fo each variable. I will be looking to see if each variable contains a normal distribution to identify if the linear assumptions are violated.

#Get the data distributions
ggplot(Admissions, aes(GRE.Score)) +
  geom_histogram(color="darkblue",
                 fill = 'lightblue',
                 breaks=seq(290, 340, by=5))

ggplot(Admissions, aes(TOEFL.Score)) +
  geom_histogram(color="darkblue",
                 fill = 'lightblue')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Admissions, aes(University.Rating)) +
  geom_histogram(color="darkblue",
                 fill = 'lightblue')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Admissions, aes(SOP)) +
  geom_histogram(color="darkblue",
                 fill = 'lightblue')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Admissions, aes(LOR)) +
  geom_histogram(color="darkblue",
                 fill = 'lightblue')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Admissions, aes(CGPA)) +
  geom_histogram(color="darkblue",
                 fill = 'lightblue')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Admissions, aes(Chance.of.Admit)) +
  geom_histogram(color="darkblue",
                 fill = 'lightblue')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Correlation Plots

#Correlation plot Matrix
CORR <- cor(Admissions)

ggcorrplot(CORR,
           type = 'full',
           lab = TRUE,
           lab_size = 2.75,
           method = 'square',
           ggtheme= theme_bw ,
           colors = c('red', 'white', 'darkgreen'))

#Create Facet Plot

ggpairs(data = Admissions,
        switch = 'both',
        axisLabels = 'show')

Scatterplots

This section plots the response variable (Chance of Admittance) to each of the predictor variables individually.

#Regression Plots
Admissions %>% 
  gather(-Chance.of.Admit, key = 'var', value = 'value') %>% 
  ggplot(aes(value, Chance.of.Admit)) +
  geom_point() +
  facet_wrap(~var, scales = 'free') +
  geom_smooth(method ='lm')

Linear Regression

#Run Regressions
Fit <- lm(Chance.of.Admit ~ GRE.Score + TOEFL.Score + SOP +
            University.Rating + LOR + CGPA + Research)
summary(Fit)

## 
## Call:
## lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + SOP + 
##     University.Rating + LOR + CGPA + Research)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.266657 -0.023327  0.009191  0.033714  0.156818 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.2757251  0.1042962 -12.232  < 2e-16 ***
## GRE.Score          0.0018585  0.0005023   3.700 0.000240 ***
## TOEFL.Score        0.0027780  0.0008724   3.184 0.001544 ** 
## SOP                0.0015861  0.0045627   0.348 0.728263    
## University.Rating  0.0059414  0.0038019   1.563 0.118753    
## LOR                0.0168587  0.0041379   4.074 5.38e-05 ***
## CGPA               0.1183851  0.0097051  12.198  < 2e-16 ***
## Research           0.0243075  0.0066057   3.680 0.000259 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05999 on 492 degrees of freedom
## Multiple R-squared:  0.8219, Adjusted R-squared:  0.8194 
## F-statistic: 324.4 on 7 and 492 DF,  p-value: < 2.2e-16

Variance Inflation Factor

#Check for Multicollinearity
car::vif(Fit)

##         GRE.Score       TOEFL.Score               SOP University.Rating 
##          4.464249          3.904213          2.835210          2.621036 
##               LOR              CGPA          Research 
##          2.033555          4.777992          1.494008

#Possible Multicollinearity between TOEFL.Score, GRE.Score, and CGPA
#Students who scored better on the GRE.Score tend to have scored really well on the TOEFL
#and have higher GPAs. It might be worthwile to remove two of these variables
#To allow the regression model to perform better.

Regression Residual Plots

#QQPlot indicates that there is left skew
plot(Fit)

Ridge Regression

#prepare data
trainsample <- Admissions %>% sample_frac(.75)
testsample <- anti_join(Admissions, trainsample)

## Joining, by = c("GRE.Score", "TOEFL.Score", "University.Rating", "SOP", "LOR",
## "CGPA", "Research", "Chance.of.Admit")

xridgetrain <- model.matrix(Chance.of.Admit~., trainsample)[,-1]
xridgetest <- model.matrix(Chance.of.Admit~., testsample)[,-1]
y <- trainsample$Chance.of.Admit

#Ridge Regression
cvalridge <- cv.glmnet(xridgetrain, y, alpha = 0)

modelridge <- glmnet(xridgetrain, y, alpha = 0,
                lambda = cvalridge$lambda.1se)

#Make Predictions

predictionsridge <- modelridge %>% predict(xridgetest)

dfridge <- data.frame(
  RMSE = RMSE(predictionsridge, testsample$Chance.of.Admit),
  Rsq. = R2(predictionsridge, testsample$Chance.of.Admit)
)
dfridge <- dfridge %>% rename(Rsquared = s0)

coef(modelridge)

## 8 x 1 sparse Matrix of class "dgCMatrix"
##                             s0
## (Intercept)       -1.044126417
## GRE.Score          0.002220249
## TOEFL.Score        0.003375483
## University.Rating  0.010240850
## SOP                0.011813694
## LOR                0.017038986
## CGPA               0.064507984
## Research           0.027401285

dfridge

##         RMSE  Rsquared
## 1 0.06309443 0.7879626

Lasso Regression

#Lasso Regression
cvlasso <- cv.glmnet(xridgetrain, y, alpha = 1)

modellasso <- glmnet(xridgetrain, y, alpha = 1,
                     lambda = cvlasso$lambda.1se)



#Predictions
predictionslasso <-modellasso %>% predict(xridgetest)

dflasso <- data.frame(
  RMSE = RMSE(predictionslasso, testsample$Chance.of.Admit),
  Rsquare = R2(predictionslasso, testsample$Chance.of.Admit)
)

dflasso <- dflasso %>% rename(Rsquared = s0)

coef(modellasso)

## 8 x 1 sparse Matrix of class "dgCMatrix"
##                              s0
## (Intercept)       -1.2168372457
## GRE.Score          0.0020934508
## TOEFL.Score        0.0009631816
## University.Rating  .           
## SOP                .           
## LOR                0.0079328697
## CGPA               0.1327655573
## Research           0.0094520469

dflasso

##         RMSE  Rsquared
## 1 0.06668545 0.7629921

Conclusion

Citation: Mohan S Acharya, Asfia Armaan, Aneeta S Antony : A Comparison of Regression Models for Prediction of Graduate Admissions, IEEE International Conference on Computational Intelligence in Data Science 2019

Graduate Admissions Exploratory Data Analysis

Christopher Fleming

December 30, 2019