Predicting_University

Reading Input:-

Reading the input csv file

df <- read.csv('Input_data/binary.csv')
head(df)

##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2

str(df)

## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...

Data Preprocessing :-

df$admitf <- factor(df$admit , labels = c('No Admission' , 'Give Admission' ))
df$rankf <- factor(df$rank , labels = c('Rank-1' , 'Rank-2' , 'Rank-3' , 'Rank-4'))
head(df)

##   admit gre  gpa rank         admitf  rankf
## 1     0 380 3.61    3   No Admission Rank-3
## 2     1 660 3.67    3 Give Admission Rank-3
## 3     1 800 4.00    1 Give Admission Rank-1
## 4     1 640 3.19    4 Give Admission Rank-4
## 5     0 520 2.93    4   No Admission Rank-4
## 6     1 760 3.00    2 Give Admission Rank-2

str(df)

## 'data.frame':    400 obs. of  6 variables:
##  $ admit : int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre   : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa   : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank  : int  3 3 1 4 4 2 1 2 3 2 ...
##  $ admitf: Factor w/ 2 levels "No Admission",..: 1 2 2 2 1 2 2 1 2 1 ...
##  $ rankf : Factor w/ 4 levels "Rank-1","Rank-2",..: 3 3 1 4 4 2 1 2 3 2 ...

Ovrall Summary :-

summary(df)

##      admit             gre             gpa             rank      
##  Min.   :0.0000   Min.   :220.0   Min.   :2.260   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:520.0   1st Qu.:3.130   1st Qu.:2.000  
##  Median :0.0000   Median :580.0   Median :3.395   Median :2.000  
##  Mean   :0.3175   Mean   :587.7   Mean   :3.390   Mean   :2.485  
##  3rd Qu.:1.0000   3rd Qu.:660.0   3rd Qu.:3.670   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :800.0   Max.   :4.000   Max.   :4.000  
##             admitf       rankf    
##  No Admission  :273   Rank-1: 61  
##  Give Admission:127   Rank-2:151  
##                       Rank-3:121  
##                       Rank-4: 67  
##                                   
##

There are no Null Values, So it is good to go for Exploratory Data Analysis.

Exploratory Data Analysis :-

Contionus variables :-

plot_box <- function(attr){
  colname <- eval(parse(text=paste('df$' ,attr , sep = '' )))
  boxplot(colname , horizontal = TRUE)
  remove(colname)
}

plot_hist <- function(attr){
  colname <- eval(parse(text=paste('df$' ,attr , sep = '' )))
  sub_text <- paste('Mean :' , round (mean(colname) , 2) ,
                    '\nMedian:', round (median(colname) , 2) ,
                    '\nStd. Dev:', round (sd(colname) , 2)
                    )
  hist(colname , main = attr , labels = TRUE , xlab = '' , sub=sub_text )
}
par(mfrow=c(2,2))

plot_box('gre')
plot_box('gpa')

plot_hist('gre')
plot_hist('gpa')

remove(plot_box , plot_hist )

Categorical variables :-

count_cross_table <- function( x , y){
  colname_x <- eval(parse(text=paste('df$' ,x , sep = '' )))
  colname_y <- eval(parse(text=paste('df$' ,y , sep = '' )))
  
  data <- xtabs(~colname_x + colname_y)
  data_total <- cbind(data , 'Total' = rowSums(data))
  data_total <- rbind(data_total , 'Total' = colSums(data_total))
  print(mosaicplot(data , shade = TRUE))
  print(data_total)
  remove(data , data_total , colname_x , colname_y)
}

count_cross_table('admitf' , 'rankf')

## NULL
##                Rank-1 Rank-2 Rank-3 Rank-4 Total
## No Admission       28     97     93     55   273
## Give Admission     33     54     28     12   127
## Total              61    151    121     67   400

remove(count_cross_table)

Fitting Logistic Model (TO predict Admission):-

fit <- glm(admitf ~ gre + gpa + rankf , data = df , family = binomial)
summary(fit)

## 
## Call:
## glm(formula = admitf ~ gre + gpa + rankf, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rankfRank-2 -0.675443   0.316490  -2.134 0.032829 *  
## rankfRank-3 -1.340204   0.345306  -3.881 0.000104 ***
## rankfRank-4 -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4

Estimates of the fitted Logistic Model :-

data.frame(cbind("Coefficients" = coef(fit) , 
                 "Odd Ratios" = exp(coef(fit)) , 
                 exp(confint(fit))))

## Waiting for profiling to be done...

##             Coefficients Odd.Ratios      X2.5..   X97.5..
## (Intercept) -3.989979073  0.0185001 0.001889165 0.1665354
## gre          0.002264426  1.0022670 1.000137602 1.0044457
## gpa          0.804037549  2.2345448 1.173858216 4.3238349
## rankfRank-2 -0.675442928  0.5089310 0.272289674 0.9448343
## rankfRank-3 -1.340203916  0.2617923 0.131641717 0.5115181
## rankfRank-4 -1.551463677  0.2119375 0.090715546 0.4706961

Evaluation :-

Confusion Matrix :-

Preparing the Predicted column :-

df$predicted_admission <- ifelse ( fitted(fit) <= 0.5 ,
         'No Admission' ,
         'Give Admission')

head(df)

##   admit gre  gpa rank         admitf  rankf predicted_admission
## 1     0 380 3.61    3   No Admission Rank-3        No Admission
## 2     1 660 3.67    3 Give Admission Rank-3        No Admission
## 3     1 800 4.00    1 Give Admission Rank-1      Give Admission
## 4     1 640 3.19    4 Give Admission Rank-4        No Admission
## 5     0 520 2.93    4   No Admission Rank-4        No Admission
## 6     1 760 3.00    2 Give Admission Rank-2        No Admission

Preparing confusion Matrix :-

conf_mat <- table(Observed = df$admitf , Predicted = df$predicted_admission)

conf_mat <- cbind(conf_mat , "Total" = rowSums(conf_mat))
conf_mat <- rbind(conf_mat , "Total" = colSums(conf_mat))
data.frame(conf_mat)

##                Give.Admission No.Admission Total
## No Admission               19          254   273
## Give Admission             30           97   127
## Total                      49          351   400

Correct Classification Rate:-

(conf_mat[2] + conf_mat[4]) / conf_mat[9]

## [1] 0.71

remove(conf_mat)

Pseudo - R2 Values :-

pseudo_r2 <- function(x , nrow){
  stopifnot(inherits(x, "glm"))
  x_null<- update(x , . ~ 1)
  llv <- logLik(x)
  llo <- logLik(x_null)
  McFadden_R2 <- ( 1 - (llv / llo) )
  
  lo <- exp(llo)
  lv <- exp(llv)
  pow <- (2/nrow)
  cox_snell_R2 <- lo/lv
  cox_snell_R2 <- cox_snell_R2 ^ pow
  cox_snell_R2 <- (1 - cox_snell_R2)
  
  r2max <- lo ^pow
  r2max <- (1 - r2max)
  nagelkerke_R2 <- (cox_snell_R2 / r2max )
  return(data.frame( "McFadden_R2" = McFadden_R2 ,
            "cox_snell_R2" = cox_snell_R2,
            "nagelkerke_R2" = nagelkerke_R2
            ))
  remove(llv,llo,McFadden_R2 , lo , lv , cox_snell_R2 , r2max , nagelkerke_R2 , pow , fit_null)
}


pseudo_r2(fit , nrow(df))

##   McFadden_R2 cox_snell_R2 nagelkerke_R2
## 1  0.08292194   0.09845702     0.1379958

Conclusion :-

Eventhought the confusion matrix is giving 71 % correct Classificaiton Rate, the pseudo R2 values are very very less (almost near to zero). So this model is not perfect.

Predicting_University_admission

Anil Kumar Kanasani

9 June 2020