Reading the input csv file
df <- read.csv('Input_data/binary.csv')
head(df)
## admit gre gpa rank
## 1 0 380 3.61 3
## 2 1 660 3.67 3
## 3 1 800 4.00 1
## 4 1 640 3.19 4
## 5 0 520 2.93 4
## 6 1 760 3.00 2
str(df)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
df$admitf <- factor(df$admit , labels = c('No Admission' , 'Give Admission' ))
df$rankf <- factor(df$rank , labels = c('Rank-1' , 'Rank-2' , 'Rank-3' , 'Rank-4'))
head(df)
## admit gre gpa rank admitf rankf
## 1 0 380 3.61 3 No Admission Rank-3
## 2 1 660 3.67 3 Give Admission Rank-3
## 3 1 800 4.00 1 Give Admission Rank-1
## 4 1 640 3.19 4 Give Admission Rank-4
## 5 0 520 2.93 4 No Admission Rank-4
## 6 1 760 3.00 2 Give Admission Rank-2
str(df)
## 'data.frame': 400 obs. of 6 variables:
## $ admit : int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
## $ admitf: Factor w/ 2 levels "No Admission",..: 1 2 2 2 1 2 2 1 2 1 ...
## $ rankf : Factor w/ 4 levels "Rank-1","Rank-2",..: 3 3 1 4 4 2 1 2 3 2 ...
summary(df)
## admit gre gpa rank
## Min. :0.0000 Min. :220.0 Min. :2.260 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:520.0 1st Qu.:3.130 1st Qu.:2.000
## Median :0.0000 Median :580.0 Median :3.395 Median :2.000
## Mean :0.3175 Mean :587.7 Mean :3.390 Mean :2.485
## 3rd Qu.:1.0000 3rd Qu.:660.0 3rd Qu.:3.670 3rd Qu.:3.000
## Max. :1.0000 Max. :800.0 Max. :4.000 Max. :4.000
## admitf rankf
## No Admission :273 Rank-1: 61
## Give Admission:127 Rank-2:151
## Rank-3:121
## Rank-4: 67
##
##
There are no Null Values, So it is good to go for Exploratory Data Analysis.
plot_box <- function(attr){
colname <- eval(parse(text=paste('df$' ,attr , sep = '' )))
boxplot(colname , horizontal = TRUE)
remove(colname)
}
plot_hist <- function(attr){
colname <- eval(parse(text=paste('df$' ,attr , sep = '' )))
sub_text <- paste('Mean :' , round (mean(colname) , 2) ,
'\nMedian:', round (median(colname) , 2) ,
'\nStd. Dev:', round (sd(colname) , 2)
)
hist(colname , main = attr , labels = TRUE , xlab = '' , sub=sub_text )
}
par(mfrow=c(2,2))
plot_box('gre')
plot_box('gpa')
plot_hist('gre')
plot_hist('gpa')
remove(plot_box , plot_hist )
count_cross_table <- function( x , y){
colname_x <- eval(parse(text=paste('df$' ,x , sep = '' )))
colname_y <- eval(parse(text=paste('df$' ,y , sep = '' )))
data <- xtabs(~colname_x + colname_y)
data_total <- cbind(data , 'Total' = rowSums(data))
data_total <- rbind(data_total , 'Total' = colSums(data_total))
print(mosaicplot(data , shade = TRUE))
print(data_total)
remove(data , data_total , colname_x , colname_y)
}
count_cross_table('admitf' , 'rankf')
## NULL
## Rank-1 Rank-2 Rank-3 Rank-4 Total
## No Admission 28 97 93 55 273
## Give Admission 33 54 28 12 127
## Total 61 151 121 67 400
remove(count_cross_table)
fit <- glm(admitf ~ gre + gpa + rankf , data = df , family = binomial)
summary(fit)
##
## Call:
## glm(formula = admitf ~ gre + gpa + rankf, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6268 -0.8662 -0.6388 1.1490 2.0790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
## gre 0.002264 0.001094 2.070 0.038465 *
## gpa 0.804038 0.331819 2.423 0.015388 *
## rankfRank-2 -0.675443 0.316490 -2.134 0.032829 *
## rankfRank-3 -1.340204 0.345306 -3.881 0.000104 ***
## rankfRank-4 -1.551464 0.417832 -3.713 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 458.52 on 394 degrees of freedom
## AIC: 470.52
##
## Number of Fisher Scoring iterations: 4
data.frame(cbind("Coefficients" = coef(fit) ,
"Odd Ratios" = exp(coef(fit)) ,
exp(confint(fit))))
## Waiting for profiling to be done...
## Coefficients Odd.Ratios X2.5.. X97.5..
## (Intercept) -3.989979073 0.0185001 0.001889165 0.1665354
## gre 0.002264426 1.0022670 1.000137602 1.0044457
## gpa 0.804037549 2.2345448 1.173858216 4.3238349
## rankfRank-2 -0.675442928 0.5089310 0.272289674 0.9448343
## rankfRank-3 -1.340203916 0.2617923 0.131641717 0.5115181
## rankfRank-4 -1.551463677 0.2119375 0.090715546 0.4706961
df$predicted_admission <- ifelse ( fitted(fit) <= 0.5 ,
'No Admission' ,
'Give Admission')
head(df)
## admit gre gpa rank admitf rankf predicted_admission
## 1 0 380 3.61 3 No Admission Rank-3 No Admission
## 2 1 660 3.67 3 Give Admission Rank-3 No Admission
## 3 1 800 4.00 1 Give Admission Rank-1 Give Admission
## 4 1 640 3.19 4 Give Admission Rank-4 No Admission
## 5 0 520 2.93 4 No Admission Rank-4 No Admission
## 6 1 760 3.00 2 Give Admission Rank-2 No Admission
conf_mat <- table(Observed = df$admitf , Predicted = df$predicted_admission)
conf_mat <- cbind(conf_mat , "Total" = rowSums(conf_mat))
conf_mat <- rbind(conf_mat , "Total" = colSums(conf_mat))
data.frame(conf_mat)
## Give.Admission No.Admission Total
## No Admission 19 254 273
## Give Admission 30 97 127
## Total 49 351 400
(conf_mat[2] + conf_mat[4]) / conf_mat[9]
## [1] 0.71
remove(conf_mat)
pseudo_r2 <- function(x , nrow){
stopifnot(inherits(x, "glm"))
x_null<- update(x , . ~ 1)
llv <- logLik(x)
llo <- logLik(x_null)
McFadden_R2 <- ( 1 - (llv / llo) )
lo <- exp(llo)
lv <- exp(llv)
pow <- (2/nrow)
cox_snell_R2 <- lo/lv
cox_snell_R2 <- cox_snell_R2 ^ pow
cox_snell_R2 <- (1 - cox_snell_R2)
r2max <- lo ^pow
r2max <- (1 - r2max)
nagelkerke_R2 <- (cox_snell_R2 / r2max )
return(data.frame( "McFadden_R2" = McFadden_R2 ,
"cox_snell_R2" = cox_snell_R2,
"nagelkerke_R2" = nagelkerke_R2
))
remove(llv,llo,McFadden_R2 , lo , lv , cox_snell_R2 , r2max , nagelkerke_R2 , pow , fit_null)
}
pseudo_r2(fit , nrow(df))
## McFadden_R2 cox_snell_R2 nagelkerke_R2
## 1 0.08292194 0.09845702 0.1379958
Eventhought the confusion matrix is giving 71 % correct Classificaiton Rate, the pseudo R2 values are very very less (almost near to zero). So this model is not perfect.