Loading and Reconnaissance of dataset

data <- read.csv("C:/Users/Mehedi Hassan Galib/Desktop/R/binary.csv")
str(data)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
head(data)
##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2




Converting numeric variable to factor variable

data$admit <- as.factor(data$admit)
data$rank <- as.factor(data$rank)




Checking if there is any NULL value

xtabs(~admit+rank, data = data)
##      rank
## admit  1  2  3  4
##     0 28 97 93 55
##     1 33 54 28 12




Data Partitioning

set.seed(1234)
p_data <- sample(2, nrow(data), replace = TRUE, prob = c(0.8, 0.2))
train <- data[p_data==1,]
test <- data[p_data==2,]




Data Partitioning

set.seed(1234)
p_data <- sample(2, nrow(data), replace = TRUE, prob = c(0.8, 0.2))
train <- data[p_data==1,]
test <- data[p_data==2,]




Model and Summary

model <- glm(admit~gre+gpa+rank, data = train, family = "binomial")
summary(model)
## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5873  -0.8679  -0.6181   1.1301   2.1178  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.009514   1.316514  -3.805 0.000142 ***
## gre          0.001631   0.001217   1.340 0.180180    
## gpa          1.166408   0.388899   2.999 0.002706 ** 
## rank2       -0.570976   0.358273  -1.594 0.111005    
## rank3       -1.125341   0.383372  -2.935 0.003331 ** 
## rank4       -1.532942   0.477377  -3.211 0.001322 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 404.39  on 324  degrees of freedom
## Residual deviance: 369.99  on 319  degrees of freedom
## AIC: 381.99
## 
## Number of Fisher Scoring iterations: 4




Removing GRE as it has less significance

model1 <- glm(admit~gpa+rank, data = train, family = "binomial")
summary(model1)
## 
## Call:
## glm(formula = admit ~ gpa + rank, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5156  -0.8880  -0.6318   1.1091   2.1688  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.7270     1.2918  -3.659 0.000253 ***
## gpa           1.3735     0.3590   3.826 0.000130 ***
## rank2        -0.5712     0.3564  -1.603 0.108976    
## rank3        -1.1645     0.3804  -3.061 0.002203 ** 
## rank4        -1.5642     0.4756  -3.289 0.001005 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 404.39  on 324  degrees of freedom
## Residual deviance: 371.81  on 320  degrees of freedom
## AIC: 381.81
## 
## Number of Fisher Scoring iterations: 4




Prediction m(Train)

p1 <- predict(model1, train, type = "response")
head(p1)
##         1         2         3         4         6         7 
## 0.2822956 0.2992879 0.6828897 0.1290134 0.2354735 0.3466234




Confusion Matrics (Train)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

pred1 <- ifelse(p1>0.5, 1, 0)
tab1 <- table(Predicted = pred1,  Actual = train$admit)
tab1
##          Actual
## Predicted   0   1
##         0 208  73
##         1  15  29




Missclassification Error (Train)

1- sum(diag(tab1))/sum(tab1)
## [1] 0.2707692




Prediction (Test)

p2 <- predict(model1, test, type = "response")
head(p2)
##          5         14         16         26         28         29 
## 0.09390783 0.25582628 0.23746963 0.57446309 0.23971008 0.29411490




Confusion Matrics (Test)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

pred2 <- ifelse(p2>0.5, 1, 0)
tab2 <- table(Predicted = pred2,  Actual = test$admit)
tab2
##          Actual
## Predicted  0  1
##         0 48 20
##         1  2  5




Missclassification Error (Test)

1- sum(diag(tab2))/sum(tab2)
## [1] 0.2933333




Goodness of fit

It’s nothing but P-Value

with(model1, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = F))
## [1] 1.450537e-06