Loading and Reconnaissance of dataset

data <- read.csv("C:/Users/Mehedi Hassan Galib/Desktop/R/binary.csv")
str(data)

## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...

head(data)

##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2

Converting numeric variable to factor variable

data$admit <- as.factor(data$admit)
data$rank <- as.factor(data$rank)

Checking if there is any NULL value

xtabs(~admit+rank, data = data)

##      rank
## admit  1  2  3  4
##     0 28 97 93 55
##     1 33 54 28 12

Data Partitioning

set.seed(1234)
p_data <- sample(2, nrow(data), replace = TRUE, prob = c(0.8, 0.2))
train <- data[p_data==1,]
test <- data[p_data==2,]

Data Partitioning

set.seed(1234)
p_data <- sample(2, nrow(data), replace = TRUE, prob = c(0.8, 0.2))
train <- data[p_data==1,]
test <- data[p_data==2,]

Model and Summary

model <- glm(admit~gre+gpa+rank, data = train, family = "binomial")
summary(model)

## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5873  -0.8679  -0.6181   1.1301   2.1178  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.009514   1.316514  -3.805 0.000142 ***
## gre          0.001631   0.001217   1.340 0.180180    
## gpa          1.166408   0.388899   2.999 0.002706 ** 
## rank2       -0.570976   0.358273  -1.594 0.111005    
## rank3       -1.125341   0.383372  -2.935 0.003331 ** 
## rank4       -1.532942   0.477377  -3.211 0.001322 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 404.39  on 324  degrees of freedom
## Residual deviance: 369.99  on 319  degrees of freedom
## AIC: 381.99
## 
## Number of Fisher Scoring iterations: 4

Removing GRE as it has less significance

model1 <- glm(admit~gpa+rank, data = train, family = "binomial")
summary(model1)

## 
## Call:
## glm(formula = admit ~ gpa + rank, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5156  -0.8880  -0.6318   1.1091   2.1688  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.7270     1.2918  -3.659 0.000253 ***
## gpa           1.3735     0.3590   3.826 0.000130 ***
## rank2        -0.5712     0.3564  -1.603 0.108976    
## rank3        -1.1645     0.3804  -3.061 0.002203 ** 
## rank4        -1.5642     0.4756  -3.289 0.001005 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 404.39  on 324  degrees of freedom
## Residual deviance: 371.81  on 320  degrees of freedom
## AIC: 381.81
## 
## Number of Fisher Scoring iterations: 4

Prediction m(Train)

p1 <- predict(model1, train, type = "response")
head(p1)

##         1         2         3         4         6         7 
## 0.2822956 0.2992879 0.6828897 0.1290134 0.2354735 0.3466234

Confusion Matrics (Train)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

pred1 <- ifelse(p1>0.5, 1, 0)
tab1 <- table(Predicted = pred1,  Actual = train$admit)
tab1

##          Actual
## Predicted   0   1
##         0 208  73
##         1  15  29

Missclassification Error (Train)

1- sum(diag(tab1))/sum(tab1)

## [1] 0.2707692

Prediction (Test)

p2 <- predict(model1, test, type = "response")
head(p2)

##          5         14         16         26         28         29 
## 0.09390783 0.25582628 0.23746963 0.57446309 0.23971008 0.29411490

Confusion Matrics (Test)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

pred2 <- ifelse(p2>0.5, 1, 0)
tab2 <- table(Predicted = pred2,  Actual = test$admit)
tab2

##          Actual
## Predicted  0  1
##         0 48 20
##         1  2  5

Missclassification Error (Test)

1- sum(diag(tab2))/sum(tab2)

## [1] 0.2933333

Goodness of fit

It’s nothing but P-Value

with(model1, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = F))

## [1] 1.450537e-06

Logistic Regression with R

Key Elements:
Data preparation
Model build up
Prediction
Confusion Matrics
Missclassification Error
Goodness of fit

Author: MD. Mehedi Hassan Galib

Date: 30 July, 2020

Loading and Reconnaissance of dataset

Converting numeric variable to factor variable

Checking if there is any NULL value

Data Partitioning

Data Partitioning

Model and Summary

Removing GRE as it has less significance

Prediction m(Train)

Confusion Matrics (Train)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

Missclassification Error (Train)

Prediction (Test)

Confusion Matrics (Test)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

Missclassification Error (Test)

Goodness of fit

It’s nothing but P-Value

Logistic Regression with R

Key Elements:Data preparationModel build upPredictionConfusion MatricsMissclassification ErrorGoodness of fit

Author: MD. Mehedi Hassan Galib

Date: 30 July, 2020

Loading and Reconnaissance of dataset

Converting numeric variable to factor variable

Checking if there is any NULL value

Data Partitioning

Data Partitioning

Model and Summary

Removing GRE as it has less significance

Prediction m(Train)

Confusion Matrics (Train)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

Missclassification Error (Train)

Prediction (Test)

Confusion Matrics (Test)

pred1 <- ifelse(p1>0.5, 1, 0) - if p1>0.5 then probability will be 1, otherwise 0.

Missclassification Error (Test)

Goodness of fit

It’s nothing but P-Value

Key Elements:
Data preparation
Model build up
Prediction
Confusion Matrics
Missclassification Error
Goodness of fit