data <- read.csv("C:/Users/Mehedi Hassan Galib/Desktop/R/binary.csv")
str(data)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
head(data)
## admit gre gpa rank
## 1 0 380 3.61 3
## 2 1 660 3.67 3
## 3 1 800 4.00 1
## 4 1 640 3.19 4
## 5 0 520 2.93 4
## 6 1 760 3.00 2
data$admit <- as.factor(data$admit)
data$rank <- as.factor(data$rank)
xtabs(~admit+rank, data = data)
## rank
## admit 1 2 3 4
## 0 28 97 93 55
## 1 33 54 28 12
set.seed(1234)
p_data <- sample(2, nrow(data), replace = TRUE, prob = c(0.8, 0.2))
train <- data[p_data==1,]
test <- data[p_data==2,]
set.seed(1234)
p_data <- sample(2, nrow(data), replace = TRUE, prob = c(0.8, 0.2))
train <- data[p_data==1,]
test <- data[p_data==2,]
model <- glm(admit~gre+gpa+rank, data = train, family = "binomial")
summary(model)
##
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5873 -0.8679 -0.6181 1.1301 2.1178
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.009514 1.316514 -3.805 0.000142 ***
## gre 0.001631 0.001217 1.340 0.180180
## gpa 1.166408 0.388899 2.999 0.002706 **
## rank2 -0.570976 0.358273 -1.594 0.111005
## rank3 -1.125341 0.383372 -2.935 0.003331 **
## rank4 -1.532942 0.477377 -3.211 0.001322 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 404.39 on 324 degrees of freedom
## Residual deviance: 369.99 on 319 degrees of freedom
## AIC: 381.99
##
## Number of Fisher Scoring iterations: 4
model1 <- glm(admit~gpa+rank, data = train, family = "binomial")
summary(model1)
##
## Call:
## glm(formula = admit ~ gpa + rank, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5156 -0.8880 -0.6318 1.1091 2.1688
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.7270 1.2918 -3.659 0.000253 ***
## gpa 1.3735 0.3590 3.826 0.000130 ***
## rank2 -0.5712 0.3564 -1.603 0.108976
## rank3 -1.1645 0.3804 -3.061 0.002203 **
## rank4 -1.5642 0.4756 -3.289 0.001005 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 404.39 on 324 degrees of freedom
## Residual deviance: 371.81 on 320 degrees of freedom
## AIC: 381.81
##
## Number of Fisher Scoring iterations: 4
p1 <- predict(model1, train, type = "response")
head(p1)
## 1 2 3 4 6 7
## 0.2822956 0.2992879 0.6828897 0.1290134 0.2354735 0.3466234
pred1 <- ifelse(p1>0.5, 1, 0)
tab1 <- table(Predicted = pred1, Actual = train$admit)
tab1
## Actual
## Predicted 0 1
## 0 208 73
## 1 15 29
1- sum(diag(tab1))/sum(tab1)
## [1] 0.2707692
p2 <- predict(model1, test, type = "response")
head(p2)
## 5 14 16 26 28 29
## 0.09390783 0.25582628 0.23746963 0.57446309 0.23971008 0.29411490
pred2 <- ifelse(p2>0.5, 1, 0)
tab2 <- table(Predicted = pred2, Actual = test$admit)
tab2
## Actual
## Predicted 0 1
## 0 48 20
## 1 2 5
1- sum(diag(tab2))/sum(tab2)
## [1] 0.2933333
with(model1, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = F))
## [1] 1.450537e-06