admission_prediction

Using independent variables as gre,sop and cgpa ,i am going to predict whether a student is admited or not to the college.data is from kaggle

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(nnet)
Admission_Predict <- read_csv("C:/Users/USER/Desktop/Admission_Predict.csv")

## Rows: 400 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): gre, sop, cgpa, admitted
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

attach(Admission_Predict)
Admission_Predict$admitted<-as.factor(Admission_Predict$admitted)
data<-Admission_Predict
str(data)

## spc_tbl_ [400 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ gre     : num [1:400] 337 324 316 322 314 330 321 308 302 323 ...
##  $ sop     : num [1:400] 4.5 4 3 3.5 2 4.5 3 3 2 3.5 ...
##  $ cgpa    : num [1:400] 9.65 8.87 8 8.67 8.21 9.34 8.2 7.9 8 8.6 ...
##  $ admitted: Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   gre = col_double(),
##   ..   sop = col_double(),
##   ..   cgpa = col_double(),
##   ..   admitted = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

explore the data

summary(data)

##       gre             sop           cgpa       admitted
##  Min.   :290.0   Min.   :1.0   Min.   :6.800   0:181   
##  1st Qu.:308.0   1st Qu.:2.5   1st Qu.:8.170   1:219   
##  Median :317.0   Median :3.5   Median :8.610           
##  Mean   :316.8   Mean   :3.4   Mean   :8.599           
##  3rd Qu.:325.0   3rd Qu.:4.0   3rd Qu.:9.062           
##  Max.   :340.0   Max.   :5.0   Max.   :9.920

table(data$admitted)

## 
##   0   1 
## 181 219

data partitioning

set.seed(1234)
pd<-sample(2,nrow(data),replace = TRUE,prob = c(0.8,0.2))
train<-data[pd==1,]
test<-data[pd==2,]

MODEL

m1<-glm(admitted~.,data=train,family = "binomial")
m1

## 
## Call:  glm(formula = admitted ~ ., family = "binomial", data = train)
## 
## Coefficients:
## (Intercept)          gre          sop         cgpa  
##    -43.5752       0.1288       0.3013       0.2358  
## 
## Degrees of Freedom: 324 Total (i.e. Null);  321 Residual
## Null Deviance:       445.9 
## Residual Deviance: 306.3     AIC: 314.3

miclassification rate

p<-predict(m1,data=train)

table

ta<-table(p,train$admitted)

accuracy (correct classification)

sum(diag(ta))/sum(ta)

## [1] 0.003076923

how many admitted not admitted

table(train$admitted)

## 
##   0   1 
## 143 182

model performance evaluation

library(ROCR)
pred<-predict(m1,data,type="response")
head(pred)

##         1         2         3         4         5         6 
## 0.9694280 0.8096627 0.4777753 0.7295657 0.3547260 0.9228724

head(data)

## # A tibble: 6 × 4
##     gre   sop  cgpa admitted
##   <dbl> <dbl> <dbl> <fct>   
## 1   337   4.5  9.65 1       
## 2   324   4    8.87 1       
## 3   316   3    8    1       
## 4   322   3.5  8.67 1       
## 5   314   2    8.21 0       
## 6   330   4.5  9.34 1

histogram

hist(pred)

roc_curve

pred<-prediction(pred,data$admitted)
eval<-performance(pred,"acc")
plot(eval)
abline(h=0.79,v=0.60)

identify best cut off and accuracy

max<-which.max(slot(eval,"y.values")[[1]])
acc<-slot(eval,"y.values")[[1]][max]
max

## [1] 189

acc

## [1] 0.79

cutoff<-slot(eval,"x.values")[[1]][max]
cutoff

##       334 
## 0.6046182

print(c(accuracy=acc,cutoffp=cutoff))

##    accuracy cutoffp.334 
##   0.7900000   0.6046182

performance(pred,"tpr")

## A performance instance
##   'Cutoff' vs. 'True positive rate' (alpha: 'none')
##   with 398 data points

roc<-performance(pred,"tpr","fpr")
plot(roc)
abline(a=0,b=1)

plot(roc,colorize=T,main="Roc Curve",ylab="sensitivity",xlab="1-specificity")
abline(a=0,b=1)

are under the curve

auc<-performance(pred,"auc")
auc<-unlist(slot(auc,"y.values"))
auc

## [1] 0.8516486

auc<-round(auc,2)
auc

## [1] 0.85

admission_prediction

mugo

12 November 2023