Using independent variables as gre,sop and cgpa ,i am going to predict whether a student is admited or not to the college.data is from kaggle
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(nnet)
Admission_Predict <- read_csv("C:/Users/USER/Desktop/Admission_Predict.csv")
## Rows: 400 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): gre, sop, cgpa, admitted
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
attach(Admission_Predict)
Admission_Predict$admitted<-as.factor(Admission_Predict$admitted)
data<-Admission_Predict
str(data)
## spc_tbl_ [400 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ gre : num [1:400] 337 324 316 322 314 330 321 308 302 323 ...
## $ sop : num [1:400] 4.5 4 3 3.5 2 4.5 3 3 2 3.5 ...
## $ cgpa : num [1:400] 9.65 8.87 8 8.67 8.21 9.34 8.2 7.9 8 8.6 ...
## $ admitted: Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. gre = col_double(),
## .. sop = col_double(),
## .. cgpa = col_double(),
## .. admitted = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
explore the data
summary(data)
## gre sop cgpa admitted
## Min. :290.0 Min. :1.0 Min. :6.800 0:181
## 1st Qu.:308.0 1st Qu.:2.5 1st Qu.:8.170 1:219
## Median :317.0 Median :3.5 Median :8.610
## Mean :316.8 Mean :3.4 Mean :8.599
## 3rd Qu.:325.0 3rd Qu.:4.0 3rd Qu.:9.062
## Max. :340.0 Max. :5.0 Max. :9.920
table(data$admitted)
##
## 0 1
## 181 219
data partitioning
set.seed(1234)
pd<-sample(2,nrow(data),replace = TRUE,prob = c(0.8,0.2))
train<-data[pd==1,]
test<-data[pd==2,]
MODEL
m1<-glm(admitted~.,data=train,family = "binomial")
m1
##
## Call: glm(formula = admitted ~ ., family = "binomial", data = train)
##
## Coefficients:
## (Intercept) gre sop cgpa
## -43.5752 0.1288 0.3013 0.2358
##
## Degrees of Freedom: 324 Total (i.e. Null); 321 Residual
## Null Deviance: 445.9
## Residual Deviance: 306.3 AIC: 314.3
miclassification rate
p<-predict(m1,data=train)
table
ta<-table(p,train$admitted)
accuracy (correct classification)
sum(diag(ta))/sum(ta)
## [1] 0.003076923
how many admitted not admitted
table(train$admitted)
##
## 0 1
## 143 182
model performance evaluation
library(ROCR)
pred<-predict(m1,data,type="response")
head(pred)
## 1 2 3 4 5 6
## 0.9694280 0.8096627 0.4777753 0.7295657 0.3547260 0.9228724
head(data)
## # A tibble: 6 × 4
## gre sop cgpa admitted
## <dbl> <dbl> <dbl> <fct>
## 1 337 4.5 9.65 1
## 2 324 4 8.87 1
## 3 316 3 8 1
## 4 322 3.5 8.67 1
## 5 314 2 8.21 0
## 6 330 4.5 9.34 1
histogram
hist(pred)
roc_curve
pred<-prediction(pred,data$admitted)
eval<-performance(pred,"acc")
plot(eval)
abline(h=0.79,v=0.60)
identify best cut off and accuracy
max<-which.max(slot(eval,"y.values")[[1]])
acc<-slot(eval,"y.values")[[1]][max]
max
## [1] 189
acc
## [1] 0.79
cutoff<-slot(eval,"x.values")[[1]][max]
cutoff
## 334
## 0.6046182
print(c(accuracy=acc,cutoffp=cutoff))
## accuracy cutoffp.334
## 0.7900000 0.6046182
performance(pred,"tpr")
## A performance instance
## 'Cutoff' vs. 'True positive rate' (alpha: 'none')
## with 398 data points
roc<-performance(pred,"tpr","fpr")
plot(roc)
abline(a=0,b=1)
plot(roc,colorize=T,main="Roc Curve",ylab="sensitivity",xlab="1-specificity")
abline(a=0,b=1)
are under the curve
auc<-performance(pred,"auc")
auc<-unlist(slot(auc,"y.values"))
auc
## [1] 0.8516486
auc<-round(auc,2)
auc
## [1] 0.85