Example of logistic regression. Code is complete, explanation coming soon...
options(warn=-1)
#install.packages('ISLR')
library(ISLR)
print(head(College,2))## Private Apps Accept Enroll Top10perc
## Abilene Christian University Yes 1660 1232 721 23
## Adelphi University Yes 2186 1924 512 16
## Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University 52 2885 537 7440
## Adelphi University 29 2683 1227 12280
## Room.Board Books Personal PhD Terminal
## Abilene Christian University 3300 450 2200 70 78
## Adelphi University 6450 750 1500 29 30
## S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University 18.1 12 7041 60
## Adelphi University 12.2 16 10527 56
# Convert Private column from Yes/No to 1/0
Private = as.numeric(College$Private)-1
# Add new 'Private' to scaled data as new df called 'data'
newdata = cbind(Private,College[,2:18])
head(newdata)## Private Apps Accept Enroll Top10perc
## Abilene Christian University 1 1660 1232 721 23
## Adelphi University 1 2186 1924 512 16
## Adrian College 1 1428 1097 336 22
## Agnes Scott College 1 417 349 137 60
## Alaska Pacific University 1 193 146 55 16
## Albertson College 1 587 479 158 38
## Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University 52 2885 537 7440
## Adelphi University 29 2683 1227 12280
## Adrian College 50 1036 99 11250
## Agnes Scott College 89 510 63 12960
## Alaska Pacific University 44 249 869 7560
## Albertson College 62 678 41 13500
## Room.Board Books Personal PhD Terminal
## Abilene Christian University 3300 450 2200 70 78
## Adelphi University 6450 750 1500 29 30
## Adrian College 3750 400 1165 53 66
## Agnes Scott College 5450 450 875 92 97
## Alaska Pacific University 4120 800 1500 76 72
## Albertson College 3335 500 675 67 73
## S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University 18.1 12 7041 60
## Adelphi University 12.2 16 10527 56
## Adrian College 12.9 30 8735 54
## Agnes Scott College 7.7 37 19016 59
## Alaska Pacific University 11.9 2 10922 15
## Albertson College 9.4 11 9727 55
library(caTools)
set.seed(101)
# Create Split (any column is fine)
split = sample.split(newdata$Private, SplitRatio = 0.70)
# Split based off of split Boolean Vector
train = subset(newdata, split == TRUE)
test = subset(newdata, split == FALSE)
Xtrain <- train[,2:18]
Ytrain <- train[,1]
Xtest <- test[,2:18]# install.packages("Amelia")
library(Amelia)## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(train, main = "Missing values vs observed")# find any missing values
# sapply(train,function(x) sum(is.na(x)))
# replace if missing like this:
# data$Feature[is.na(data$Feature)] <- mean(data$Feauture,na.rm=T)No missing values, good.
x <- cbind(Xtrain,Ytrain)
# Train the model using the training sets and check score
logistic <- glm(Ytrain ~ ., data = x,family='binomial')
summary(logistic)##
## Call:
## glm(formula = Ytrain ~ ., family = "binomial", data = x)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9865 -0.0015 0.0404 0.1559 2.5445
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3481075 2.6722045 -0.504 0.61392
## Apps -0.0008996 0.0003675 -2.448 0.01438 *
## Accept 0.0015976 0.0006470 2.469 0.01355 *
## Enroll 0.0039692 0.0012507 3.173 0.00151 **
## Top10perc -0.0022502 0.0362778 -0.062 0.95054
## Top25perc 0.0349642 0.0257936 1.356 0.17525
## F.Undergrad -0.0018325 0.0004071 -4.501 6.77e-06 ***
## P.Undergrad 0.0004170 0.0001883 2.215 0.02679 *
## Outstate 0.0007796 0.0001579 4.937 7.92e-07 ***
## Room.Board -0.0002548 0.0003460 -0.736 0.46153
## Books 0.0027164 0.0018968 1.432 0.15213
## Personal -0.0005158 0.0003994 -1.291 0.19658
## PhD -0.0659950 0.0326687 -2.020 0.04337 *
## Terminal -0.0340818 0.0310905 -1.096 0.27298
## S.F.Ratio 0.0032316 0.0866527 0.037 0.97025
## perc.alumni 0.0089838 0.0255895 0.351 0.72553
## Expend 0.0003319 0.0001521 2.183 0.02907 *
## Grad.Rate 0.0193495 0.0157666 1.227 0.21973
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 636.80 on 543 degrees of freedom
## Residual deviance: 144.15 on 526 degrees of freedom
## AIC: 180.15
##
## Number of Fisher Scoring iterations: 8
#Predict Output
predicted= predict(logistic,Xtest)anova(logistic, test="Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Ytrain
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 543 636.80
## Apps 1 99.263 542 537.54 < 2.2e-16 ***
## Accept 1 28.470 541 509.07 9.514e-08 ***
## Enroll 1 64.528 540 444.54 9.515e-16 ***
## Top10perc 1 76.565 539 367.98 < 2.2e-16 ***
## Top25perc 1 5.825 538 362.15 0.01580 *
## F.Undergrad 1 73.044 537 289.11 < 2.2e-16 ***
## P.Undergrad 1 0.595 536 288.51 0.44055
## Outstate 1 110.646 535 177.87 < 2.2e-16 ***
## Room.Board 1 0.975 534 176.89 0.32336
## Books 1 1.242 533 175.65 0.26508
## Personal 1 0.562 532 175.09 0.45332
## PhD 1 22.297 531 152.79 2.336e-06 ***
## Terminal 1 0.603 530 152.19 0.43758
## S.F.Ratio 1 1.122 529 151.07 0.28954
## perc.alumni 1 0.392 528 150.67 0.53109
## Expend 1 4.981 527 145.69 0.02563 *
## Grad.Rate 1 1.541 526 144.15 0.21444
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Compare different models
# drop features
x2 <- cbind(Xtrain[,c(1:6,8,12,16)],Ytrain)
logistic2 <- glm(Ytrain ~ ., data = x2,family='binomial')
anova(logistic, logistic2, test="Chisq")## Analysis of Deviance Table
##
## Model 1: Ytrain ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
## Model 2: Ytrain ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## Outstate + PhD + Expend
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 526 144.15
## 2 534 152.50 -8 -8.3436 0.4006
Dropping features makes no change.
fitted.results <- predict(logistic,test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$Private)
print(paste('Accuracy',1-misClasificError))## [1] "Accuracy 0.957081545064378"
# remember, binary response variabel is `Private`
library(ROCR)## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
options(warn=-1)
p <- predict(logistic,test,type='response')
pr <- prediction(p, test$Private)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
title(paste('AUC = ',toString(round(auc*1000)/1000)))