Machine learning intro in R: Logistic Regression

Intro

Example of logistic regression. Code is complete, explanation coming soon...

Data and packages

options(warn=-1)
#install.packages('ISLR')
library(ISLR)

print(head(College,2))
##                              Private Apps Accept Enroll Top10perc
## Abilene Christian University     Yes 1660   1232    721        23
## Adelphi University               Yes 2186   1924    512        16
##                              Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University        52        2885         537     7440
## Adelphi University                  29        2683        1227    12280
##                              Room.Board Books Personal PhD Terminal
## Abilene Christian University       3300   450     2200  70       78
## Adelphi University                 6450   750     1500  29       30
##                              S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University      18.1          12   7041        60
## Adelphi University                12.2          16  10527        56
# Convert Private column from Yes/No to 1/0
Private = as.numeric(College$Private)-1
# Add new 'Private' to scaled data as new df called 'data'
newdata = cbind(Private,College[,2:18])
head(newdata)
##                              Private Apps Accept Enroll Top10perc
## Abilene Christian University       1 1660   1232    721        23
## Adelphi University                 1 2186   1924    512        16
## Adrian College                     1 1428   1097    336        22
## Agnes Scott College                1  417    349    137        60
## Alaska Pacific University          1  193    146     55        16
## Albertson College                  1  587    479    158        38
##                              Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University        52        2885         537     7440
## Adelphi University                  29        2683        1227    12280
## Adrian College                      50        1036          99    11250
## Agnes Scott College                 89         510          63    12960
## Alaska Pacific University           44         249         869     7560
## Albertson College                   62         678          41    13500
##                              Room.Board Books Personal PhD Terminal
## Abilene Christian University       3300   450     2200  70       78
## Adelphi University                 6450   750     1500  29       30
## Adrian College                     3750   400     1165  53       66
## Agnes Scott College                5450   450      875  92       97
## Alaska Pacific University          4120   800     1500  76       72
## Albertson College                  3335   500      675  67       73
##                              S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University      18.1          12   7041        60
## Adelphi University                12.2          16  10527        56
## Adrian College                    12.9          30   8735        54
## Agnes Scott College                7.7          37  19016        59
## Alaska Pacific University         11.9           2  10922        15
## Albertson College                  9.4          11   9727        55

Split data

library(caTools)
set.seed(101)

# Create Split (any column is fine)
split = sample.split(newdata$Private, SplitRatio = 0.70)

# Split based off of split Boolean Vector
train = subset(newdata, split == TRUE)
test = subset(newdata, split == FALSE)

Xtrain <- train[,2:18]
Ytrain <- train[,1]

Xtest <- test[,2:18]

Missing values

# install.packages("Amelia")
library(Amelia)
## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(train, main = "Missing values vs observed")

# find any missing values
# sapply(train,function(x) sum(is.na(x)))
# replace if missing like this:
# data$Feature[is.na(data$Feature)] <- mean(data$Feauture,na.rm=T)

No missing values, good.

Logistic regression

x <- cbind(Xtrain,Ytrain)
# Train the model using the training sets and check score
logistic <- glm(Ytrain ~ ., data = x,family='binomial')
summary(logistic)
## 
## Call:
## glm(formula = Ytrain ~ ., family = "binomial", data = x)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.9865  -0.0015   0.0404   0.1559   2.5445  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.3481075  2.6722045  -0.504  0.61392    
## Apps        -0.0008996  0.0003675  -2.448  0.01438 *  
## Accept       0.0015976  0.0006470   2.469  0.01355 *  
## Enroll       0.0039692  0.0012507   3.173  0.00151 ** 
## Top10perc   -0.0022502  0.0362778  -0.062  0.95054    
## Top25perc    0.0349642  0.0257936   1.356  0.17525    
## F.Undergrad -0.0018325  0.0004071  -4.501 6.77e-06 ***
## P.Undergrad  0.0004170  0.0001883   2.215  0.02679 *  
## Outstate     0.0007796  0.0001579   4.937 7.92e-07 ***
## Room.Board  -0.0002548  0.0003460  -0.736  0.46153    
## Books        0.0027164  0.0018968   1.432  0.15213    
## Personal    -0.0005158  0.0003994  -1.291  0.19658    
## PhD         -0.0659950  0.0326687  -2.020  0.04337 *  
## Terminal    -0.0340818  0.0310905  -1.096  0.27298    
## S.F.Ratio    0.0032316  0.0866527   0.037  0.97025    
## perc.alumni  0.0089838  0.0255895   0.351  0.72553    
## Expend       0.0003319  0.0001521   2.183  0.02907 *  
## Grad.Rate    0.0193495  0.0157666   1.227  0.21973    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 636.80  on 543  degrees of freedom
## Residual deviance: 144.15  on 526  degrees of freedom
## AIC: 180.15
## 
## Number of Fisher Scoring iterations: 8
#Predict Output
predicted= predict(logistic,Xtest)
anova(logistic, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Ytrain
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          543     636.80              
## Apps         1   99.263       542     537.54 < 2.2e-16 ***
## Accept       1   28.470       541     509.07 9.514e-08 ***
## Enroll       1   64.528       540     444.54 9.515e-16 ***
## Top10perc    1   76.565       539     367.98 < 2.2e-16 ***
## Top25perc    1    5.825       538     362.15   0.01580 *  
## F.Undergrad  1   73.044       537     289.11 < 2.2e-16 ***
## P.Undergrad  1    0.595       536     288.51   0.44055    
## Outstate     1  110.646       535     177.87 < 2.2e-16 ***
## Room.Board   1    0.975       534     176.89   0.32336    
## Books        1    1.242       533     175.65   0.26508    
## Personal     1    0.562       532     175.09   0.45332    
## PhD          1   22.297       531     152.79 2.336e-06 ***
## Terminal     1    0.603       530     152.19   0.43758    
## S.F.Ratio    1    1.122       529     151.07   0.28954    
## perc.alumni  1    0.392       528     150.67   0.53109    
## Expend       1    4.981       527     145.69   0.02563 *  
## Grad.Rate    1    1.541       526     144.15   0.21444    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Compare different models

# drop features
x2 <- cbind(Xtrain[,c(1:6,8,12,16)],Ytrain)
logistic2 <- glm(Ytrain ~ ., data = x2,family='binomial')
anova(logistic, logistic2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Ytrain ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Books + Personal + 
##     PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
## Model 2: Ytrain ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     Outstate + PhD + Expend
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       526     144.15                     
## 2       534     152.50 -8  -8.3436   0.4006

Dropping features makes no change.

fitted.results   <- predict(logistic,test,type='response')
fitted.results   <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$Private)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.957081545064378"
# remember, binary response variabel is `Private`
library(ROCR)
## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess
options(warn=-1)

p   <- predict(logistic,test,type='response') 
pr  <- prediction(p, test$Private)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
title(paste('AUC = ',toString(round(auc*1000)/1000)))