A marathon is a very hard race. We know that all participants do not finish. A study was made by a Sports Scientist to discover what might predict whether a first-time marathon runner quits the race. Many first time participants (many finishers and non-finishers) were randomly interviewed.

Looading the dataset

marathon <- read.csv("D:/PGP BA-BI Course Materials/PREDICTIVE MODELING/LOGISTIC REGRESSION/Logistic Regression/Logistic/marathon_data.csv",header = T)
head(marathon)

##   months_trained charity age finished_race
## 1              3      No  35           Yes
## 2              3      No  23           Yes
## 3              4      No  27           Yes
## 4              4      No  18           Yes
## 5              5      No  36           Yes
## 6              2      No  30           Yes

str(marathon)

## 'data.frame':    200 obs. of  4 variables:
##  $ months_trained: int  3 3 4 4 5 2 3 4 2 1 ...
##  $ charity       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 2 2 ...
##  $ age           : int  35 23 27 18 36 30 34 30 27 25 ...
##  $ finished_race : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...

summary(marathon)

##  months_trained   charity        age        finished_race
##  Min.   : 0.000   No :195   Min.   :18.00   No : 72      
##  1st Qu.: 2.000   Yes:  5   1st Qu.:27.00   Yes:128      
##  Median : 3.500             Median :33.00                
##  Mean   : 4.295             Mean   :32.48                
##  3rd Qu.: 7.000             3rd Qu.:39.00                
##  Max.   :10.000             Max.   :46.00

marathon$result[marathon$finished_race == "Yes"] <- 1 
marathon$result[marathon$finished_race == "No"] <- 0
marathon$result <- as.factor(marathon$result)
head(marathon)

##   months_trained charity age finished_race result
## 1              3      No  35           Yes      1
## 2              3      No  23           Yes      1
## 3              4      No  27           Yes      1
## 4              4      No  18           Yes      1
## 5              5      No  36           Yes      1
## 6              2      No  30           Yes      1

Splitting the dataset into training and testing dataset

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

set.seed(1407)
split <- createDataPartition(marathon$result,p = 0.7,list = FALSE,times = 1)
View(split)
train.data <- marathon[split,] 
test.data <- marathon[-split,]

Building a logit model

Step 1 : Log likelihood test

logit <- glm(result~months_trained+charity+age,data=train.data,family = binomial)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

lrtest(logit)

## Likelihood ratio test
## 
## Model 1: result ~ months_trained + charity + age
## Model 2: result ~ 1
##   #Df  LogLik Df Chisq Pr(>Chisq)    
## 1   4 -43.074                        
## 2   1 -92.269 -3 98.39  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 2 : McFadden test

library(pscl)

## Warning: package 'pscl' was built under R version 3.4.2

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

pR2(logit)

##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -43.0743319 -92.2691669  98.3896700   0.5331666   0.5023205   0.6882506

If McFadden r^2, is less than 10% not satisfactory,bet 10% to 30% is satisfactory,between 30% to 50% is very good

Step 3 : Interpreting individual coefficients, whether they are significant or not.

summary(logit)

## 
## Call:
## glm(formula = result ~ months_trained + charity + age, family = binomial, 
##     data = train.data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.81521  -0.43126   0.05328   0.41419   2.36526  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       3.59494    1.49011   2.413   0.0158 *  
## months_trained    0.94348    0.17994   5.243 1.58e-07 ***
## charityYes       18.11338 1754.85354   0.010   0.9918    
## age              -0.19107    0.04803  -3.978 6.95e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 184.538  on 140  degrees of freedom
## Residual deviance:  86.149  on 137  degrees of freedom
## AIC: 94.149
## 
## Number of Fisher Scoring iterations: 16

Step 4 : Explanatory power of odds ratio

exp(coef(logit))

##    (Intercept) months_trained     charityYes            age 
##   3.641369e+01   2.568894e+00   7.354322e+07   8.260709e-01

Step 5 : Classification or confusion matrix for training data

cutoff <- floor(fitted(logit)+0.6)
table(Actual=train.data$result,Predicted=cutoff)

##       Predicted
## Actual  0  1
##      0 35 16
##      1 10 80

Step 6 : ROC Curve

library(Deducer)

## Warning: package 'Deducer' was built under R version 3.4.4

## Loading required package: JGR

## Warning: package 'JGR' was built under R version 3.4.2

## Loading required package: rJava

## Warning: package 'rJava' was built under R version 3.4.3

## Loading required package: JavaGD

## Warning: package 'JavaGD' was built under R version 3.4.2

## 
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.

## Loading required package: car

## Warning: package 'car' was built under R version 3.4.2

## Loading required package: MASS

## 
## 
## Note Non-JGR console detected:
##  Deducer is best used from within JGR (http://jgr.markushelbig.org/).
##  To Bring up GUI dialogs, type deducer().

library(rJava)
library(pROC)

## Warning: package 'pROC' was built under R version 3.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

rocplot(logit)

Evaluating model performance on testing data

attach(test.data)
Floor <- predict(logit, newdata = test.data, type = "response")
cutoff1 <- floor(Floor+0.6)
table(Actual=test.data$result,Predicted=cutoff1)

##       Predicted
## Actual  0  1
##      0 12  9
##      1  9 29

Logistic Regression on marathon runners