REGRESSION METHODS - LOGISTIC REGRESSION.

CHALLENGER: Space Shuttle Launch Data.

STEP 1: DATA COLLECTION

STEP 2: DATA EXPLORATION AND PREPARATION.

launch <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml10/challenger.csv")
  • Examine the launch data.
str(launch)
## 'data.frame':    23 obs. of  4 variables:
##  $ distress_ct         : int  0 1 0 0 0 0 0 0 1 1 ...
##  $ temperature         : int  66 70 69 68 67 72 73 70 57 63 ...
##  $ field_check_pressure: int  50 50 50 50 50 50 100 100 200 200 ...
##  $ flight_num          : int  1 2 3 4 5 6 7 8 9 10 ...
  • Logisitic regression.
  • First recode the distress_ct variable into 0 and 1, making 1 to represent at least one failure during a launch.
launch$distress_ct = ifelse(launch$distress_ct<1,0,1)
launch$distress_ct
##  [1] 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1

STEP 3: TRAINING MODEL ON THE DATA.

  • Set up training and test data sets.
indx = sample(1:nrow(launch), as.integer(0.9*nrow(launch)))
indx
##  [1] 21 10  2  9  5 17 19  6  4  3  8 20 15 13 22 14  7 12 18  1
launch_train = launch[indx,]
launch_test = launch[-indx,]
launch_train_labels = launch[indx,1]
launch_test_labels = launch[-indx,1]  

STEP 4: EVALUATING MODEL PERFORMANCE.

  • Checking for any missing values.
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.3.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(launch, main = "Missing vs Observed Values")

  • Finding the number of missing values in each column.
sapply(launch,function(x) sum(is.na(x)))
##          distress_ct          temperature field_check_pressure 
##                    0                    0                    0 
##           flight_num 
##                    0
  • Finding the number of unique values in each column.
sapply(launch, function(x) length(unique(x)))
##          distress_ct          temperature field_check_pressure 
##                    2                   16                    3 
##           flight_num 
##                   23

-Fitting the logistic regression model, with all predictor variables.

model <- glm(distress_ct ~.,family=binomial(link='logit'),data=launch_train)
model
## 
## Call:  glm(formula = distress_ct ~ ., family = binomial(link = "logit"), 
##     data = launch_train)
## 
## Coefficients:
##          (Intercept)           temperature  field_check_pressure  
##            12.671747             -0.217126              0.006483  
##           flight_num  
##             0.014191  
## 
## Degrees of Freedom: 19 Total (i.e. Null);  16 Residual
## Null Deviance:       22.49 
## Residual Deviance: 15.85     AIC: 23.85
summary(model)
## 
## Call:
## glm(formula = distress_ct ~ ., family = binomial(link = "logit"), 
##     data = launch_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0263  -0.5922  -0.4277  -0.1037   2.1364  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)          12.671747  11.954237   1.060    0.289
## temperature          -0.217126   0.171088  -1.269    0.204
## field_check_pressure  0.006483   0.024292   0.267    0.790
## flight_num            0.014191   0.289269   0.049    0.961
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22.493  on 19  degrees of freedom
## Residual deviance: 15.847  on 16  degrees of freedom
## AIC: 23.847
## 
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: distress_ct
## 
## Terms added sequentially (first to last)
## 
## 
##                      Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                                    19     22.493           
## temperature           1   5.9429        18     16.550  0.01478 *
## field_check_pressure  1   0.7009        17     15.850  0.40248  
## flight_num            1   0.0024        16     15.847  0.96072  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

STEP 5: IMPROVING MODEL PERFORMANCE.

  • Dropping the insignificant predictors, alpha = 0.10.
model <- glm(distress_ct ~temperature,family=binomial(link='logit'),data=launch_train)
model
## 
## Call:  glm(formula = distress_ct ~ temperature, family = binomial(link = "logit"), 
##     data = launch_train)
## 
## Coefficients:
## (Intercept)  temperature  
##     14.0894      -0.2227  
## 
## Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
## Null Deviance:       22.49 
## Residual Deviance: 16.55     AIC: 20.55
summary(model)
## 
## Call:
## glm(formula = distress_ct ~ temperature, family = binomial(link = "logit"), 
##     data = launch_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.93185  -0.71920  -0.40147  -0.03645   2.31709  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  14.0894     8.0268   1.755   0.0792 .
## temperature  -0.2227     0.1185  -1.880   0.0602 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22.493  on 19  degrees of freedom
## Residual deviance: 16.551  on 18  degrees of freedom
## AIC: 20.551
## 
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: distress_ct
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                           19     22.493           
## temperature  1   5.9429        18     16.550  0.01478 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Checking the accuracy.
fitted.results <- predict(model,newdata=launch_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != launch_test$distress_ct)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.666666666666667"
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.3.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
p <- predict(model, newdata=launch_test, type="response")
pr <- prediction(p, launch_test$distress_ct)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 1