launch <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml10/challenger.csv")
- Examine the launch data.
str(launch)
## 'data.frame': 23 obs. of 4 variables:
## $ distress_ct : int 0 1 0 0 0 0 0 0 1 1 ...
## $ temperature : int 66 70 69 68 67 72 73 70 57 63 ...
## $ field_check_pressure: int 50 50 50 50 50 50 100 100 200 200 ...
## $ flight_num : int 1 2 3 4 5 6 7 8 9 10 ...
- Logisitic regression.
- First recode the distress_ct variable into 0 and 1, making 1 to represent at least one failure during a launch.
launch$distress_ct = ifelse(launch$distress_ct<1,0,1)
launch$distress_ct
## [1] 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1
- Set up training and test data sets.
indx = sample(1:nrow(launch), as.integer(0.9*nrow(launch)))
indx
## [1] 21 10 2 9 5 17 19 6 4 3 8 20 15 13 22 14 7 12 18 1
launch_train = launch[indx,]
launch_test = launch[-indx,]
launch_train_labels = launch[indx,1]
launch_test_labels = launch[-indx,1]
- Checking for any missing values.
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.3.3
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(launch, main = "Missing vs Observed Values")
- Finding the number of missing values in each column.
sapply(launch,function(x) sum(is.na(x)))
## distress_ct temperature field_check_pressure
## 0 0 0
## flight_num
## 0
- Finding the number of unique values in each column.
sapply(launch, function(x) length(unique(x)))
## distress_ct temperature field_check_pressure
## 2 16 3
## flight_num
## 23
-Fitting the logistic regression model, with all predictor variables.
model <- glm(distress_ct ~.,family=binomial(link='logit'),data=launch_train)
model
##
## Call: glm(formula = distress_ct ~ ., family = binomial(link = "logit"),
## data = launch_train)
##
## Coefficients:
## (Intercept) temperature field_check_pressure
## 12.671747 -0.217126 0.006483
## flight_num
## 0.014191
##
## Degrees of Freedom: 19 Total (i.e. Null); 16 Residual
## Null Deviance: 22.49
## Residual Deviance: 15.85 AIC: 23.85
summary(model)
##
## Call:
## glm(formula = distress_ct ~ ., family = binomial(link = "logit"),
## data = launch_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0263 -0.5922 -0.4277 -0.1037 2.1364
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.671747 11.954237 1.060 0.289
## temperature -0.217126 0.171088 -1.269 0.204
## field_check_pressure 0.006483 0.024292 0.267 0.790
## flight_num 0.014191 0.289269 0.049 0.961
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22.493 on 19 degrees of freedom
## Residual deviance: 15.847 on 16 degrees of freedom
## AIC: 23.847
##
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: distress_ct
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 22.493
## temperature 1 5.9429 18 16.550 0.01478 *
## field_check_pressure 1 0.7009 17 15.850 0.40248
## flight_num 1 0.0024 16 15.847 0.96072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
- Dropping the insignificant predictors, alpha = 0.10.
model <- glm(distress_ct ~temperature,family=binomial(link='logit'),data=launch_train)
model
##
## Call: glm(formula = distress_ct ~ temperature, family = binomial(link = "logit"),
## data = launch_train)
##
## Coefficients:
## (Intercept) temperature
## 14.0894 -0.2227
##
## Degrees of Freedom: 19 Total (i.e. Null); 18 Residual
## Null Deviance: 22.49
## Residual Deviance: 16.55 AIC: 20.55
summary(model)
##
## Call:
## glm(formula = distress_ct ~ temperature, family = binomial(link = "logit"),
## data = launch_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.93185 -0.71920 -0.40147 -0.03645 2.31709
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.0894 8.0268 1.755 0.0792 .
## temperature -0.2227 0.1185 -1.880 0.0602 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22.493 on 19 degrees of freedom
## Residual deviance: 16.551 on 18 degrees of freedom
## AIC: 20.551
##
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: distress_ct
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 22.493
## temperature 1 5.9429 18 16.550 0.01478 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
- Checking the accuracy.
fitted.results <- predict(model,newdata=launch_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != launch_test$distress_ct)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.666666666666667"
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.3.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
p <- predict(model, newdata=launch_test, type="response")
pr <- prediction(p, launch_test$distress_ct)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 1