Load the data

data.joined <- readRDS(file="/home/saqib/ml_at_berkeley/CSX460/04-logistic-regression/04-exercise-nycflights-logistic/flightdata.Rda")

Add a categorical variable for arr_delay >= 22 minutes. It is called arrival_delayed

data.joined$arrival_delayed <- factor(ifelse(data.joined$arr_delay >= 22, 1,0))

Filter out rows with NAs for arrival_delayed

data.joined <- data.joined  %>% filter(!is.na(arrival_delayed))   

#data.joined <- data.joined[, speed:=NULL]

Split the Dataset in Training and Test datasets

data.joined.training <- sample_frac(data.joined, .75)
data.joined.testing <- sample_frac(data.joined, .5)

Exercise 1: caret/logistic regression (5 points)

Rebuild your logistic regression model from the previous week, this time using the caret package.

Show all work.

Train the Linear Regression Model using CARET

# Your Work Here

#data.joined <- data.joined[, speed:=NULL]


#lapply(data.joined, levels)
#(l <- sapply(data.joined, function(x) is.factor(x)))

fitControl <- trainControl(method = "cv", number = 2)

glmFit <- train(arrival_delayed ~ dep_delay + dest + origin + year + month + day + hour + sched_dep_time + sched_arr_time + carrier + distance + year.pl + type  + engines + seats + engine + temp + dewp + humid + wind_dir + wind_speed + wind_gust +precip + pressure + visib + lat + lon + alt + tz  + lat.dest + lon.dest + alt.dest + tz.dest, data=data.joined, method = "glm", trControl = fitControl)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Model Performance

pred <- as.vector(ifelse(predict(glmFit, newdata=data.joined, type="prob")[,"1"]<.5, 0, 1))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
probsTest <- predict(glmFit, data.joined.testing, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred      <- factor( ifelse(probsTest[, "1"] > 0.5, "1", "0") )
confusionMatrix(pred, data.joined.testing$arrival_delayed)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 93679  6544
##          1  1896 14703
##                                           
##                Accuracy : 0.9278          
##                  95% CI : (0.9263, 0.9292)
##     No Information Rate : 0.8181          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7347          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9802          
##             Specificity : 0.6920          
##          Pos Pred Value : 0.9347          
##          Neg Pred Value : 0.8858          
##              Prevalence : 0.8181          
##          Detection Rate : 0.8019          
##    Detection Prevalence : 0.8579          
##       Balanced Accuracy : 0.8361          
##                                           
##        'Positive' Class : 0               
## 

Plot the ROC Curve

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
probsTrain <- predict(glmFit, data.joined.training, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
rocCurve   <- roc(response = data.joined.training$arrival_delayed,
                      predictor = probsTrain[, "1"],
                      levels = rev(levels(data.joined.training$arrival_delayed)))
plot(rocCurve, print.thres = "best")

Exercise 2: caret/rpart (5 points)

Using the caret and rpart packages, create a classification model for flight delays using your NYC FLight data. Your solution should include:

Show and describe all work

# Your Work Here

fitControl <- trainControl(method = "cv", number = 2)

rpartFit <- train(arrival_delayed ~ dep_delay + dest + origin + year + month + day + hour + sched_dep_time + sched_arr_time + carrier + distance + year.pl + type  + engines + seats + engine + temp + dewp + humid + wind_dir + wind_speed + wind_gust +precip + pressure + visib + lat + lon + alt + tz  + lat.dest + lon.dest + alt.dest + tz.dest, data=data.joined, method = "rpart", trControl = fitControl)

Model Performance

pred <- as.vector(ifelse(predict(rpartFit, newdata=data.joined, type="prob")[,"1"]<.5, 0, 1))
probsTest <- predict(rpartFit, data.joined.testing, type = "prob")
pred      <- factor( ifelse(probsTest[, "1"] > 0.5, "1", "0") )
confusionMatrix(pred, data.joined.testing$arrival_delayed)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 94087  7040
##          1  1488 14207
##                                           
##                Accuracy : 0.927           
##                  95% CI : (0.9255, 0.9285)
##     No Information Rate : 0.8181          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.727           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9844          
##             Specificity : 0.6687          
##          Pos Pred Value : 0.9304          
##          Neg Pred Value : 0.9052          
##              Prevalence : 0.8181          
##          Detection Rate : 0.8054          
##    Detection Prevalence : 0.8657          
##       Balanced Accuracy : 0.8265          
##                                           
##        'Positive' Class : 0               
## 

Decision Tree Plot

library(rpart.plot)


rpart.plot(rpartFit$finalModel)

Questions:

  • Discuss the difference between the models and why you would use one model over the other?

Logistic Regression performed better than Decision Trees.