data.joined <- readRDS(file="/home/saqib/ml_at_berkeley/CSX460/04-logistic-regression/04-exercise-nycflights-logistic/flightdata.Rda")
data.joined$arrival_delayed <- factor(ifelse(data.joined$arr_delay >= 22, 1,0))
data.joined <- data.joined %>% filter(!is.na(arrival_delayed))
#data.joined <- data.joined[, speed:=NULL]
data.joined.training <- sample_frac(data.joined, .75)
data.joined.testing <- sample_frac(data.joined, .5)
Rebuild your logistic regression model from the previous week, this time using the caret package.
Show all work.
# Your Work Here
#data.joined <- data.joined[, speed:=NULL]
#lapply(data.joined, levels)
#(l <- sapply(data.joined, function(x) is.factor(x)))
fitControl <- trainControl(method = "cv", number = 2)
glmFit <- train(arrival_delayed ~ dep_delay + dest + origin + year + month + day + hour + sched_dep_time + sched_arr_time + carrier + distance + year.pl + type + engines + seats + engine + temp + dewp + humid + wind_dir + wind_speed + wind_gust +precip + pressure + visib + lat + lon + alt + tz + lat.dest + lon.dest + alt.dest + tz.dest, data=data.joined, method = "glm", trControl = fitControl)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred <- as.vector(ifelse(predict(glmFit, newdata=data.joined, type="prob")[,"1"]<.5, 0, 1))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
probsTest <- predict(glmFit, data.joined.testing, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred <- factor( ifelse(probsTest[, "1"] > 0.5, "1", "0") )
confusionMatrix(pred, data.joined.testing$arrival_delayed)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 93679 6544
## 1 1896 14703
##
## Accuracy : 0.9278
## 95% CI : (0.9263, 0.9292)
## No Information Rate : 0.8181
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7347
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9802
## Specificity : 0.6920
## Pos Pred Value : 0.9347
## Neg Pred Value : 0.8858
## Prevalence : 0.8181
## Detection Rate : 0.8019
## Detection Prevalence : 0.8579
## Balanced Accuracy : 0.8361
##
## 'Positive' Class : 0
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
probsTrain <- predict(glmFit, data.joined.training, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
rocCurve <- roc(response = data.joined.training$arrival_delayed,
predictor = probsTrain[, "1"],
levels = rev(levels(data.joined.training$arrival_delayed)))
plot(rocCurve, print.thres = "best")
Using the caret and rpart packages, create a classification model for flight delays using your NYC FLight data. Your solution should include:
caret and rpart to train a model.Show and describe all work
# Your Work Here
fitControl <- trainControl(method = "cv", number = 2)
rpartFit <- train(arrival_delayed ~ dep_delay + dest + origin + year + month + day + hour + sched_dep_time + sched_arr_time + carrier + distance + year.pl + type + engines + seats + engine + temp + dewp + humid + wind_dir + wind_speed + wind_gust +precip + pressure + visib + lat + lon + alt + tz + lat.dest + lon.dest + alt.dest + tz.dest, data=data.joined, method = "rpart", trControl = fitControl)
pred <- as.vector(ifelse(predict(rpartFit, newdata=data.joined, type="prob")[,"1"]<.5, 0, 1))
probsTest <- predict(rpartFit, data.joined.testing, type = "prob")
pred <- factor( ifelse(probsTest[, "1"] > 0.5, "1", "0") )
confusionMatrix(pred, data.joined.testing$arrival_delayed)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94087 7040
## 1 1488 14207
##
## Accuracy : 0.927
## 95% CI : (0.9255, 0.9285)
## No Information Rate : 0.8181
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.727
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9844
## Specificity : 0.6687
## Pos Pred Value : 0.9304
## Neg Pred Value : 0.9052
## Prevalence : 0.8181
## Detection Rate : 0.8054
## Detection Prevalence : 0.8657
## Balanced Accuracy : 0.8265
##
## 'Positive' Class : 0
##
library(rpart.plot)
rpart.plot(rpartFit$finalModel)
Logistic Regression performed better than Decision Trees.