Scotty turns out being a very popular service in Turkey! This is a good thing, but also brings a new problem: “There is no drivers!”. The demands for Scotty began to overload, at some region and some times, and there was not enough driver at those times and places. Fortunately, we are know that we can use classification model to predict which region and times are risky enough to have this “no drivers” problem.
Based on “scotty-cl-cov/data/data-train.csv” data (up to Saturday, December 2nd 2017), we are going to make a prediction model report that would be evaluated on next 7 days (Sunday, December 3rd 2017 to Monday, December 9th 2017). The prediction will cover the predicted coverage status for each hour and each area: “sufficient” or “insufficient”
#Preprocessing Df
sxk3 <- data%>%
filter(src_area == "sxk3") %>%
mutate(id = as.character(id),
trip_id = as.character(trip_id),
start_time = ymd_hms(start_time),
status = ifelse(status == "nodrivers",1,0))%>%
select(src_area,start_time,status) %>%
thicken(interval = "hour") %>%
group_by(src_area,start_time_hour) %>%
summarise(status = sum(status)) %>%
ungroup() %>%
pad() %>%
fill_by_value(src_area, value = "sxk3") %>%
fill_by_value(status, value = 0)## pad applied on the interval: hour
sxk8 <- data%>%
filter(src_area == "sxk8") %>%
mutate(id = as.character(id),
trip_id = as.character(trip_id),
start_time = ymd_hms(start_time),
status = ifelse(status == "nodrivers",1,0))%>%
select(src_area,start_time,status) %>%
thicken(interval = "hour") %>%
group_by(src_area,start_time_hour) %>%
summarise(status = sum(status)) %>%
ungroup() %>%
pad() %>%
fill_by_value(src_area, value = "sxk8") %>%
fill_by_value(status, value = 0)## pad applied on the interval: hour
sxk9 <- data%>%
filter(src_area == "sxk9") %>%
mutate(id = as.character(id),
trip_id = as.character(trip_id),
start_time = ymd_hms(start_time),
status = ifelse(status == "nodrivers",1,0))%>%
select(src_area,start_time,status) %>%
thicken(interval = "hour") %>%
group_by(src_area,start_time_hour) %>%
summarise(status = sum(status)) %>%
ungroup() %>%
pad() %>%
fill_by_value(src_area, value = "sxk9") %>%
fill_by_value(status, value = 0)## pad applied on the interval: hour
df1 <- rbind(tail(sxk3,-168),
tail(sxk8,-168),
tail(sxk9,-168)
)
df2 <- df1 %>%
mutate(start_time = start_time_hour) %>%
mutate(date = as.factor(day(start_time)),
day = as.factor(wday(start_time, label = TRUE)),
hour = as.factor(hour(start_time)),
) %>%
select(src_area,date,day,hour,status)
df2$status <- as.factor(ifelse(df2$status>0,"nodrivers","confirmed"))
df <- df2
#Evaluation Data
sxk3.eval <- df %>%
filter(src_area == "sxk3") %>%
group_by(day,hour) %>%
tail(24*7)
sxk8.eval <- df %>%
filter(src_area == "sxk8") %>%
group_by(day,hour) %>%
tail(24*7)
sxk9.eval <- df %>%
filter(src_area == "sxk9") %>%
group_by(day,hour) %>%
tail(24*7)
df.eval <- rbind(sxk3.eval,sxk8.eval,sxk9.eval)# We Plot Source Area to the total number of "nodrivers" orders
area.nodriver <- df %>%
group_by(src_area) %>%
count(status) %>%
ungroup() %>%
filter(status == "nodrivers")
area_plot <- ggplot(area.nodriver,aes(x=src_area, y=n))+
geom_col(aes(color = src_area,fill = src_area))+
geom_point()+
theme_minimal()
area_plot# We Plot the hours in a day to the total number of "nodrivers"
hour.nodriver <- df %>%
group_by(hour) %>%
count(status) %>%
ungroup() %>%
filter(status == "nodrivers")
hour_plot <- ggplot(hour.nodriver,aes(x=hour, y=n))+
geom_col(aes(fill=hour))+
geom_point()+
theme_minimal()
hour_plot# We Plot the days of the week to the total number of "nodrivers"
day.nodriver <- df %>%
group_by(day) %>%
count(status) %>%
ungroup()%>%
filter(status == "nodrivers")
day_plot <- ggplot(day.nodriver,aes(x=day, y=n))+
geom_col(aes(fill=day))+
geom_point()+
theme_minimal()
day_plot# We Plot the days of the month to the total number of "nodrivers"
date.nodriver <- df %>%
group_by(date) %>%
count(status) %>%
ungroup()%>%
filter(status == "nodrivers")
date_plot <- ggplot(date.nodriver,aes(x=date, y=(n)))+
geom_col(aes(fill=date))+
theme_minimal()
date_plot# create initial split
set.seed(182)
test.split <- initial_split(df, prop = 0.95, strata = "status")
# create validation split
set.seed(182)
val.split <- initial_split(training(test.split), prop = 0.95, strata = "status")
# quick check
test.split## <3831/201/4032>
# define preprocess recipe from train dataset
rec <- recipe(status ~ ., data = training(val.split)) %>%
#step_rm() %>%
#step_nzv(all_predictors()) %>% #(it seems it removes the src_area variable, which in this case we want to use)
step_string2factor(all_nominal(), -status) %>%
step_string2factor(status, levels = c("confirmed", "nodriver")) %>%
step_upsample(status, ratio = 1/1, seed = 182) %>%
step_dummy(all_nominal(), -status, one_hot = FALSE) %>%
prep(strings_as_factors = FALSE)
# get train-val-test dataset
data_train <- juice(rec)
data_val <- bake(rec, testing(val.split))
data_test <- bake(rec, testing(test.split))
# prepare train arrays
data_train_y <- to_categorical(as.numeric(data_train$status) - 1)
data_train_x <- data_train %>%
select(-status) %>%
data.matrix()
# prepare validation arrays
data_val_y <- to_categorical(as.numeric(data_val$status) - 1)
data_val_x <- data_val %>%
select(-status) %>%
data.matrix()
# prepare test arrays
data_test_y <- to_categorical(as.numeric(data_test$status) - 1)
data_test_x <- data_test %>%
select(-status) %>%
data.matrix()
# quick check
dim(data_train_x)## [1] 3840 61
## [1] 3840 2
# define input
input <- layer_input(name = "input", shape = ncol(data_train_x))
# define hidden layers
hiddens <- input %>%
layer_dense(name = "dense_1", units = 64) %>%
layer_activation_leaky_relu(name = "dense_1_act") %>%
layer_batch_normalization(name = "dense_1_bn") %>%
layer_dropout(name = "dense_1_dp", rate = 0.15) %>%
layer_dense(name = "dense_2", units = 32) %>%
layer_activation_leaky_relu(name = "dense_2_act") %>%
layer_batch_normalization(name = "dense_2_bn") %>%
layer_dropout(name = "dense_2_dp", rate = 0.15)
# define output
output <- hiddens %>%
layer_dense(name = "output", units = ncol(data_train_y)) %>%
layer_batch_normalization(name = "output_bn") %>%
layer_activation(name = "output_act", activation = "sigmoid")
# define full model
model <- keras_model(inputs = input, outputs = output)
# compile the model
model %>% compile(
optimizer = optimizer_adam(lr = 0.001),
metrics = "accuracy",
loss = "binary_crossentropy"
)
# model summary
summary(model)## Model: "model"
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## input (InputLayer) [(None, 61)] 0
## ___________________________________________________________________________
## dense_1 (Dense) (None, 64) 3968
## ___________________________________________________________________________
## dense_1_act (LeakyReLU) (None, 64) 0
## ___________________________________________________________________________
## dense_1_bn (BatchNormalization) (None, 64) 256
## ___________________________________________________________________________
## dense_1_dp (Dropout) (None, 64) 0
## ___________________________________________________________________________
## dense_2 (Dense) (None, 32) 2080
## ___________________________________________________________________________
## dense_2_act (LeakyReLU) (None, 32) 0
## ___________________________________________________________________________
## dense_2_bn (BatchNormalization) (None, 32) 128
## ___________________________________________________________________________
## dense_2_dp (Dropout) (None, 32) 0
## ___________________________________________________________________________
## output (Dense) (None, 2) 66
## ___________________________________________________________________________
## output_bn (BatchNormalization) (None, 2) 8
## ___________________________________________________________________________
## output_act (Activation) (None, 2) 0
## ===========================================================================
## Total params: 6,506
## Trainable params: 6,310
## Non-trainable params: 196
## ___________________________________________________________________________
# callbacks
callbacks <- callback_tensorboard("logs/run_c")
# fit the model
history <- model %>% fit(
x = data_train_x,
y = data_train_y,
batch_size = 64,
epochs = 100,
validation_data = list(
data_val_x,
data_val_y
),
callbacks = callbacks
)
# plot history
plot(history)# predict on test
pred_test <- as_tibble(predict(model, data_test_x)) %>%
set_names(levels(data_train$status)) %>%
mutate(class = ifelse(nodrivers > 0.5, "nodrivers", "confirmed")) %>%
mutate(class = factor(class, levels = levels(data_train$status))) %>%
set_names(paste0(".pred_", colnames(.)))
# combine with test datasetli
pred_test <- data_test %>%
select(status) %>%
bind_cols(pred_test)
prop.table(table(pred_test$.pred_class))##
## confirmed nodrivers
## 0.4278607 0.5721393
cmatrix <- confusionMatrix(pred_test$'.pred_class',pred_test$status, positive = "confirmed")
cmatrix## Confusion Matrix and Statistics
##
## Reference
## Prediction confirmed nodrivers
## confirmed 63 23
## nodrivers 32 83
##
## Accuracy : 0.7264
## 95% CI : (0.6592, 0.7867)
## No Information Rate : 0.5274
## P-Value [Acc > NIR] : 6.066e-09
##
## Kappa : 0.4484
##
## Mcnemar's Test P-Value : 0.2807
##
## Sensitivity : 0.6632
## Specificity : 0.7830
## Pos Pred Value : 0.7326
## Neg Pred Value : 0.7217
## Prevalence : 0.4726
## Detection Rate : 0.3134
## Detection Prevalence : 0.4279
## Balanced Accuracy : 0.7231
##
## 'Positive' Class : confirmed
##
We try the navie bayes model as all variables seem to be factors, in which we can take the assumption of all variables being important and independent. ### Modeling
library(e1071)
model_naive <- naiveBayes(data_train_x, y = data_train$status, laplace = 1)
model_naive2 <- naiveBayes(data_train_x, y = data_train$status, laplace = 0)
pred.naive <- predict(model_naive, newdata = data_test_x, type = "class")
pred.naive2 <- predict(model_naive2, newdata = data_test_x, type = "class")
cmatrix.nb <- confusionMatrix(pred.naive, data_test$status, positive = "confirmed")
cmatrix.nb2 <- confusionMatrix(pred.naive2, data_test$status, positive = "confirmed")
cmatrix.nb## Confusion Matrix and Statistics
##
## Reference
## Prediction confirmed nodrivers
## confirmed 63 28
## nodrivers 32 78
##
## Accuracy : 0.7015
## 95% CI : (0.6331, 0.7638)
## No Information Rate : 0.5274
## P-Value [Acc > NIR] : 3.632e-07
##
## Kappa : 0.3999
##
## Mcnemar's Test P-Value : 0.6985
##
## Sensitivity : 0.6632
## Specificity : 0.7358
## Pos Pred Value : 0.6923
## Neg Pred Value : 0.7091
## Prevalence : 0.4726
## Detection Rate : 0.3134
## Detection Prevalence : 0.4527
## Balanced Accuracy : 0.6995
##
## 'Positive' Class : confirmed
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction confirmed nodrivers
## confirmed 63 28
## nodrivers 32 78
##
## Accuracy : 0.7015
## 95% CI : (0.6331, 0.7638)
## No Information Rate : 0.5274
## P-Value [Acc > NIR] : 3.632e-07
##
## Kappa : 0.3999
##
## Mcnemar's Test P-Value : 0.6985
##
## Sensitivity : 0.6632
## Specificity : 0.7358
## Pos Pred Value : 0.6923
## Neg Pred Value : 0.7091
## Prevalence : 0.4726
## Detection Rate : 0.3134
## Detection Prevalence : 0.4527
## Balanced Accuracy : 0.6995
##
## 'Positive' Class : confirmed
##
## Accuracy
## 0.7263682
## Accuracy
## 0.7014925
## Accuracy
## 0.7014925
From the metrics above we choose the Neural Network Model as the best model by comparing their metrics of accuracy. (Sensitivity,Accuracy, Pos Pred, Specificity)
eval.test <- df.eval %>%
ungroup() %>%
select(src_area, date, day, hour) %>%
mutate(hour = as.factor(hour),
day = (day),
status = as.factor(0))
eval.test <-bake(rec,eval.test) %>%
select(-status) %>%
data.matrix()
pred.eval <- predict(model, eval.test, type = "raw")
pred.eval.r <- as.factor(ifelse(pred.eval[,2]>0.5, "nodrivers", "confirmed"))
cmatrix.eval<- confusionMatrix(pred.eval.r, df.eval$status, positive = "confirmed")
cmatrix.eval## Confusion Matrix and Statistics
##
## Reference
## Prediction confirmed nodrivers
## confirmed 226 32
## nodrivers 53 193
##
## Accuracy : 0.8313
## 95% CI : (0.7957, 0.863)
## No Information Rate : 0.5536
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6618
##
## Mcnemar's Test P-Value : 0.03006
##
## Sensitivity : 0.8100
## Specificity : 0.8578
## Pos Pred Value : 0.8760
## Neg Pred Value : 0.7846
## Prevalence : 0.5536
## Detection Rate : 0.4484
## Detection Prevalence : 0.5119
## Balanced Accuracy : 0.8339
##
## 'Positive' Class : confirmed
##