1 Introduction

Scotty turns out being a very popular service in Turkey! This is a good thing, but also brings a new problem: “There is no drivers!”. The demands for Scotty began to overload, at some region and some times, and there was not enough driver at those times and places. Fortunately, we are know that we can use classification model to predict which region and times are risky enough to have this “no drivers” problem.

Based on “scotty-cl-cov/data/data-train.csv” data (up to Saturday, December 2nd 2017), we are going to make a prediction model report that would be evaluated on next 7 days (Sunday, December 3rd 2017 to Monday, December 9th 2017). The prediction will cover the predicted coverage status for each hour and each area: “sufficient” or “insufficient”

2 Reading Data and Preprocessing

2.1 Read Data

data<- read.csv("data-train.csv")

2.2 Preprocessing

#Preprocessing Df

sxk3 <- data%>% 
  filter(src_area == "sxk3") %>% 
  mutate(id = as.character(id),
         trip_id = as.character(trip_id),
         start_time = ymd_hms(start_time),
         status = ifelse(status == "nodrivers",1,0))%>% 
  select(src_area,start_time,status) %>% 
  thicken(interval = "hour") %>% 
  group_by(src_area,start_time_hour) %>%
  summarise(status = sum(status)) %>% 
  ungroup() %>% 
  pad() %>% 
  fill_by_value(src_area, value = "sxk3") %>% 
  fill_by_value(status, value = 0)

## pad applied on the interval: hour

sxk8 <- data%>% 
  filter(src_area == "sxk8") %>% 
  mutate(id = as.character(id),
         trip_id = as.character(trip_id),
         start_time = ymd_hms(start_time),
         status = ifelse(status == "nodrivers",1,0))%>% 
  select(src_area,start_time,status) %>% 
  thicken(interval = "hour") %>% 
  group_by(src_area,start_time_hour) %>%
  summarise(status = sum(status)) %>% 
  ungroup() %>% 
  pad() %>% 
  fill_by_value(src_area, value = "sxk8") %>% 
  fill_by_value(status, value = 0)

## pad applied on the interval: hour

sxk9 <- data%>% 
  filter(src_area == "sxk9") %>% 
  mutate(id = as.character(id),
         trip_id = as.character(trip_id),
         start_time = ymd_hms(start_time),
         status = ifelse(status == "nodrivers",1,0))%>% 
  select(src_area,start_time,status) %>% 
  thicken(interval = "hour") %>% 
  group_by(src_area,start_time_hour) %>%
  summarise(status = sum(status)) %>% 
  ungroup() %>% 
  pad() %>% 
  fill_by_value(src_area, value = "sxk9") %>% 
  fill_by_value(status, value = 0)

## pad applied on the interval: hour

df1 <- rbind(tail(sxk3,-168),
             tail(sxk8,-168),
             tail(sxk9,-168)
)
             


df2 <- df1 %>% 
  mutate(start_time = start_time_hour) %>% 
  mutate(date = as.factor(day(start_time)),
         day = as.factor(wday(start_time, label = TRUE)),
         hour = as.factor(hour(start_time)),
         ) %>% 
  select(src_area,date,day,hour,status)


df2$status <- as.factor(ifelse(df2$status>0,"nodrivers","confirmed"))
df <- df2

#Evaluation Data

sxk3.eval <- df %>% 
  filter(src_area == "sxk3") %>% 
  group_by(day,hour) %>% 
  tail(24*7)

sxk8.eval <- df %>% 
  filter(src_area == "sxk8") %>% 
  group_by(day,hour) %>% 
  tail(24*7)

sxk9.eval <- df %>% 
  filter(src_area == "sxk9") %>% 
  group_by(day,hour) %>% 
  tail(24*7)

df.eval <- rbind(sxk3.eval,sxk8.eval,sxk9.eval)

3 Data Exploration

# We Plot Source Area to the total number of "nodrivers" orders
area.nodriver <- df %>% 
  group_by(src_area) %>% 
  count(status) %>% 
  ungroup() %>% 
  filter(status == "nodrivers")

area_plot <- ggplot(area.nodriver,aes(x=src_area, y=n))+
  geom_col(aes(color = src_area,fill = src_area))+
  geom_point()+
  theme_minimal()

area_plot

# We Plot the hours in a day to the total number of "nodrivers" 
hour.nodriver <- df %>% 
  group_by(hour) %>% 
  count(status) %>% 
  ungroup() %>% 
  filter(status == "nodrivers")

hour_plot <- ggplot(hour.nodriver,aes(x=hour, y=n))+
  geom_col(aes(fill=hour))+
  geom_point()+
  theme_minimal()

hour_plot

# We Plot the days of the week to the total number of "nodrivers" 
day.nodriver <- df %>% 
  group_by(day) %>% 
  count(status) %>% 
  ungroup()%>% 
  filter(status == "nodrivers")

day_plot <- ggplot(day.nodriver,aes(x=day, y=n))+
  geom_col(aes(fill=day))+
  geom_point()+
  theme_minimal()

day_plot

# We Plot the days of the month to the total number of "nodrivers"
date.nodriver <- df %>% 
  group_by(date) %>% 
  count(status) %>% 
  ungroup()%>% 
  filter(status == "nodrivers")

date_plot <- ggplot(date.nodriver,aes(x=date, y=(n)))+
  geom_col(aes(fill=date))+
  theme_minimal()

date_plot

3.1 Data Cross Validation

# create initial split
set.seed(182)

test.split <- initial_split(df, prop = 0.95, strata = "status")

# create validation split
set.seed(182)

val.split <- initial_split(training(test.split), prop = 0.95, strata = "status")

# quick check
test.split

## <3831/201/4032>

3.2 Defining Preprocess Recipe and Train/Test Data

# define preprocess recipe from train dataset
rec <- recipe(status ~ ., data = training(val.split)) %>% 
  #step_rm() %>%
  #step_nzv(all_predictors()) %>%  #(it seems it removes the src_area variable, which in this case we want to use)
  step_string2factor(all_nominal(), -status) %>%
  step_string2factor(status, levels = c("confirmed", "nodriver")) %>%
  step_upsample(status, ratio = 1/1, seed = 182) %>%
  step_dummy(all_nominal(), -status, one_hot = FALSE) %>%
  prep(strings_as_factors = FALSE)


# get train-val-test dataset
data_train <- juice(rec)
data_val <- bake(rec, testing(val.split))
data_test <- bake(rec, testing(test.split))

# prepare train arrays
data_train_y <- to_categorical(as.numeric(data_train$status) - 1)

data_train_x <- data_train %>% 
  select(-status) %>% 
  data.matrix()

# prepare validation arrays
data_val_y <- to_categorical(as.numeric(data_val$status) - 1)

data_val_x <- data_val %>% 
  select(-status) %>% 
  data.matrix()

# prepare test arrays
data_test_y <- to_categorical(as.numeric(data_test$status) - 1)

data_test_x <- data_test %>% 
  select(-status) %>% 
  data.matrix()

# quick check
dim(data_train_x)

## [1] 3840   61

dim(data_train_y)

## [1] 3840    2

4 Modeling

4.1 NN Model

4.1.1 Model Architecture

# define input
input <- layer_input(name = "input", shape = ncol(data_train_x))

# define hidden layers
hiddens <- input %>% 
  layer_dense(name = "dense_1", units = 64) %>%
  layer_activation_leaky_relu(name = "dense_1_act") %>%
  layer_batch_normalization(name = "dense_1_bn") %>%
  layer_dropout(name = "dense_1_dp", rate = 0.15) %>%
  layer_dense(name = "dense_2", units = 32) %>%
  layer_activation_leaky_relu(name = "dense_2_act") %>%
  layer_batch_normalization(name = "dense_2_bn") %>%
  layer_dropout(name = "dense_2_dp", rate = 0.15)

# define output
output <- hiddens %>% 
  layer_dense(name = "output", units = ncol(data_train_y)) %>% 
  layer_batch_normalization(name = "output_bn") %>%
  layer_activation(name = "output_act", activation = "sigmoid")

# define full model
model <- keras_model(inputs = input, outputs = output)

# compile the model
model %>% compile(
  optimizer = optimizer_adam(lr = 0.001),
  metrics = "accuracy",
  loss = "binary_crossentropy"
)

# model summary
summary(model)

## Model: "model"
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## input (InputLayer)               [(None, 61)]                  0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 64)                    3968        
## ___________________________________________________________________________
## dense_1_act (LeakyReLU)          (None, 64)                    0           
## ___________________________________________________________________________
## dense_1_bn (BatchNormalization)  (None, 64)                    256         
## ___________________________________________________________________________
## dense_1_dp (Dropout)             (None, 64)                    0           
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 32)                    2080        
## ___________________________________________________________________________
## dense_2_act (LeakyReLU)          (None, 32)                    0           
## ___________________________________________________________________________
## dense_2_bn (BatchNormalization)  (None, 32)                    128         
## ___________________________________________________________________________
## dense_2_dp (Dropout)             (None, 32)                    0           
## ___________________________________________________________________________
## output (Dense)                   (None, 2)                     66          
## ___________________________________________________________________________
## output_bn (BatchNormalization)   (None, 2)                     8           
## ___________________________________________________________________________
## output_act (Activation)          (None, 2)                     0           
## ===========================================================================
## Total params: 6,506
## Trainable params: 6,310
## Non-trainable params: 196
## ___________________________________________________________________________

4.1.2 Model Fitting

# callbacks
callbacks <- callback_tensorboard("logs/run_c")

# fit the model
history <- model %>% fit(
  x = data_train_x,
  y = data_train_y,
  batch_size = 64,
  epochs = 100,
  validation_data = list(
    data_val_x,
    data_val_y
  ),
  callbacks = callbacks
)

# plot history
plot(history)

4.1.3 Model Evaluation

4.1.4 Predict on Test Dataset

# predict on test
pred_test <- as_tibble(predict(model, data_test_x)) %>% 
  set_names(levels(data_train$status)) %>% 
  mutate(class = ifelse(nodrivers > 0.5, "nodrivers", "confirmed")) %>% 
  mutate(class = factor(class, levels = levels(data_train$status))) %>% 
  set_names(paste0(".pred_", colnames(.)))

# combine with test datasetli
pred_test <- data_test %>% 
  select(status) %>% 
  bind_cols(pred_test)

prop.table(table(pred_test$.pred_class))

## 
## confirmed nodrivers 
## 0.4278607 0.5721393

4.1.5 Confusion Matrix

cmatrix <- confusionMatrix(pred_test$'.pred_class',pred_test$status,  positive = "confirmed")
cmatrix

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  confirmed nodrivers
##   confirmed        63        23
##   nodrivers        32        83
##                                           
##                Accuracy : 0.7264          
##                  95% CI : (0.6592, 0.7867)
##     No Information Rate : 0.5274          
##     P-Value [Acc > NIR] : 6.066e-09       
##                                           
##                   Kappa : 0.4484          
##                                           
##  Mcnemar's Test P-Value : 0.2807          
##                                           
##             Sensitivity : 0.6632          
##             Specificity : 0.7830          
##          Pos Pred Value : 0.7326          
##          Neg Pred Value : 0.7217          
##              Prevalence : 0.4726          
##          Detection Rate : 0.3134          
##    Detection Prevalence : 0.4279          
##       Balanced Accuracy : 0.7231          
##                                           
##        'Positive' Class : confirmed       
##

4.2 Model NaiveBayes

We try the navie bayes model as all variables seem to be factors, in which we can take the assumption of all variables being important and independent. ### Modeling

library(e1071)
model_naive <- naiveBayes(data_train_x, y = data_train$status, laplace = 1)
model_naive2 <- naiveBayes(data_train_x, y = data_train$status, laplace = 0)

pred.naive <- predict(model_naive, newdata = data_test_x, type = "class")
pred.naive2 <- predict(model_naive2, newdata = data_test_x, type = "class")

cmatrix.nb <- confusionMatrix(pred.naive, data_test$status, positive = "confirmed")
cmatrix.nb2 <- confusionMatrix(pred.naive2, data_test$status, positive = "confirmed")

cmatrix.nb

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  confirmed nodrivers
##   confirmed        63        28
##   nodrivers        32        78
##                                           
##                Accuracy : 0.7015          
##                  95% CI : (0.6331, 0.7638)
##     No Information Rate : 0.5274          
##     P-Value [Acc > NIR] : 3.632e-07       
##                                           
##                   Kappa : 0.3999          
##                                           
##  Mcnemar's Test P-Value : 0.6985          
##                                           
##             Sensitivity : 0.6632          
##             Specificity : 0.7358          
##          Pos Pred Value : 0.6923          
##          Neg Pred Value : 0.7091          
##              Prevalence : 0.4726          
##          Detection Rate : 0.3134          
##    Detection Prevalence : 0.4527          
##       Balanced Accuracy : 0.6995          
##                                           
##        'Positive' Class : confirmed       
##

cmatrix.nb2

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  confirmed nodrivers
##   confirmed        63        28
##   nodrivers        32        78
##                                           
##                Accuracy : 0.7015          
##                  95% CI : (0.6331, 0.7638)
##     No Information Rate : 0.5274          
##     P-Value [Acc > NIR] : 3.632e-07       
##                                           
##                   Kappa : 0.3999          
##                                           
##  Mcnemar's Test P-Value : 0.6985          
##                                           
##             Sensitivity : 0.6632          
##             Specificity : 0.7358          
##          Pos Pred Value : 0.6923          
##          Neg Pred Value : 0.7091          
##              Prevalence : 0.4726          
##          Detection Rate : 0.3134          
##    Detection Prevalence : 0.4527          
##       Balanced Accuracy : 0.6995          
##                                           
##        'Positive' Class : confirmed       
##

5 Comparing and Choosing Models

cmatrix$overall[1]

##  Accuracy 
## 0.7263682

cmatrix.nb$overall[1]

##  Accuracy 
## 0.7014925

cmatrix.nb2$overall[1]

##  Accuracy 
## 0.7014925

From the metrics above we choose the Neural Network Model as the best model by comparing their metrics of accuracy. (Sensitivity,Accuracy, Pos Pred, Specificity)

5.1 Choosing treshold to maximize metrics

hist(pred_test$.pred_nodrivers)

6 Predicting to Data eval

eval.test <- df.eval %>% 
  ungroup() %>% 
  select(src_area, date, day, hour) %>% 
  mutate(hour = as.factor(hour),
         day = (day),
         status = as.factor(0))
  
eval.test <-bake(rec,eval.test) %>% 
  select(-status) %>% 
  data.matrix()


pred.eval <- predict(model, eval.test, type = "raw")
pred.eval.r <- as.factor(ifelse(pred.eval[,2]>0.5, "nodrivers", "confirmed"))

cmatrix.eval<- confusionMatrix(pred.eval.r, df.eval$status,  positive = "confirmed")
cmatrix.eval

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  confirmed nodrivers
##   confirmed       226        32
##   nodrivers        53       193
##                                          
##                Accuracy : 0.8313         
##                  95% CI : (0.7957, 0.863)
##     No Information Rate : 0.5536         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.6618         
##                                          
##  Mcnemar's Test P-Value : 0.03006        
##                                          
##             Sensitivity : 0.8100         
##             Specificity : 0.8578         
##          Pos Pred Value : 0.8760         
##          Neg Pred Value : 0.7846         
##              Prevalence : 0.5536         
##          Detection Rate : 0.4484         
##    Detection Prevalence : 0.5119         
##       Balanced Accuracy : 0.8339         
##                                          
##        'Positive' Class : confirmed      
##

Classification with Neural Network - Keras

Thona Elisa

9/27/2019