final_df <- read.csv("analyze_me.csv", stringsAsFactors=FALSE)

# Let's check what we have inside 
str(final_df)
## 'data.frame':    5983 obs. of  18 variables:
##  $ delivery_id                        : int  1457973 1377056 1476547 1485494 1327707 1423142 1334106 1311619 1487674 1417206 ...
##  $ customer_id                        : int  327168 64452 83095 271149 122609 75169 101347 59161 55375 153816 ...
##  $ jumpman_id                         : int  162381 104533 132725 157175 118095 91932 124897 79847 181543 157415 ...
##  $ vehicle_type                       : chr  "van" "bicycle" "bicycle" "bicycle" ...
##  $ pickup_place                       : chr  "Melt Shop" "Prince Street Pizza" "Bareburger" "Juice Press" ...
##  $ place_category                     : chr  "American" "Pizza" "Burger" "Juice Bar" ...
##  $ item_name                          : chr  "Lemonade" "Neapolitan Rice Balls" "Bare Sodas" "OMG! My Favorite Juice!" ...
##  $ item_quantity                      : int  1 3 1 1 2 1 1 2 NA 1 ...
##  $ item_category_name                 : chr  "Beverages" "Munchables" "Drinks" "Cold Pressed Juices" ...
##  $ how_long_it_took_to_order          : chr  "00:19:58.582052" "00:25:09.107093" "00:06:44.541717" "" ...
##  $ pickup_lat                         : num  40.7 40.7 40.7 40.7 40.7 ...
##  $ pickup_lon                         : num  -74 -74 -74 -74 -74 ...
##  $ dropoff_lat                        : num  40.8 40.7 40.7 40.8 40.7 ...
##  $ dropoff_lon                        : num  -74 -74 -74 -74 -74 ...
##  $ when_the_delivery_started          : chr  "2014-10-26 13:51:59.898924" "2014-10-16 21:58:58.65491" "2014-10-28 21:39:52.654394" "2014-10-30 10:54:11.531894" ...
##  $ when_the_Jumpman_arrived_at_pickup : chr  "" "2014-10-16 22:26:02.120931" "2014-10-28 21:37:18.793405" "2014-10-30 11:04:17.759577" ...
##  $ when_the_Jumpman_left_pickup       : chr  "" "2014-10-16 22:48:23.091253" "2014-10-28 21:59:09.98481" "2014-10-30 11:16:37.895816" ...
##  $ when_the_Jumpman_arrived_at_dropoff: chr  "2014-10-26 14:52:06.313088" "2014-10-16 22:59:22.948873" "2014-10-28 22:04:40.634962" "2014-10-30 11:32:38.090061" ...
# 1st problem - incorrect classes
final_df$when_the_delivery_started <- anytime::anytime(final_df$when_the_delivery_started)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
final_df$when_the_Jumpman_arrived_at_pickup <- anytime::anytime(final_df$when_the_Jumpman_arrived_at_pickup)
final_df$when_the_Jumpman_left_pickup <- anytime::anytime(final_df$when_the_Jumpman_left_pickup)
final_df$when_the_Jumpman_arrived_at_dropoff <- anytime::anytime(final_df$when_the_Jumpman_arrived_at_dropoff)
final_df$how_long_it_took_to_order <- lubridate::period_to_seconds(lubridate::hms(final_df$how_long_it_took_to_order, quiet = T, roll = T))/60

# 2nd problem - missing values. 
# I'd use MEAN to fill the gaps where it's possible and leave N/A in "chr" strings
final_df$item_quantity <- imputeTS::na_mean(final_df$item_quantity)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
final_df$how_long_it_took_to_order <- imputeTS::na_mean(final_df$how_long_it_took_to_order)

# 3rd problem - duplicates.
final_df <- dplyr::distinct(final_df)

# Data is near-clean now. Let's add a few variables we'll need later
final_df$distance <- geosphere::distHaversine(cbind(final_df$dropoff_lat,final_df$dropoff_lon),
                                              cbind(final_df$pickup_lat,final_df$pickup_lon))
final_df$distance <- udunits2::ud.convert(final_df$distance, "m","miles")
final_df$preperation_time <- as.numeric(difftime(final_df$when_the_Jumpman_left_pickup,
                                      final_df$when_the_Jumpman_arrived_at_pickup,
                                      units="mins"))
final_df$time_in_transit <- difftime(final_df$when_the_Jumpman_arrived_at_dropoff, 
                                     final_df$when_the_Jumpman_left_pickup, 
                                     units = "mins")
final_df$total_time <- difftime(final_df$when_the_Jumpman_arrived_at_dropoff,
                                final_df$when_the_delivery_started,
                                units = "mins")
final_df$day_delivery_started <- (lubridate::day(final_df$when_the_delivery_started))
final_df$wday_delivery_started <- lubridate::wday(final_df$when_the_delivery_started)
final_df$weekend_delivery_started <- ifelse(final_df$wday_delivery_started %in% c(1,7),1,0)
final_df$period_of_day <- ifelse(lubridate::hour(final_df$when_the_delivery_started) %in% c(5:11),"Morning","Afternoon")

# All the prep work is done now
# Let's analyze users

# Number of customers
paste(length(unique(final_df$customer_id))," Number of customers")
## [1] "3192  Number of customers"
# Number of orders
ggplot2::ggplot(final_df, ggplot2::aes(x=day_delivery_started, 1, group=1)) +
  ggplot2::stat_summary(fun = sum,geom = "line")+
  ggplot2::ggtitle("Orders in Oct")+
  ggplot2::ylab("# of Deliveries")+
  ggplot2::xlab("Day of Month")

# Orders distribution
ggplot2::ggplot(data.frame(as.vector(table(final_df$customer_id))))+
  ggplot2::geom_histogram(bins=30,ggplot2::aes(x=as.vector.table.final_df.customer_id..))+
  ggplot2::ggtitle("Orders per customer distribution - Histogram")+
  ggplot2::xlab("Orders/customer")+
  ggplot2::ylab("# of Customers")

# Favorite pickup_place by period_of_day
Top_morning_places <- sqldf::sqldf(
"SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_morning` 
FROM `final_df`
WHERE period_of_day = 'Morning'
GROUP BY `pickup_place`
ORDER BY `popular_morning` DESC
LIMIT    5;")
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
ggplot2::ggplot(data=Top_morning_places, ggplot2::aes(x=reorder(pickup_place, popular_morning), y=popular_morning)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Place") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Morning Places") + 
  ggplot2::coord_flip()

Top_afternoon_places <- sqldf::sqldf(
"SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_afternoon` 
FROM `final_df`
WHERE period_of_day = 'Afternoon'
GROUP BY `pickup_place`
ORDER BY `popular_afternoon` DESC
LIMIT    5;")

ggplot2::ggplot(data=Top_afternoon_places, ggplot2::aes(x=reorder(pickup_place, popular_afternoon), y=popular_afternoon)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Place") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Afternoon Places") + 
  ggplot2::coord_flip()

# Favorite pickup_place by workday/weekday
Top_weekend_places <- sqldf::sqldf(
  "SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_weekend` 
FROM `final_df`
WHERE weekend_delivery_started = 1
GROUP BY `pickup_place`
ORDER BY `popular_weekend` DESC
LIMIT    5;")

ggplot2::ggplot(data=Top_weekend_places, ggplot2::aes(x=reorder(pickup_place, popular_weekend), y=popular_weekend)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Place") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Weekend Places") + 
  ggplot2::coord_flip()

Top_weekday_places <- sqldf::sqldf(
  "SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_weekday` 
FROM `final_df`
WHERE weekend_delivery_started = 0
GROUP BY `pickup_place`
ORDER BY `popular_weekday` DESC
LIMIT    5;")

ggplot2::ggplot(data=Top_weekday_places, ggplot2::aes(x=reorder(pickup_place, popular_weekday), y=popular_weekday)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Place") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Weekday Places") + 
  ggplot2::coord_flip()

# Favorite item_category_name by period_of_day
Top_item_category_morning <- sqldf::sqldf(
"SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_morning` 
FROM `final_df`
WHERE period_of_day = 'Morning'
GROUP BY `item_category_name`
ORDER BY `popular_morning` DESC
LIMIT 5;")

ggplot2::ggplot(data=Top_item_category_morning, ggplot2::aes(x=reorder(item_category_name, popular_morning), y=popular_morning)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Item Category") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Item Categories (Morning)") + 
  ggplot2::coord_flip()

Top_item_category_afternoon <- sqldf::sqldf(
  "SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_afternoon` 
FROM `final_df`
WHERE period_of_day = 'Afternoon'
GROUP BY `item_category_name`
ORDER BY `popular_afternoon` DESC
LIMIT 5;")

ggplot2::ggplot(data=Top_item_category_afternoon, ggplot2::aes(x=reorder(item_category_name, popular_afternoon), y=popular_afternoon)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Item Category") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Item Categories (Afternoon)") +
  ggplot2::coord_flip()

# Favorite item_category_name by weekday/weekend
Top_item_category_weekend <- sqldf::sqldf(
  "SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_weekend` 
FROM `final_df`
WHERE weekend_delivery_started = 1
GROUP BY `item_category_name`
ORDER BY `popular_weekend` DESC
LIMIT 5;")

ggplot2::ggplot(data=Top_item_category_weekend, ggplot2::aes(x=reorder(item_category_name, popular_weekend), y=popular_weekend)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Item Category") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Item Categories (Weekend)") + 
  ggplot2::coord_flip()

Top_item_category_weekday <- sqldf::sqldf(
  "SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_weekday` 
FROM `final_df`
WHERE weekend_delivery_started = 0
GROUP BY `item_category_name`
ORDER BY `popular_weekday` DESC
LIMIT    5;")

ggplot2::ggplot(data=Top_item_category_weekday, ggplot2::aes(x=reorder(item_category_name, popular_weekday), y=popular_weekday)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Item Category") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Item Categories (Weekday)") + 
  ggplot2::coord_flip()

# Delivers by day and hour
ggplot2::ggplot(final_df,ggplot2::aes(x=lubridate::wday(when_the_delivery_started,label=T), 1,group=1)) +
  ggplot2::stat_summary(fun = sum,geom = "line")+ 
  ggplot2::ggtitle("Deliveries by day")+
  ggplot2::ylab("# of Deliveries")+
  ggplot2::xlab("Day")

ggplot2::ggplot(final_df,ggplot2::aes(x=lubridate::hour(when_the_delivery_started), 1,group=1)) +
  ggplot2::stat_summary(fun = sum,geom = "line")+
  ggplot2::ggtitle("Deliveries by hour")+
  ggplot2::ylab("# of Deliveries")+
  ggplot2::xlab("Hour")

# Our customers on a map
library(magrittr)
library(leaflet)
leaflet() %>% 
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(data=subset(final_df,weekend_delivery_started==0),
                   lat=~dropoff_lat,lng=~dropoff_lon,weight=2,radius=3,opacity=2,color="#46de9b") %>%
  addCircleMarkers(data=subset(final_df,weekend_delivery_started==1),
                   lat=~dropoff_lat,lng=~dropoff_lon,weight=2,radius=3,opacity=2,color="#de4689") %>%
  addLegend("bottomright",colors =c("#de4689", "#46de9b"),labels= c("Weekend","Weekday"),opacity = 1)
# Now let's analyze our pick_up places
# of pick_up places
paste(length(unique(final_df$pickup_place))," Number of places")
## [1] "898  Number of places"
# Preperation time distribution
preperationTime <- sqldf::sqldf(
"SELECT pickup_place, avg(preperation_time)
FROM final_df
GROUP BY 1;")
hist(preperationTime$`avg(preperation_time)`, 
main = "Preparation Time Distributions", 
ylab="Number of Pickup places",
xlab="Preparation Time")

# Popular places
Top_25_places <- sqldf::sqldf("SELECT `pickup_place`,
COUNT(`delivery_id`) AS `Orders` 
FROM `final_df`
GROUP BY `pickup_place`
ORDER BY `Orders` DESC
LIMIT 25;")

ggplot2::ggplot(Top_25_places, ggplot2::aes(x=reorder(pickup_place, Orders), y=Orders)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Places") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top 25 places (overall)") + 
  ggplot2::coord_flip()

# Our places on a map
#weekend vs weekday by pickup
leaflet() %>%
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(data=subset(final_df,weekend_delivery_started==0),
                   lat=~pickup_lat,lng=~pickup_lon,weight=2,radius=3,opacity=2,color="#ffa7b6") %>%
  addCircleMarkers(data=subset(final_df,weekend_delivery_started==1),
                   lat=~pickup_lat,lng=~pickup_lon,weight=2,radius=2,opacity=2,color="#a7b6ff")%>%
  addLegend("bottomright",colors =c("#a7b6ff", "#ffa7b6"),labels= c("Weekend","Weekday"),opacity = 1) 
# Now let's move to our jumpmen

# #of deliveries
paste(length(final_df$delivery_id)," # of deliveries")
## [1] "5967  # of deliveries"
# #of jumpmen
paste(length(unique(final_df$jumpman_id))," # of jumpmen")
## [1] "578  # of jumpmen"
# best jumpmen
Top_jumpmen <- sqldf::sqldf(
  "SELECT `jumpman_id`,
COUNT(`jumpman_id`) AS `jumpmen` 
FROM `final_df`
GROUP BY `jumpman_id`
ORDER BY `jumpmen` DESC
LIMIT    25;")

ggplot2::ggplot(data=Top_jumpmen, ggplot2::aes(x=reorder(jumpman_id, jumpmen), y=jumpmen)) +
  ggplot2::geom_bar(stat="identity") +  
  ggplot2::xlab("Jumpman_id") +
  ggplot2::ylab("Orders") +
  ggplot2::ggtitle("Top Jumpmen") + 
  ggplot2::coord_flip()

# Deliveries distribution by jumpmen
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
Orders_by_jumpman <- final_df %>% 
  as.tibble() %>% 
  count(jumpman_id)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
ggplot2::ggplot(Orders_by_jumpman, aes(x=n)) + 
  ggplot2::geom_histogram() +
  ggplot2::geom_vline(aes(xintercept=mean(n)),
             color="blue", linetype="dashed", size=1) +
  ggplot2::ggtitle("Deliveries by jumpman")+
  ggplot2::ylab("# of deliveries")+
  ggplot2::xlab("# of Jumpmen") +
  ggplot2::scale_x_continuous(breaks=seq(0,80,5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# type of transport they use
ggplot2::ggplot(final_df, ggplot2::aes(x=vehicle_type, 1,group=1)) +
  ggplot2::stat_summary(fun = sum,geom = "bar")+
  ggplot2::ggtitle("Deliveries by vehicle")+
  ggplot2::ylab("# of deliveries")+
  ggplot2::xlab("vehicle")

# distance distribution
hist(final_df$distance, 
     main = "Distance distributions", 
     ylab="Number of Pickup plcases",
     xlab="Distance")

# distance by vehicle
ggplot2::ggplot(final_df,ggplot2::aes(x=vehicle_type,y=distance))+
  ggplot2::geom_boxplot()+
  ggplot2::ggtitle("Distance by Vehicle Type")+
  ggplot2::xlab("Vehicle")+
  ggplot2::ylab("Distance")

# time_in_transit distribution
hist(as.numeric(final_df$time_in_transit), 
     main = "time_in_transit distribution", 
     ylab="Number of Pickup plcases",
     xlab="Distance")

# time_in_transit time vs distance (we see that 550 rows were removed. That's because of N/As)
ggplot2::ggplot(final_df, ggplot2::aes(x=distance, y=as.numeric(time_in_transit), group=1)) +
  ggplot2::stat_summary(fun = sum,geom = "point")+
  ggplot2::ggtitle("Transit time vs distance")+
  ggplot2::ylab("Time in transit")+
  ggplot2::xlab("Distance")
## Warning: Removed 550 rows containing non-finite values (stat_summary).

install.packages("devtools")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
devtools::install_github("laresbernardo/lares")
## Skipping install of 'lares' from a github remote, the SHA1 (c3b8ee12) has not changed since last install.
##   Use `force = TRUE` to force installation
lares::mplot_lineal(tag = final_df$distance, 
                  score = as.numeric(final_df$time_in_transit),
                  subtitle = "Distance&Time Regression Model")