final_df <- read.csv("analyze_me.csv", stringsAsFactors=FALSE)
# Let's check what we have inside
str(final_df)
## 'data.frame': 5983 obs. of 18 variables:
## $ delivery_id : int 1457973 1377056 1476547 1485494 1327707 1423142 1334106 1311619 1487674 1417206 ...
## $ customer_id : int 327168 64452 83095 271149 122609 75169 101347 59161 55375 153816 ...
## $ jumpman_id : int 162381 104533 132725 157175 118095 91932 124897 79847 181543 157415 ...
## $ vehicle_type : chr "van" "bicycle" "bicycle" "bicycle" ...
## $ pickup_place : chr "Melt Shop" "Prince Street Pizza" "Bareburger" "Juice Press" ...
## $ place_category : chr "American" "Pizza" "Burger" "Juice Bar" ...
## $ item_name : chr "Lemonade" "Neapolitan Rice Balls" "Bare Sodas" "OMG! My Favorite Juice!" ...
## $ item_quantity : int 1 3 1 1 2 1 1 2 NA 1 ...
## $ item_category_name : chr "Beverages" "Munchables" "Drinks" "Cold Pressed Juices" ...
## $ how_long_it_took_to_order : chr "00:19:58.582052" "00:25:09.107093" "00:06:44.541717" "" ...
## $ pickup_lat : num 40.7 40.7 40.7 40.7 40.7 ...
## $ pickup_lon : num -74 -74 -74 -74 -74 ...
## $ dropoff_lat : num 40.8 40.7 40.7 40.8 40.7 ...
## $ dropoff_lon : num -74 -74 -74 -74 -74 ...
## $ when_the_delivery_started : chr "2014-10-26 13:51:59.898924" "2014-10-16 21:58:58.65491" "2014-10-28 21:39:52.654394" "2014-10-30 10:54:11.531894" ...
## $ when_the_Jumpman_arrived_at_pickup : chr "" "2014-10-16 22:26:02.120931" "2014-10-28 21:37:18.793405" "2014-10-30 11:04:17.759577" ...
## $ when_the_Jumpman_left_pickup : chr "" "2014-10-16 22:48:23.091253" "2014-10-28 21:59:09.98481" "2014-10-30 11:16:37.895816" ...
## $ when_the_Jumpman_arrived_at_dropoff: chr "2014-10-26 14:52:06.313088" "2014-10-16 22:59:22.948873" "2014-10-28 22:04:40.634962" "2014-10-30 11:32:38.090061" ...
# 1st problem - incorrect classes
final_df$when_the_delivery_started <- anytime::anytime(final_df$when_the_delivery_started)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
final_df$when_the_Jumpman_arrived_at_pickup <- anytime::anytime(final_df$when_the_Jumpman_arrived_at_pickup)
final_df$when_the_Jumpman_left_pickup <- anytime::anytime(final_df$when_the_Jumpman_left_pickup)
final_df$when_the_Jumpman_arrived_at_dropoff <- anytime::anytime(final_df$when_the_Jumpman_arrived_at_dropoff)
final_df$how_long_it_took_to_order <- lubridate::period_to_seconds(lubridate::hms(final_df$how_long_it_took_to_order, quiet = T, roll = T))/60
# 2nd problem - missing values.
# I'd use MEAN to fill the gaps where it's possible and leave N/A in "chr" strings
final_df$item_quantity <- imputeTS::na_mean(final_df$item_quantity)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
final_df$how_long_it_took_to_order <- imputeTS::na_mean(final_df$how_long_it_took_to_order)
# 3rd problem - duplicates.
final_df <- dplyr::distinct(final_df)
# Data is near-clean now. Let's add a few variables we'll need later
final_df$distance <- geosphere::distHaversine(cbind(final_df$dropoff_lat,final_df$dropoff_lon),
cbind(final_df$pickup_lat,final_df$pickup_lon))
final_df$distance <- udunits2::ud.convert(final_df$distance, "m","miles")
final_df$preperation_time <- as.numeric(difftime(final_df$when_the_Jumpman_left_pickup,
final_df$when_the_Jumpman_arrived_at_pickup,
units="mins"))
final_df$time_in_transit <- difftime(final_df$when_the_Jumpman_arrived_at_dropoff,
final_df$when_the_Jumpman_left_pickup,
units = "mins")
final_df$total_time <- difftime(final_df$when_the_Jumpman_arrived_at_dropoff,
final_df$when_the_delivery_started,
units = "mins")
final_df$day_delivery_started <- (lubridate::day(final_df$when_the_delivery_started))
final_df$wday_delivery_started <- lubridate::wday(final_df$when_the_delivery_started)
final_df$weekend_delivery_started <- ifelse(final_df$wday_delivery_started %in% c(1,7),1,0)
final_df$period_of_day <- ifelse(lubridate::hour(final_df$when_the_delivery_started) %in% c(5:11),"Morning","Afternoon")
# All the prep work is done now
# Let's analyze users
# Number of customers
paste(length(unique(final_df$customer_id))," Number of customers")
## [1] "3192 Number of customers"
# Number of orders
ggplot2::ggplot(final_df, ggplot2::aes(x=day_delivery_started, 1, group=1)) +
ggplot2::stat_summary(fun = sum,geom = "line")+
ggplot2::ggtitle("Orders in Oct")+
ggplot2::ylab("# of Deliveries")+
ggplot2::xlab("Day of Month")

# Orders distribution
ggplot2::ggplot(data.frame(as.vector(table(final_df$customer_id))))+
ggplot2::geom_histogram(bins=30,ggplot2::aes(x=as.vector.table.final_df.customer_id..))+
ggplot2::ggtitle("Orders per customer distribution - Histogram")+
ggplot2::xlab("Orders/customer")+
ggplot2::ylab("# of Customers")

# Favorite pickup_place by period_of_day
Top_morning_places <- sqldf::sqldf(
"SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_morning`
FROM `final_df`
WHERE period_of_day = 'Morning'
GROUP BY `pickup_place`
ORDER BY `popular_morning` DESC
LIMIT 5;")
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
ggplot2::ggplot(data=Top_morning_places, ggplot2::aes(x=reorder(pickup_place, popular_morning), y=popular_morning)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Place") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Morning Places") +
ggplot2::coord_flip()

Top_afternoon_places <- sqldf::sqldf(
"SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_afternoon`
FROM `final_df`
WHERE period_of_day = 'Afternoon'
GROUP BY `pickup_place`
ORDER BY `popular_afternoon` DESC
LIMIT 5;")
ggplot2::ggplot(data=Top_afternoon_places, ggplot2::aes(x=reorder(pickup_place, popular_afternoon), y=popular_afternoon)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Place") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Afternoon Places") +
ggplot2::coord_flip()

# Favorite pickup_place by workday/weekday
Top_weekend_places <- sqldf::sqldf(
"SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_weekend`
FROM `final_df`
WHERE weekend_delivery_started = 1
GROUP BY `pickup_place`
ORDER BY `popular_weekend` DESC
LIMIT 5;")
ggplot2::ggplot(data=Top_weekend_places, ggplot2::aes(x=reorder(pickup_place, popular_weekend), y=popular_weekend)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Place") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Weekend Places") +
ggplot2::coord_flip()

Top_weekday_places <- sqldf::sqldf(
"SELECT `pickup_place`,
COUNT(`pickup_place`) AS `popular_weekday`
FROM `final_df`
WHERE weekend_delivery_started = 0
GROUP BY `pickup_place`
ORDER BY `popular_weekday` DESC
LIMIT 5;")
ggplot2::ggplot(data=Top_weekday_places, ggplot2::aes(x=reorder(pickup_place, popular_weekday), y=popular_weekday)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Place") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Weekday Places") +
ggplot2::coord_flip()

# Favorite item_category_name by period_of_day
Top_item_category_morning <- sqldf::sqldf(
"SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_morning`
FROM `final_df`
WHERE period_of_day = 'Morning'
GROUP BY `item_category_name`
ORDER BY `popular_morning` DESC
LIMIT 5;")
ggplot2::ggplot(data=Top_item_category_morning, ggplot2::aes(x=reorder(item_category_name, popular_morning), y=popular_morning)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Item Category") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Item Categories (Morning)") +
ggplot2::coord_flip()

Top_item_category_afternoon <- sqldf::sqldf(
"SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_afternoon`
FROM `final_df`
WHERE period_of_day = 'Afternoon'
GROUP BY `item_category_name`
ORDER BY `popular_afternoon` DESC
LIMIT 5;")
ggplot2::ggplot(data=Top_item_category_afternoon, ggplot2::aes(x=reorder(item_category_name, popular_afternoon), y=popular_afternoon)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Item Category") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Item Categories (Afternoon)") +
ggplot2::coord_flip()

# Favorite item_category_name by weekday/weekend
Top_item_category_weekend <- sqldf::sqldf(
"SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_weekend`
FROM `final_df`
WHERE weekend_delivery_started = 1
GROUP BY `item_category_name`
ORDER BY `popular_weekend` DESC
LIMIT 5;")
ggplot2::ggplot(data=Top_item_category_weekend, ggplot2::aes(x=reorder(item_category_name, popular_weekend), y=popular_weekend)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Item Category") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Item Categories (Weekend)") +
ggplot2::coord_flip()

Top_item_category_weekday <- sqldf::sqldf(
"SELECT `item_category_name`,
COUNT(`item_category_name`) AS `popular_weekday`
FROM `final_df`
WHERE weekend_delivery_started = 0
GROUP BY `item_category_name`
ORDER BY `popular_weekday` DESC
LIMIT 5;")
ggplot2::ggplot(data=Top_item_category_weekday, ggplot2::aes(x=reorder(item_category_name, popular_weekday), y=popular_weekday)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Item Category") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Item Categories (Weekday)") +
ggplot2::coord_flip()

# Delivers by day and hour
ggplot2::ggplot(final_df,ggplot2::aes(x=lubridate::wday(when_the_delivery_started,label=T), 1,group=1)) +
ggplot2::stat_summary(fun = sum,geom = "line")+
ggplot2::ggtitle("Deliveries by day")+
ggplot2::ylab("# of Deliveries")+
ggplot2::xlab("Day")

ggplot2::ggplot(final_df,ggplot2::aes(x=lubridate::hour(when_the_delivery_started), 1,group=1)) +
ggplot2::stat_summary(fun = sum,geom = "line")+
ggplot2::ggtitle("Deliveries by hour")+
ggplot2::ylab("# of Deliveries")+
ggplot2::xlab("Hour")

# Our customers on a map
library(magrittr)
library(leaflet)
leaflet() %>%
addTiles() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(data=subset(final_df,weekend_delivery_started==0),
lat=~dropoff_lat,lng=~dropoff_lon,weight=2,radius=3,opacity=2,color="#46de9b") %>%
addCircleMarkers(data=subset(final_df,weekend_delivery_started==1),
lat=~dropoff_lat,lng=~dropoff_lon,weight=2,radius=3,opacity=2,color="#de4689") %>%
addLegend("bottomright",colors =c("#de4689", "#46de9b"),labels= c("Weekend","Weekday"),opacity = 1)
# Now let's analyze our pick_up places
# of pick_up places
paste(length(unique(final_df$pickup_place))," Number of places")
## [1] "898 Number of places"
# Preperation time distribution
preperationTime <- sqldf::sqldf(
"SELECT pickup_place, avg(preperation_time)
FROM final_df
GROUP BY 1;")
hist(preperationTime$`avg(preperation_time)`,
main = "Preparation Time Distributions",
ylab="Number of Pickup places",
xlab="Preparation Time")

# Popular places
Top_25_places <- sqldf::sqldf("SELECT `pickup_place`,
COUNT(`delivery_id`) AS `Orders`
FROM `final_df`
GROUP BY `pickup_place`
ORDER BY `Orders` DESC
LIMIT 25;")
ggplot2::ggplot(Top_25_places, ggplot2::aes(x=reorder(pickup_place, Orders), y=Orders)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Places") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top 25 places (overall)") +
ggplot2::coord_flip()

# Our places on a map
#weekend vs weekday by pickup
leaflet() %>%
addTiles() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(data=subset(final_df,weekend_delivery_started==0),
lat=~pickup_lat,lng=~pickup_lon,weight=2,radius=3,opacity=2,color="#ffa7b6") %>%
addCircleMarkers(data=subset(final_df,weekend_delivery_started==1),
lat=~pickup_lat,lng=~pickup_lon,weight=2,radius=2,opacity=2,color="#a7b6ff")%>%
addLegend("bottomright",colors =c("#a7b6ff", "#ffa7b6"),labels= c("Weekend","Weekday"),opacity = 1)
# Now let's move to our jumpmen
# #of deliveries
paste(length(final_df$delivery_id)," # of deliveries")
## [1] "5967 # of deliveries"
# #of jumpmen
paste(length(unique(final_df$jumpman_id))," # of jumpmen")
## [1] "578 # of jumpmen"
# best jumpmen
Top_jumpmen <- sqldf::sqldf(
"SELECT `jumpman_id`,
COUNT(`jumpman_id`) AS `jumpmen`
FROM `final_df`
GROUP BY `jumpman_id`
ORDER BY `jumpmen` DESC
LIMIT 25;")
ggplot2::ggplot(data=Top_jumpmen, ggplot2::aes(x=reorder(jumpman_id, jumpmen), y=jumpmen)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::xlab("Jumpman_id") +
ggplot2::ylab("Orders") +
ggplot2::ggtitle("Top Jumpmen") +
ggplot2::coord_flip()

# Deliveries distribution by jumpmen
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
Orders_by_jumpman <- final_df %>%
as.tibble() %>%
count(jumpman_id)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
ggplot2::ggplot(Orders_by_jumpman, aes(x=n)) +
ggplot2::geom_histogram() +
ggplot2::geom_vline(aes(xintercept=mean(n)),
color="blue", linetype="dashed", size=1) +
ggplot2::ggtitle("Deliveries by jumpman")+
ggplot2::ylab("# of deliveries")+
ggplot2::xlab("# of Jumpmen") +
ggplot2::scale_x_continuous(breaks=seq(0,80,5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# type of transport they use
ggplot2::ggplot(final_df, ggplot2::aes(x=vehicle_type, 1,group=1)) +
ggplot2::stat_summary(fun = sum,geom = "bar")+
ggplot2::ggtitle("Deliveries by vehicle")+
ggplot2::ylab("# of deliveries")+
ggplot2::xlab("vehicle")

# distance distribution
hist(final_df$distance,
main = "Distance distributions",
ylab="Number of Pickup plcases",
xlab="Distance")

# distance by vehicle
ggplot2::ggplot(final_df,ggplot2::aes(x=vehicle_type,y=distance))+
ggplot2::geom_boxplot()+
ggplot2::ggtitle("Distance by Vehicle Type")+
ggplot2::xlab("Vehicle")+
ggplot2::ylab("Distance")

# time_in_transit distribution
hist(as.numeric(final_df$time_in_transit),
main = "time_in_transit distribution",
ylab="Number of Pickup plcases",
xlab="Distance")

# time_in_transit time vs distance (we see that 550 rows were removed. That's because of N/As)
ggplot2::ggplot(final_df, ggplot2::aes(x=distance, y=as.numeric(time_in_transit), group=1)) +
ggplot2::stat_summary(fun = sum,geom = "point")+
ggplot2::ggtitle("Transit time vs distance")+
ggplot2::ylab("Time in transit")+
ggplot2::xlab("Distance")
## Warning: Removed 550 rows containing non-finite values (stat_summary).

install.packages("devtools")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
devtools::install_github("laresbernardo/lares")
## Skipping install of 'lares' from a github remote, the SHA1 (c3b8ee12) has not changed since last install.
## Use `force = TRUE` to force installation
lares::mplot_lineal(tag = final_df$distance,
score = as.numeric(final_df$time_in_transit),
subtitle = "Distance&Time Regression Model")
