Libraries

# Project Files Manager
library(here)

# The best package for data mining ever!
library(tidyverse)

# Pretty Tables :D
library(flextable)

#Sankey Network Graphic
library(networkD3)

# Interactive Maps
library(leaflet)
library(leaflet.minicharts)

#Pallete Colors
library(RColorBrewer)

#Summary Statistic Table
library(rstatix)

Introduction

That is a short analysis for a dataset in Kaggle about electric scooter in Chicago.

That kind of locomotion was really common in Rio before SARS-CoV-2 pandemic, but unfortunately over.

Let’s see what we can discover about it in the 2020s years.

First Steps

data=read.csv(here("data/E-Scooter_Trips_-_2020.csv"))

data$Trip.Duration = gsub(",","",data$Trip.Duration) |> as.numeric() |>as.duration()  |> as.numeric()

data$Trip.Distance = gsub(",","",data$Trip.Distance) |> as.numeric()

Static Data Visualizations

total_for_vendor= data |>
   group_by(Vendor) |>  
   summarise(total=n()) |> 
   ggplot(aes(x=Vendor)) +
   labs(title = "Total of Trips",
        subtitle ="For Each E-Scooter's Vendor"
         ) +
   geom_col(aes(y=total,fill=Vendor))
   
total_for_vendor + scale_fill_brewer(palette ="Dark2" )

mean_duration= data |>
   ggplot(aes(x=Vendor, y=Trip.Duration, fill=Vendor)) + 
   geom_boxplot(outlier.shape = NA) +
   labs(title = "Average trip duration for scooter each vendor",
        subtitle ="Without Outliers"
         ) +
   coord_cartesian(ylim = quantile(data$Trip.Duration, c(0.1, 0.92)))
mean_duration + scale_fill_brewer(palette ="Dark2" )

common_travel_trajectory = data |> na.omit() |> 
   group_by(Start.Community.Area.Name,End.Community.Area.Name) |>  
   summarise(total=n(),meanDuration=mean(Trip.Duration),meanDistance=mean(Trip.Distance)) |>
   arrange(desc(total)) |>
    head(10) |> 
   flextable()

common_travel_trajectory

Start.Community.Area.Name

End.Community.Area.Name

total

meanDuration

meanDistance

LAKE VIEW

LAKE VIEW

73,717

598.3106

2,004.828

LINCOLN PARK

LINCOLN PARK

55,233

711.6077

2,135.439

WEST TOWN

WEST TOWN

44,929

706.2897

2,087.713

NEAR WEST SIDE

NEAR WEST SIDE

25,090

817.9240

2,328.143

LOGAN SQUARE

LOGAN SQUARE

17,867

675.9727

2,159.870

UPTOWN

UPTOWN

17,378

683.1982

2,312.415

NEAR NORTH SIDE

NEAR NORTH SIDE

17,089

824.2961

2,151.306

HYDE PARK

HYDE PARK

15,477

907.0672

2,230.932

LINCOLN PARK

LAKE VIEW

13,005

811.2798

2,987.432

LAKE VIEW

LINCOLN PARK

12,538

828.6661

2,913.175

Interative Data Visualizations

Top 10 Trajectory

data_trajectory_10 = data |> na.omit() |> 
   group_by(Start.Community.Area.Name,End.Community.Area.Name) |> 
   summarise(total=n(),meanDuration=mean(Trip.Duration),meanDistance=mean(Trip.Distance)) |>
   arrange(desc(total)) |>
   head(10) 

 nodes <- data.frame(
   name=c(as.character(data_trajectory_10$Start.Community.Area.Name), 
          tolower(as.character(data_trajectory_10$End.Community.Area.Name))) |> 
         unique()
)


data_trajectory_10$IDsource <- match(data_trajectory_10$Start.Community.Area.Name, nodes$name)-1 
data_trajectory_10$IDtarget <- match(tolower(data_trajectory_10$End.Community.Area.Name), nodes$name) -1 


p <- sankeyNetwork(Links = data_trajectory_10, Nodes = nodes,
                   Source = "IDsource", Target = "IDtarget",
                   Value = "total", NodeID = "name", 
                   sinksRight=T, nodeWidth=10, fontSize=13, nodePadding=20,)

p

Vendor by Start Area

The radius is proportional to the number of trips

percentage = function(a,b){
   return (round((a/b),2))
}


data_mod= data |> group_by(Start.Community.Area.Name,
                 Start.Centroid.Latitude,
                 Start.Centroid.Longitude,
                 Vendor) |>
      summarise(n=n()) |>
      na.omit() |>
      pivot_wider(names_from='Vendor',values_from=n) |> 
      mutate(total=bird+lime+spin) |> 
      mutate(p.bird=percentage(bird,total),p.lime=percentage(lime,total),p.spin=percentage(spin,total))
   

basemap = leaflet() %>% 
   addTiles() %>% 
   addProviderTiles(providers$OpenStreetMap)

basemap |> 
addMinicharts(
   lat=data_mod$Start.Centroid.Latitude, lng=data_mod$Start.Centroid.Longitude,
   type = "pie",
   chartdata = data_mod[,c('p.bird','p.lime','p.spin')],
   width =40 * sqrt(data_mod$total) / sqrt(max(data_mod$total)),
   layerId =data_mod$Start.Community.Area.Name,
   colorPalette = brewer.pal(3,"Dark2")
   
   )

Quick Statics Analysis

There is difference between the travel’s distance with the tree e-scooter vendor?

data |>
  group_by(Vendor) |>
   get_summary_stats(Trip.Duration, type = "median_iqr") |>
  flextable() |>
  set_header_labels(Vendor = "Vendor", variable = "Variable", n = "N",
                    median = "Median", iqr = "IQR")

Vendor

Variable

N

Median

IQR

bird

Trip.Duration

181,155

534

776

lime

Trip.Duration

280,092

615

895

spin

Trip.Duration

169,569

544

750

kruskal.test(Trip.Duration~ Vendor, data = data)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Trip.Duration by Vendor
## Kruskal-Wallis chi-squared = 2761.1, df = 2, p-value < 2.2e-16

There is a significant difference between trip duration median between Vendors.

But which pairs?

pairwise.wilcox.test(data$Trip.Duration, data$Vendor,
                 p.adjust.method = "BH")
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  data$Trip.Duration and data$Vendor 
## 
##      bird   lime  
## lime <2e-16 -     
## spin <2e-16 <2e-16
## 
## P value adjustment method: BH

Everyone is different from each other

Conclusion

The lime´s company have more trip distance than the other, perhaps to be more used