Libraries

# Project Files Manager
library(here)

# The best package for data mining ever!
library(tidyverse)

# Pretty Tables :D
library(flextable)

#Sankey Network Graphic
library(networkD3)

# Interactive Maps
library(leaflet)
library(leaflet.minicharts)

#Pallete Colors
library(RColorBrewer)

#Summary Statistic Table
library(rstatix)

Introduction

That is a short analysis for a dataset in Kaggle about electric scooter in Chicago.

That kind of locomotion was really common in Rio before SARS-CoV-2 pandemic, but unfortunately over.

Let’s see what we can discover about it in the 2020s years.

First Steps

data=read.csv(here("data/E-Scooter_Trips_-_2020.csv"))

data$Trip.Duration = gsub(",","",data$Trip.Duration) |> as.numeric() |>as.duration()  |> as.numeric()

data$Trip.Distance = gsub(",","",data$Trip.Distance) |> as.numeric()

Static Data Visualizations

total_for_vendor= data |>
   group_by(Vendor) |>  
   summarise(total=n()) |> 
   ggplot(aes(x=Vendor)) +
   labs(title = "Total of Trips",
        subtitle ="For Each E-Scooter's Vendor"
         ) +
   geom_col(aes(y=total,fill=Vendor))
   
total_for_vendor + scale_fill_brewer(palette ="Dark2" )

mean_duration= data |>
   ggplot(aes(x=Vendor, y=Trip.Duration, fill=Vendor)) + 
   geom_boxplot(outlier.shape = NA) +
   labs(title = "Average trip duration for scooter each vendor",
        subtitle ="Without Outliers"
         ) +
   coord_cartesian(ylim = quantile(data$Trip.Duration, c(0.1, 0.92)))
mean_duration + scale_fill_brewer(palette ="Dark2" )

common_travel_trajectory = data |> na.omit() |> 
   group_by(Start.Community.Area.Name,End.Community.Area.Name) |>  
   summarise(total=n(),meanDuration=mean(Trip.Duration),meanDistance=mean(Trip.Distance)) |>
   arrange(desc(total)) |>
    head(10) |> 
   flextable()

common_travel_trajectory

Start.Community.Area.Name	End.Community.Area.Name	total	meanDuration	meanDistance
LAKE VIEW	LAKE VIEW	73,717	598.3106	2,004.828
LINCOLN PARK	LINCOLN PARK	55,233	711.6077	2,135.439
WEST TOWN	WEST TOWN	44,929	706.2897	2,087.713
NEAR WEST SIDE	NEAR WEST SIDE	25,090	817.9240	2,328.143
LOGAN SQUARE	LOGAN SQUARE	17,867	675.9727	2,159.870
UPTOWN	UPTOWN	17,378	683.1982	2,312.415
NEAR NORTH SIDE	NEAR NORTH SIDE	17,089	824.2961	2,151.306
HYDE PARK	HYDE PARK	15,477	907.0672	2,230.932
LINCOLN PARK	LAKE VIEW	13,005	811.2798	2,987.432
LAKE VIEW	LINCOLN PARK	12,538	828.6661	2,913.175

Interative Data Visualizations

Top 10 Trajectory

data_trajectory_10 = data |> na.omit() |> 
   group_by(Start.Community.Area.Name,End.Community.Area.Name) |> 
   summarise(total=n(),meanDuration=mean(Trip.Duration),meanDistance=mean(Trip.Distance)) |>
   arrange(desc(total)) |>
   head(10) 

 nodes <- data.frame(
   name=c(as.character(data_trajectory_10$Start.Community.Area.Name), 
          tolower(as.character(data_trajectory_10$End.Community.Area.Name))) |> 
         unique()
)


data_trajectory_10$IDsource <- match(data_trajectory_10$Start.Community.Area.Name, nodes$name)-1 
data_trajectory_10$IDtarget <- match(tolower(data_trajectory_10$End.Community.Area.Name), nodes$name) -1 


p <- sankeyNetwork(Links = data_trajectory_10, Nodes = nodes,
                   Source = "IDsource", Target = "IDtarget",
                   Value = "total", NodeID = "name", 
                   sinksRight=T, nodeWidth=10, fontSize=13, nodePadding=20,)

p

Vendor by Start Area

The radius is proportional to the number of trips

percentage = function(a,b){
   return (round((a/b),2))
}


data_mod= data |> group_by(Start.Community.Area.Name,
                 Start.Centroid.Latitude,
                 Start.Centroid.Longitude,
                 Vendor) |>
      summarise(n=n()) |>
      na.omit() |>
      pivot_wider(names_from='Vendor',values_from=n) |> 
      mutate(total=bird+lime+spin) |> 
      mutate(p.bird=percentage(bird,total),p.lime=percentage(lime,total),p.spin=percentage(spin,total))
   

basemap = leaflet() %>% 
   addTiles() %>% 
   addProviderTiles(providers$OpenStreetMap)

basemap |> 
addMinicharts(
   lat=data_mod$Start.Centroid.Latitude, lng=data_mod$Start.Centroid.Longitude,
   type = "pie",
   chartdata = data_mod[,c('p.bird','p.lime','p.spin')],
   width =40 * sqrt(data_mod$total) / sqrt(max(data_mod$total)),
   layerId =data_mod$Start.Community.Area.Name,
   colorPalette = brewer.pal(3,"Dark2")
   
   )

Quick Statics Analysis

There is difference between the travel’s distance with the tree e-scooter vendor?

data |>
  group_by(Vendor) |>
   get_summary_stats(Trip.Duration, type = "median_iqr") |>
  flextable() |>
  set_header_labels(Vendor = "Vendor", variable = "Variable", n = "N",
                    median = "Median", iqr = "IQR")

Vendor	Variable	N	Median	IQR
bird	Trip.Duration	181,155	534	776
lime	Trip.Duration	280,092	615	895
spin	Trip.Duration	169,569	544	750

kruskal.test(Trip.Duration~ Vendor, data = data)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Trip.Duration by Vendor
## Kruskal-Wallis chi-squared = 2761.1, df = 2, p-value < 2.2e-16

There is a significant difference between trip duration median between Vendors.

But which pairs?

pairwise.wilcox.test(data$Trip.Duration, data$Vendor,
                 p.adjust.method = "BH")

## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  data$Trip.Duration and data$Vendor 
## 
##      bird   lime  
## lime <2e-16 -     
## spin <2e-16 <2e-16
## 
## P value adjustment method: BH

Everyone is different from each other

Conclusion

The lime´s company have more trip distance than the other, perhaps to be more used

Kaggle_E-Scooter_Trips_-_2020

Romario Gomes

2023-04-10