# Project Files Manager
library(here)
# The best package for data mining ever!
library(tidyverse)
# Pretty Tables :D
library(flextable)
#Sankey Network Graphic
library(networkD3)
# Interactive Maps
library(leaflet)
library(leaflet.minicharts)
#Pallete Colors
library(RColorBrewer)
#Summary Statistic Table
library(rstatix)
That is a short analysis for a dataset in Kaggle about electric scooter in Chicago.
That kind of locomotion was really common in Rio before SARS-CoV-2 pandemic, but unfortunately over.
Let’s see what we can discover about it in the 2020s years.
data=read.csv(here("data/E-Scooter_Trips_-_2020.csv"))
data$Trip.Duration = gsub(",","",data$Trip.Duration) |> as.numeric() |>as.duration() |> as.numeric()
data$Trip.Distance = gsub(",","",data$Trip.Distance) |> as.numeric()
total_for_vendor= data |>
group_by(Vendor) |>
summarise(total=n()) |>
ggplot(aes(x=Vendor)) +
labs(title = "Total of Trips",
subtitle ="For Each E-Scooter's Vendor"
) +
geom_col(aes(y=total,fill=Vendor))
total_for_vendor + scale_fill_brewer(palette ="Dark2" )
mean_duration= data |>
ggplot(aes(x=Vendor, y=Trip.Duration, fill=Vendor)) +
geom_boxplot(outlier.shape = NA) +
labs(title = "Average trip duration for scooter each vendor",
subtitle ="Without Outliers"
) +
coord_cartesian(ylim = quantile(data$Trip.Duration, c(0.1, 0.92)))
mean_duration + scale_fill_brewer(palette ="Dark2" )
common_travel_trajectory = data |> na.omit() |>
group_by(Start.Community.Area.Name,End.Community.Area.Name) |>
summarise(total=n(),meanDuration=mean(Trip.Duration),meanDistance=mean(Trip.Distance)) |>
arrange(desc(total)) |>
head(10) |>
flextable()
common_travel_trajectory
Start.Community.Area.Name | End.Community.Area.Name | total | meanDuration | meanDistance |
|---|---|---|---|---|
LAKE VIEW | LAKE VIEW | 73,717 | 598.3106 | 2,004.828 |
LINCOLN PARK | LINCOLN PARK | 55,233 | 711.6077 | 2,135.439 |
WEST TOWN | WEST TOWN | 44,929 | 706.2897 | 2,087.713 |
NEAR WEST SIDE | NEAR WEST SIDE | 25,090 | 817.9240 | 2,328.143 |
LOGAN SQUARE | LOGAN SQUARE | 17,867 | 675.9727 | 2,159.870 |
UPTOWN | UPTOWN | 17,378 | 683.1982 | 2,312.415 |
NEAR NORTH SIDE | NEAR NORTH SIDE | 17,089 | 824.2961 | 2,151.306 |
HYDE PARK | HYDE PARK | 15,477 | 907.0672 | 2,230.932 |
LINCOLN PARK | LAKE VIEW | 13,005 | 811.2798 | 2,987.432 |
LAKE VIEW | LINCOLN PARK | 12,538 | 828.6661 | 2,913.175 |
data_trajectory_10 = data |> na.omit() |>
group_by(Start.Community.Area.Name,End.Community.Area.Name) |>
summarise(total=n(),meanDuration=mean(Trip.Duration),meanDistance=mean(Trip.Distance)) |>
arrange(desc(total)) |>
head(10)
nodes <- data.frame(
name=c(as.character(data_trajectory_10$Start.Community.Area.Name),
tolower(as.character(data_trajectory_10$End.Community.Area.Name))) |>
unique()
)
data_trajectory_10$IDsource <- match(data_trajectory_10$Start.Community.Area.Name, nodes$name)-1
data_trajectory_10$IDtarget <- match(tolower(data_trajectory_10$End.Community.Area.Name), nodes$name) -1
p <- sankeyNetwork(Links = data_trajectory_10, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "total", NodeID = "name",
sinksRight=T, nodeWidth=10, fontSize=13, nodePadding=20,)
p
percentage = function(a,b){
return (round((a/b),2))
}
data_mod= data |> group_by(Start.Community.Area.Name,
Start.Centroid.Latitude,
Start.Centroid.Longitude,
Vendor) |>
summarise(n=n()) |>
na.omit() |>
pivot_wider(names_from='Vendor',values_from=n) |>
mutate(total=bird+lime+spin) |>
mutate(p.bird=percentage(bird,total),p.lime=percentage(lime,total),p.spin=percentage(spin,total))
basemap = leaflet() %>%
addTiles() %>%
addProviderTiles(providers$OpenStreetMap)
basemap |>
addMinicharts(
lat=data_mod$Start.Centroid.Latitude, lng=data_mod$Start.Centroid.Longitude,
type = "pie",
chartdata = data_mod[,c('p.bird','p.lime','p.spin')],
width =40 * sqrt(data_mod$total) / sqrt(max(data_mod$total)),
layerId =data_mod$Start.Community.Area.Name,
colorPalette = brewer.pal(3,"Dark2")
)
There is difference between the travel’s distance with the tree e-scooter vendor?
data |>
group_by(Vendor) |>
get_summary_stats(Trip.Duration, type = "median_iqr") |>
flextable() |>
set_header_labels(Vendor = "Vendor", variable = "Variable", n = "N",
median = "Median", iqr = "IQR")
Vendor | Variable | N | Median | IQR |
|---|---|---|---|---|
bird | Trip.Duration | 181,155 | 534 | 776 |
lime | Trip.Duration | 280,092 | 615 | 895 |
spin | Trip.Duration | 169,569 | 544 | 750 |
kruskal.test(Trip.Duration~ Vendor, data = data)
##
## Kruskal-Wallis rank sum test
##
## data: Trip.Duration by Vendor
## Kruskal-Wallis chi-squared = 2761.1, df = 2, p-value < 2.2e-16
There is a significant difference between trip duration median between Vendors.
But which pairs?
pairwise.wilcox.test(data$Trip.Duration, data$Vendor,
p.adjust.method = "BH")
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: data$Trip.Duration and data$Vendor
##
## bird lime
## lime <2e-16 -
## spin <2e-16 <2e-16
##
## P value adjustment method: BH
Everyone is different from each other
The lime´s company have more trip distance than the other, perhaps to be more used