library(lattice)
library(rlang)
library(ggplot2)
library(data.table)
library(dplyr)
library(tidyr)
library(plotly)
library(caret)
library(lubridate)
library(httpuv)
Data Preprocessing
rawdata = read.csv("F:/Mtero Bike Competition/metro-bike-share-trip-data.csv")
#Summary Raw Data
glimpse(rawdata)
## Observations: 132,427
## Variables: 16
## $ Trip.ID <int> 1912818, 1919661, 1933383, 1944197,...
## $ Duration <int> 180, 1980, 300, 10860, 420, 780, 60...
## $ Start.Time <fct> 2016-07-07T04:17:00, 2016-07-07T06:...
## $ End.Time <fct> 2016-07-07T04:20:00, 2016-07-07T06:...
## $ Starting.Station.ID <int> 3014, 3014, 3016, 3016, 3032, 3021,...
## $ Starting.Station.Latitude <dbl> 34.05661, 34.05661, 34.05290, 34.05...
## $ Starting.Station.Longitude <dbl> -118.2372, -118.2372, -118.2416, -1...
## $ Ending.Station.ID <int> 3014, 3014, 3016, 3016, 3032, 3054,...
## $ Ending.Station.Latitude <dbl> 34.05661, 34.05661, 34.05290, 34.05...
## $ Ending.Station.Longitude <dbl> -118.2372, -118.2372, -118.2416, -1...
## $ Bike.ID <int> 6281, 6281, 5861, 5861, 6674, 6717,...
## $ Plan.Duration <int> 30, 30, 365, 365, 0, 30, 30, 365, 3...
## $ Trip.Route.Category <fct> Round Trip, Round Trip, Round Trip,...
## $ Passholder.Type <fct> Monthly Pass, Monthly Pass, Flex Pa...
## $ Starting.Lat.Long <fct> "{'longitude': '-118.23721', 'latit...
## $ Ending.Lat.Long <fct> "{'longitude': '-118.23721', 'latit...
#Missing Values
sapply(rawdata, function(x) sum(is.na(x)))
## Trip.ID Duration
## 0 0
## Start.Time End.Time
## 0 0
## Starting.Station.ID Starting.Station.Latitude
## 19 48
## Starting.Station.Longitude Ending.Station.ID
## 48 96
## Ending.Station.Latitude Ending.Station.Longitude
## 1051 1051
## Bike.ID Plan.Duration
## 10 766
## Trip.Route.Category Passholder.Type
## 0 0
## Starting.Lat.Long Ending.Lat.Long
## 0 0
#New dataset without missing values
nonadataset = rawdata[complete.cases(rawdata),]
# Calculate distance in kilometers between two points
library(geodist)
library(geosphere)
p1 = nonadataset[,c('Starting.Station.Longitude','Starting.Station.Latitude')]
p2 = nonadataset[,c('Ending.Station.Longitude','Ending.Station.Latitude')]
# Default unit = meters
distance.between.stations=distVincentyEllipsoid(p1,p2)
nonadataset = cbind(nonadataset,distance.between.stations)
temdata = nonadataset
Data Analysis for Travel Distance by different Ticket’s Type
#Data Analysis
#Time.spend: Travel Time == Duration
temdata = temdata %>%
separate(Start.Time,c("Start.date","Start.time"),"T") %>%
separate(End.Time,c("End.date","End.time"),"T") %>%
mutate(Start.Time = as.POSIXct(paste(Start.date,Start.time, sep=" ")),
End.Time = as.POSIXct(paste(End.date,End.time, sep=" ")),
Time.spend = difftime(End.Time,Start.Time,units="secs"))
#Labeling Plan.Duration
temdata = temdata %>%
mutate(Plan.Duration = ordered(Plan.Duration,
levels = c(0,30,365),
labels = c("Single Ride", "Monthly", "Yearly")))
#Analysis for Customer's Behaviors on Passholder.Type
bike_data_group <- temdata %>%
group_by(Trip.Route.Category,Passholder.Type,Plan.Duration) %>%
summarize(average_duration = round(mean(Duration)))
bike_data_group %>%
subset(Trip.Route.Category == 'One Way') %>%
plot_ly(x=~Passholder.Type, y=~average_duration,type = 'bar', name='One Way') %>%
add_trace(data=subset(bike_data_group,Trip.Route.Category == 'Round Trip'),x=~Passholder.Type,y=~average_duration,name='Round Trip')
bike_data_group
## # A tibble: 6 x 4
## # Groups: Trip.Route.Category, Passholder.Type [6]
## Trip.Route.Category Passholder.Type Plan.Duration average_duration
## <fct> <fct> <ord> <dbl>
## 1 One Way Flex Pass Yearly 987
## 2 One Way Monthly Pass Monthly 740
## 3 One Way Walk-up Single Ride 2162
## 4 Round Trip Flex Pass Yearly 2550
## 5 Round Trip Monthly Pass Monthly 1547
## 6 Round Trip Walk-up Single Ride 4112
Based on the map, we could quickly understand each station’s situation. Moreover, we could through this map to find out pain points of areas which have less usage. For Monthly and Walk-Up, They show the normal proportion. The One-Way average duration is almost half of Round-Trip duration. However, For Flex Pass, it’s One-Way average duration is much less than half of the Round-Trip duration. Therefore, we could conclude that customers who buy the Flex Pass are more prefer the round trip as their transportation behavior. As a result, we could make a promotion on Round-Trip for customers who buy Flex Pass.
Customer’s Usage Area and Record Count
library(leaflet)
#Record Count by Map
#Find out Data is not clean
#Longitude has 0 value in the column.
head(levels(as.factor(temdata$Starting.Station.Latitude)))
## [1] "0" "33.987381" "34.024479" "34.02589" "34.028511"
## [6] "34.0310516"
head(levels(as.factor(temdata$Starting.Station.Longitude)))
## [1] "-118.472832" "-118.393867" "-118.270813" "-118.27081" "-118.268082"
## [6] "-118.26808"
#Record Count Cluster Mapping
temdata %>%
filter(Starting.Station.Longitude != 0) %>%
select(lat=Starting.Station.Latitude,lng=Starting.Station.Longitude) %>%
leaflet() %>%
addTiles() %>%
addMarkers(clusterOptions = markerClusterOptions())
Based on the map, we could quickly understand each station’s situation. Moreover, we could through this map to pain point those areas which have less usage. Those stations close to the airport or parking lots have less usage, so we could assume that customers would prefer to use other tools as instead.