library(lattice)
library(rlang)
library(ggplot2)
library(data.table)
library(dplyr)
library(tidyr)
library(plotly)
library(caret)
library(lubridate)
library(httpuv)

Data Preprocessing

rawdata = read.csv("F:/Mtero Bike Competition/metro-bike-share-trip-data.csv")
#Summary Raw Data
glimpse(rawdata)

## Observations: 132,427
## Variables: 16
## $ Trip.ID                    <int> 1912818, 1919661, 1933383, 1944197,...
## $ Duration                   <int> 180, 1980, 300, 10860, 420, 780, 60...
## $ Start.Time                 <fct> 2016-07-07T04:17:00, 2016-07-07T06:...
## $ End.Time                   <fct> 2016-07-07T04:20:00, 2016-07-07T06:...
## $ Starting.Station.ID        <int> 3014, 3014, 3016, 3016, 3032, 3021,...
## $ Starting.Station.Latitude  <dbl> 34.05661, 34.05661, 34.05290, 34.05...
## $ Starting.Station.Longitude <dbl> -118.2372, -118.2372, -118.2416, -1...
## $ Ending.Station.ID          <int> 3014, 3014, 3016, 3016, 3032, 3054,...
## $ Ending.Station.Latitude    <dbl> 34.05661, 34.05661, 34.05290, 34.05...
## $ Ending.Station.Longitude   <dbl> -118.2372, -118.2372, -118.2416, -1...
## $ Bike.ID                    <int> 6281, 6281, 5861, 5861, 6674, 6717,...
## $ Plan.Duration              <int> 30, 30, 365, 365, 0, 30, 30, 365, 3...
## $ Trip.Route.Category        <fct> Round Trip, Round Trip, Round Trip,...
## $ Passholder.Type            <fct> Monthly Pass, Monthly Pass, Flex Pa...
## $ Starting.Lat.Long          <fct> "{'longitude': '-118.23721', 'latit...
## $ Ending.Lat.Long            <fct> "{'longitude': '-118.23721', 'latit...

#Missing Values 
sapply(rawdata, function(x) sum(is.na(x)))

##                    Trip.ID                   Duration 
##                          0                          0 
##                 Start.Time                   End.Time 
##                          0                          0 
##        Starting.Station.ID  Starting.Station.Latitude 
##                         19                         48 
## Starting.Station.Longitude          Ending.Station.ID 
##                         48                         96 
##    Ending.Station.Latitude   Ending.Station.Longitude 
##                       1051                       1051 
##                    Bike.ID              Plan.Duration 
##                         10                        766 
##        Trip.Route.Category            Passholder.Type 
##                          0                          0 
##          Starting.Lat.Long            Ending.Lat.Long 
##                          0                          0

#New dataset without missing values
nonadataset = rawdata[complete.cases(rawdata),]

# Calculate distance in kilometers between two points
library(geodist)
library(geosphere)
p1 = nonadataset[,c('Starting.Station.Longitude','Starting.Station.Latitude')]
p2 = nonadataset[,c('Ending.Station.Longitude','Ending.Station.Latitude')]
# Default unit = meters
distance.between.stations=distVincentyEllipsoid(p1,p2)
nonadataset = cbind(nonadataset,distance.between.stations)
temdata = nonadataset

Data Analysis for Travel Distance by different Ticket’s Type

#Data Analysis
#Time.spend: Travel Time == Duration
temdata = temdata %>%
  separate(Start.Time,c("Start.date","Start.time"),"T") %>%
  separate(End.Time,c("End.date","End.time"),"T") %>%
  mutate(Start.Time = as.POSIXct(paste(Start.date,Start.time, sep=" ")),
         End.Time = as.POSIXct(paste(End.date,End.time, sep=" ")),
         Time.spend = difftime(End.Time,Start.Time,units="secs"))

#Labeling Plan.Duration
temdata = temdata %>%
  mutate(Plan.Duration = ordered(Plan.Duration,
                                 levels = c(0,30,365),
                                 labels = c("Single Ride", "Monthly", "Yearly")))

#Analysis for Customer's Behaviors on Passholder.Type
bike_data_group <- temdata %>%
  group_by(Trip.Route.Category,Passholder.Type,Plan.Duration) %>%
  summarize(average_duration = round(mean(Duration)))
bike_data_group %>%
  subset(Trip.Route.Category == 'One Way') %>%
  plot_ly(x=~Passholder.Type, y=~average_duration,type = 'bar', name='One Way') %>%
  add_trace(data=subset(bike_data_group,Trip.Route.Category == 'Round Trip'),x=~Passholder.Type,y=~average_duration,name='Round Trip')

bike_data_group

## # A tibble: 6 x 4
## # Groups:   Trip.Route.Category, Passholder.Type [6]
##   Trip.Route.Category Passholder.Type Plan.Duration average_duration
##   <fct>               <fct>           <ord>                    <dbl>
## 1 One Way             Flex Pass       Yearly                     987
## 2 One Way             Monthly Pass    Monthly                    740
## 3 One Way             Walk-up         Single Ride               2162
## 4 Round Trip          Flex Pass       Yearly                    2550
## 5 Round Trip          Monthly Pass    Monthly                   1547
## 6 Round Trip          Walk-up         Single Ride               4112

Based on the map, we could quickly understand each station’s situation. Moreover, we could through this map to find out pain points of areas which have less usage. For Monthly and Walk-Up, They show the normal proportion. The One-Way average duration is almost half of Round-Trip duration. However, For Flex Pass, it’s One-Way average duration is much less than half of the Round-Trip duration. Therefore, we could conclude that customers who buy the Flex Pass are more prefer the round trip as their transportation behavior. As a result, we could make a promotion on Round-Trip for customers who buy Flex Pass.

Customer’s Usage Area and Record Count

library(leaflet)
#Record Count by Map
#Find out Data is not clean
#Longitude has 0 value in the column.
head(levels(as.factor(temdata$Starting.Station.Latitude)))

## [1] "0"          "33.987381"  "34.024479"  "34.02589"   "34.028511" 
## [6] "34.0310516"

head(levels(as.factor(temdata$Starting.Station.Longitude)))

## [1] "-118.472832" "-118.393867" "-118.270813" "-118.27081"  "-118.268082"
## [6] "-118.26808"

#Record Count Cluster Mapping 
temdata %>%
  filter(Starting.Station.Longitude != 0) %>%
  select(lat=Starting.Station.Latitude,lng=Starting.Station.Longitude) %>%
  leaflet() %>%
  addTiles() %>%
  addMarkers(clusterOptions = markerClusterOptions())

Based on the map, we could quickly understand each station’s situation. Moreover, we could through this map to pain point those areas which have less usage. Those stations close to the airport or parking lots have less usage, so we could assume that customers would prefer to use other tools as instead.

Extra Revenue Analysis

#Extra Revenue Analysis
temdata = temdata %>%
  mutate(overtimerevenue = ifelse(Duration > (30*60), (Duration-(30*60))/(30*60)*1.75, 0))
temdata = as_tibble(as.data.frame(temdata))
YoM_df = temdata %>%
  mutate(Start.date = as.Date(Start.date)) %>%
  arrange(Start.date) %>%
  group_by(Year_Month=floor_date(Start.date, "month"),Passholder.Type) %>%
  summarise(total.extra.revenue = sum(overtimerevenue))
YoM_df = as_tibble(as.data.frame(YoM_df))
YoM_df %>%
  mutate(total.extra.revenue = as.numeric(total.extra.revenue)) %>%
  subset(Passholder.Type == 'Flex Pass') %>% plot_ly() %>%
  add_trace(x=~Year_Month, y=~total.extra.revenue,name='Flex Pass', type='scatter',mode='lines') %>%
  add_trace(data=subset(YoM_df,Passholder.Type == 'Monthly Pass'),x=~Year_Month, y=~total.extra.revenue,name='Monthly Pass', type='scatter',mode='lines') %>%
  add_trace(data=subset(YoM_df,Passholder.Type == 'Walk-up'),x=~Year_Month, y=~total.extra.revenue,name='Walk-up', type='scatter',mode='lines')

The walk-up ticket has much higher extra revenue for the company. Therefore, this will be the company’s potential profit model. We should make a strategy and push other types of customers to this type.

Metro_Bike_Data_Analysis

Chien-Hua Wang

2019/04/25

Data Preprocessing

Data Analysis for Travel Distance by different Ticket’s Type

Customer’s Usage Area and Record Count

Extra Revenue Analysis

The walk-up ticket has much higher extra revenue for the company. Therefore, this will be the company’s potential profit model. We should make a strategy and push other types of customers to this type.