1 Read Data

rm(list = ls())

library(tidyverse)
library(lubridate)
library(skimr)
library(tseries)
library(forecast)

##########################################################
################   Bike Share Analysis   #################
##########################################################

bike <- read_csv("austin_bikeshare_trips.csv")

2 Summary of the Data

glimpse(bike)
## Rows: 649,231
## Columns: 12
## $ bikeid             <dbl> 8, 141, 578, 555, 86, 861, 382, 435, 555, 668, 981,~
## $ checkout_time      <time> 19:12:00, 02:06:04, 16:28:27, 15:12:00, 15:39:13, ~
## $ duration_minutes   <dbl> 41, 6, 13, 80, 25, 29, 17, 49, 19, 20, 44, 12, 21, ~
## $ end_station_id     <dbl> 2565, 2570, 2498, 2712, 3377, 2537, 2575, 2575, 257~
## $ end_station_name   <chr> "Trinity & 6th Street", "South Congress & Academy",~
## $ month              <dbl> 3, 10, 3, 11, 4, 5, 7, 1, 5, NA, 3, 9, 10, 3, 4, 3,~
## $ start_station_id   <dbl> 2536, 2494, 2538, 2497, 2707, 2540, 2567, 2575, 250~
## $ start_station_name <chr> "Waller & 6th St.", "2nd & Congress", "Bullock Muse~
## $ start_time         <dttm> 2015-03-19 19:12:00, 2016-10-30 02:06:04, 2016-03-~
## $ subscriber_type    <chr> "Walk Up", "Local365", "Local365", "24-Hour Kiosk (~
## $ trip_id            <dbl> 9900082882, 12617682, 9075366, 9900319298, 14468597~
## $ year               <dbl> 2015, 2016, 2016, 2014, 2017, 2015, 2016, 2015, 201~
summary(bike)
##      bikeid       checkout_time     duration_minutes   end_station_id 
##  Min.   :   3.0   Length:649231     Min.   :    0.00   Min.   :1001   
##  1st Qu.: 208.0   Class1:hms        1st Qu.:    8.00   1st Qu.:2499   
##  Median : 417.0   Class2:difftime   Median :   15.00   Median :2548   
##  Mean   : 471.6   Mode  :numeric    Mean   :   29.13   Mean   :2582   
##  3rd Qu.: 745.0                     3rd Qu.:   28.00   3rd Qu.:2571   
##  Max.   :5089.0                     Max.   :21296.00   Max.   :3687   
##  NA's   :723                                           NA's   :19842  
##  end_station_name       month        start_station_id start_station_name
##  Length:649231      Min.   : 1.000   Min.   :1001     Length:649231     
##  Class :character   1st Qu.: 3.000   1st Qu.:2501     Class :character  
##  Mode  :character   Median : 5.000   Median :2549     Mode  :character  
##                     Mean   : 5.887   Mean   :2584                       
##                     3rd Qu.: 9.000   3rd Qu.:2571                       
##                     Max.   :12.000   Max.   :3687                       
##                     NA's   :30752    NA's   :19041                      
##    start_time                  subscriber_type       trip_id         
##  Min.   :2013-12-21 09:12:00   Length:649231      Min.   :8.270e+06  
##  1st Qu.:2015-01-24 12:12:00   Class :character   1st Qu.:1.275e+07  
##  Median :2015-11-10 13:12:40   Mode  :character   Median :9.900e+09  
##  Mean   :2015-11-13 20:41:09                      Mean   :5.385e+09  
##  3rd Qu.:2016-09-30 17:01:11                      3rd Qu.:9.900e+09  
##  Max.   :2017-07-31 23:44:27                      Max.   :9.900e+09  
##                                                                      
##       year      
##  Min.   :2013   
##  1st Qu.:2014   
##  Median :2015   
##  Mean   :2015   
##  3rd Qu.:2016   
##  Max.   :2017   
##  NA's   :30752
skim(bike)
Data summary
Name bike
Number of rows 649231
Number of columns 12
_______________________
Column type frequency:
character 3
difftime 1
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
end_station_name 0 1 4 64 0 92 0
start_station_name 0 1 4 64 0 90 0
subscriber_type 2077 1 5 45 0 52 0

Variable type: difftime

skim_variable n_missing complete_rate min max median n_unique
checkout_time 0 1 121 secs 86279 secs 15:09:29 65663

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
bikeid 723 1.00 4.716200e+02 3.235900e+02 3 208 417 745 5089 ▇▁▁▁▁
duration_minutes 0 1.00 2.913000e+01 8.728000e+01 0 8 15 28 21296 ▇▁▁▁▁
end_station_id 19842 0.97 2.582470e+03 3.199000e+02 1001 2499 2548 2571 3687 ▁▁▇▁▁
month 30752 0.95 5.890000e+00 3.210000e+00 1 3 5 9 12 ▇▅▃▃▅
start_station_id 19041 0.97 2.584240e+03 3.208400e+02 1001 2501 2549 2571 3687 ▁▁▇▁▁
trip_id 0 1.00 5.384945e+09 4.925349e+09 8269930 12747093 9900028150 9900190458 9900352765 ▇▁▁▁▇
year 30752 0.95 2.015340e+03 1.020000e+00 2013 2014 2015 2016 2017 ▁▇▇▇▃

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
start_time 0 1 2013-12-21 09:12:00 2017-07-31 23:44:27 2015-11-10 13:12:40 357010

3 EDA

3.1 Most Common Pickup Points, Top 20

#########################  1  ###########################

bike %>% 
  count(start_station_name, sort = T) %>% 
  head(20)
## # A tibble: 20 x 2
##    start_station_name                          n
##    <chr>                                   <int>
##  1 Riverside @ S. Lamar                    28695
##  2 City Hall / Lavaca & 2nd                28535
##  3 5th & Bowie                             26669
##  4 2nd & Congress                          26612
##  5 4th & Congress                          24972
##  6 Convention Center / 4th St. @ MetroRail 24357
##  7 Rainey St @ Cummings                    23468
##  8 Davis at Rainey Street                  22273
##  9 Capitol Station / Congress & 11th       20056
## 10 Pfluger Bridge @ W 2nd Street           19434
## 11 Long Center @ South 1st & Riverside     16285
## 12 Barton Springs & Riverside              16130
## 13 Barton Springs @ Kinney Ave             15816
## 14 3rd & West                              15576
## 15 Palmer Auditorium                       15044
## 16 South Congress & James                  14802
## 17 South Congress & Academy                14607
## 18 Zilker Park                             13390
## 19 8th & Congress                          13034
## 20 Convention Center / 3rd & Trinity       12848
fillColor = "#FFA533"
fillColor2 = "#EECA16"
fillColor3 = "#16D2EE"

bike %>% 
  count(start_station_name, sort = T) %>% 
  head(20) %>% 
  ggplot(aes(fct_reorder(start_station_name, n) , n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) +
  geom_text(aes(label = format(n, big.mark = ",")), hjust =  -0.1, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() +
  labs(title = "Most Common Pickup Points", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma)

3.2 Most Common End Points, Top 20

bike %>% 
  count(end_station_name, sort = T) %>% 
  head(20) 
## # A tibble: 20 x 2
##    end_station_name                            n
##    <chr>                                   <int>
##  1 City Hall / Lavaca & 2nd                33125
##  2 2nd & Congress                          29516
##  3 Riverside @ S. Lamar                    28023
##  4 4th & Congress                          27902
##  5 Convention Center / 4th St. @ MetroRail 26862
##  6 5th & Bowie                             25070
##  7 Rainey St @ Cummings                    22696
##  8 Davis at Rainey Street                  21916
##  9 Pfluger Bridge @ W 2nd Street           18632
## 10 Capitol Station / Congress & 11th       17192
## 11 Long Center @ South 1st & Riverside     17103
## 12 Barton Springs @ Kinney Ave             16597
## 13 Barton Springs & Riverside              16541
## 14 Zilker Park                             15302
## 15 South Congress & James                  14975
## 16 Convention Center / 3rd & Trinity       14565
## 17 8th & Congress                          14122
## 18 South Congress & Academy                14120
## 19 West & 6th St.                          13926
## 20 Palmer Auditorium                       13783
bike %>% 
  count(end_station_name, sort = T) %>% 
  head(20) %>% 
  ggplot(aes(fct_reorder(end_station_name, n) , n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) +
  geom_text(aes(label = format(n, big.mark = ",")), hjust =  -0.1, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() +
  labs(title = "Most Common End Points", y = "Count", x = NULL)+
  theme_light() +
  scale_y_continuous(labels = scales::comma)

3.3 Most Common Route, Top 10

We use both starting and ending station and count number of rides in the route. dplyr package’s group_by can be used to do this very easily.

bike %>% 
  group_by(start_station_name, end_station_name) %>% 
  count(start_station_name, sort = T) %>% 
  head(10)
## # A tibble: 10 x 3
## # Groups:   start_station_name, end_station_name [10]
##    start_station_name                  end_station_name                        n
##    <chr>                               <chr>                               <int>
##  1 Riverside @ S. Lamar                Riverside @ S. Lamar                 8418
##  2 Rainey St @ Cummings                Rainey St @ Cummings                 5691
##  3 2nd & Congress                      2nd & Congress                       4336
##  4 City Hall / Lavaca & 2nd            City Hall / Lavaca & 2nd             4335
##  5 Capitol Station / Congress & 11th   Capitol Station / Congress & 11th    3644
##  6 Zilker Park                         Zilker Park                          3631
##  7 Pfluger Bridge @ W 2nd Street       Pfluger Bridge @ W 2nd Street        3483
##  8 Zilker Park at Barton Springs & Wi~ Zilker Park at Barton Springs & Wi~  3362
##  9 Barton Springs & Riverside          Barton Springs & Riverside           3009
## 10 Palmer Auditorium                   Palmer Auditorium                    2941

3.4 Date Time Manipulation using lubridate package

lubridate package eneables us to manipulate Date-Time data very easily. We can extract month, day, weekday, date, Hour, Minutes, Seconds, time intervals etc easily.

3.4.1 Busiest Month

m <- bike %>% count( m = month(start_time, label = TRUE), sort = TRUE) 
m
## # A tibble: 12 x 2
##    m          n
##    <ord>  <int>
##  1 Mar   112002
##  2 Oct    69925
##  3 May    69572
##  4 Jul    61577
##  5 Jun    60192
##  6 Apr    53419
##  7 Jan    43761
##  8 Sep    42746
##  9 Feb    42032
## 10 Aug    38036
## 11 Nov    37182
## 12 Dec    18787
m %>% 
  ggplot(aes(fct_reorder(m, n), n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust =  -0.1, size = 3, colour = 'black', fontface = 'bold') +
  coord_flip() +
  labs(title = "Busiest Month", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma)

3.4.2 Busiest Date of the month

d <- bike %>% count(day = mday(start_time), sort = TRUE)
d
## # A tibble: 31 x 2
##      day     n
##    <int> <int>
##  1    15 26695
##  2    14 26363
##  3    18 25419
##  4    13 24312
##  5    19 24198
##  6    12 24185
##  7    17 23184
##  8    16 23030
##  9    11 23018
## 10     4 22084
## # ... with 21 more rows
d %>% 
  ggplot(aes(reorder(day, n), n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() + 
  labs(title = "Busiest Date of the Month", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma)

11th to 19th dates seem to be most busy during a months.

3.4.3 Busiest dates by Month

d <- bike %>% 
  count(day = mday(start_time), sort = TRUE)
d
## # A tibble: 31 x 2
##      day     n
##    <int> <int>
##  1    15 26695
##  2    14 26363
##  3    18 25419
##  4    13 24312
##  5    19 24198
##  6    12 24185
##  7    17 23184
##  8    16 23030
##  9    11 23018
## 10     4 22084
## # ... with 21 more rows
bike %>% 
  group_by(day = mday(start_time),
           month = month(start_time, label = T, abbr = F)) %>% 
  summarise(n = n()) %>% 
  
  ggplot(aes(day, n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.08, size = 2, colour = 'black', fontface = 'bold') +
  coord_flip() + 
  labs(title = "Busiest Date of the Month", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma) +
  facet_wrap(~month)

3.4.4 Busiest WeekDay

w <- bike %>% count(w = wday(start_time, label = TRUE), sort = TRUE)
w
## # A tibble: 7 x 2
##   w          n
##   <ord>  <int>
## 1 Sat   140846
## 2 Sun   114337
## 3 Fri   102187
## 4 Mon    77707
## 5 Thu    75704
## 6 Wed    70007
## 7 Tue    68443
w %>% 
  ggplot(aes(reorder(w,n), n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() + 
  labs(title = "Busiest Weekdays", y = "Count", x = NULL) +
  theme_light() + 
  scale_y_continuous(labels = scales::comma)

3.4.5 Busiest Hour

### Busiest Hour
h <- bike %>% mutate(h = format(start_time, "%H"))
h <- h %>% count(h, sort = TRUE)
h
## # A tibble: 24 x 2
##    h         n
##    <chr> <int>
##  1 17    56819
##  2 16    56643
##  3 13    56564
##  4 15    56346
##  5 14    55631
##  6 12    51667
##  7 18    47532
##  8 11    42441
##  9 19    36778
## 10 10    31716
## # ... with 14 more rows
h %>% 
  ggplot(aes(h, n)) + 
  geom_bar(stat="identity", colour="white", fill = fillColor2) + 
  geom_text(aes(label = format(n, big.mark = ",")), vjust = -0.2, size = 2.5 , colour = 'black', fontface = 'bold') +
  labs(title = "Busiest Hours of the Day", y = "Count", x = "Hour") +
  theme_light()

h %>% 
  ggplot(aes(reorder(h, n), n)) + 
  geom_bar(stat="identity", colour="white", fill = fillColor2) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5 , colour = 'black', fontface = 'bold') +
  labs(title = "Busiest Hours of the Day", y = "Count", x = NULL) +
  theme_light() +
  coord_flip() + 
  scale_y_continuous(labels = scales::comma)

4 Time Series Prediction

4.1 Convert to Monthly Data

We extract the date and time separately, then use the date variable to further transform our data.

# Convert to Monthly Data
# Again using lubridate package

bike <- tidyr::separate(data = bike, col = start_time, into = c("start_date", "time"), sep = " ", remove = FALSE)

4.2 Count monthly number of rides

using floor_date from lubridate, we convert all the dates of a month to the first day of the month, this way it makes the counting ride easier.

bike$c <- 1
bikemonth <- bike %>% 
  select(start_date, trip_id, c) %>% 
  group_by(month = lubridate::floor_date(ymd(start_date), "month")) %>%
  summarize(Rides = sum(c))

4.3 Input Missing Months by using previous year values

unique(bikemonth$month)
##  [1] "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01" "2014-04-01"
##  [6] "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01" "2014-09-01"
## [11] "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01" "2015-02-01"
## [16] "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01" "2015-07-01"
## [21] "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01" "2015-12-01"
## [26] "2016-01-01" "2016-02-01" "2016-03-01" "2016-05-01" "2016-06-01"
## [31] "2016-07-01" "2016-08-01" "2016-09-01" "2016-10-01" "2016-11-01"
## [36] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01"
## [41] "2017-06-01" "2017-07-01"

Looks like April and December are missing from our data. We will use previous year’s values for this month. This is important for our time series data to have uniform dates and values to predict seasonality.

bikemonth %>% 
  filter(month %in% ymd(c("2015-12-01", "2015-04-01")))
## # A tibble: 2 x 2
##   month      Rides
##   <date>     <dbl>
## 1 2015-04-01 16554
## 2 2015-12-01 10057
missingmon <- data.frame(month = ymd(c("2016-12-01", "2016-04-01")), Rides = c(10057, 16554))
missingmon
##        month Rides
## 1 2016-12-01 10057
## 2 2016-04-01 16554
bikemonth <- rbind(bikemonth, missingmon) %>% arrange(month)
unique(bikemonth$month)
##  [1] "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01" "2014-04-01"
##  [6] "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01" "2014-09-01"
## [11] "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01" "2015-02-01"
## [16] "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01" "2015-07-01"
## [21] "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01" "2015-12-01"
## [26] "2016-01-01" "2016-02-01" "2016-03-01" "2016-04-01" "2016-05-01"
## [31] "2016-06-01" "2016-07-01" "2016-08-01" "2016-09-01" "2016-10-01"
## [36] "2016-11-01" "2016-12-01" "2017-01-01" "2017-02-01" "2017-03-01"
## [41] "2017-04-01" "2017-05-01" "2017-06-01" "2017-07-01"

4.4 Visualize the finalized time series

bikemonth %>% 
  ggplot(aes(month, Rides)) + 
  geom_line() + 
  labs( x= "Month", title="Monthly Bike Ride Counts")

4.5 Seasonal Time Series Model Fitting, SARIMA Model

ts <- (bikemonth$Rides)

acf(ts)

pacf(ts)

plot(diff(ts, 1), type='l')

adf.test(ts)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts
## Dickey-Fuller = -4.8697, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
plot(diff(ts, 2), type='l')

plot(diff(ts, 3), type='l')

acf(diff(ts, 1))

pacf(diff(ts, 1))

Observing the ACF and PACF plots, we decide to fit a SARIAMA(0,1,1)(1,1,1)(12) model to our data.

n = 36

mod2 <- arima(ts, order = c(0,1,1), seasonal = list(order = c(1,1,1), period = 12))

### Model and Predictions
mod2
## 
## Call:
## arima(x = ts, order = c(0, 1, 1), seasonal = list(order = c(1, 1, 1), period = 12))
## 
## Coefficients:
##           ma1    sar1     sma1
##       -0.7369  0.0592  -0.9998
## s.e.   0.1572  0.2692   0.8856
## 
## sigma^2 estimated as 7978084:  log likelihood = -297.82,  aic = 603.64
f <- as.data.frame(predict(mod2, n.ahead = n))

### Time Series plot of Forecasted Values

### Paste original and forecasted data together
bikemonth$type <- "Data"
d <- data.frame(max(bikemonth$month) + months(1:n), f$pred, rep("Forecast", n))
names(d) <- names(bikemonth)
bikeforcast <- rbind(bikemonth,d)
tail(bikeforcast)
## # A tibble: 6 x 3
##   month       Rides type    
##   <date>      <dbl> <chr>   
## 1 2020-02-01 17224. Forecast
## 2 2020-03-01 34689. Forecast
## 3 2020-04-01 24267. Forecast
## 4 2020-05-01 24128. Forecast
## 5 2020-06-01 21788. Forecast
## 6 2020-07-01 22111. Forecast
bikeCI <- bikeforcast %>% filter(type == "Forecast")


p <- bikeforcast %>% 
  ggplot(aes(month, Rides, color=type)) + 
  geom_line() +
  geom_point() +
  scale_x_date(date_labels = "%b %y", breaks = "3 month") +
  theme(axis.text.x = element_text(size=8, angle = 90))

plotly::ggplotly(p)