1 Read Data

rm(list = ls())

library(tidyverse)
library(lubridate)
library(skimr)
library(tseries)
library(forecast)

##########################################################
################   Bike Share Analysis   #################
##########################################################

bike <- read_csv("austin_bikeshare_trips.csv")

2 Summary of the Data

glimpse(bike)

## Rows: 649,231
## Columns: 12
## $ bikeid             <dbl> 8, 141, 578, 555, 86, 861, 382, 435, 555, 668, 981,~
## $ checkout_time      <time> 19:12:00, 02:06:04, 16:28:27, 15:12:00, 15:39:13, ~
## $ duration_minutes   <dbl> 41, 6, 13, 80, 25, 29, 17, 49, 19, 20, 44, 12, 21, ~
## $ end_station_id     <dbl> 2565, 2570, 2498, 2712, 3377, 2537, 2575, 2575, 257~
## $ end_station_name   <chr> "Trinity & 6th Street", "South Congress & Academy",~
## $ month              <dbl> 3, 10, 3, 11, 4, 5, 7, 1, 5, NA, 3, 9, 10, 3, 4, 3,~
## $ start_station_id   <dbl> 2536, 2494, 2538, 2497, 2707, 2540, 2567, 2575, 250~
## $ start_station_name <chr> "Waller & 6th St.", "2nd & Congress", "Bullock Muse~
## $ start_time         <dttm> 2015-03-19 19:12:00, 2016-10-30 02:06:04, 2016-03-~
## $ subscriber_type    <chr> "Walk Up", "Local365", "Local365", "24-Hour Kiosk (~
## $ trip_id            <dbl> 9900082882, 12617682, 9075366, 9900319298, 14468597~
## $ year               <dbl> 2015, 2016, 2016, 2014, 2017, 2015, 2016, 2015, 201~

summary(bike)

##      bikeid       checkout_time     duration_minutes   end_station_id 
##  Min.   :   3.0   Length:649231     Min.   :    0.00   Min.   :1001   
##  1st Qu.: 208.0   Class1:hms        1st Qu.:    8.00   1st Qu.:2499   
##  Median : 417.0   Class2:difftime   Median :   15.00   Median :2548   
##  Mean   : 471.6   Mode  :numeric    Mean   :   29.13   Mean   :2582   
##  3rd Qu.: 745.0                     3rd Qu.:   28.00   3rd Qu.:2571   
##  Max.   :5089.0                     Max.   :21296.00   Max.   :3687   
##  NA's   :723                                           NA's   :19842  
##  end_station_name       month        start_station_id start_station_name
##  Length:649231      Min.   : 1.000   Min.   :1001     Length:649231     
##  Class :character   1st Qu.: 3.000   1st Qu.:2501     Class :character  
##  Mode  :character   Median : 5.000   Median :2549     Mode  :character  
##                     Mean   : 5.887   Mean   :2584                       
##                     3rd Qu.: 9.000   3rd Qu.:2571                       
##                     Max.   :12.000   Max.   :3687                       
##                     NA's   :30752    NA's   :19041                      
##    start_time                  subscriber_type       trip_id         
##  Min.   :2013-12-21 09:12:00   Length:649231      Min.   :8.270e+06  
##  1st Qu.:2015-01-24 12:12:00   Class :character   1st Qu.:1.275e+07  
##  Median :2015-11-10 13:12:40   Mode  :character   Median :9.900e+09  
##  Mean   :2015-11-13 20:41:09                      Mean   :5.385e+09  
##  3rd Qu.:2016-09-30 17:01:11                      3rd Qu.:9.900e+09  
##  Max.   :2017-07-31 23:44:27                      Max.   :9.900e+09  
##                                                                      
##       year      
##  Min.   :2013   
##  1st Qu.:2014   
##  Median :2015   
##  Mean   :2015   
##  3rd Qu.:2016   
##  Max.   :2017   
##  NA's   :30752

skim(bike)

Data summary
Name	bike
Number of rows	649231
Number of columns	12
_______________________
Column type frequency:
character	3
difftime	1
numeric	7
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
end_station_name	0	1	4	64	92
start_station_name	0	1	4	64	90
subscriber_type	2077	1	5	45	52

Variable type: difftime

skim_variable	n_missing	complete_rate	min	max	median	n_unique
checkout_time	0	1	121 secs	86279 secs	15:09:29	65663

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
bikeid	723	1.00	4.716200e+02	3.235900e+02	3	208	417	745	5089	▇▁▁▁▁
duration_minutes	0	1.00	2.913000e+01	8.728000e+01	0	8	15	28	21296	▇▁▁▁▁
end_station_id	19842	0.97	2.582470e+03	3.199000e+02	1001	2499	2548	2571	3687	▁▁▇▁▁
month	30752	0.95	5.890000e+00	3.210000e+00	1	3	5	9	12	▇▅▃▃▅
start_station_id	19041	0.97	2.584240e+03	3.208400e+02	1001	2501	2549	2571	3687	▁▁▇▁▁
trip_id	0	1.00	5.384945e+09	4.925349e+09	8269930	12747093	9900028150	9900190458	9900352765	▇▁▁▁▇
year	30752	0.95	2.015340e+03	1.020000e+00	2013	2014	2015	2016	2017	▁▇▇▇▃

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
start_time	0	1	2013-12-21 09:12:00	2017-07-31 23:44:27	2015-11-10 13:12:40	357010

3 EDA

3.1 Most Common Pickup Points, Top 20

#########################  1  ###########################

bike %>% 
  count(start_station_name, sort = T) %>% 
  head(20)

## # A tibble: 20 x 2
##    start_station_name                          n
##    <chr>                                   <int>
##  1 Riverside @ S. Lamar                    28695
##  2 City Hall / Lavaca & 2nd                28535
##  3 5th & Bowie                             26669
##  4 2nd & Congress                          26612
##  5 4th & Congress                          24972
##  6 Convention Center / 4th St. @ MetroRail 24357
##  7 Rainey St @ Cummings                    23468
##  8 Davis at Rainey Street                  22273
##  9 Capitol Station / Congress & 11th       20056
## 10 Pfluger Bridge @ W 2nd Street           19434
## 11 Long Center @ South 1st & Riverside     16285
## 12 Barton Springs & Riverside              16130
## 13 Barton Springs @ Kinney Ave             15816
## 14 3rd & West                              15576
## 15 Palmer Auditorium                       15044
## 16 South Congress & James                  14802
## 17 South Congress & Academy                14607
## 18 Zilker Park                             13390
## 19 8th & Congress                          13034
## 20 Convention Center / 3rd & Trinity       12848

fillColor = "#FFA533"
fillColor2 = "#EECA16"
fillColor3 = "#16D2EE"

bike %>% 
  count(start_station_name, sort = T) %>% 
  head(20) %>% 
  ggplot(aes(fct_reorder(start_station_name, n) , n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) +
  geom_text(aes(label = format(n, big.mark = ",")), hjust =  -0.1, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() +
  labs(title = "Most Common Pickup Points", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma)

3.2 Most Common End Points, Top 20

bike %>% 
  count(end_station_name, sort = T) %>% 
  head(20)

## # A tibble: 20 x 2
##    end_station_name                            n
##    <chr>                                   <int>
##  1 City Hall / Lavaca & 2nd                33125
##  2 2nd & Congress                          29516
##  3 Riverside @ S. Lamar                    28023
##  4 4th & Congress                          27902
##  5 Convention Center / 4th St. @ MetroRail 26862
##  6 5th & Bowie                             25070
##  7 Rainey St @ Cummings                    22696
##  8 Davis at Rainey Street                  21916
##  9 Pfluger Bridge @ W 2nd Street           18632
## 10 Capitol Station / Congress & 11th       17192
## 11 Long Center @ South 1st & Riverside     17103
## 12 Barton Springs @ Kinney Ave             16597
## 13 Barton Springs & Riverside              16541
## 14 Zilker Park                             15302
## 15 South Congress & James                  14975
## 16 Convention Center / 3rd & Trinity       14565
## 17 8th & Congress                          14122
## 18 South Congress & Academy                14120
## 19 West & 6th St.                          13926
## 20 Palmer Auditorium                       13783

bike %>% 
  count(end_station_name, sort = T) %>% 
  head(20) %>% 
  ggplot(aes(fct_reorder(end_station_name, n) , n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) +
  geom_text(aes(label = format(n, big.mark = ",")), hjust =  -0.1, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() +
  labs(title = "Most Common End Points", y = "Count", x = NULL)+
  theme_light() +
  scale_y_continuous(labels = scales::comma)

3.3 Most Common Route, Top 10

We use both starting and ending station and count number of rides in the route. dplyr package’s group_by can be used to do this very easily.

bike %>% 
  group_by(start_station_name, end_station_name) %>% 
  count(start_station_name, sort = T) %>% 
  head(10)

## # A tibble: 10 x 3
## # Groups:   start_station_name, end_station_name [10]
##    start_station_name                  end_station_name                        n
##    <chr>                               <chr>                               <int>
##  1 Riverside @ S. Lamar                Riverside @ S. Lamar                 8418
##  2 Rainey St @ Cummings                Rainey St @ Cummings                 5691
##  3 2nd & Congress                      2nd & Congress                       4336
##  4 City Hall / Lavaca & 2nd            City Hall / Lavaca & 2nd             4335
##  5 Capitol Station / Congress & 11th   Capitol Station / Congress & 11th    3644
##  6 Zilker Park                         Zilker Park                          3631
##  7 Pfluger Bridge @ W 2nd Street       Pfluger Bridge @ W 2nd Street        3483
##  8 Zilker Park at Barton Springs & Wi~ Zilker Park at Barton Springs & Wi~  3362
##  9 Barton Springs & Riverside          Barton Springs & Riverside           3009
## 10 Palmer Auditorium                   Palmer Auditorium                    2941

3.4 Date Time Manipulation using lubridate package

lubridate package eneables us to manipulate Date-Time data very easily. We can extract month, day, weekday, date, Hour, Minutes, Seconds, time intervals etc easily.

3.4.1 Busiest Month

m <- bike %>% count( m = month(start_time, label = TRUE), sort = TRUE) 
m

## # A tibble: 12 x 2
##    m          n
##    <ord>  <int>
##  1 Mar   112002
##  2 Oct    69925
##  3 May    69572
##  4 Jul    61577
##  5 Jun    60192
##  6 Apr    53419
##  7 Jan    43761
##  8 Sep    42746
##  9 Feb    42032
## 10 Aug    38036
## 11 Nov    37182
## 12 Dec    18787

m %>% 
  ggplot(aes(fct_reorder(m, n), n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust =  -0.1, size = 3, colour = 'black', fontface = 'bold') +
  coord_flip() +
  labs(title = "Busiest Month", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma)

3.4.2 Busiest Date of the month

d <- bike %>% count(day = mday(start_time), sort = TRUE)
d

## # A tibble: 31 x 2
##      day     n
##    <int> <int>
##  1    15 26695
##  2    14 26363
##  3    18 25419
##  4    13 24312
##  5    19 24198
##  6    12 24185
##  7    17 23184
##  8    16 23030
##  9    11 23018
## 10     4 22084
## # ... with 21 more rows

d %>% 
  ggplot(aes(reorder(day, n), n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() + 
  labs(title = "Busiest Date of the Month", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma)

11th to 19th dates seem to be most busy during a months.

3.4.3 Busiest dates by Month

d <- bike %>% 
  count(day = mday(start_time), sort = TRUE)
d

## # A tibble: 31 x 2
##      day     n
##    <int> <int>
##  1    15 26695
##  2    14 26363
##  3    18 25419
##  4    13 24312
##  5    19 24198
##  6    12 24185
##  7    17 23184
##  8    16 23030
##  9    11 23018
## 10     4 22084
## # ... with 21 more rows

bike %>% 
  group_by(day = mday(start_time),
           month = month(start_time, label = T, abbr = F)) %>% 
  summarise(n = n()) %>% 
  
  ggplot(aes(day, n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.08, size = 2, colour = 'black', fontface = 'bold') +
  coord_flip() + 
  labs(title = "Busiest Date of the Month", y = "Count", x = NULL) +
  theme_light() +
  scale_y_continuous(labels = scales::comma) +
  facet_wrap(~month)

3.4.4 Busiest WeekDay

w <- bike %>% count(w = wday(start_time, label = TRUE), sort = TRUE)
w

## # A tibble: 7 x 2
##   w          n
##   <ord>  <int>
## 1 Sat   140846
## 2 Sun   114337
## 3 Fri   102187
## 4 Mon    77707
## 5 Thu    75704
## 6 Wed    70007
## 7 Tue    68443

w %>% 
  ggplot(aes(reorder(w,n), n)) + 
  geom_bar(stat="identity",colour="white", fill = fillColor) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5, colour = 'black', fontface = 'bold') +
  coord_flip() + 
  labs(title = "Busiest Weekdays", y = "Count", x = NULL) +
  theme_light() + 
  scale_y_continuous(labels = scales::comma)

3.4.5 Busiest Hour

### Busiest Hour
h <- bike %>% mutate(h = format(start_time, "%H"))
h <- h %>% count(h, sort = TRUE)
h

## # A tibble: 24 x 2
##    h         n
##    <chr> <int>
##  1 17    56819
##  2 16    56643
##  3 13    56564
##  4 15    56346
##  5 14    55631
##  6 12    51667
##  7 18    47532
##  8 11    42441
##  9 19    36778
## 10 10    31716
## # ... with 14 more rows

h %>% 
  ggplot(aes(h, n)) + 
  geom_bar(stat="identity", colour="white", fill = fillColor2) + 
  geom_text(aes(label = format(n, big.mark = ",")), vjust = -0.2, size = 2.5 , colour = 'black', fontface = 'bold') +
  labs(title = "Busiest Hours of the Day", y = "Count", x = "Hour") +
  theme_light()

h %>% 
  ggplot(aes(reorder(h, n), n)) + 
  geom_bar(stat="identity", colour="white", fill = fillColor2) + 
  geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5 , colour = 'black', fontface = 'bold') +
  labs(title = "Busiest Hours of the Day", y = "Count", x = NULL) +
  theme_light() +
  coord_flip() + 
  scale_y_continuous(labels = scales::comma)

4 Time Series Prediction

4.1 Convert to Monthly Data

We extract the date and time separately, then use the date variable to further transform our data.

# Convert to Monthly Data
# Again using lubridate package

bike <- tidyr::separate(data = bike, col = start_time, into = c("start_date", "time"), sep = " ", remove = FALSE)

4.2 Count monthly number of rides

using floor_date from lubridate, we convert all the dates of a month to the first day of the month, this way it makes the counting ride easier.

bike$c <- 1
bikemonth <- bike %>% 
  select(start_date, trip_id, c) %>% 
  group_by(month = lubridate::floor_date(ymd(start_date), "month")) %>%
  summarize(Rides = sum(c))

4.3 Input Missing Months by using previous year values

unique(bikemonth$month)

##  [1] "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01" "2014-04-01"
##  [6] "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01" "2014-09-01"
## [11] "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01" "2015-02-01"
## [16] "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01" "2015-07-01"
## [21] "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01" "2015-12-01"
## [26] "2016-01-01" "2016-02-01" "2016-03-01" "2016-05-01" "2016-06-01"
## [31] "2016-07-01" "2016-08-01" "2016-09-01" "2016-10-01" "2016-11-01"
## [36] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01"
## [41] "2017-06-01" "2017-07-01"

Looks like April and December are missing from our data. We will use previous year’s values for this month. This is important for our time series data to have uniform dates and values to predict seasonality.

bikemonth %>% 
  filter(month %in% ymd(c("2015-12-01", "2015-04-01")))

## # A tibble: 2 x 2
##   month      Rides
##   <date>     <dbl>
## 1 2015-04-01 16554
## 2 2015-12-01 10057

missingmon <- data.frame(month = ymd(c("2016-12-01", "2016-04-01")), Rides = c(10057, 16554))
missingmon

##        month Rides
## 1 2016-12-01 10057
## 2 2016-04-01 16554

bikemonth <- rbind(bikemonth, missingmon) %>% arrange(month)
unique(bikemonth$month)

##  [1] "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01" "2014-04-01"
##  [6] "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01" "2014-09-01"
## [11] "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01" "2015-02-01"
## [16] "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01" "2015-07-01"
## [21] "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01" "2015-12-01"
## [26] "2016-01-01" "2016-02-01" "2016-03-01" "2016-04-01" "2016-05-01"
## [31] "2016-06-01" "2016-07-01" "2016-08-01" "2016-09-01" "2016-10-01"
## [36] "2016-11-01" "2016-12-01" "2017-01-01" "2017-02-01" "2017-03-01"
## [41] "2017-04-01" "2017-05-01" "2017-06-01" "2017-07-01"

4.4 Visualize the finalized time series

bikemonth %>% 
  ggplot(aes(month, Rides)) + 
  geom_line() + 
  labs( x= "Month", title="Monthly Bike Ride Counts")

4.5 Seasonal Time Series Model Fitting, SARIMA Model

ts <- (bikemonth$Rides)

acf(ts)

pacf(ts)

plot(diff(ts, 1), type='l')

adf.test(ts)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts
## Dickey-Fuller = -4.8697, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary

plot(diff(ts, 2), type='l')

plot(diff(ts, 3), type='l')

acf(diff(ts, 1))

pacf(diff(ts, 1))

Observing the ACF and PACF plots, we decide to fit a SARIAMA(0,1,1)(1,1,1)(12) model to our data.

n = 36

mod2 <- arima(ts, order = c(0,1,1), seasonal = list(order = c(1,1,1), period = 12))

### Model and Predictions
mod2

## 
## Call:
## arima(x = ts, order = c(0, 1, 1), seasonal = list(order = c(1, 1, 1), period = 12))
## 
## Coefficients:
##           ma1    sar1     sma1
##       -0.7369  0.0592  -0.9998
## s.e.   0.1572  0.2692   0.8856
## 
## sigma^2 estimated as 7978084:  log likelihood = -297.82,  aic = 603.64

f <- as.data.frame(predict(mod2, n.ahead = n))

### Time Series plot of Forecasted Values

### Paste original and forecasted data together
bikemonth$type <- "Data"
d <- data.frame(max(bikemonth$month) + months(1:n), f$pred, rep("Forecast", n))
names(d) <- names(bikemonth)
bikeforcast <- rbind(bikemonth,d)
tail(bikeforcast)

## # A tibble: 6 x 3
##   month       Rides type    
##   <date>      <dbl> <chr>   
## 1 2020-02-01 17224. Forecast
## 2 2020-03-01 34689. Forecast
## 3 2020-04-01 24267. Forecast
## 4 2020-05-01 24128. Forecast
## 5 2020-06-01 21788. Forecast
## 6 2020-07-01 22111. Forecast

bikeCI <- bikeforcast %>% filter(type == "Forecast")


p <- bikeforcast %>% 
  ggplot(aes(month, Rides, color=type)) + 
  geom_line() +
  geom_point() +
  scale_x_date(date_labels = "%b %y", breaks = "3 month") +
  theme(axis.text.x = element_text(size=8, angle = 90))

plotly::ggplotly(p)

Austin Bike Share Trips EDA

Using lubridate package in R for Date-Time Manipulation

FRK

July 22, 2021