rm(list = ls())
library(tidyverse)
library(lubridate)
library(skimr)
library(tseries)
library(forecast)
##########################################################
################ Bike Share Analysis #################
##########################################################
<- read_csv("austin_bikeshare_trips.csv") bike
glimpse(bike)
## Rows: 649,231
## Columns: 12
## $ bikeid <dbl> 8, 141, 578, 555, 86, 861, 382, 435, 555, 668, 981,~
## $ checkout_time <time> 19:12:00, 02:06:04, 16:28:27, 15:12:00, 15:39:13, ~
## $ duration_minutes <dbl> 41, 6, 13, 80, 25, 29, 17, 49, 19, 20, 44, 12, 21, ~
## $ end_station_id <dbl> 2565, 2570, 2498, 2712, 3377, 2537, 2575, 2575, 257~
## $ end_station_name <chr> "Trinity & 6th Street", "South Congress & Academy",~
## $ month <dbl> 3, 10, 3, 11, 4, 5, 7, 1, 5, NA, 3, 9, 10, 3, 4, 3,~
## $ start_station_id <dbl> 2536, 2494, 2538, 2497, 2707, 2540, 2567, 2575, 250~
## $ start_station_name <chr> "Waller & 6th St.", "2nd & Congress", "Bullock Muse~
## $ start_time <dttm> 2015-03-19 19:12:00, 2016-10-30 02:06:04, 2016-03-~
## $ subscriber_type <chr> "Walk Up", "Local365", "Local365", "24-Hour Kiosk (~
## $ trip_id <dbl> 9900082882, 12617682, 9075366, 9900319298, 14468597~
## $ year <dbl> 2015, 2016, 2016, 2014, 2017, 2015, 2016, 2015, 201~
summary(bike)
## bikeid checkout_time duration_minutes end_station_id
## Min. : 3.0 Length:649231 Min. : 0.00 Min. :1001
## 1st Qu.: 208.0 Class1:hms 1st Qu.: 8.00 1st Qu.:2499
## Median : 417.0 Class2:difftime Median : 15.00 Median :2548
## Mean : 471.6 Mode :numeric Mean : 29.13 Mean :2582
## 3rd Qu.: 745.0 3rd Qu.: 28.00 3rd Qu.:2571
## Max. :5089.0 Max. :21296.00 Max. :3687
## NA's :723 NA's :19842
## end_station_name month start_station_id start_station_name
## Length:649231 Min. : 1.000 Min. :1001 Length:649231
## Class :character 1st Qu.: 3.000 1st Qu.:2501 Class :character
## Mode :character Median : 5.000 Median :2549 Mode :character
## Mean : 5.887 Mean :2584
## 3rd Qu.: 9.000 3rd Qu.:2571
## Max. :12.000 Max. :3687
## NA's :30752 NA's :19041
## start_time subscriber_type trip_id
## Min. :2013-12-21 09:12:00 Length:649231 Min. :8.270e+06
## 1st Qu.:2015-01-24 12:12:00 Class :character 1st Qu.:1.275e+07
## Median :2015-11-10 13:12:40 Mode :character Median :9.900e+09
## Mean :2015-11-13 20:41:09 Mean :5.385e+09
## 3rd Qu.:2016-09-30 17:01:11 3rd Qu.:9.900e+09
## Max. :2017-07-31 23:44:27 Max. :9.900e+09
##
## year
## Min. :2013
## 1st Qu.:2014
## Median :2015
## Mean :2015
## 3rd Qu.:2016
## Max. :2017
## NA's :30752
skim(bike)
Name | bike |
Number of rows | 649231 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
character | 3 |
difftime | 1 |
numeric | 7 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
end_station_name | 0 | 1 | 4 | 64 | 0 | 92 | 0 |
start_station_name | 0 | 1 | 4 | 64 | 0 | 90 | 0 |
subscriber_type | 2077 | 1 | 5 | 45 | 0 | 52 | 0 |
Variable type: difftime
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
checkout_time | 0 | 1 | 121 secs | 86279 secs | 15:09:29 | 65663 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
bikeid | 723 | 1.00 | 4.716200e+02 | 3.235900e+02 | 3 | 208 | 417 | 745 | 5089 | ▇▁▁▁▁ |
duration_minutes | 0 | 1.00 | 2.913000e+01 | 8.728000e+01 | 0 | 8 | 15 | 28 | 21296 | ▇▁▁▁▁ |
end_station_id | 19842 | 0.97 | 2.582470e+03 | 3.199000e+02 | 1001 | 2499 | 2548 | 2571 | 3687 | ▁▁▇▁▁ |
month | 30752 | 0.95 | 5.890000e+00 | 3.210000e+00 | 1 | 3 | 5 | 9 | 12 | ▇▅▃▃▅ |
start_station_id | 19041 | 0.97 | 2.584240e+03 | 3.208400e+02 | 1001 | 2501 | 2549 | 2571 | 3687 | ▁▁▇▁▁ |
trip_id | 0 | 1.00 | 5.384945e+09 | 4.925349e+09 | 8269930 | 12747093 | 9900028150 | 9900190458 | 9900352765 | ▇▁▁▁▇ |
year | 30752 | 0.95 | 2.015340e+03 | 1.020000e+00 | 2013 | 2014 | 2015 | 2016 | 2017 | ▁▇▇▇▃ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
start_time | 0 | 1 | 2013-12-21 09:12:00 | 2017-07-31 23:44:27 | 2015-11-10 13:12:40 | 357010 |
######################### 1 ###########################
%>%
bike count(start_station_name, sort = T) %>%
head(20)
## # A tibble: 20 x 2
## start_station_name n
## <chr> <int>
## 1 Riverside @ S. Lamar 28695
## 2 City Hall / Lavaca & 2nd 28535
## 3 5th & Bowie 26669
## 4 2nd & Congress 26612
## 5 4th & Congress 24972
## 6 Convention Center / 4th St. @ MetroRail 24357
## 7 Rainey St @ Cummings 23468
## 8 Davis at Rainey Street 22273
## 9 Capitol Station / Congress & 11th 20056
## 10 Pfluger Bridge @ W 2nd Street 19434
## 11 Long Center @ South 1st & Riverside 16285
## 12 Barton Springs & Riverside 16130
## 13 Barton Springs @ Kinney Ave 15816
## 14 3rd & West 15576
## 15 Palmer Auditorium 15044
## 16 South Congress & James 14802
## 17 South Congress & Academy 14607
## 18 Zilker Park 13390
## 19 8th & Congress 13034
## 20 Convention Center / 3rd & Trinity 12848
= "#FFA533"
fillColor = "#EECA16"
fillColor2 = "#16D2EE"
fillColor3
%>%
bike count(start_station_name, sort = T) %>%
head(20) %>%
ggplot(aes(fct_reorder(start_station_name, n) , n)) +
geom_bar(stat="identity",colour="white", fill = fillColor) +
geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.1, size = 2.5, colour = 'black', fontface = 'bold') +
coord_flip() +
labs(title = "Most Common Pickup Points", y = "Count", x = NULL) +
theme_light() +
scale_y_continuous(labels = scales::comma)
%>%
bike count(end_station_name, sort = T) %>%
head(20)
## # A tibble: 20 x 2
## end_station_name n
## <chr> <int>
## 1 City Hall / Lavaca & 2nd 33125
## 2 2nd & Congress 29516
## 3 Riverside @ S. Lamar 28023
## 4 4th & Congress 27902
## 5 Convention Center / 4th St. @ MetroRail 26862
## 6 5th & Bowie 25070
## 7 Rainey St @ Cummings 22696
## 8 Davis at Rainey Street 21916
## 9 Pfluger Bridge @ W 2nd Street 18632
## 10 Capitol Station / Congress & 11th 17192
## 11 Long Center @ South 1st & Riverside 17103
## 12 Barton Springs @ Kinney Ave 16597
## 13 Barton Springs & Riverside 16541
## 14 Zilker Park 15302
## 15 South Congress & James 14975
## 16 Convention Center / 3rd & Trinity 14565
## 17 8th & Congress 14122
## 18 South Congress & Academy 14120
## 19 West & 6th St. 13926
## 20 Palmer Auditorium 13783
%>%
bike count(end_station_name, sort = T) %>%
head(20) %>%
ggplot(aes(fct_reorder(end_station_name, n) , n)) +
geom_bar(stat="identity",colour="white", fill = fillColor) +
geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.1, size = 2.5, colour = 'black', fontface = 'bold') +
coord_flip() +
labs(title = "Most Common End Points", y = "Count", x = NULL)+
theme_light() +
scale_y_continuous(labels = scales::comma)
We use both starting and ending station and count number of rides in the route. dplyr package’s group_by can be used to do this very easily.
%>%
bike group_by(start_station_name, end_station_name) %>%
count(start_station_name, sort = T) %>%
head(10)
## # A tibble: 10 x 3
## # Groups: start_station_name, end_station_name [10]
## start_station_name end_station_name n
## <chr> <chr> <int>
## 1 Riverside @ S. Lamar Riverside @ S. Lamar 8418
## 2 Rainey St @ Cummings Rainey St @ Cummings 5691
## 3 2nd & Congress 2nd & Congress 4336
## 4 City Hall / Lavaca & 2nd City Hall / Lavaca & 2nd 4335
## 5 Capitol Station / Congress & 11th Capitol Station / Congress & 11th 3644
## 6 Zilker Park Zilker Park 3631
## 7 Pfluger Bridge @ W 2nd Street Pfluger Bridge @ W 2nd Street 3483
## 8 Zilker Park at Barton Springs & Wi~ Zilker Park at Barton Springs & Wi~ 3362
## 9 Barton Springs & Riverside Barton Springs & Riverside 3009
## 10 Palmer Auditorium Palmer Auditorium 2941
lubridate package eneables us to manipulate Date-Time data very easily. We can extract month, day, weekday, date, Hour, Minutes, Seconds, time intervals etc easily.
<- bike %>% count( m = month(start_time, label = TRUE), sort = TRUE)
m m
## # A tibble: 12 x 2
## m n
## <ord> <int>
## 1 Mar 112002
## 2 Oct 69925
## 3 May 69572
## 4 Jul 61577
## 5 Jun 60192
## 6 Apr 53419
## 7 Jan 43761
## 8 Sep 42746
## 9 Feb 42032
## 10 Aug 38036
## 11 Nov 37182
## 12 Dec 18787
%>%
m ggplot(aes(fct_reorder(m, n), n)) +
geom_bar(stat="identity",colour="white", fill = fillColor) +
geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.1, size = 3, colour = 'black', fontface = 'bold') +
coord_flip() +
labs(title = "Busiest Month", y = "Count", x = NULL) +
theme_light() +
scale_y_continuous(labels = scales::comma)
<- bike %>% count(day = mday(start_time), sort = TRUE)
d d
## # A tibble: 31 x 2
## day n
## <int> <int>
## 1 15 26695
## 2 14 26363
## 3 18 25419
## 4 13 24312
## 5 19 24198
## 6 12 24185
## 7 17 23184
## 8 16 23030
## 9 11 23018
## 10 4 22084
## # ... with 21 more rows
%>%
d ggplot(aes(reorder(day, n), n)) +
geom_bar(stat="identity",colour="white", fill = fillColor) +
geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5, colour = 'black', fontface = 'bold') +
coord_flip() +
labs(title = "Busiest Date of the Month", y = "Count", x = NULL) +
theme_light() +
scale_y_continuous(labels = scales::comma)
11th to 19th dates seem to be most busy during a months.
<- bike %>%
d count(day = mday(start_time), sort = TRUE)
d
## # A tibble: 31 x 2
## day n
## <int> <int>
## 1 15 26695
## 2 14 26363
## 3 18 25419
## 4 13 24312
## 5 19 24198
## 6 12 24185
## 7 17 23184
## 8 16 23030
## 9 11 23018
## 10 4 22084
## # ... with 21 more rows
%>%
bike group_by(day = mday(start_time),
month = month(start_time, label = T, abbr = F)) %>%
summarise(n = n()) %>%
ggplot(aes(day, n)) +
geom_bar(stat="identity",colour="white", fill = fillColor) +
geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.08, size = 2, colour = 'black', fontface = 'bold') +
coord_flip() +
labs(title = "Busiest Date of the Month", y = "Count", x = NULL) +
theme_light() +
scale_y_continuous(labels = scales::comma) +
facet_wrap(~month)
<- bike %>% count(w = wday(start_time, label = TRUE), sort = TRUE)
w w
## # A tibble: 7 x 2
## w n
## <ord> <int>
## 1 Sat 140846
## 2 Sun 114337
## 3 Fri 102187
## 4 Mon 77707
## 5 Thu 75704
## 6 Wed 70007
## 7 Tue 68443
%>%
w ggplot(aes(reorder(w,n), n)) +
geom_bar(stat="identity",colour="white", fill = fillColor) +
geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5, colour = 'black', fontface = 'bold') +
coord_flip() +
labs(title = "Busiest Weekdays", y = "Count", x = NULL) +
theme_light() +
scale_y_continuous(labels = scales::comma)
### Busiest Hour
<- bike %>% mutate(h = format(start_time, "%H"))
h <- h %>% count(h, sort = TRUE)
h h
## # A tibble: 24 x 2
## h n
## <chr> <int>
## 1 17 56819
## 2 16 56643
## 3 13 56564
## 4 15 56346
## 5 14 55631
## 6 12 51667
## 7 18 47532
## 8 11 42441
## 9 19 36778
## 10 10 31716
## # ... with 14 more rows
%>%
h ggplot(aes(h, n)) +
geom_bar(stat="identity", colour="white", fill = fillColor2) +
geom_text(aes(label = format(n, big.mark = ",")), vjust = -0.2, size = 2.5 , colour = 'black', fontface = 'bold') +
labs(title = "Busiest Hours of the Day", y = "Count", x = "Hour") +
theme_light()
%>%
h ggplot(aes(reorder(h, n), n)) +
geom_bar(stat="identity", colour="white", fill = fillColor2) +
geom_text(aes(label = format(n, big.mark = ",")), hjust = -0.05, size = 2.5 , colour = 'black', fontface = 'bold') +
labs(title = "Busiest Hours of the Day", y = "Count", x = NULL) +
theme_light() +
coord_flip() +
scale_y_continuous(labels = scales::comma)
We extract the date and time separately, then use the date variable to further transform our data.
# Convert to Monthly Data
# Again using lubridate package
<- tidyr::separate(data = bike, col = start_time, into = c("start_date", "time"), sep = " ", remove = FALSE) bike
using floor_date from lubridate, we convert all the dates of a month to the first day of the month, this way it makes the counting ride easier.
$c <- 1
bike<- bike %>%
bikemonth select(start_date, trip_id, c) %>%
group_by(month = lubridate::floor_date(ymd(start_date), "month")) %>%
summarize(Rides = sum(c))
unique(bikemonth$month)
## [1] "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01" "2014-04-01"
## [6] "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01" "2014-09-01"
## [11] "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01" "2015-02-01"
## [16] "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01" "2015-07-01"
## [21] "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01" "2015-12-01"
## [26] "2016-01-01" "2016-02-01" "2016-03-01" "2016-05-01" "2016-06-01"
## [31] "2016-07-01" "2016-08-01" "2016-09-01" "2016-10-01" "2016-11-01"
## [36] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01"
## [41] "2017-06-01" "2017-07-01"
Looks like April and December are missing from our data. We will use previous year’s values for this month. This is important for our time series data to have uniform dates and values to predict seasonality.
%>%
bikemonth filter(month %in% ymd(c("2015-12-01", "2015-04-01")))
## # A tibble: 2 x 2
## month Rides
## <date> <dbl>
## 1 2015-04-01 16554
## 2 2015-12-01 10057
<- data.frame(month = ymd(c("2016-12-01", "2016-04-01")), Rides = c(10057, 16554))
missingmon missingmon
## month Rides
## 1 2016-12-01 10057
## 2 2016-04-01 16554
<- rbind(bikemonth, missingmon) %>% arrange(month)
bikemonth unique(bikemonth$month)
## [1] "2013-12-01" "2014-01-01" "2014-02-01" "2014-03-01" "2014-04-01"
## [6] "2014-05-01" "2014-06-01" "2014-07-01" "2014-08-01" "2014-09-01"
## [11] "2014-10-01" "2014-11-01" "2014-12-01" "2015-01-01" "2015-02-01"
## [16] "2015-03-01" "2015-04-01" "2015-05-01" "2015-06-01" "2015-07-01"
## [21] "2015-08-01" "2015-09-01" "2015-10-01" "2015-11-01" "2015-12-01"
## [26] "2016-01-01" "2016-02-01" "2016-03-01" "2016-04-01" "2016-05-01"
## [31] "2016-06-01" "2016-07-01" "2016-08-01" "2016-09-01" "2016-10-01"
## [36] "2016-11-01" "2016-12-01" "2017-01-01" "2017-02-01" "2017-03-01"
## [41] "2017-04-01" "2017-05-01" "2017-06-01" "2017-07-01"
%>%
bikemonth ggplot(aes(month, Rides)) +
geom_line() +
labs( x= "Month", title="Monthly Bike Ride Counts")
<- (bikemonth$Rides)
ts
acf(ts)
pacf(ts)
plot(diff(ts, 1), type='l')
adf.test(ts)
##
## Augmented Dickey-Fuller Test
##
## data: ts
## Dickey-Fuller = -4.8697, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
plot(diff(ts, 2), type='l')
plot(diff(ts, 3), type='l')
acf(diff(ts, 1))
pacf(diff(ts, 1))
Observing the ACF and PACF plots, we decide to fit a SARIAMA(0,1,1)(1,1,1)(12) model to our data.
= 36
n
<- arima(ts, order = c(0,1,1), seasonal = list(order = c(1,1,1), period = 12))
mod2
### Model and Predictions
mod2
##
## Call:
## arima(x = ts, order = c(0, 1, 1), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ma1 sar1 sma1
## -0.7369 0.0592 -0.9998
## s.e. 0.1572 0.2692 0.8856
##
## sigma^2 estimated as 7978084: log likelihood = -297.82, aic = 603.64
<- as.data.frame(predict(mod2, n.ahead = n))
f
### Time Series plot of Forecasted Values
### Paste original and forecasted data together
$type <- "Data"
bikemonth<- data.frame(max(bikemonth$month) + months(1:n), f$pred, rep("Forecast", n))
d names(d) <- names(bikemonth)
<- rbind(bikemonth,d)
bikeforcast tail(bikeforcast)
## # A tibble: 6 x 3
## month Rides type
## <date> <dbl> <chr>
## 1 2020-02-01 17224. Forecast
## 2 2020-03-01 34689. Forecast
## 3 2020-04-01 24267. Forecast
## 4 2020-05-01 24128. Forecast
## 5 2020-06-01 21788. Forecast
## 6 2020-07-01 22111. Forecast
<- bikeforcast %>% filter(type == "Forecast")
bikeCI
<- bikeforcast %>%
p ggplot(aes(month, Rides, color=type)) +
geom_line() +
geom_point() +
scale_x_date(date_labels = "%b %y", breaks = "3 month") +
theme(axis.text.x = element_text(size=8, angle = 90))
::ggplotly(p) plotly