Environment creation

Notes: Critical packages loaded below. ‘readxl’ was very important for importing soon to be dataframes.

library(tidyverse) # Data wrangling
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted) # Resolve any conflicts
library(readxl) # Enables import of spreadsheets

# Set dplyr::filter and dplyr::lag as the default choices
conflict_prefer("filter","dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag","dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.

Read in Excel spreadsheets. Due to file size, using MS Excel was superior to Google Sheets

#======================
# PHASE 1: COLLECT DATA
#======================

## Spreadsheet upload

Q1_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q1.xlsx")
Q2_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q2.xlsx")
Q3_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q3.xlsx")
Q4_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q4.xlsx")

Initial review for data discrepancies and then merging all four quarterly subsets into one file

#==================================
# PHASE 2: DATA WRANGLING & MERGING
#==================================

## Compare column names to check for discrepancies

colnames(Q1_2019)
##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"
colnames(Q2_2019)
##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"
colnames(Q3_2019)
##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"
colnames(Q4_2019)
##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"
## Inspect the data frames and look for incongruities

str(Q1_2019)
## tibble [345,222 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:345222] 21742443 21742444 21742445 21742446 21742447 ...
##  $ start_time        : POSIXct[1:345222], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
##  $ end_time          : POSIXct[1:345222], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
##  $ bike_id           : num [1:345222] 2167 4386 1524 252 1170 ...
##  $ start_station_id  : num [1:345222] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:345222] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:345222] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:345222] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ user_type         : chr [1:345222] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:345222] "Male" "Female" "Female" "Male" ...
##  $ birth_year        : num [1:345222] 1989 1990 1994 1993 1994 ...
##  $ day_of_week       : num [1:345222] 3 3 3 3 3 3 3 3 3 3 ...
##  $ ride_length       : POSIXct[1:345222], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...
str(Q2_2019)
## tibble [594,807 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:594807] 22178529 22178530 22178531 22178532 22178533 ...
##  $ start_time        : POSIXct[1:594807], format: "2019-04-01 00:02:00" "2019-04-01 00:03:00" ...
##  $ end_time          : POSIXct[1:594807], format: "2019-04-01 00:09:00" "2019-04-01 00:20:00" ...
##  $ bike_id           : num [1:594807] 6251 6226 5649 4151 3270 ...
##  $ start_station_id  : num [1:594807] 81 317 283 26 202 420 503 260 304 37 ...
##  $ start_station_name: chr [1:594807] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : num [1:594807] 56 59 174 133 129 426 500 499 232 337 ...
##  $ end_station_name  : chr [1:594807] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ user_type         : chr [1:594807] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:594807] "Male" "Female" "Male" "Male" ...
##  $ birth_year        : num [1:594807] 1975 1984 1990 1993 1992 ...
##  $ day_of_week       : num [1:594807] 2 2 2 2 2 2 2 2 2 2 ...
##  $ ride_length       : POSIXct[1:594807], format: "1899-12-31 00:07:00" "1899-12-31 00:17:00" ...
str(Q3_2019)
## tibble [559,507 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:559507] 23479388 23479393 23479406 23479408 23479409 ...
##  $ start_time        : POSIXct[1:559507], format: "2019-07-01 00:00:00" "2019-07-01 00:02:00" ...
##  $ end_time          : POSIXct[1:559507], format: "2019-07-01 00:20:00" "2019-07-01 00:07:00" ...
##  $ bike_id           : num [1:559507] 3591 4941 2758 2447 6336 ...
##  $ start_station_id  : num [1:559507] 117 300 624 239 257 240 6 6 6 203 ...
##  $ start_station_name: chr [1:559507] "Wilton Ave & Belmont Ave" "Broadway & Barry Ave" "Dearborn St & Van Buren St" "Western Ave & Leland Ave" ...
##  $ end_station_id    : num [1:559507] 497 232 237 455 295 326 321 321 321 203 ...
##  $ end_station_name  : chr [1:559507] "Kimball Ave & Belmont Ave" "Pine Grove Ave & Waveland Ave" "MLK Jr Dr & 29th St" "Maplewood Ave & Peterson Ave" ...
##  $ user_type         : chr [1:559507] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:559507] "Male" "Male" "Male" "Male" ...
##  $ birth_year        : num [1:559507] 1992 1990 1995 1970 1993 ...
##  $ day_of_week       : num [1:559507] 2 2 2 2 2 2 2 2 2 2 ...
##  $ ride_length       : POSIXct[1:559507], format: "1899-12-31 00:20:00" "1899-12-31 00:05:00" ...
str(Q4_2019)
## tibble [637,279 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:637279] 25223640 25223641 25223642 25223643 25223644 ...
##  $ start_time        : POSIXct[1:637279], format: "2019-10-01 00:01:00" "2019-10-01 00:02:00" ...
##  $ end_time          : POSIXct[1:637279], format: "2019-10-01 00:17:00" "2019-10-01 00:06:00" ...
##  $ bike_id           : num [1:637279] 2215 6328 3003 3275 5294 ...
##  $ start_station_id  : num [1:637279] 20 19 84 313 210 156 84 156 156 77 ...
##  $ start_station_name: chr [1:637279] "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : num [1:637279] 309 241 199 290 382 226 142 463 463 506 ...
##  $ end_station_name  : chr [1:637279] "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
##  $ user_type         : chr [1:637279] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:637279] "Male" "Male" "Female" "Male" ...
##  $ birth_year        : num [1:637279] 1987 1998 1991 1990 1987 ...
##  $ day_of_week       : num [1:637279] 3 3 3 3 3 3 3 3 3 3 ...
##  $ ride_length       : POSIXct[1:637279], format: "1899-12-31 00:16:00" "1899-12-31 00:04:00" ...
# Stack individual quarter's data frames into one big data frame

all_trips <- bind_rows(Q1_2019,Q2_2019,Q3_2019,Q4_2019)

Preparing for data analysis

#===================================
# PHASE 3: PREPARE DATA FOR ANALYSIS
#===================================

colnames(all_trips)
##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"
nrow(all_trips)
## [1] 2136815
dim(all_trips)
## [1] 2136815      13
head(all_trips)
## # A tibble: 6 × 13
##    trip_id start_time          end_time            bike_id start_station_id
##      <dbl> <dttm>              <dttm>                <dbl>            <dbl>
## 1 21742443 2019-01-01 00:04:00 2019-01-01 00:11:00    2167              199
## 2 21742444 2019-01-01 00:08:00 2019-01-01 00:15:00    4386               44
## 3 21742445 2019-01-01 00:13:00 2019-01-01 00:27:00    1524               15
## 4 21742446 2019-01-01 00:13:00 2019-01-01 00:43:00     252              123
## 5 21742447 2019-01-01 00:14:00 2019-01-01 00:20:00    1170              173
## 6 21742448 2019-01-01 00:15:00 2019-01-01 00:19:00    2437               98
## # ℹ 8 more variables: start_station_name <chr>, end_station_id <dbl>,
## #   end_station_name <chr>, user_type <chr>, gender <chr>, birth_year <dbl>,
## #   day_of_week <dbl>, ride_length <dttm>
tail(all_trips)
## # A tibble: 6 × 13
##    trip_id start_time          end_time            bike_id start_station_id
##      <dbl> <dttm>              <dttm>                <dbl>            <dbl>
## 1 25962899 2019-12-31 23:54:00 2020-01-01 00:22:00    5996              145
## 2 25962900 2019-12-31 23:56:00 2020-01-01 00:15:00    2196              112
## 3 25962901 2019-12-31 23:56:00 2020-01-01 00:22:00    4877               90
## 4 25962902 2019-12-31 23:57:00 2020-01-01 00:05:00     863              623
## 5 25962903 2019-12-31 23:57:00 2020-01-01 00:05:00    2637              623
## 6 25962904 2019-12-31 23:57:00 2019-12-31 23:59:00    5930              256
## # ℹ 8 more variables: start_station_name <chr>, end_station_id <dbl>,
## #   end_station_name <chr>, user_type <chr>, gender <chr>, birth_year <dbl>,
## #   day_of_week <dbl>, ride_length <dttm>
str(all_trips)
## tibble [2,136,815 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:2136815] 21742443 21742444 21742445 21742446 21742447 ...
##  $ start_time        : POSIXct[1:2136815], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
##  $ end_time          : POSIXct[1:2136815], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
##  $ bike_id           : num [1:2136815] 2167 4386 1524 252 1170 ...
##  $ start_station_id  : num [1:2136815] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:2136815] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:2136815] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:2136815] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ user_type         : chr [1:2136815] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:2136815] "Male" "Female" "Female" "Male" ...
##  $ birth_year        : num [1:2136815] 1989 1990 1994 1993 1994 ...
##  $ day_of_week       : num [1:2136815] 3 3 3 3 3 3 3 3 3 3 ...
##  $ ride_length       : POSIXct[1:2136815], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...
summary(all_trips)
##     trip_id           start_time                    
##  Min.   :21742443   Min.   :2019-01-01 00:04:00.00  
##  1st Qu.:22434746   1st Qu.:2019-04-24 18:22:00.00  
##  Median :23670383   Median :2019-07-10 18:28:00.00  
##  Mean   :23717363   Mean   :2019-07-07 13:47:48.33  
##  3rd Qu.:25346030   3rd Qu.:2019-10-08 23:19:30.00  
##  Max.   :25962904   Max.   :2019-12-31 23:57:00.00  
##     end_time                         bike_id     start_station_id
##  Min.   :2019-01-01 00:11:00.00   Min.   :   1   Min.   :  2.0   
##  1st Qu.:2019-04-24 18:35:00.00   1st Qu.:1738   1st Qu.: 80.0   
##  Median :2019-07-10 18:45:00.00   Median :3475   Median :175.0   
##  Mean   :2019-07-07 14:02:26.65   Mean   :3401   Mean   :204.3   
##  3rd Qu.:2019-10-08 23:31:30.00   3rd Qu.:5086   3rd Qu.:291.0   
##  Max.   :2020-01-01 17:25:00.00   Max.   :6946   Max.   :673.0   
##  start_station_name end_station_id  end_station_name    user_type        
##  Length:2136815     Min.   :  2.0   Length:2136815     Length:2136815    
##  Class :character   1st Qu.: 80.0   Class :character   Class :character  
##  Mode  :character   Median :175.0   Mode  :character   Mode  :character  
##                     Mean   :204.7                                        
##                     3rd Qu.:291.0                                        
##                     Max.   :673.0                                        
##     gender            birth_year    day_of_week   
##  Length:2136815     Min.   :1759   Min.   :1.000  
##  Class :character   1st Qu.:1978   1st Qu.:2.000  
##  Mode  :character   Median :1987   Median :4.000  
##                     Mean   :1984   Mean   :3.973  
##                     3rd Qu.:1992   3rd Qu.:6.000  
##                     Max.   :2014   Max.   :7.000  
##   ride_length                    
##  Min.   :1899-12-31 00:01:00.00  
##  1st Qu.:1899-12-31 00:06:00.00  
##  Median :1899-12-31 00:10:00.00  
##  Mean   :1899-12-31 00:14:38.30  
##  3rd Qu.:1899-12-31 00:17:00.00  
##  Max.   :1899-12-31 23:57:00.00
# Add columns that list the date, month, day, and year of each ride
# This will allow us to aggregate ride data for each month, day, or year

all_trips$date <- as.Date(all_trips$start_time)
all_trips$month <- format(as.Date(all_trips$date),"%m")
all_trips$day <- format(as.Date(all_trips$date),"%d")
all_trips$year <- format(as.Date(all_trips$date),"%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date),"%A")

# Inspect the structure of the columns

str(all_trips)
## tibble [2,136,815 × 17] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:2136815] 21742443 21742444 21742445 21742446 21742447 ...
##  $ start_time        : POSIXct[1:2136815], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
##  $ end_time          : POSIXct[1:2136815], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
##  $ bike_id           : num [1:2136815] 2167 4386 1524 252 1170 ...
##  $ start_station_id  : num [1:2136815] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:2136815] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:2136815] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:2136815] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ user_type         : chr [1:2136815] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:2136815] "Male" "Female" "Female" "Male" ...
##  $ birth_year        : num [1:2136815] 1989 1990 1994 1993 1994 ...
##  $ day_of_week       : chr [1:2136815] "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
##  $ ride_length       : POSIXct[1:2136815], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...
##  $ date              : Date[1:2136815], format: "2019-01-01" "2019-01-01" ...
##  $ month             : chr [1:2136815] "01" "01" "01" "01" ...
##  $ day               : chr [1:2136815] "01" "01" "01" "01" ...
##  $ year              : chr [1:2136815] "2019" "2019" "2019" "2019" ...
# Add a "ride_length" calculation to all_trips (in seconds)

all_trips$ride_length <- difftime(all_trips$end_time,all_trips$start_time)

Perform insightful analyses

#=============================
# PERFORM DESCRIPTIVE ANALYSES
#=============================

summary(all_trips$ride_length)
##   Length    Class     Mode 
##  2136815 difftime  numeric
mean(all_trips$ride_length) #straight average (total ride length / rides)
## Time difference of 14.63847 mins
median(all_trips$ride_length) #midpoint number in the ascending array of ride lengths
## Time difference of 10 mins
max(all_trips$ride_length) #longest ride
## Time difference of 1437 mins
min(all_trips$ride_length) #shortest ride
## Time difference of 1 mins
# Compare subscribers (annual members) and customers (non-members)

aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = mean)
##   all_trips$user_type all_trips$ride_length
## 1            Customer         36.07204 mins
## 2          Subscriber         12.55834 mins
aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = median)
##   all_trips$user_type all_trips$ride_length
## 1            Customer               23 mins
## 2          Subscriber                9 mins
aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = max)
##   all_trips$user_type all_trips$ride_length
## 1            Customer             1437 mins
## 2          Subscriber             1433 mins
aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = min)
##   all_trips$user_type all_trips$ride_length
## 1            Customer                1 mins
## 2          Subscriber                1 mins
# See the average ride time by each day for subscribers vs customers

aggregate(all_trips$ride_length ~ all_trips$user_type + all_trips$day_of_week, FUN = mean)
##    all_trips$user_type all_trips$day_of_week all_trips$ride_length
## 1             Customer                Friday         35.25310 mins
## 2           Subscriber                Friday         12.09061 mins
## 3             Customer                Monday         35.31018 mins
## 4           Subscriber                Monday         12.30750 mins
## 5             Customer              Saturday         38.79385 mins
## 6           Subscriber              Saturday         13.94203 mins
## 7             Customer                Sunday         38.63173 mins
## 8           Subscriber                Sunday         14.04006 mins
## 9             Customer              Thursday         34.08715 mins
## 10          Subscriber              Thursday         12.26093 mins
## 11            Customer               Tuesday         33.20637 mins
## 12          Subscriber               Tuesday         12.21851 mins
## 13            Customer             Wednesday         32.83618 mins
## 14          Subscriber             Wednesday         12.34527 mins
# Fixes out of order days from command above

all_trips$day_of_week <- ordered(all_trips$day_of_week, levels = c("Sunday","Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday"))

# Rerun average ride time per day by user type

aggregate(all_trips$ride_length ~ all_trips$user_type + all_trips$day_of_week, FUN = mean)
##    all_trips$user_type all_trips$day_of_week all_trips$ride_length
## 1             Customer                Sunday         38.63173 mins
## 2           Subscriber                Sunday         14.04006 mins
## 3             Customer                Monday         35.31018 mins
## 4           Subscriber                Monday         12.30750 mins
## 5             Customer               Tuesday         33.20637 mins
## 6           Subscriber               Tuesday         12.21851 mins
## 7             Customer             Wednesday         32.83618 mins
## 8           Subscriber             Wednesday         12.34527 mins
## 9             Customer              Thursday         34.08715 mins
## 10          Subscriber              Thursday         12.26093 mins
## 11            Customer                Friday         35.25310 mins
## 12          Subscriber                Friday         12.09061 mins
## 13            Customer              Saturday         38.79385 mins
## 14          Subscriber              Saturday         13.94203 mins
# Chart ridership data by type and weekday
all_trips %>%
  mutate(weekday = wday(start_time, label = TRUE)) %>%
  group_by(user_type,weekday) %>%
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length)) %>%
  arrange(user_type, weekday)
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups:   user_type [2]
##    user_type  weekday number_of_rides average_duration
##    <chr>      <ord>             <int> <drtn>          
##  1 Customer   Sun               38146 38.63173 mins   
##  2 Customer   Mon               23493 35.31018 mins   
##  3 Customer   Tue               21408 33.20637 mins   
##  4 Customer   Wed               20327 32.83618 mins   
##  5 Customer   Thu               21160 34.08715 mins   
##  6 Customer   Fri               24338 35.25310 mins   
##  7 Customer   Sat               40161 38.79385 mins   
##  8 Subscriber Sun              170752 14.04006 mins   
##  9 Subscriber Mon              316572 12.30750 mins   
## 10 Subscriber Tue              349273 12.21851 mins   
## 11 Subscriber Wed              330483 12.34527 mins   
## 12 Subscriber Thu              308413 12.26093 mins   
## 13 Subscriber Fri              295055 12.09061 mins   
## 14 Subscriber Sat              177234 13.94203 mins
# Chart number of rides by user type
all_trips %>%
  mutate(weekday = wday(start_time, label = TRUE)) %>%
  group_by(user_type, weekday) %>%
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length)) %>%
  arrange(user_type, weekday) %>%
  ggplot(aes(x = weekday, y = number_of_rides, fill = user_type)) +
  geom_col(position = "dodge") +
  labs(y = "number of rides") #+
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.

    #scale_y_continuous(labels = label_number(big.mark = ","))
  
# Chart average ride duration
all_trips %>%
  mutate(weekday = wday(start_time, label = TRUE)) %>%
  group_by(user_type, weekday) %>%
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length)) %>%
  arrange(user_type, weekday) %>%
  ggplot(aes(x = weekday, y = average_duration, fill = user_type)) +
  geom_col(position = "dodge") +
  labs(y = "average duration (minutes)")
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

Create and export summary file

#====================
# EXPORT SUMMARY FILE
#====================

counts <- aggregate(all_trips$ride_length ~ all_trips$user_type +
                      all_trips$day_of_week, FUN = mean)
write.csv(counts, file = 'avg_ride_length.csv')