Notes: Critical packages loaded below. ‘readxl’ was very important
for importing soon to be dataframes.
library(tidyverse) # Data wrangling
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted) # Resolve any conflicts
library(readxl) # Enables import of spreadsheets
# Set dplyr::filter and dplyr::lag as the default choices
conflict_prefer("filter","dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag","dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.
Read in Excel spreadsheets. Due to file size, using MS Excel was
superior to Google Sheets
#======================
# PHASE 1: COLLECT DATA
#======================
## Spreadsheet upload
Q1_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q1.xlsx")
Q2_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q2.xlsx")
Q3_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q3.xlsx")
Q4_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q4.xlsx")
Initial review for data discrepancies and then merging all four
quarterly subsets into one file
#==================================
# PHASE 2: DATA WRANGLING & MERGING
#==================================
## Compare column names to check for discrepancies
colnames(Q1_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bike_id" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "user_type"
## [10] "gender" "birth_year" "day_of_week"
## [13] "ride_length"
colnames(Q2_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bike_id" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "user_type"
## [10] "gender" "birth_year" "day_of_week"
## [13] "ride_length"
colnames(Q3_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bike_id" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "user_type"
## [10] "gender" "birth_year" "day_of_week"
## [13] "ride_length"
colnames(Q4_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bike_id" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "user_type"
## [10] "gender" "birth_year" "day_of_week"
## [13] "ride_length"
## Inspect the data frames and look for incongruities
str(Q1_2019)
## tibble [345,222 × 13] (S3: tbl_df/tbl/data.frame)
## $ trip_id : num [1:345222] 21742443 21742444 21742445 21742446 21742447 ...
## $ start_time : POSIXct[1:345222], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
## $ end_time : POSIXct[1:345222], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
## $ bike_id : num [1:345222] 2167 4386 1524 252 1170 ...
## $ start_station_id : num [1:345222] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:345222] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:345222] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:345222] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ user_type : chr [1:345222] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:345222] "Male" "Female" "Female" "Male" ...
## $ birth_year : num [1:345222] 1989 1990 1994 1993 1994 ...
## $ day_of_week : num [1:345222] 3 3 3 3 3 3 3 3 3 3 ...
## $ ride_length : POSIXct[1:345222], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...
str(Q2_2019)
## tibble [594,807 × 13] (S3: tbl_df/tbl/data.frame)
## $ trip_id : num [1:594807] 22178529 22178530 22178531 22178532 22178533 ...
## $ start_time : POSIXct[1:594807], format: "2019-04-01 00:02:00" "2019-04-01 00:03:00" ...
## $ end_time : POSIXct[1:594807], format: "2019-04-01 00:09:00" "2019-04-01 00:20:00" ...
## $ bike_id : num [1:594807] 6251 6226 5649 4151 3270 ...
## $ start_station_id : num [1:594807] 81 317 283 26 202 420 503 260 304 37 ...
## $ start_station_name: chr [1:594807] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
## $ end_station_id : num [1:594807] 56 59 174 133 129 426 500 499 232 337 ...
## $ end_station_name : chr [1:594807] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
## $ user_type : chr [1:594807] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:594807] "Male" "Female" "Male" "Male" ...
## $ birth_year : num [1:594807] 1975 1984 1990 1993 1992 ...
## $ day_of_week : num [1:594807] 2 2 2 2 2 2 2 2 2 2 ...
## $ ride_length : POSIXct[1:594807], format: "1899-12-31 00:07:00" "1899-12-31 00:17:00" ...
str(Q3_2019)
## tibble [559,507 × 13] (S3: tbl_df/tbl/data.frame)
## $ trip_id : num [1:559507] 23479388 23479393 23479406 23479408 23479409 ...
## $ start_time : POSIXct[1:559507], format: "2019-07-01 00:00:00" "2019-07-01 00:02:00" ...
## $ end_time : POSIXct[1:559507], format: "2019-07-01 00:20:00" "2019-07-01 00:07:00" ...
## $ bike_id : num [1:559507] 3591 4941 2758 2447 6336 ...
## $ start_station_id : num [1:559507] 117 300 624 239 257 240 6 6 6 203 ...
## $ start_station_name: chr [1:559507] "Wilton Ave & Belmont Ave" "Broadway & Barry Ave" "Dearborn St & Van Buren St" "Western Ave & Leland Ave" ...
## $ end_station_id : num [1:559507] 497 232 237 455 295 326 321 321 321 203 ...
## $ end_station_name : chr [1:559507] "Kimball Ave & Belmont Ave" "Pine Grove Ave & Waveland Ave" "MLK Jr Dr & 29th St" "Maplewood Ave & Peterson Ave" ...
## $ user_type : chr [1:559507] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:559507] "Male" "Male" "Male" "Male" ...
## $ birth_year : num [1:559507] 1992 1990 1995 1970 1993 ...
## $ day_of_week : num [1:559507] 2 2 2 2 2 2 2 2 2 2 ...
## $ ride_length : POSIXct[1:559507], format: "1899-12-31 00:20:00" "1899-12-31 00:05:00" ...
str(Q4_2019)
## tibble [637,279 × 13] (S3: tbl_df/tbl/data.frame)
## $ trip_id : num [1:637279] 25223640 25223641 25223642 25223643 25223644 ...
## $ start_time : POSIXct[1:637279], format: "2019-10-01 00:01:00" "2019-10-01 00:02:00" ...
## $ end_time : POSIXct[1:637279], format: "2019-10-01 00:17:00" "2019-10-01 00:06:00" ...
## $ bike_id : num [1:637279] 2215 6328 3003 3275 5294 ...
## $ start_station_id : num [1:637279] 20 19 84 313 210 156 84 156 156 77 ...
## $ start_station_name: chr [1:637279] "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
## $ end_station_id : num [1:637279] 309 241 199 290 382 226 142 463 463 506 ...
## $ end_station_name : chr [1:637279] "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
## $ user_type : chr [1:637279] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:637279] "Male" "Male" "Female" "Male" ...
## $ birth_year : num [1:637279] 1987 1998 1991 1990 1987 ...
## $ day_of_week : num [1:637279] 3 3 3 3 3 3 3 3 3 3 ...
## $ ride_length : POSIXct[1:637279], format: "1899-12-31 00:16:00" "1899-12-31 00:04:00" ...
# Stack individual quarter's data frames into one big data frame
all_trips <- bind_rows(Q1_2019,Q2_2019,Q3_2019,Q4_2019)
Preparing for data analysis
#===================================
# PHASE 3: PREPARE DATA FOR ANALYSIS
#===================================
colnames(all_trips)
## [1] "trip_id" "start_time" "end_time"
## [4] "bike_id" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "user_type"
## [10] "gender" "birth_year" "day_of_week"
## [13] "ride_length"
nrow(all_trips)
## [1] 2136815
dim(all_trips)
## [1] 2136815 13
head(all_trips)
## # A tibble: 6 × 13
## trip_id start_time end_time bike_id start_station_id
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 21742443 2019-01-01 00:04:00 2019-01-01 00:11:00 2167 199
## 2 21742444 2019-01-01 00:08:00 2019-01-01 00:15:00 4386 44
## 3 21742445 2019-01-01 00:13:00 2019-01-01 00:27:00 1524 15
## 4 21742446 2019-01-01 00:13:00 2019-01-01 00:43:00 252 123
## 5 21742447 2019-01-01 00:14:00 2019-01-01 00:20:00 1170 173
## 6 21742448 2019-01-01 00:15:00 2019-01-01 00:19:00 2437 98
## # ℹ 8 more variables: start_station_name <chr>, end_station_id <dbl>,
## # end_station_name <chr>, user_type <chr>, gender <chr>, birth_year <dbl>,
## # day_of_week <dbl>, ride_length <dttm>
tail(all_trips)
## # A tibble: 6 × 13
## trip_id start_time end_time bike_id start_station_id
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 25962899 2019-12-31 23:54:00 2020-01-01 00:22:00 5996 145
## 2 25962900 2019-12-31 23:56:00 2020-01-01 00:15:00 2196 112
## 3 25962901 2019-12-31 23:56:00 2020-01-01 00:22:00 4877 90
## 4 25962902 2019-12-31 23:57:00 2020-01-01 00:05:00 863 623
## 5 25962903 2019-12-31 23:57:00 2020-01-01 00:05:00 2637 623
## 6 25962904 2019-12-31 23:57:00 2019-12-31 23:59:00 5930 256
## # ℹ 8 more variables: start_station_name <chr>, end_station_id <dbl>,
## # end_station_name <chr>, user_type <chr>, gender <chr>, birth_year <dbl>,
## # day_of_week <dbl>, ride_length <dttm>
str(all_trips)
## tibble [2,136,815 × 13] (S3: tbl_df/tbl/data.frame)
## $ trip_id : num [1:2136815] 21742443 21742444 21742445 21742446 21742447 ...
## $ start_time : POSIXct[1:2136815], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
## $ end_time : POSIXct[1:2136815], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
## $ bike_id : num [1:2136815] 2167 4386 1524 252 1170 ...
## $ start_station_id : num [1:2136815] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:2136815] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:2136815] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:2136815] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ user_type : chr [1:2136815] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:2136815] "Male" "Female" "Female" "Male" ...
## $ birth_year : num [1:2136815] 1989 1990 1994 1993 1994 ...
## $ day_of_week : num [1:2136815] 3 3 3 3 3 3 3 3 3 3 ...
## $ ride_length : POSIXct[1:2136815], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...
summary(all_trips)
## trip_id start_time
## Min. :21742443 Min. :2019-01-01 00:04:00.00
## 1st Qu.:22434746 1st Qu.:2019-04-24 18:22:00.00
## Median :23670383 Median :2019-07-10 18:28:00.00
## Mean :23717363 Mean :2019-07-07 13:47:48.33
## 3rd Qu.:25346030 3rd Qu.:2019-10-08 23:19:30.00
## Max. :25962904 Max. :2019-12-31 23:57:00.00
## end_time bike_id start_station_id
## Min. :2019-01-01 00:11:00.00 Min. : 1 Min. : 2.0
## 1st Qu.:2019-04-24 18:35:00.00 1st Qu.:1738 1st Qu.: 80.0
## Median :2019-07-10 18:45:00.00 Median :3475 Median :175.0
## Mean :2019-07-07 14:02:26.65 Mean :3401 Mean :204.3
## 3rd Qu.:2019-10-08 23:31:30.00 3rd Qu.:5086 3rd Qu.:291.0
## Max. :2020-01-01 17:25:00.00 Max. :6946 Max. :673.0
## start_station_name end_station_id end_station_name user_type
## Length:2136815 Min. : 2.0 Length:2136815 Length:2136815
## Class :character 1st Qu.: 80.0 Class :character Class :character
## Mode :character Median :175.0 Mode :character Mode :character
## Mean :204.7
## 3rd Qu.:291.0
## Max. :673.0
## gender birth_year day_of_week
## Length:2136815 Min. :1759 Min. :1.000
## Class :character 1st Qu.:1978 1st Qu.:2.000
## Mode :character Median :1987 Median :4.000
## Mean :1984 Mean :3.973
## 3rd Qu.:1992 3rd Qu.:6.000
## Max. :2014 Max. :7.000
## ride_length
## Min. :1899-12-31 00:01:00.00
## 1st Qu.:1899-12-31 00:06:00.00
## Median :1899-12-31 00:10:00.00
## Mean :1899-12-31 00:14:38.30
## 3rd Qu.:1899-12-31 00:17:00.00
## Max. :1899-12-31 23:57:00.00
# Add columns that list the date, month, day, and year of each ride
# This will allow us to aggregate ride data for each month, day, or year
all_trips$date <- as.Date(all_trips$start_time)
all_trips$month <- format(as.Date(all_trips$date),"%m")
all_trips$day <- format(as.Date(all_trips$date),"%d")
all_trips$year <- format(as.Date(all_trips$date),"%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date),"%A")
# Inspect the structure of the columns
str(all_trips)
## tibble [2,136,815 × 17] (S3: tbl_df/tbl/data.frame)
## $ trip_id : num [1:2136815] 21742443 21742444 21742445 21742446 21742447 ...
## $ start_time : POSIXct[1:2136815], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
## $ end_time : POSIXct[1:2136815], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
## $ bike_id : num [1:2136815] 2167 4386 1524 252 1170 ...
## $ start_station_id : num [1:2136815] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:2136815] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:2136815] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:2136815] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ user_type : chr [1:2136815] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:2136815] "Male" "Female" "Female" "Male" ...
## $ birth_year : num [1:2136815] 1989 1990 1994 1993 1994 ...
## $ day_of_week : chr [1:2136815] "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
## $ ride_length : POSIXct[1:2136815], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...
## $ date : Date[1:2136815], format: "2019-01-01" "2019-01-01" ...
## $ month : chr [1:2136815] "01" "01" "01" "01" ...
## $ day : chr [1:2136815] "01" "01" "01" "01" ...
## $ year : chr [1:2136815] "2019" "2019" "2019" "2019" ...
# Add a "ride_length" calculation to all_trips (in seconds)
all_trips$ride_length <- difftime(all_trips$end_time,all_trips$start_time)
Perform insightful analyses
#=============================
# PERFORM DESCRIPTIVE ANALYSES
#=============================
summary(all_trips$ride_length)
## Length Class Mode
## 2136815 difftime numeric
mean(all_trips$ride_length) #straight average (total ride length / rides)
## Time difference of 14.63847 mins
median(all_trips$ride_length) #midpoint number in the ascending array of ride lengths
## Time difference of 10 mins
max(all_trips$ride_length) #longest ride
## Time difference of 1437 mins
min(all_trips$ride_length) #shortest ride
## Time difference of 1 mins
# Compare subscribers (annual members) and customers (non-members)
aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = mean)
## all_trips$user_type all_trips$ride_length
## 1 Customer 36.07204 mins
## 2 Subscriber 12.55834 mins
aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = median)
## all_trips$user_type all_trips$ride_length
## 1 Customer 23 mins
## 2 Subscriber 9 mins
aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = max)
## all_trips$user_type all_trips$ride_length
## 1 Customer 1437 mins
## 2 Subscriber 1433 mins
aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = min)
## all_trips$user_type all_trips$ride_length
## 1 Customer 1 mins
## 2 Subscriber 1 mins
# See the average ride time by each day for subscribers vs customers
aggregate(all_trips$ride_length ~ all_trips$user_type + all_trips$day_of_week, FUN = mean)
## all_trips$user_type all_trips$day_of_week all_trips$ride_length
## 1 Customer Friday 35.25310 mins
## 2 Subscriber Friday 12.09061 mins
## 3 Customer Monday 35.31018 mins
## 4 Subscriber Monday 12.30750 mins
## 5 Customer Saturday 38.79385 mins
## 6 Subscriber Saturday 13.94203 mins
## 7 Customer Sunday 38.63173 mins
## 8 Subscriber Sunday 14.04006 mins
## 9 Customer Thursday 34.08715 mins
## 10 Subscriber Thursday 12.26093 mins
## 11 Customer Tuesday 33.20637 mins
## 12 Subscriber Tuesday 12.21851 mins
## 13 Customer Wednesday 32.83618 mins
## 14 Subscriber Wednesday 12.34527 mins
# Fixes out of order days from command above
all_trips$day_of_week <- ordered(all_trips$day_of_week, levels = c("Sunday","Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday"))
# Rerun average ride time per day by user type
aggregate(all_trips$ride_length ~ all_trips$user_type + all_trips$day_of_week, FUN = mean)
## all_trips$user_type all_trips$day_of_week all_trips$ride_length
## 1 Customer Sunday 38.63173 mins
## 2 Subscriber Sunday 14.04006 mins
## 3 Customer Monday 35.31018 mins
## 4 Subscriber Monday 12.30750 mins
## 5 Customer Tuesday 33.20637 mins
## 6 Subscriber Tuesday 12.21851 mins
## 7 Customer Wednesday 32.83618 mins
## 8 Subscriber Wednesday 12.34527 mins
## 9 Customer Thursday 34.08715 mins
## 10 Subscriber Thursday 12.26093 mins
## 11 Customer Friday 35.25310 mins
## 12 Subscriber Friday 12.09061 mins
## 13 Customer Saturday 38.79385 mins
## 14 Subscriber Saturday 13.94203 mins
# Chart ridership data by type and weekday
all_trips %>%
mutate(weekday = wday(start_time, label = TRUE)) %>%
group_by(user_type,weekday) %>%
summarise(number_of_rides = n(),
average_duration = mean(ride_length)) %>%
arrange(user_type, weekday)
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups: user_type [2]
## user_type weekday number_of_rides average_duration
## <chr> <ord> <int> <drtn>
## 1 Customer Sun 38146 38.63173 mins
## 2 Customer Mon 23493 35.31018 mins
## 3 Customer Tue 21408 33.20637 mins
## 4 Customer Wed 20327 32.83618 mins
## 5 Customer Thu 21160 34.08715 mins
## 6 Customer Fri 24338 35.25310 mins
## 7 Customer Sat 40161 38.79385 mins
## 8 Subscriber Sun 170752 14.04006 mins
## 9 Subscriber Mon 316572 12.30750 mins
## 10 Subscriber Tue 349273 12.21851 mins
## 11 Subscriber Wed 330483 12.34527 mins
## 12 Subscriber Thu 308413 12.26093 mins
## 13 Subscriber Fri 295055 12.09061 mins
## 14 Subscriber Sat 177234 13.94203 mins
# Chart number of rides by user type
all_trips %>%
mutate(weekday = wday(start_time, label = TRUE)) %>%
group_by(user_type, weekday) %>%
summarise(number_of_rides = n(),
average_duration = mean(ride_length)) %>%
arrange(user_type, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = user_type)) +
geom_col(position = "dodge") +
labs(y = "number of rides") #+
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.

#scale_y_continuous(labels = label_number(big.mark = ","))
# Chart average ride duration
all_trips %>%
mutate(weekday = wday(start_time, label = TRUE)) %>%
group_by(user_type, weekday) %>%
summarise(number_of_rides = n(),
average_duration = mean(ride_length)) %>%
arrange(user_type, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = user_type)) +
geom_col(position = "dodge") +
labs(y = "average duration (minutes)")
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

Create and export summary file
#====================
# EXPORT SUMMARY FILE
#====================
counts <- aggregate(all_trips$ride_length ~ all_trips$user_type +
all_trips$day_of_week, FUN = mean)
write.csv(counts, file = 'avg_ride_length.csv')