Notes: Critical packages loaded below. ‘readxl’ was very important for importing soon to be dataframes.

library(tidyverse) # Data wrangling

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(conflicted) # Resolve any conflicts
library(readxl) # Enables import of spreadsheets

# Set dplyr::filter and dplyr::lag as the default choices
conflict_prefer("filter","dplyr")

## [conflicted] Will prefer dplyr::filter over any other package.

conflict_prefer("lag","dplyr")

## [conflicted] Will prefer dplyr::lag over any other package.

Read in Excel spreadsheets. Due to file size, using MS Excel was superior to Google Sheets

#======================
# PHASE 1: COLLECT DATA
#======================

## Spreadsheet upload

Q1_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q1.xlsx")
Q2_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q2.xlsx")
Q3_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q3.xlsx")
Q4_2019 <- read_excel("C:\\Users\\corey\\OneDrive\\Desktop\\Capstone\\Bike_Trips_2019\\Excel_Data\\Cleaned_Excel_Files\\Q4.xlsx")

Initial review for data discrepancies and then merging all four quarterly subsets into one file

#==================================
# PHASE 2: DATA WRANGLING & MERGING
#==================================

## Compare column names to check for discrepancies

colnames(Q1_2019)

##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"

colnames(Q2_2019)

##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"

colnames(Q3_2019)

##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"

colnames(Q4_2019)

##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"

## Inspect the data frames and look for incongruities

str(Q1_2019)

## tibble [345,222 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:345222] 21742443 21742444 21742445 21742446 21742447 ...
##  $ start_time        : POSIXct[1:345222], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
##  $ end_time          : POSIXct[1:345222], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
##  $ bike_id           : num [1:345222] 2167 4386 1524 252 1170 ...
##  $ start_station_id  : num [1:345222] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:345222] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:345222] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:345222] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ user_type         : chr [1:345222] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:345222] "Male" "Female" "Female" "Male" ...
##  $ birth_year        : num [1:345222] 1989 1990 1994 1993 1994 ...
##  $ day_of_week       : num [1:345222] 3 3 3 3 3 3 3 3 3 3 ...
##  $ ride_length       : POSIXct[1:345222], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...

str(Q2_2019)

## tibble [594,807 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:594807] 22178529 22178530 22178531 22178532 22178533 ...
##  $ start_time        : POSIXct[1:594807], format: "2019-04-01 00:02:00" "2019-04-01 00:03:00" ...
##  $ end_time          : POSIXct[1:594807], format: "2019-04-01 00:09:00" "2019-04-01 00:20:00" ...
##  $ bike_id           : num [1:594807] 6251 6226 5649 4151 3270 ...
##  $ start_station_id  : num [1:594807] 81 317 283 26 202 420 503 260 304 37 ...
##  $ start_station_name: chr [1:594807] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : num [1:594807] 56 59 174 133 129 426 500 499 232 337 ...
##  $ end_station_name  : chr [1:594807] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ user_type         : chr [1:594807] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:594807] "Male" "Female" "Male" "Male" ...
##  $ birth_year        : num [1:594807] 1975 1984 1990 1993 1992 ...
##  $ day_of_week       : num [1:594807] 2 2 2 2 2 2 2 2 2 2 ...
##  $ ride_length       : POSIXct[1:594807], format: "1899-12-31 00:07:00" "1899-12-31 00:17:00" ...

str(Q3_2019)

## tibble [559,507 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:559507] 23479388 23479393 23479406 23479408 23479409 ...
##  $ start_time        : POSIXct[1:559507], format: "2019-07-01 00:00:00" "2019-07-01 00:02:00" ...
##  $ end_time          : POSIXct[1:559507], format: "2019-07-01 00:20:00" "2019-07-01 00:07:00" ...
##  $ bike_id           : num [1:559507] 3591 4941 2758 2447 6336 ...
##  $ start_station_id  : num [1:559507] 117 300 624 239 257 240 6 6 6 203 ...
##  $ start_station_name: chr [1:559507] "Wilton Ave & Belmont Ave" "Broadway & Barry Ave" "Dearborn St & Van Buren St" "Western Ave & Leland Ave" ...
##  $ end_station_id    : num [1:559507] 497 232 237 455 295 326 321 321 321 203 ...
##  $ end_station_name  : chr [1:559507] "Kimball Ave & Belmont Ave" "Pine Grove Ave & Waveland Ave" "MLK Jr Dr & 29th St" "Maplewood Ave & Peterson Ave" ...
##  $ user_type         : chr [1:559507] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:559507] "Male" "Male" "Male" "Male" ...
##  $ birth_year        : num [1:559507] 1992 1990 1995 1970 1993 ...
##  $ day_of_week       : num [1:559507] 2 2 2 2 2 2 2 2 2 2 ...
##  $ ride_length       : POSIXct[1:559507], format: "1899-12-31 00:20:00" "1899-12-31 00:05:00" ...

str(Q4_2019)

## tibble [637,279 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:637279] 25223640 25223641 25223642 25223643 25223644 ...
##  $ start_time        : POSIXct[1:637279], format: "2019-10-01 00:01:00" "2019-10-01 00:02:00" ...
##  $ end_time          : POSIXct[1:637279], format: "2019-10-01 00:17:00" "2019-10-01 00:06:00" ...
##  $ bike_id           : num [1:637279] 2215 6328 3003 3275 5294 ...
##  $ start_station_id  : num [1:637279] 20 19 84 313 210 156 84 156 156 77 ...
##  $ start_station_name: chr [1:637279] "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : num [1:637279] 309 241 199 290 382 226 142 463 463 506 ...
##  $ end_station_name  : chr [1:637279] "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
##  $ user_type         : chr [1:637279] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:637279] "Male" "Male" "Female" "Male" ...
##  $ birth_year        : num [1:637279] 1987 1998 1991 1990 1987 ...
##  $ day_of_week       : num [1:637279] 3 3 3 3 3 3 3 3 3 3 ...
##  $ ride_length       : POSIXct[1:637279], format: "1899-12-31 00:16:00" "1899-12-31 00:04:00" ...

# Stack individual quarter's data frames into one big data frame

all_trips <- bind_rows(Q1_2019,Q2_2019,Q3_2019,Q4_2019)

Preparing for data analysis

#===================================
# PHASE 3: PREPARE DATA FOR ANALYSIS
#===================================

colnames(all_trips)

##  [1] "trip_id"            "start_time"         "end_time"          
##  [4] "bike_id"            "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "user_type"         
## [10] "gender"             "birth_year"         "day_of_week"       
## [13] "ride_length"

nrow(all_trips)

## [1] 2136815

dim(all_trips)

## [1] 2136815      13

head(all_trips)

## # A tibble: 6 × 13
##    trip_id start_time          end_time            bike_id start_station_id
##      <dbl> <dttm>              <dttm>                <dbl>            <dbl>
## 1 21742443 2019-01-01 00:04:00 2019-01-01 00:11:00    2167              199
## 2 21742444 2019-01-01 00:08:00 2019-01-01 00:15:00    4386               44
## 3 21742445 2019-01-01 00:13:00 2019-01-01 00:27:00    1524               15
## 4 21742446 2019-01-01 00:13:00 2019-01-01 00:43:00     252              123
## 5 21742447 2019-01-01 00:14:00 2019-01-01 00:20:00    1170              173
## 6 21742448 2019-01-01 00:15:00 2019-01-01 00:19:00    2437               98
## # ℹ 8 more variables: start_station_name <chr>, end_station_id <dbl>,
## #   end_station_name <chr>, user_type <chr>, gender <chr>, birth_year <dbl>,
## #   day_of_week <dbl>, ride_length <dttm>

tail(all_trips)

## # A tibble: 6 × 13
##    trip_id start_time          end_time            bike_id start_station_id
##      <dbl> <dttm>              <dttm>                <dbl>            <dbl>
## 1 25962899 2019-12-31 23:54:00 2020-01-01 00:22:00    5996              145
## 2 25962900 2019-12-31 23:56:00 2020-01-01 00:15:00    2196              112
## 3 25962901 2019-12-31 23:56:00 2020-01-01 00:22:00    4877               90
## 4 25962902 2019-12-31 23:57:00 2020-01-01 00:05:00     863              623
## 5 25962903 2019-12-31 23:57:00 2020-01-01 00:05:00    2637              623
## 6 25962904 2019-12-31 23:57:00 2019-12-31 23:59:00    5930              256
## # ℹ 8 more variables: start_station_name <chr>, end_station_id <dbl>,
## #   end_station_name <chr>, user_type <chr>, gender <chr>, birth_year <dbl>,
## #   day_of_week <dbl>, ride_length <dttm>

str(all_trips)

## tibble [2,136,815 × 13] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:2136815] 21742443 21742444 21742445 21742446 21742447 ...
##  $ start_time        : POSIXct[1:2136815], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
##  $ end_time          : POSIXct[1:2136815], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
##  $ bike_id           : num [1:2136815] 2167 4386 1524 252 1170 ...
##  $ start_station_id  : num [1:2136815] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:2136815] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:2136815] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:2136815] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ user_type         : chr [1:2136815] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:2136815] "Male" "Female" "Female" "Male" ...
##  $ birth_year        : num [1:2136815] 1989 1990 1994 1993 1994 ...
##  $ day_of_week       : num [1:2136815] 3 3 3 3 3 3 3 3 3 3 ...
##  $ ride_length       : POSIXct[1:2136815], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...

summary(all_trips)

##     trip_id           start_time                    
##  Min.   :21742443   Min.   :2019-01-01 00:04:00.00  
##  1st Qu.:22434746   1st Qu.:2019-04-24 18:22:00.00  
##  Median :23670383   Median :2019-07-10 18:28:00.00  
##  Mean   :23717363   Mean   :2019-07-07 13:47:48.33  
##  3rd Qu.:25346030   3rd Qu.:2019-10-08 23:19:30.00  
##  Max.   :25962904   Max.   :2019-12-31 23:57:00.00  
##     end_time                         bike_id     start_station_id
##  Min.   :2019-01-01 00:11:00.00   Min.   :   1   Min.   :  2.0   
##  1st Qu.:2019-04-24 18:35:00.00   1st Qu.:1738   1st Qu.: 80.0   
##  Median :2019-07-10 18:45:00.00   Median :3475   Median :175.0   
##  Mean   :2019-07-07 14:02:26.65   Mean   :3401   Mean   :204.3   
##  3rd Qu.:2019-10-08 23:31:30.00   3rd Qu.:5086   3rd Qu.:291.0   
##  Max.   :2020-01-01 17:25:00.00   Max.   :6946   Max.   :673.0   
##  start_station_name end_station_id  end_station_name    user_type        
##  Length:2136815     Min.   :  2.0   Length:2136815     Length:2136815    
##  Class :character   1st Qu.: 80.0   Class :character   Class :character  
##  Mode  :character   Median :175.0   Mode  :character   Mode  :character  
##                     Mean   :204.7                                        
##                     3rd Qu.:291.0                                        
##                     Max.   :673.0                                        
##     gender            birth_year    day_of_week   
##  Length:2136815     Min.   :1759   Min.   :1.000  
##  Class :character   1st Qu.:1978   1st Qu.:2.000  
##  Mode  :character   Median :1987   Median :4.000  
##                     Mean   :1984   Mean   :3.973  
##                     3rd Qu.:1992   3rd Qu.:6.000  
##                     Max.   :2014   Max.   :7.000  
##   ride_length                    
##  Min.   :1899-12-31 00:01:00.00  
##  1st Qu.:1899-12-31 00:06:00.00  
##  Median :1899-12-31 00:10:00.00  
##  Mean   :1899-12-31 00:14:38.30  
##  3rd Qu.:1899-12-31 00:17:00.00  
##  Max.   :1899-12-31 23:57:00.00

# Add columns that list the date, month, day, and year of each ride
# This will allow us to aggregate ride data for each month, day, or year

all_trips$date <- as.Date(all_trips$start_time)
all_trips$month <- format(as.Date(all_trips$date),"%m")
all_trips$day <- format(as.Date(all_trips$date),"%d")
all_trips$year <- format(as.Date(all_trips$date),"%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date),"%A")

# Inspect the structure of the columns

str(all_trips)

## tibble [2,136,815 × 17] (S3: tbl_df/tbl/data.frame)
##  $ trip_id           : num [1:2136815] 21742443 21742444 21742445 21742446 21742447 ...
##  $ start_time        : POSIXct[1:2136815], format: "2019-01-01 00:04:00" "2019-01-01 00:08:00" ...
##  $ end_time          : POSIXct[1:2136815], format: "2019-01-01 00:11:00" "2019-01-01 00:15:00" ...
##  $ bike_id           : num [1:2136815] 2167 4386 1524 252 1170 ...
##  $ start_station_id  : num [1:2136815] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:2136815] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:2136815] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:2136815] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ user_type         : chr [1:2136815] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:2136815] "Male" "Female" "Female" "Male" ...
##  $ birth_year        : num [1:2136815] 1989 1990 1994 1993 1994 ...
##  $ day_of_week       : chr [1:2136815] "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
##  $ ride_length       : POSIXct[1:2136815], format: "1899-12-31 00:07:00" "1899-12-31 00:07:00" ...
##  $ date              : Date[1:2136815], format: "2019-01-01" "2019-01-01" ...
##  $ month             : chr [1:2136815] "01" "01" "01" "01" ...
##  $ day               : chr [1:2136815] "01" "01" "01" "01" ...
##  $ year              : chr [1:2136815] "2019" "2019" "2019" "2019" ...

# Add a "ride_length" calculation to all_trips (in seconds)

all_trips$ride_length <- difftime(all_trips$end_time,all_trips$start_time)

Perform insightful analyses

#=============================
# PERFORM DESCRIPTIVE ANALYSES
#=============================

summary(all_trips$ride_length)

##   Length    Class     Mode 
##  2136815 difftime  numeric

mean(all_trips$ride_length) #straight average (total ride length / rides)

## Time difference of 14.63847 mins

median(all_trips$ride_length) #midpoint number in the ascending array of ride lengths

## Time difference of 10 mins

max(all_trips$ride_length) #longest ride

## Time difference of 1437 mins

min(all_trips$ride_length) #shortest ride

## Time difference of 1 mins

# Compare subscribers (annual members) and customers (non-members)

aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = mean)

##   all_trips$user_type all_trips$ride_length
## 1            Customer         36.07204 mins
## 2          Subscriber         12.55834 mins

aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = median)

##   all_trips$user_type all_trips$ride_length
## 1            Customer               23 mins
## 2          Subscriber                9 mins

aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = max)

##   all_trips$user_type all_trips$ride_length
## 1            Customer             1437 mins
## 2          Subscriber             1433 mins

aggregate(all_trips$ride_length ~ all_trips$user_type, FUN = min)

##   all_trips$user_type all_trips$ride_length
## 1            Customer                1 mins
## 2          Subscriber                1 mins

# See the average ride time by each day for subscribers vs customers

aggregate(all_trips$ride_length ~ all_trips$user_type + all_trips$day_of_week, FUN = mean)

##    all_trips$user_type all_trips$day_of_week all_trips$ride_length
## 1             Customer                Friday         35.25310 mins
## 2           Subscriber                Friday         12.09061 mins
## 3             Customer                Monday         35.31018 mins
## 4           Subscriber                Monday         12.30750 mins
## 5             Customer              Saturday         38.79385 mins
## 6           Subscriber              Saturday         13.94203 mins
## 7             Customer                Sunday         38.63173 mins
## 8           Subscriber                Sunday         14.04006 mins
## 9             Customer              Thursday         34.08715 mins
## 10          Subscriber              Thursday         12.26093 mins
## 11            Customer               Tuesday         33.20637 mins
## 12          Subscriber               Tuesday         12.21851 mins
## 13            Customer             Wednesday         32.83618 mins
## 14          Subscriber             Wednesday         12.34527 mins

# Fixes out of order days from command above

all_trips$day_of_week <- ordered(all_trips$day_of_week, levels = c("Sunday","Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday"))

# Rerun average ride time per day by user type

aggregate(all_trips$ride_length ~ all_trips$user_type + all_trips$day_of_week, FUN = mean)

##    all_trips$user_type all_trips$day_of_week all_trips$ride_length
## 1             Customer                Sunday         38.63173 mins
## 2           Subscriber                Sunday         14.04006 mins
## 3             Customer                Monday         35.31018 mins
## 4           Subscriber                Monday         12.30750 mins
## 5             Customer               Tuesday         33.20637 mins
## 6           Subscriber               Tuesday         12.21851 mins
## 7             Customer             Wednesday         32.83618 mins
## 8           Subscriber             Wednesday         12.34527 mins
## 9             Customer              Thursday         34.08715 mins
## 10          Subscriber              Thursday         12.26093 mins
## 11            Customer                Friday         35.25310 mins
## 12          Subscriber                Friday         12.09061 mins
## 13            Customer              Saturday         38.79385 mins
## 14          Subscriber              Saturday         13.94203 mins

# Chart ridership data by type and weekday
all_trips %>%
  mutate(weekday = wday(start_time, label = TRUE)) %>%
  group_by(user_type,weekday) %>%
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length)) %>%
  arrange(user_type, weekday)

## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.

## # A tibble: 14 × 4
## # Groups:   user_type [2]
##    user_type  weekday number_of_rides average_duration
##    <chr>      <ord>             <int> <drtn>          
##  1 Customer   Sun               38146 38.63173 mins   
##  2 Customer   Mon               23493 35.31018 mins   
##  3 Customer   Tue               21408 33.20637 mins   
##  4 Customer   Wed               20327 32.83618 mins   
##  5 Customer   Thu               21160 34.08715 mins   
##  6 Customer   Fri               24338 35.25310 mins   
##  7 Customer   Sat               40161 38.79385 mins   
##  8 Subscriber Sun              170752 14.04006 mins   
##  9 Subscriber Mon              316572 12.30750 mins   
## 10 Subscriber Tue              349273 12.21851 mins   
## 11 Subscriber Wed              330483 12.34527 mins   
## 12 Subscriber Thu              308413 12.26093 mins   
## 13 Subscriber Fri              295055 12.09061 mins   
## 14 Subscriber Sat              177234 13.94203 mins

# Chart number of rides by user type
all_trips %>%
  mutate(weekday = wday(start_time, label = TRUE)) %>%
  group_by(user_type, weekday) %>%
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length)) %>%
  arrange(user_type, weekday) %>%
  ggplot(aes(x = weekday, y = number_of_rides, fill = user_type)) +
  geom_col(position = "dodge") +
  labs(y = "number of rides") #+

## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.

    #scale_y_continuous(labels = label_number(big.mark = ","))
  
# Chart average ride duration
all_trips %>%
  mutate(weekday = wday(start_time, label = TRUE)) %>%
  group_by(user_type, weekday) %>%
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length)) %>%
  arrange(user_type, weekday) %>%
  ggplot(aes(x = weekday, y = average_duration, fill = user_type)) +
  geom_col(position = "dodge") +
  labs(y = "average duration (minutes)")

## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

Create and export summary file

#====================
# EXPORT SUMMARY FILE
#====================

counts <- aggregate(all_trips$ride_length ~ all_trips$user_type +
                      all_trips$day_of_week, FUN = mean)
write.csv(counts, file = 'avg_ride_length.csv')

Bike_Share_2019

Corey Saari

2025-01-17

Environment creation

Notes: Critical packages loaded below. ‘readxl’ was very important for importing soon to be dataframes.

Read in Excel spreadsheets. Due to file size, using MS Excel was superior to Google Sheets

Initial review for data discrepancies and then merging all four quarterly subsets into one file

Preparing for data analysis

Perform insightful analyses

Create and export summary file