My purpose of this script was to consolidate downloaded Cyclistic data into a single dataframe and then conduct simple analysis to help answer the key question:

“In what ways do members and casual riders use Cyclistic bikes differently?”

First I installed and load the required packages:

# install.packages('tidyverse')
# install.packages('lubridate')
library(tidyverse) # for data import and wrangling
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate) # for date functions
library(ggplot2) # for visualization

STEP 1: COLLECT DATA

I uploaded datasets here:

q2_2019 <- read.csv(file = "2019_Q2.csv")
q3_2019 <- read.csv(file = "2019_Q3.csv")
q4_2019 <- read.csv(file = "2019_Q4.csv")
q1_2020 <- read.csv(file = "2020_Q1.csv")

STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE

I compared column names of each of the files.

While the names didn’t have to be in the same order, they needed to match perfectly before I could use a command to join them into one file.

colnames(x = q2_2019)
##  [1] "X01...Rental.Details.Rental.ID"                   
##  [2] "X01...Rental.Details.Local.Start.Time"            
##  [3] "X01...Rental.Details.Local.End.Time"              
##  [4] "X01...Rental.Details.Bike.ID"                     
##  [5] "X01...Rental.Details.Duration.In.Seconds.Uncapped"
##  [6] "X03...Rental.Start.Station.ID"                    
##  [7] "X03...Rental.Start.Station.Name"                  
##  [8] "X02...Rental.End.Station.ID"                      
##  [9] "X02...Rental.End.Station.Name"                    
## [10] "User.Type"                                        
## [11] "Member.Gender"                                    
## [12] "X05...Member.Details.Member.Birthday.Year"
colnames(x = q3_2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"
colnames(x = q4_2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"
colnames(x = q1_2020)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

I renamed columns to make them consistent with q1_2020 (as this will be the supposed going-forward table design for Cyclistic)

q2_2019 <- rename(
  .data = q2_2019,
  ride_id = "X01...Rental.Details.Rental.ID",
  rideable_type = "X01...Rental.Details.Bike.ID",
  started_at = "X01...Rental.Details.Local.Start.Time",
  ended_at = "X01...Rental.Details.Local.End.Time",
  start_station_name = "X03...Rental.Start.Station.Name",
  start_station_id = "X03...Rental.Start.Station.ID",
  end_station_name = "X02...Rental.End.Station.Name",
  end_station_id = "X02...Rental.End.Station.ID",
  member_casual = "User.Type"
)

q3_2019 <- rename(
  .data = q3_2019,
  ride_id = trip_id,
  rideable_type = bikeid,
  started_at = start_time,
  ended_at = end_time,
  start_station_name = from_station_name,
  start_station_id = from_station_id,
  end_station_name = to_station_name,
  end_station_id = to_station_id,
  member_casual = usertype
)

q4_2019 <- rename(
  .data = q4_2019,
  ride_id = trip_id,
  rideable_type = bikeid,
  started_at = start_time,
  ended_at = end_time,
  start_station_name = from_station_name,
  start_station_id = from_station_id,
  end_station_name = to_station_name,
  end_station_id = to_station_id,
  member_casual = usertype
)

q2_2019 <- rename(
  .data = q2_2019,
  tripduration = "X01...Rental.Details.Duration.In.Seconds.Uncapped",
  birthyear = "X05...Member.Details.Member.Birthday.Year",
  gender = "Member.Gender"
)

I inspected the dataframes and looked for incongruencies:

str(object = q2_2019)
## 'data.frame':    1108163 obs. of  12 variables:
##  $ ride_id           : int  22178529 22178530 22178531 22178532 22178533 22178534 22178535 22178536 22178537 22178538 ...
##  $ started_at        : chr  "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
##  $ ended_at          : chr  "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
##  $ rideable_type     : int  6251 6226 5649 4151 3270 3123 6418 4513 3280 5534 ...
##  $ tripduration      : chr  "446.0" "1,048.0" "252.0" "357.0" ...
##  $ start_station_id  : int  81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr  "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : int  56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr  "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr  "Male" "Female" "Male" "Male" ...
##  $ birthyear         : int  1975 1984 1990 1993 1992 1999 1969 1991 NA NA ...
str(object = q3_2019)
## 'data.frame':    1640718 obs. of  12 variables:
##  $ ride_id           : int  23479388 23479389 23479390 23479391 23479392 23479393 23479394 23479395 23479396 23479397 ...
##  $ started_at        : chr  "2019-07-01 00:00:27" "2019-07-01 00:01:16" "2019-07-01 00:01:48" "2019-07-01 00:02:07" ...
##  $ ended_at          : chr  "2019-07-01 00:20:41" "2019-07-01 00:18:44" "2019-07-01 00:27:42" "2019-07-01 00:27:10" ...
##  $ rideable_type     : int  3591 5353 6180 5540 6014 4941 3770 5442 2957 6091 ...
##  $ tripduration      : chr  "1,214.0" "1,048.0" "1,554.0" "1,503.0" ...
##  $ start_station_id  : int  117 381 313 313 168 300 168 313 43 43 ...
##  $ start_station_name: chr  "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : int  497 203 144 144 62 232 62 144 195 195 ...
##  $ end_station_name  : chr  "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
##  $ member_casual     : chr  "Subscriber" "Customer" "Customer" "Customer" ...
##  $ gender            : chr  "Male" "" "" "" ...
##  $ birthyear         : int  1992 NA NA NA NA 1990 NA NA NA NA ...
str(object = q4_2019)
## 'data.frame':    704054 obs. of  12 variables:
##  $ ride_id           : int  25223640 25223641 25223642 25223643 25223644 25223645 25223646 25223647 25223648 25223649 ...
##  $ started_at        : chr  "2019-10-01 00:01:39" "2019-10-01 00:02:16" "2019-10-01 00:04:32" "2019-10-01 00:04:32" ...
##  $ ended_at          : chr  "2019-10-01 00:17:20" "2019-10-01 00:06:34" "2019-10-01 00:18:43" "2019-10-01 00:43:43" ...
##  $ rideable_type     : int  2215 6328 3003 3275 5294 1891 1061 1274 6011 2957 ...
##  $ tripduration      : chr  "940.0" "258.0" "850.0" "2,350.0" ...
##  $ start_station_id  : int  20 19 84 313 210 156 84 156 156 336 ...
##  $ start_station_name: chr  "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : int  309 241 199 290 382 226 142 463 463 336 ...
##  $ end_station_name  : chr  "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
##  $ member_casual     : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr  "Male" "Male" "Female" "Male" ...
##  $ birthyear         : int  1987 1998 1991 1990 1987 1994 1991 1995 1993 NA ...
str(object = q1_2020)
## 'data.frame':    426887 obs. of  13 variables:
##  $ ride_id           : chr  "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
##  $ rideable_type     : chr  "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : chr  "2020-01-21 20:06:59" "2020-01-30 14:22:39" "2020-01-09 19:29:26" "2020-01-06 16:17:07" ...
##  $ ended_at          : chr  "2020-01-21 20:14:30" "2020-01-30 14:26:22" "2020-01-09 19:32:17" "2020-01-06 16:25:56" ...
##  $ start_station_name: chr  "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
##  $ start_station_id  : int  239 234 296 51 66 212 96 96 212 38 ...
##  $ end_station_name  : chr  "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
##  $ end_station_id    : int  326 318 117 24 212 96 212 212 96 100 ...
##  $ start_lat         : num  42 42 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num  42 42 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

I converted ride_id and rideable_type to character so that they can stack correctly:

q2_2019 <- mutate(
  .data = q2_2019, ride_id = as.character(x = ride_id),
  rideable_type = as.character(rideable_type)
)
q3_2019 <- mutate(
  .data = q3_2019, ride_id = as.character(x = ride_id),
  rideable_type = as.character(rideable_type)
)
q4_2019 <- mutate(
  .data = q4_2019, ride_id = as.character(x = ride_id),
  rideable_type = as.character(rideable_type)
)

I stacked individual quarter’s data frames into one big data frame:

all_trips <- bind_rows(q2_2019, q3_2019, q4_2019, q1_2020)

I removed lat, long, birthyear, gender and tripduration fields as this data had been dropped beginning in 2020:

all_trips <- all_trips %>%
  select(-c(
    start_lat,
    start_lng,
    end_lat,
    end_lng,
    birthyear,
    gender,
    tripduration
  ))

STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS

I inspected the new table that had been created:

colnames(all_trips) # List of column names
## [1] "ride_id"            "started_at"         "ended_at"          
## [4] "rideable_type"      "start_station_id"   "start_station_name"
## [7] "end_station_id"     "end_station_name"   "member_casual"
nrow(all_trips) # How many rows are in data frame?
## [1] 3879822
dim(all_trips) # Dimensions of the data frame?
## [1] 3879822       9
head(all_trips)
##    ride_id          started_at            ended_at rideable_type
## 1 22178529 2019-04-01 00:02:22 2019-04-01 00:09:48          6251
## 2 22178530 2019-04-01 00:03:02 2019-04-01 00:20:30          6226
## 3 22178531 2019-04-01 00:11:07 2019-04-01 00:15:19          5649
## 4 22178532 2019-04-01 00:13:01 2019-04-01 00:18:58          4151
## 5 22178533 2019-04-01 00:19:26 2019-04-01 00:36:13          3270
## 6 22178534 2019-04-01 00:19:39 2019-04-01 00:23:56          3123
##   start_station_id        start_station_name end_station_id
## 1               81        Daley Center Plaza             56
## 2              317       Wood St & Taylor St             59
## 3              283 LaSalle St & Jackson Blvd            174
## 4               26  McClurg Ct & Illinois St            133
## 5              202      Halsted St & 18th St            129
## 6              420       Ellis Ave & 55th St            426
##            end_station_name member_casual
## 1 Desplaines St & Kinzie St    Subscriber
## 2 Wabash Ave & Roosevelt Rd    Subscriber
## 3     Canal St & Madison St    Subscriber
## 4  Kingsbury St & Kinzie St    Subscriber
## 5 Blue Island Ave & 18th St    Subscriber
## 6       Ellis Ave & 60th St    Subscriber
tail(all_trips)
##                  ride_id          started_at            ended_at rideable_type
## 3879817 6F4D221BDDFD943F 2020-03-10 10:40:27 2020-03-10 10:40:29   docked_bike
## 3879818 ADDAA33CEBCAE733 2020-03-10 10:40:06 2020-03-10 10:40:07   docked_bike
## 3879819 82B10FA3994BC66A 2020-03-07 15:25:55 2020-03-07 16:14:03   docked_bike
## 3879820 AA0D5AAA0B59C8AA 2020-03-01 13:12:38 2020-03-01 13:38:29   docked_bike
## 3879821 3296360A7BC20FB8 2020-03-07 18:02:45 2020-03-07 18:13:18   docked_bike
## 3879822 064EC7698E4FF9B3 2020-03-08 13:03:57 2020-03-08 13:32:27   docked_bike
##         start_station_id        start_station_name end_station_id
## 3879817              675                     HQ QR            675
## 3879818              675                     HQ QR            675
## 3879819              161     Rush St & Superior St            240
## 3879820              141    Clark St & Lincoln Ave            210
## 3879821              672 Franklin St & Illinois St            264
## 3879822              110     Dearborn St & Erie St             85
##                     end_station_name member_casual
## 3879817                        HQ QR        casual
## 3879818                        HQ QR        casual
## 3879819 Sheridan Rd & Irving Park Rd        member
## 3879820    Ashland Ave & Division St        casual
## 3879821 Stetson Ave & South Water St        member
## 3879822        Michigan Ave & Oak St        casual
str(all_trips) # See list of columns and data types
## 'data.frame':    3879822 obs. of  9 variables:
##  $ ride_id           : chr  "22178529" "22178530" "22178531" "22178532" ...
##  $ started_at        : chr  "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
##  $ ended_at          : chr  "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
##  $ rideable_type     : chr  "6251" "6226" "5649" "4151" ...
##  $ start_station_id  : int  81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr  "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : int  56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr  "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
summary(all_trips) # Statistical summary of data. Mainly for numerics
##    ride_id           started_at          ended_at         rideable_type     
##  Length:3879822     Length:3879822     Length:3879822     Length:3879822    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  start_station_id start_station_name end_station_id  end_station_name  
##  Min.   :  1.0    Length:3879822     Min.   :  1.0   Length:3879822    
##  1st Qu.: 77.0    Class :character   1st Qu.: 77.0   Class :character  
##  Median :174.0    Mode  :character   Median :174.0   Mode  :character  
##  Mean   :202.9                       Mean   :203.8                     
##  3rd Qu.:291.0                       3rd Qu.:291.0                     
##  Max.   :675.0                       Max.   :675.0                     
##                                      NA's   :1                         
##  member_casual     
##  Length:3879822    
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

There were a few problems I needed to fix:

  1. In the “member_casual” column, there were two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). I needed to consolidate that from four to two labels.

  2. The data could only be aggregated at the ride-level, which was too granular. I decided to add some additional columns of data – such as day, month, year – that provided additional opportunities to aggregate the data.

  3. I also decided to add a calculated field for length of ride since the 2020Q1 data did not have the ‘tripduration’ column. I added “ride_length” to the entire dataframe for consistency.

  4. There were some rides where ‘tripduration’ shows up as negative, including several hundred rides where Cyclistic took bikes out of circulation for Quality Control reasons. I deleted those rides.

Tackling the first Problem:

In the “member_casual” column, I replaced “Subscriber” with “member” and “Customer” with “casual”. Before 2020, Cyclistic used different labels for these two types of riders. I wanted to make my dataframe consistent with their current nomenclature.

I began by seeing how many observations fall under each usertype.

all_trips %>%
  count(member_casual) %>%
  arrange(-n)
##   member_casual       n
## 1    Subscriber 2595461
## 2      Customer  857474
## 3        member  378407
## 4        casual   48480

I reassigned to the desired values:

all_trips <- all_trips %>%
  mutate(member_casual = case_match(
    .x = member_casual,
    "Subscriber" ~ "member",
    "Customer" ~ "casual")
    )

I checked to make sure the proper number of observations were reassigned:

all_trips %>%
  count(member_casual) %>%
  arrange(-n)
##   member_casual       n
## 1        member 2595461
## 2        casual  857474
## 3          <NA>  426887

Tackling the second problem:

I added columns that list the date, month, day, and year of each ride. This allowed me to aggregate ride data for each month, day, or year. Before completing these operations I could only aggregate at the ride level.

all_trips$date <- as.Date(x = all_trips$started_at)
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

Tackling the third problem:

First I converted data type of ‘started_at column’ and ‘ended_at column’ from character to POSIXct:

all_trips$started_at <- as.POSIXct(all_trips$started_at, format = "%Y-%m-%d %H:%M:%S")
all_trips$ended_at <- as.POSIXct(all_trips$ended_at, format = "%Y-%m-%d %H:%M:%S")

Then I added a “ride_length” calculation to all_trips (in seconds):

all_trips$ride_length <- difftime(
  all_trips$ended_at,
  all_trips$started_at
)

I inspected the structure of the columns:

str(all_trips)
## 'data.frame':    3879822 obs. of  15 variables:
##  $ ride_id           : chr  "22178529" "22178530" "22178531" "22178532" ...
##  $ started_at        : POSIXct, format: "2019-04-01 00:02:22" "2019-04-01 00:03:02" ...
##  $ ended_at          : POSIXct, format: "2019-04-01 00:09:48" "2019-04-01 00:20:30" ...
##  $ rideable_type     : chr  "6251" "6226" "5649" "4151" ...
##  $ start_station_id  : int  81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr  "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : int  56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr  "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...
##  $ date              : Date, format: "2019-04-01" "2019-04-01" ...
##  $ month             : chr  "04" "04" "04" "04" ...
##  $ day               : chr  "01" "01" "01" "01" ...
##  $ year              : chr  "2019" "2019" "2019" "2019" ...
##  $ day_of_week       : chr  "Monday" "Monday" "Monday" "Monday" ...
##  $ ride_length       : 'difftime' num  446 1048 252 357 ...
##   ..- attr(*, "units")= chr "secs"

Tackle the fourth problem:

I removed ‘bad’ data. The dataframe included a few hundred entries when bikes were taken out of docks and checked for quality by Cyclistic or ride_length was negative.

I created a new version of the dataframe (v2) since data was being removed.

all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length < 0), ]

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

Here I provided some descriptive analysis on ride_length (all figures in seconds)

mean(all_trips_v2$ride_length, na.rm = TRUE) # straight average (total ride length / rides)
## Time difference of 1479.266 secs
median(all_trips_v2$ride_length, na.rm = TRUE) # midpoint number in the ascending array of ride lengths
## Time difference of 712 secs
max(all_trips_v2$ride_length, na.rm = TRUE) # longest ride
## Time difference of 9383424 secs
min(all_trips_v2$ride_length, na.rm = TRUE) # shortest ride
## Time difference of 1 secs

Here I compared members and casual users:

all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(mean_ride_length = mean(ride_length))
## # A tibble: 3 × 2
##   member_casual mean_ride_length
##   <chr>         <drtn>          
## 1 casual        3413.6001 secs  
## 2 member         863.1709 secs  
## 3 <NA>                 NA secs
all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(median_ride_length = median(ride_length))
## # A tibble: 3 × 2
##   member_casual median_ride_length
##   <chr>         <drtn>            
## 1 casual        1554 secs         
## 2 member         601 secs         
## 3 <NA>            NA secs
all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(max_ride_length = max(ride_length))
## # A tibble: 3 × 2
##   member_casual max_ride_length
##   <chr>         <drtn>         
## 1 casual        8582302 secs   
## 2 member        9060234 secs   
## 3 <NA>               NA secs
all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(min_ride_length = min(ride_length))
## # A tibble: 3 × 2
##   member_casual min_ride_length
##   <chr>         <drtn>         
## 1 casual        61 secs        
## 2 member        61 secs        
## 3 <NA>          NA secs

I checked the average ride time by each day for members vs casual users

all_trips_v2 %>% 
  group_by(member_casual, day_of_week) %>% 
  summarise(mean = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 22 × 3
## # Groups:   member_casual [3]
##    member_casual day_of_week mean          
##    <chr>         <chr>       <drtn>        
##  1 casual        Friday      3611.5732 secs
##  2 casual        Monday      3281.7303 secs
##  3 casual        Saturday    3233.4910 secs
##  4 casual        Sunday      3391.1785 secs
##  5 casual        Thursday    3466.1390 secs
##  6 casual        Tuesday     3477.3102 secs
##  7 casual        Wednesday   3634.7524 secs
##  8 member        Friday       833.9273 secs
##  9 member        Monday       852.2588 secs
## 10 member        Saturday     973.9411 secs
## # … with 12 more rows

I noticed that the days of the week were out of order. So I fixed that.

all_trips_v2$day_of_week <-
  ordered(all_trips_v2$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

Again I run the average ride time by each day for members vs casual users to check if the problem had been addressed.

all_trips_v2 %>% 
  group_by(member_casual, day_of_week) %>% 
  summarise(mean = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 22 × 3
## # Groups:   member_casual [3]
##    member_casual day_of_week mean          
##    <chr>         <ord>       <drtn>        
##  1 casual        Sunday      3391.1785 secs
##  2 casual        Monday      3281.7303 secs
##  3 casual        Tuesday     3477.3102 secs
##  4 casual        Wednesday   3634.7524 secs
##  5 casual        Thursday    3466.1390 secs
##  6 casual        Friday      3611.5732 secs
##  7 casual        Saturday    3233.4910 secs
##  8 member        Sunday       915.4380 secs
##  9 member        Monday       852.2588 secs
## 10 member        Tuesday      847.4632 secs
## # … with 12 more rows

I analyzed ridership data by type and weekday:

all_trips_v2 %>%
  mutate(weekday = wday(started_at, label = TRUE)) %>% # creates weekday field using wday()
  group_by(member_casual, weekday) %>% # groups by usertype and weekday
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>% # calculates the number of rides and average duration
  arrange(member_casual, weekday) # sorts
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 22 × 4
## # Groups:   member_casual [3]
##    member_casual weekday number_of_rides average_duration
##    <chr>         <ord>             <int> <drtn>          
##  1 casual        Sun              166407 3391.1785 secs  
##  2 casual        Mon               99597 3281.7303 secs  
##  3 casual        Tue               85927 3477.3102 secs  
##  4 casual        Wed               87256 3634.7524 secs  
##  5 casual        Thu               98452 3466.1390 secs  
##  6 casual        Fri              117766 3611.5732 secs  
##  7 casual        Sat              202063 3233.4910 secs  
##  8 member        Sun              232001  915.4380 secs  
##  9 member        Mon              410273  852.2588 secs  
## 10 member        Tue              438748  847.4632 secs  
## # … with 12 more rows

I visualized the number of rides by rider type:

all_trips_v2 %>%
  filter(!is.na(member_casual)) %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, weekday) %>%
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>%
  arrange(member_casual, weekday) %>%
  ggplot(mapping = aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge") + 
  labs(title = 'Number of riders by weekday for member and casual riders', fill = '') +
  xlab('Weekday') +
  ylab('Number of rides') +
  theme_bw()

all_trips_v2 %>%
  filter(!is.na(member_casual)) %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, weekday) %>%
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>%
  arrange(member_casual, weekday) %>%
  ggplot(mapping = aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "fill") + 
  labs(title = 'Number of riders by weekday for member and casual riders', fill = '') +
  xlab('Weekday') +
  ylab('Number of rides') +
  theme_bw()

I also created a visualization for average duration:

all_trips_v2 %>%
  filter(!is.na(member_casual)) %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, weekday) %>%
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>%
  arrange(member_casual, weekday) %>%
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title = 'Average ride duration by weekday for member and casual riders', fill = '') +
  xlab('Weekday') +
  ylab('Avergage duration') +
   theme_bw()

all_trips_v2 %>% 
  filter(member_casual== 'member') %>% 
  ggplot(aes(ride_length)) +
  geom_histogram(binwidth = 300) +
  labs(title = 'Distribution of ride length for memebr riders') +
  xlim(0, 5000) +
  xlab('Ride length in seconds') +
  ylab('Count') +
  theme_bw()

all_trips_v2 %>% 
  filter(member_casual== 'casual') %>% 
  ggplot(aes(ride_length)) +
  geom_histogram(binwidth = 300) +
  labs(title = 'Distribution of ride length for casual riders') +
  xlim(0, 5000) +
  xlab('Ride length in seconds') +
  ylab('Count') +
  theme_bw()

all_trips_v2 %>% 
  filter(member_casual %in% c('member', 'casual')) %>%
  ggplot(aes(ride_length, color = member_casual)) +
  geom_freqpoly(binwidth = 300, linewidth = 1) +
  labs(title = 'Distribution of ride length for member and casual riders') +
  xlim(0, 5000) +
  xlab('Ride length in seconds') +
  ylab('Count') +
  theme_bw()

STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS

Finally I create a csv file that I can visualize in Excel or Tableau for further analysis.

counts<- all_trips_v2 %>%
  group_by(member_casual, day_of_week) %>%
  summarise(mean_ride_length = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
counts
## # A tibble: 22 × 3
## # Groups:   member_casual [3]
##    member_casual day_of_week mean_ride_length
##    <chr>         <ord>       <drtn>          
##  1 casual        Sunday      3391.1785 secs  
##  2 casual        Monday      3281.7303 secs  
##  3 casual        Tuesday     3477.3102 secs  
##  4 casual        Wednesday   3634.7524 secs  
##  5 casual        Thursday    3466.1390 secs  
##  6 casual        Friday      3611.5732 secs  
##  7 casual        Saturday    3233.4910 secs  
##  8 member        Sunday       915.4380 secs  
##  9 member        Monday       852.2588 secs  
## 10 member        Tuesday      847.4632 secs  
## # … with 12 more rows
write.csv(x = counts, file = 'avg_ride_length.csv')