My purpose of this script was to consolidate downloaded Cyclistic data into a single dataframe and then conduct simple analysis to help answer the key question:

“In what ways do members and casual riders use Cyclistic bikes differently?”

First I installed and load the required packages:

# install.packages('tidyverse')
# install.packages('lubridate')
library(tidyverse) # for data import and wrangling

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(lubridate) # for date functions
library(ggplot2) # for visualization

STEP 1: COLLECT DATA

I uploaded datasets here:

q2_2019 <- read.csv(file = "2019_Q2.csv")
q3_2019 <- read.csv(file = "2019_Q3.csv")
q4_2019 <- read.csv(file = "2019_Q4.csv")
q1_2020 <- read.csv(file = "2020_Q1.csv")

STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE

I compared column names of each of the files.

While the names didn’t have to be in the same order, they needed to match perfectly before I could use a command to join them into one file.

colnames(x = q2_2019)

##  [1] "X01...Rental.Details.Rental.ID"                   
##  [2] "X01...Rental.Details.Local.Start.Time"            
##  [3] "X01...Rental.Details.Local.End.Time"              
##  [4] "X01...Rental.Details.Bike.ID"                     
##  [5] "X01...Rental.Details.Duration.In.Seconds.Uncapped"
##  [6] "X03...Rental.Start.Station.ID"                    
##  [7] "X03...Rental.Start.Station.Name"                  
##  [8] "X02...Rental.End.Station.ID"                      
##  [9] "X02...Rental.End.Station.Name"                    
## [10] "User.Type"                                        
## [11] "Member.Gender"                                    
## [12] "X05...Member.Details.Member.Birthday.Year"

colnames(x = q3_2019)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

colnames(x = q4_2019)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

colnames(x = q1_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

I renamed columns to make them consistent with q1_2020 (as this will be the supposed going-forward table design for Cyclistic)

q2_2019 <- rename(
  .data = q2_2019,
  ride_id = "X01...Rental.Details.Rental.ID",
  rideable_type = "X01...Rental.Details.Bike.ID",
  started_at = "X01...Rental.Details.Local.Start.Time",
  ended_at = "X01...Rental.Details.Local.End.Time",
  start_station_name = "X03...Rental.Start.Station.Name",
  start_station_id = "X03...Rental.Start.Station.ID",
  end_station_name = "X02...Rental.End.Station.Name",
  end_station_id = "X02...Rental.End.Station.ID",
  member_casual = "User.Type"
)

q3_2019 <- rename(
  .data = q3_2019,
  ride_id = trip_id,
  rideable_type = bikeid,
  started_at = start_time,
  ended_at = end_time,
  start_station_name = from_station_name,
  start_station_id = from_station_id,
  end_station_name = to_station_name,
  end_station_id = to_station_id,
  member_casual = usertype
)

q4_2019 <- rename(
  .data = q4_2019,
  ride_id = trip_id,
  rideable_type = bikeid,
  started_at = start_time,
  ended_at = end_time,
  start_station_name = from_station_name,
  start_station_id = from_station_id,
  end_station_name = to_station_name,
  end_station_id = to_station_id,
  member_casual = usertype
)

q2_2019 <- rename(
  .data = q2_2019,
  tripduration = "X01...Rental.Details.Duration.In.Seconds.Uncapped",
  birthyear = "X05...Member.Details.Member.Birthday.Year",
  gender = "Member.Gender"
)

I inspected the dataframes and looked for incongruencies:

str(object = q2_2019)

## 'data.frame':    1108163 obs. of  12 variables:
##  $ ride_id           : int  22178529 22178530 22178531 22178532 22178533 22178534 22178535 22178536 22178537 22178538 ...
##  $ started_at        : chr  "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
##  $ ended_at          : chr  "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
##  $ rideable_type     : int  6251 6226 5649 4151 3270 3123 6418 4513 3280 5534 ...
##  $ tripduration      : chr  "446.0" "1,048.0" "252.0" "357.0" ...
##  $ start_station_id  : int  81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr  "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : int  56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr  "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr  "Male" "Female" "Male" "Male" ...
##  $ birthyear         : int  1975 1984 1990 1993 1992 1999 1969 1991 NA NA ...

str(object = q3_2019)

## 'data.frame':    1640718 obs. of  12 variables:
##  $ ride_id           : int  23479388 23479389 23479390 23479391 23479392 23479393 23479394 23479395 23479396 23479397 ...
##  $ started_at        : chr  "2019-07-01 00:00:27" "2019-07-01 00:01:16" "2019-07-01 00:01:48" "2019-07-01 00:02:07" ...
##  $ ended_at          : chr  "2019-07-01 00:20:41" "2019-07-01 00:18:44" "2019-07-01 00:27:42" "2019-07-01 00:27:10" ...
##  $ rideable_type     : int  3591 5353 6180 5540 6014 4941 3770 5442 2957 6091 ...
##  $ tripduration      : chr  "1,214.0" "1,048.0" "1,554.0" "1,503.0" ...
##  $ start_station_id  : int  117 381 313 313 168 300 168 313 43 43 ...
##  $ start_station_name: chr  "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : int  497 203 144 144 62 232 62 144 195 195 ...
##  $ end_station_name  : chr  "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
##  $ member_casual     : chr  "Subscriber" "Customer" "Customer" "Customer" ...
##  $ gender            : chr  "Male" "" "" "" ...
##  $ birthyear         : int  1992 NA NA NA NA 1990 NA NA NA NA ...

str(object = q4_2019)

## 'data.frame':    704054 obs. of  12 variables:
##  $ ride_id           : int  25223640 25223641 25223642 25223643 25223644 25223645 25223646 25223647 25223648 25223649 ...
##  $ started_at        : chr  "2019-10-01 00:01:39" "2019-10-01 00:02:16" "2019-10-01 00:04:32" "2019-10-01 00:04:32" ...
##  $ ended_at          : chr  "2019-10-01 00:17:20" "2019-10-01 00:06:34" "2019-10-01 00:18:43" "2019-10-01 00:43:43" ...
##  $ rideable_type     : int  2215 6328 3003 3275 5294 1891 1061 1274 6011 2957 ...
##  $ tripduration      : chr  "940.0" "258.0" "850.0" "2,350.0" ...
##  $ start_station_id  : int  20 19 84 313 210 156 84 156 156 336 ...
##  $ start_station_name: chr  "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : int  309 241 199 290 382 226 142 463 463 336 ...
##  $ end_station_name  : chr  "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
##  $ member_casual     : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr  "Male" "Male" "Female" "Male" ...
##  $ birthyear         : int  1987 1998 1991 1990 1987 1994 1991 1995 1993 NA ...

str(object = q1_2020)

## 'data.frame':    426887 obs. of  13 variables:
##  $ ride_id           : chr  "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
##  $ rideable_type     : chr  "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : chr  "2020-01-21 20:06:59" "2020-01-30 14:22:39" "2020-01-09 19:29:26" "2020-01-06 16:17:07" ...
##  $ ended_at          : chr  "2020-01-21 20:14:30" "2020-01-30 14:26:22" "2020-01-09 19:32:17" "2020-01-06 16:25:56" ...
##  $ start_station_name: chr  "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
##  $ start_station_id  : int  239 234 296 51 66 212 96 96 212 38 ...
##  $ end_station_name  : chr  "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
##  $ end_station_id    : int  326 318 117 24 212 96 212 212 96 100 ...
##  $ start_lat         : num  42 42 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num  42 42 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

I converted ride_id and rideable_type to character so that they can stack correctly:

q2_2019 <- mutate(
  .data = q2_2019, ride_id = as.character(x = ride_id),
  rideable_type = as.character(rideable_type)
)
q3_2019 <- mutate(
  .data = q3_2019, ride_id = as.character(x = ride_id),
  rideable_type = as.character(rideable_type)
)
q4_2019 <- mutate(
  .data = q4_2019, ride_id = as.character(x = ride_id),
  rideable_type = as.character(rideable_type)
)

I stacked individual quarter’s data frames into one big data frame:

all_trips <- bind_rows(q2_2019, q3_2019, q4_2019, q1_2020)

I removed lat, long, birthyear, gender and tripduration fields as this data had been dropped beginning in 2020:

all_trips <- all_trips %>%
  select(-c(
    start_lat,
    start_lng,
    end_lat,
    end_lng,
    birthyear,
    gender,
    tripduration
  ))

STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS

I inspected the new table that had been created:

colnames(all_trips) # List of column names

## [1] "ride_id"            "started_at"         "ended_at"          
## [4] "rideable_type"      "start_station_id"   "start_station_name"
## [7] "end_station_id"     "end_station_name"   "member_casual"

nrow(all_trips) # How many rows are in data frame?

## [1] 3879822

dim(all_trips) # Dimensions of the data frame?

## [1] 3879822       9

head(all_trips)

##    ride_id          started_at            ended_at rideable_type
## 1 22178529 2019-04-01 00:02:22 2019-04-01 00:09:48          6251
## 2 22178530 2019-04-01 00:03:02 2019-04-01 00:20:30          6226
## 3 22178531 2019-04-01 00:11:07 2019-04-01 00:15:19          5649
## 4 22178532 2019-04-01 00:13:01 2019-04-01 00:18:58          4151
## 5 22178533 2019-04-01 00:19:26 2019-04-01 00:36:13          3270
## 6 22178534 2019-04-01 00:19:39 2019-04-01 00:23:56          3123
##   start_station_id        start_station_name end_station_id
## 1               81        Daley Center Plaza             56
## 2              317       Wood St & Taylor St             59
## 3              283 LaSalle St & Jackson Blvd            174
## 4               26  McClurg Ct & Illinois St            133
## 5              202      Halsted St & 18th St            129
## 6              420       Ellis Ave & 55th St            426
##            end_station_name member_casual
## 1 Desplaines St & Kinzie St    Subscriber
## 2 Wabash Ave & Roosevelt Rd    Subscriber
## 3     Canal St & Madison St    Subscriber
## 4  Kingsbury St & Kinzie St    Subscriber
## 5 Blue Island Ave & 18th St    Subscriber
## 6       Ellis Ave & 60th St    Subscriber

tail(all_trips)

##                  ride_id          started_at            ended_at rideable_type
## 3879817 6F4D221BDDFD943F 2020-03-10 10:40:27 2020-03-10 10:40:29   docked_bike
## 3879818 ADDAA33CEBCAE733 2020-03-10 10:40:06 2020-03-10 10:40:07   docked_bike
## 3879819 82B10FA3994BC66A 2020-03-07 15:25:55 2020-03-07 16:14:03   docked_bike
## 3879820 AA0D5AAA0B59C8AA 2020-03-01 13:12:38 2020-03-01 13:38:29   docked_bike
## 3879821 3296360A7BC20FB8 2020-03-07 18:02:45 2020-03-07 18:13:18   docked_bike
## 3879822 064EC7698E4FF9B3 2020-03-08 13:03:57 2020-03-08 13:32:27   docked_bike
##         start_station_id        start_station_name end_station_id
## 3879817              675                     HQ QR            675
## 3879818              675                     HQ QR            675
## 3879819              161     Rush St & Superior St            240
## 3879820              141    Clark St & Lincoln Ave            210
## 3879821              672 Franklin St & Illinois St            264
## 3879822              110     Dearborn St & Erie St             85
##                     end_station_name member_casual
## 3879817                        HQ QR        casual
## 3879818                        HQ QR        casual
## 3879819 Sheridan Rd & Irving Park Rd        member
## 3879820    Ashland Ave & Division St        casual
## 3879821 Stetson Ave & South Water St        member
## 3879822        Michigan Ave & Oak St        casual

str(all_trips) # See list of columns and data types

## 'data.frame':    3879822 obs. of  9 variables:
##  $ ride_id           : chr  "22178529" "22178530" "22178531" "22178532" ...
##  $ started_at        : chr  "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
##  $ ended_at          : chr  "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
##  $ rideable_type     : chr  "6251" "6226" "5649" "4151" ...
##  $ start_station_id  : int  81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr  "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : int  56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr  "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...

summary(all_trips) # Statistical summary of data. Mainly for numerics

##    ride_id           started_at          ended_at         rideable_type     
##  Length:3879822     Length:3879822     Length:3879822     Length:3879822    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  start_station_id start_station_name end_station_id  end_station_name  
##  Min.   :  1.0    Length:3879822     Min.   :  1.0   Length:3879822    
##  1st Qu.: 77.0    Class :character   1st Qu.: 77.0   Class :character  
##  Median :174.0    Mode  :character   Median :174.0   Mode  :character  
##  Mean   :202.9                       Mean   :203.8                     
##  3rd Qu.:291.0                       3rd Qu.:291.0                     
##  Max.   :675.0                       Max.   :675.0                     
##                                      NA's   :1                         
##  member_casual     
##  Length:3879822    
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

There were a few problems I needed to fix:

In the “member_casual” column, there were two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). I needed to consolidate that from four to two labels.
The data could only be aggregated at the ride-level, which was too granular. I decided to add some additional columns of data – such as day, month, year – that provided additional opportunities to aggregate the data.
I also decided to add a calculated field for length of ride since the 2020Q1 data did not have the ‘tripduration’ column. I added “ride_length” to the entire dataframe for consistency.
There were some rides where ‘tripduration’ shows up as negative, including several hundred rides where Cyclistic took bikes out of circulation for Quality Control reasons. I deleted those rides.

Tackling the first Problem:

In the “member_casual” column, I replaced “Subscriber” with “member” and “Customer” with “casual”. Before 2020, Cyclistic used different labels for these two types of riders. I wanted to make my dataframe consistent with their current nomenclature.

I began by seeing how many observations fall under each usertype.

all_trips %>%
  count(member_casual) %>%
  arrange(-n)

##   member_casual       n
## 1    Subscriber 2595461
## 2      Customer  857474
## 3        member  378407
## 4        casual   48480

I reassigned to the desired values:

all_trips <- all_trips %>%
  mutate(member_casual = case_match(
    .x = member_casual,
    "Subscriber" ~ "member",
    "Customer" ~ "casual")
    )

I checked to make sure the proper number of observations were reassigned:

all_trips %>%
  count(member_casual) %>%
  arrange(-n)

##   member_casual       n
## 1        member 2595461
## 2        casual  857474
## 3          <NA>  426887

Tackling the second problem:

I added columns that list the date, month, day, and year of each ride. This allowed me to aggregate ride data for each month, day, or year. Before completing these operations I could only aggregate at the ride level.

all_trips$date <- as.Date(x = all_trips$started_at)
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

Tackling the third problem:

First I converted data type of ‘started_at column’ and ‘ended_at column’ from character to POSIXct:

all_trips$started_at <- as.POSIXct(all_trips$started_at, format = "%Y-%m-%d %H:%M:%S")
all_trips$ended_at <- as.POSIXct(all_trips$ended_at, format = "%Y-%m-%d %H:%M:%S")

Then I added a “ride_length” calculation to all_trips (in seconds):

all_trips$ride_length <- difftime(
  all_trips$ended_at,
  all_trips$started_at
)

I inspected the structure of the columns:

str(all_trips)

## 'data.frame':    3879822 obs. of  15 variables:
##  $ ride_id           : chr  "22178529" "22178530" "22178531" "22178532" ...
##  $ started_at        : POSIXct, format: "2019-04-01 00:02:22" "2019-04-01 00:03:02" ...
##  $ ended_at          : POSIXct, format: "2019-04-01 00:09:48" "2019-04-01 00:20:30" ...
##  $ rideable_type     : chr  "6251" "6226" "5649" "4151" ...
##  $ start_station_id  : int  81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name: chr  "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id    : int  56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name  : chr  "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...
##  $ date              : Date, format: "2019-04-01" "2019-04-01" ...
##  $ month             : chr  "04" "04" "04" "04" ...
##  $ day               : chr  "01" "01" "01" "01" ...
##  $ year              : chr  "2019" "2019" "2019" "2019" ...
##  $ day_of_week       : chr  "Monday" "Monday" "Monday" "Monday" ...
##  $ ride_length       : 'difftime' num  446 1048 252 357 ...
##   ..- attr(*, "units")= chr "secs"

Tackle the fourth problem:

I removed ‘bad’ data. The dataframe included a few hundred entries when bikes were taken out of docks and checked for quality by Cyclistic or ride_length was negative.

I created a new version of the dataframe (v2) since data was being removed.

all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length < 0), ]

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

Here I provided some descriptive analysis on ride_length (all figures in seconds)

mean(all_trips_v2$ride_length, na.rm = TRUE) # straight average (total ride length / rides)

## Time difference of 1479.266 secs

median(all_trips_v2$ride_length, na.rm = TRUE) # midpoint number in the ascending array of ride lengths

## Time difference of 712 secs

max(all_trips_v2$ride_length, na.rm = TRUE) # longest ride

## Time difference of 9383424 secs

min(all_trips_v2$ride_length, na.rm = TRUE) # shortest ride

## Time difference of 1 secs

Here I compared members and casual users:

all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(mean_ride_length = mean(ride_length))

## # A tibble: 3 × 2
##   member_casual mean_ride_length
##   <chr>         <drtn>          
## 1 casual        3413.6001 secs  
## 2 member         863.1709 secs  
## 3 <NA>                 NA secs

all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(median_ride_length = median(ride_length))

## # A tibble: 3 × 2
##   member_casual median_ride_length
##   <chr>         <drtn>            
## 1 casual        1554 secs         
## 2 member         601 secs         
## 3 <NA>            NA secs

all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(max_ride_length = max(ride_length))

## # A tibble: 3 × 2
##   member_casual max_ride_length
##   <chr>         <drtn>         
## 1 casual        8582302 secs   
## 2 member        9060234 secs   
## 3 <NA>               NA secs

all_trips_v2 %>%
  group_by(member_casual) %>%
  summarise(min_ride_length = min(ride_length))

## # A tibble: 3 × 2
##   member_casual min_ride_length
##   <chr>         <drtn>         
## 1 casual        61 secs        
## 2 member        61 secs        
## 3 <NA>          NA secs

I checked the average ride time by each day for members vs casual users

all_trips_v2 %>% 
  group_by(member_casual, day_of_week) %>% 
  summarise(mean = mean(ride_length))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 22 × 3
## # Groups:   member_casual [3]
##    member_casual day_of_week mean          
##    <chr>         <chr>       <drtn>        
##  1 casual        Friday      3611.5732 secs
##  2 casual        Monday      3281.7303 secs
##  3 casual        Saturday    3233.4910 secs
##  4 casual        Sunday      3391.1785 secs
##  5 casual        Thursday    3466.1390 secs
##  6 casual        Tuesday     3477.3102 secs
##  7 casual        Wednesday   3634.7524 secs
##  8 member        Friday       833.9273 secs
##  9 member        Monday       852.2588 secs
## 10 member        Saturday     973.9411 secs
## # … with 12 more rows

I noticed that the days of the week were out of order. So I fixed that.

all_trips_v2$day_of_week <-
  ordered(all_trips_v2$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

Again I run the average ride time by each day for members vs casual users to check if the problem had been addressed.

all_trips_v2 %>% 
  group_by(member_casual, day_of_week) %>% 
  summarise(mean = mean(ride_length))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 22 × 3
## # Groups:   member_casual [3]
##    member_casual day_of_week mean          
##    <chr>         <ord>       <drtn>        
##  1 casual        Sunday      3391.1785 secs
##  2 casual        Monday      3281.7303 secs
##  3 casual        Tuesday     3477.3102 secs
##  4 casual        Wednesday   3634.7524 secs
##  5 casual        Thursday    3466.1390 secs
##  6 casual        Friday      3611.5732 secs
##  7 casual        Saturday    3233.4910 secs
##  8 member        Sunday       915.4380 secs
##  9 member        Monday       852.2588 secs
## 10 member        Tuesday      847.4632 secs
## # … with 12 more rows

I analyzed ridership data by type and weekday:

all_trips_v2 %>%
  mutate(weekday = wday(started_at, label = TRUE)) %>% # creates weekday field using wday()
  group_by(member_casual, weekday) %>% # groups by usertype and weekday
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>% # calculates the number of rides and average duration
  arrange(member_casual, weekday) # sorts

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 22 × 4
## # Groups:   member_casual [3]
##    member_casual weekday number_of_rides average_duration
##    <chr>         <ord>             <int> <drtn>          
##  1 casual        Sun              166407 3391.1785 secs  
##  2 casual        Mon               99597 3281.7303 secs  
##  3 casual        Tue               85927 3477.3102 secs  
##  4 casual        Wed               87256 3634.7524 secs  
##  5 casual        Thu               98452 3466.1390 secs  
##  6 casual        Fri              117766 3611.5732 secs  
##  7 casual        Sat              202063 3233.4910 secs  
##  8 member        Sun              232001  915.4380 secs  
##  9 member        Mon              410273  852.2588 secs  
## 10 member        Tue              438748  847.4632 secs  
## # … with 12 more rows

I visualized the number of rides by rider type:

all_trips_v2 %>%
  filter(!is.na(member_casual)) %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, weekday) %>%
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>%
  arrange(member_casual, weekday) %>%
  ggplot(mapping = aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge") + 
  labs(title = 'Number of riders by weekday for member and casual riders', fill = '') +
  xlab('Weekday') +
  ylab('Number of rides') +
  theme_bw()

all_trips_v2 %>%
  filter(!is.na(member_casual)) %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, weekday) %>%
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>%
  arrange(member_casual, weekday) %>%
  ggplot(mapping = aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "fill") + 
  labs(title = 'Number of riders by weekday for member and casual riders', fill = '') +
  xlab('Weekday') +
  ylab('Number of rides') +
  theme_bw()

I also created a visualization for average duration:

all_trips_v2 %>%
  filter(!is.na(member_casual)) %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, weekday) %>%
  summarise(
    number_of_rides = n(),
    average_duration = mean(ride_length)
  ) %>%
  arrange(member_casual, weekday) %>%
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title = 'Average ride duration by weekday for member and casual riders', fill = '') +
  xlab('Weekday') +
  ylab('Avergage duration') +
   theme_bw()

all_trips_v2 %>% 
  filter(member_casual== 'member') %>% 
  ggplot(aes(ride_length)) +
  geom_histogram(binwidth = 300) +
  labs(title = 'Distribution of ride length for memebr riders') +
  xlim(0, 5000) +
  xlab('Ride length in seconds') +
  ylab('Count') +
  theme_bw()

all_trips_v2 %>% 
  filter(member_casual== 'casual') %>% 
  ggplot(aes(ride_length)) +
  geom_histogram(binwidth = 300) +
  labs(title = 'Distribution of ride length for casual riders') +
  xlim(0, 5000) +
  xlab('Ride length in seconds') +
  ylab('Count') +
  theme_bw()

all_trips_v2 %>% 
  filter(member_casual %in% c('member', 'casual')) %>%
  ggplot(aes(ride_length, color = member_casual)) +
  geom_freqpoly(binwidth = 300, linewidth = 1) +
  labs(title = 'Distribution of ride length for member and casual riders') +
  xlim(0, 5000) +
  xlab('Ride length in seconds') +
  ylab('Count') +
  theme_bw()

STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS

Finally I create a csv file that I can visualize in Excel or Tableau for further analysis.

counts<- all_trips_v2 %>%
  group_by(member_casual, day_of_week) %>%
  summarise(mean_ride_length = mean(ride_length))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

counts

## # A tibble: 22 × 3
## # Groups:   member_casual [3]
##    member_casual day_of_week mean_ride_length
##    <chr>         <ord>       <drtn>          
##  1 casual        Sunday      3391.1785 secs  
##  2 casual        Monday      3281.7303 secs  
##  3 casual        Tuesday     3477.3102 secs  
##  4 casual        Wednesday   3634.7524 secs  
##  5 casual        Thursday    3466.1390 secs  
##  6 casual        Friday      3611.5732 secs  
##  7 casual        Saturday    3233.4910 secs  
##  8 member        Sunday       915.4380 secs  
##  9 member        Monday       852.2588 secs  
## 10 member        Tuesday      847.4632 secs  
## # … with 12 more rows

write.csv(x = counts, file = 'avg_ride_length.csv')

Cyclistic Case Study

Amir Reza Khedmati

2023-04-21

STEP 1: COLLECT DATA

STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE

STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS