My purpose of this script was to consolidate downloaded Cyclistic data into a single dataframe and then conduct simple analysis to help answer the key question:
“In what ways do members and casual riders use Cyclistic bikes differently?”
First I installed and load the required packages:
# install.packages('tidyverse')
# install.packages('lubridate')
library(tidyverse) # for data import and wrangling
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate) # for date functions
library(ggplot2) # for visualization
I uploaded datasets here:
q2_2019 <- read.csv(file = "2019_Q2.csv")
q3_2019 <- read.csv(file = "2019_Q3.csv")
q4_2019 <- read.csv(file = "2019_Q4.csv")
q1_2020 <- read.csv(file = "2020_Q1.csv")
I compared column names of each of the files.
While the names didn’t have to be in the same order, they needed to match perfectly before I could use a command to join them into one file.
colnames(x = q2_2019)
## [1] "X01...Rental.Details.Rental.ID"
## [2] "X01...Rental.Details.Local.Start.Time"
## [3] "X01...Rental.Details.Local.End.Time"
## [4] "X01...Rental.Details.Bike.ID"
## [5] "X01...Rental.Details.Duration.In.Seconds.Uncapped"
## [6] "X03...Rental.Start.Station.ID"
## [7] "X03...Rental.Start.Station.Name"
## [8] "X02...Rental.End.Station.ID"
## [9] "X02...Rental.End.Station.Name"
## [10] "User.Type"
## [11] "Member.Gender"
## [12] "X05...Member.Details.Member.Birthday.Year"
colnames(x = q3_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(x = q4_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(x = q1_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
I renamed columns to make them consistent with q1_2020 (as this will be the supposed going-forward table design for Cyclistic)
q2_2019 <- rename(
.data = q2_2019,
ride_id = "X01...Rental.Details.Rental.ID",
rideable_type = "X01...Rental.Details.Bike.ID",
started_at = "X01...Rental.Details.Local.Start.Time",
ended_at = "X01...Rental.Details.Local.End.Time",
start_station_name = "X03...Rental.Start.Station.Name",
start_station_id = "X03...Rental.Start.Station.ID",
end_station_name = "X02...Rental.End.Station.Name",
end_station_id = "X02...Rental.End.Station.ID",
member_casual = "User.Type"
)
q3_2019 <- rename(
.data = q3_2019,
ride_id = trip_id,
rideable_type = bikeid,
started_at = start_time,
ended_at = end_time,
start_station_name = from_station_name,
start_station_id = from_station_id,
end_station_name = to_station_name,
end_station_id = to_station_id,
member_casual = usertype
)
q4_2019 <- rename(
.data = q4_2019,
ride_id = trip_id,
rideable_type = bikeid,
started_at = start_time,
ended_at = end_time,
start_station_name = from_station_name,
start_station_id = from_station_id,
end_station_name = to_station_name,
end_station_id = to_station_id,
member_casual = usertype
)
q2_2019 <- rename(
.data = q2_2019,
tripduration = "X01...Rental.Details.Duration.In.Seconds.Uncapped",
birthyear = "X05...Member.Details.Member.Birthday.Year",
gender = "Member.Gender"
)
I inspected the dataframes and looked for incongruencies:
str(object = q2_2019)
## 'data.frame': 1108163 obs. of 12 variables:
## $ ride_id : int 22178529 22178530 22178531 22178532 22178533 22178534 22178535 22178536 22178537 22178538 ...
## $ started_at : chr "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
## $ ended_at : chr "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
## $ rideable_type : int 6251 6226 5649 4151 3270 3123 6418 4513 3280 5534 ...
## $ tripduration : chr "446.0" "1,048.0" "252.0" "357.0" ...
## $ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
## $ start_station_name: chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
## $ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
## $ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
## $ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr "Male" "Female" "Male" "Male" ...
## $ birthyear : int 1975 1984 1990 1993 1992 1999 1969 1991 NA NA ...
str(object = q3_2019)
## 'data.frame': 1640718 obs. of 12 variables:
## $ ride_id : int 23479388 23479389 23479390 23479391 23479392 23479393 23479394 23479395 23479396 23479397 ...
## $ started_at : chr "2019-07-01 00:00:27" "2019-07-01 00:01:16" "2019-07-01 00:01:48" "2019-07-01 00:02:07" ...
## $ ended_at : chr "2019-07-01 00:20:41" "2019-07-01 00:18:44" "2019-07-01 00:27:42" "2019-07-01 00:27:10" ...
## $ rideable_type : int 3591 5353 6180 5540 6014 4941 3770 5442 2957 6091 ...
## $ tripduration : chr "1,214.0" "1,048.0" "1,554.0" "1,503.0" ...
## $ start_station_id : int 117 381 313 313 168 300 168 313 43 43 ...
## $ start_station_name: chr "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
## $ end_station_id : int 497 203 144 144 62 232 62 144 195 195 ...
## $ end_station_name : chr "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
## $ member_casual : chr "Subscriber" "Customer" "Customer" "Customer" ...
## $ gender : chr "Male" "" "" "" ...
## $ birthyear : int 1992 NA NA NA NA 1990 NA NA NA NA ...
str(object = q4_2019)
## 'data.frame': 704054 obs. of 12 variables:
## $ ride_id : int 25223640 25223641 25223642 25223643 25223644 25223645 25223646 25223647 25223648 25223649 ...
## $ started_at : chr "2019-10-01 00:01:39" "2019-10-01 00:02:16" "2019-10-01 00:04:32" "2019-10-01 00:04:32" ...
## $ ended_at : chr "2019-10-01 00:17:20" "2019-10-01 00:06:34" "2019-10-01 00:18:43" "2019-10-01 00:43:43" ...
## $ rideable_type : int 2215 6328 3003 3275 5294 1891 1061 1274 6011 2957 ...
## $ tripduration : chr "940.0" "258.0" "850.0" "2,350.0" ...
## $ start_station_id : int 20 19 84 313 210 156 84 156 156 336 ...
## $ start_station_name: chr "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
## $ end_station_id : int 309 241 199 290 382 226 142 463 463 336 ...
## $ end_station_name : chr "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
## $ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr "Male" "Male" "Female" "Male" ...
## $ birthyear : int 1987 1998 1991 1990 1987 1994 1991 1995 1993 NA ...
str(object = q1_2020)
## 'data.frame': 426887 obs. of 13 variables:
## $ ride_id : chr "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
## $ rideable_type : chr "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : chr "2020-01-21 20:06:59" "2020-01-30 14:22:39" "2020-01-09 19:29:26" "2020-01-06 16:17:07" ...
## $ ended_at : chr "2020-01-21 20:14:30" "2020-01-30 14:26:22" "2020-01-09 19:32:17" "2020-01-06 16:25:56" ...
## $ start_station_name: chr "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
## $ start_station_id : int 239 234 296 51 66 212 96 96 212 38 ...
## $ end_station_name : chr "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
## $ end_station_id : int 326 318 117 24 212 96 212 212 96 100 ...
## $ start_lat : num 42 42 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.6 -87.6 -87.6 ...
## $ end_lat : num 42 42 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "member" "member" ...
I converted ride_id and rideable_type to character so that they can stack correctly:
q2_2019 <- mutate(
.data = q2_2019, ride_id = as.character(x = ride_id),
rideable_type = as.character(rideable_type)
)
q3_2019 <- mutate(
.data = q3_2019, ride_id = as.character(x = ride_id),
rideable_type = as.character(rideable_type)
)
q4_2019 <- mutate(
.data = q4_2019, ride_id = as.character(x = ride_id),
rideable_type = as.character(rideable_type)
)
I stacked individual quarter’s data frames into one big data frame:
all_trips <- bind_rows(q2_2019, q3_2019, q4_2019, q1_2020)
I removed lat, long, birthyear, gender and tripduration fields as this data had been dropped beginning in 2020:
all_trips <- all_trips %>%
select(-c(
start_lat,
start_lng,
end_lat,
end_lng,
birthyear,
gender,
tripduration
))
I inspected the new table that had been created:
colnames(all_trips) # List of column names
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "member_casual"
nrow(all_trips) # How many rows are in data frame?
## [1] 3879822
dim(all_trips) # Dimensions of the data frame?
## [1] 3879822 9
head(all_trips)
## ride_id started_at ended_at rideable_type
## 1 22178529 2019-04-01 00:02:22 2019-04-01 00:09:48 6251
## 2 22178530 2019-04-01 00:03:02 2019-04-01 00:20:30 6226
## 3 22178531 2019-04-01 00:11:07 2019-04-01 00:15:19 5649
## 4 22178532 2019-04-01 00:13:01 2019-04-01 00:18:58 4151
## 5 22178533 2019-04-01 00:19:26 2019-04-01 00:36:13 3270
## 6 22178534 2019-04-01 00:19:39 2019-04-01 00:23:56 3123
## start_station_id start_station_name end_station_id
## 1 81 Daley Center Plaza 56
## 2 317 Wood St & Taylor St 59
## 3 283 LaSalle St & Jackson Blvd 174
## 4 26 McClurg Ct & Illinois St 133
## 5 202 Halsted St & 18th St 129
## 6 420 Ellis Ave & 55th St 426
## end_station_name member_casual
## 1 Desplaines St & Kinzie St Subscriber
## 2 Wabash Ave & Roosevelt Rd Subscriber
## 3 Canal St & Madison St Subscriber
## 4 Kingsbury St & Kinzie St Subscriber
## 5 Blue Island Ave & 18th St Subscriber
## 6 Ellis Ave & 60th St Subscriber
tail(all_trips)
## ride_id started_at ended_at rideable_type
## 3879817 6F4D221BDDFD943F 2020-03-10 10:40:27 2020-03-10 10:40:29 docked_bike
## 3879818 ADDAA33CEBCAE733 2020-03-10 10:40:06 2020-03-10 10:40:07 docked_bike
## 3879819 82B10FA3994BC66A 2020-03-07 15:25:55 2020-03-07 16:14:03 docked_bike
## 3879820 AA0D5AAA0B59C8AA 2020-03-01 13:12:38 2020-03-01 13:38:29 docked_bike
## 3879821 3296360A7BC20FB8 2020-03-07 18:02:45 2020-03-07 18:13:18 docked_bike
## 3879822 064EC7698E4FF9B3 2020-03-08 13:03:57 2020-03-08 13:32:27 docked_bike
## start_station_id start_station_name end_station_id
## 3879817 675 HQ QR 675
## 3879818 675 HQ QR 675
## 3879819 161 Rush St & Superior St 240
## 3879820 141 Clark St & Lincoln Ave 210
## 3879821 672 Franklin St & Illinois St 264
## 3879822 110 Dearborn St & Erie St 85
## end_station_name member_casual
## 3879817 HQ QR casual
## 3879818 HQ QR casual
## 3879819 Sheridan Rd & Irving Park Rd member
## 3879820 Ashland Ave & Division St casual
## 3879821 Stetson Ave & South Water St member
## 3879822 Michigan Ave & Oak St casual
str(all_trips) # See list of columns and data types
## 'data.frame': 3879822 obs. of 9 variables:
## $ ride_id : chr "22178529" "22178530" "22178531" "22178532" ...
## $ started_at : chr "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
## $ ended_at : chr "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
## $ rideable_type : chr "6251" "6226" "5649" "4151" ...
## $ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
## $ start_station_name: chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
## $ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
## $ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
## $ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
summary(all_trips) # Statistical summary of data. Mainly for numerics
## ride_id started_at ended_at rideable_type
## Length:3879822 Length:3879822 Length:3879822 Length:3879822
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_station_id start_station_name end_station_id end_station_name
## Min. : 1.0 Length:3879822 Min. : 1.0 Length:3879822
## 1st Qu.: 77.0 Class :character 1st Qu.: 77.0 Class :character
## Median :174.0 Mode :character Median :174.0 Mode :character
## Mean :202.9 Mean :203.8
## 3rd Qu.:291.0 3rd Qu.:291.0
## Max. :675.0 Max. :675.0
## NA's :1
## member_casual
## Length:3879822
## Class :character
## Mode :character
##
##
##
##
There were a few problems I needed to fix:
In the “member_casual” column, there were two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). I needed to consolidate that from four to two labels.
The data could only be aggregated at the ride-level, which was too granular. I decided to add some additional columns of data – such as day, month, year – that provided additional opportunities to aggregate the data.
I also decided to add a calculated field for length of ride since the 2020Q1 data did not have the ‘tripduration’ column. I added “ride_length” to the entire dataframe for consistency.
There were some rides where ‘tripduration’ shows up as negative, including several hundred rides where Cyclistic took bikes out of circulation for Quality Control reasons. I deleted those rides.
Tackling the first Problem:
In the “member_casual” column, I replaced “Subscriber” with “member” and “Customer” with “casual”. Before 2020, Cyclistic used different labels for these two types of riders. I wanted to make my dataframe consistent with their current nomenclature.
I began by seeing how many observations fall under each usertype.
all_trips %>%
count(member_casual) %>%
arrange(-n)
## member_casual n
## 1 Subscriber 2595461
## 2 Customer 857474
## 3 member 378407
## 4 casual 48480
I reassigned to the desired values:
all_trips <- all_trips %>%
mutate(member_casual = case_match(
.x = member_casual,
"Subscriber" ~ "member",
"Customer" ~ "casual")
)
I checked to make sure the proper number of observations were reassigned:
all_trips %>%
count(member_casual) %>%
arrange(-n)
## member_casual n
## 1 member 2595461
## 2 casual 857474
## 3 <NA> 426887
Tackling the second problem:
I added columns that list the date, month, day, and year of each ride. This allowed me to aggregate ride data for each month, day, or year. Before completing these operations I could only aggregate at the ride level.
all_trips$date <- as.Date(x = all_trips$started_at)
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
Tackling the third problem:
First I converted data type of ‘started_at column’ and ‘ended_at column’ from character to POSIXct:
all_trips$started_at <- as.POSIXct(all_trips$started_at, format = "%Y-%m-%d %H:%M:%S")
all_trips$ended_at <- as.POSIXct(all_trips$ended_at, format = "%Y-%m-%d %H:%M:%S")
Then I added a “ride_length” calculation to all_trips (in seconds):
all_trips$ride_length <- difftime(
all_trips$ended_at,
all_trips$started_at
)
I inspected the structure of the columns:
str(all_trips)
## 'data.frame': 3879822 obs. of 15 variables:
## $ ride_id : chr "22178529" "22178530" "22178531" "22178532" ...
## $ started_at : POSIXct, format: "2019-04-01 00:02:22" "2019-04-01 00:03:02" ...
## $ ended_at : POSIXct, format: "2019-04-01 00:09:48" "2019-04-01 00:20:30" ...
## $ rideable_type : chr "6251" "6226" "5649" "4151" ...
## $ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
## $ start_station_name: chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
## $ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
## $ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
## $ member_casual : chr "member" "member" "member" "member" ...
## $ date : Date, format: "2019-04-01" "2019-04-01" ...
## $ month : chr "04" "04" "04" "04" ...
## $ day : chr "01" "01" "01" "01" ...
## $ year : chr "2019" "2019" "2019" "2019" ...
## $ day_of_week : chr "Monday" "Monday" "Monday" "Monday" ...
## $ ride_length : 'difftime' num 446 1048 252 357 ...
## ..- attr(*, "units")= chr "secs"
Tackle the fourth problem:
I removed ‘bad’ data. The dataframe included a few hundred entries when bikes were taken out of docks and checked for quality by Cyclistic or ride_length was negative.
I created a new version of the dataframe (v2) since data was being removed.
all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length < 0), ]
Here I provided some descriptive analysis on ride_length (all figures in seconds)
mean(all_trips_v2$ride_length, na.rm = TRUE) # straight average (total ride length / rides)
## Time difference of 1479.266 secs
median(all_trips_v2$ride_length, na.rm = TRUE) # midpoint number in the ascending array of ride lengths
## Time difference of 712 secs
max(all_trips_v2$ride_length, na.rm = TRUE) # longest ride
## Time difference of 9383424 secs
min(all_trips_v2$ride_length, na.rm = TRUE) # shortest ride
## Time difference of 1 secs
Here I compared members and casual users:
all_trips_v2 %>%
group_by(member_casual) %>%
summarise(mean_ride_length = mean(ride_length))
## # A tibble: 3 × 2
## member_casual mean_ride_length
## <chr> <drtn>
## 1 casual 3413.6001 secs
## 2 member 863.1709 secs
## 3 <NA> NA secs
all_trips_v2 %>%
group_by(member_casual) %>%
summarise(median_ride_length = median(ride_length))
## # A tibble: 3 × 2
## member_casual median_ride_length
## <chr> <drtn>
## 1 casual 1554 secs
## 2 member 601 secs
## 3 <NA> NA secs
all_trips_v2 %>%
group_by(member_casual) %>%
summarise(max_ride_length = max(ride_length))
## # A tibble: 3 × 2
## member_casual max_ride_length
## <chr> <drtn>
## 1 casual 8582302 secs
## 2 member 9060234 secs
## 3 <NA> NA secs
all_trips_v2 %>%
group_by(member_casual) %>%
summarise(min_ride_length = min(ride_length))
## # A tibble: 3 × 2
## member_casual min_ride_length
## <chr> <drtn>
## 1 casual 61 secs
## 2 member 61 secs
## 3 <NA> NA secs
I checked the average ride time by each day for members vs casual users
all_trips_v2 %>%
group_by(member_casual, day_of_week) %>%
summarise(mean = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 22 × 3
## # Groups: member_casual [3]
## member_casual day_of_week mean
## <chr> <chr> <drtn>
## 1 casual Friday 3611.5732 secs
## 2 casual Monday 3281.7303 secs
## 3 casual Saturday 3233.4910 secs
## 4 casual Sunday 3391.1785 secs
## 5 casual Thursday 3466.1390 secs
## 6 casual Tuesday 3477.3102 secs
## 7 casual Wednesday 3634.7524 secs
## 8 member Friday 833.9273 secs
## 9 member Monday 852.2588 secs
## 10 member Saturday 973.9411 secs
## # … with 12 more rows
I noticed that the days of the week were out of order. So I fixed that.
all_trips_v2$day_of_week <-
ordered(all_trips_v2$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
Again I run the average ride time by each day for members vs casual users to check if the problem had been addressed.
all_trips_v2 %>%
group_by(member_casual, day_of_week) %>%
summarise(mean = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 22 × 3
## # Groups: member_casual [3]
## member_casual day_of_week mean
## <chr> <ord> <drtn>
## 1 casual Sunday 3391.1785 secs
## 2 casual Monday 3281.7303 secs
## 3 casual Tuesday 3477.3102 secs
## 4 casual Wednesday 3634.7524 secs
## 5 casual Thursday 3466.1390 secs
## 6 casual Friday 3611.5732 secs
## 7 casual Saturday 3233.4910 secs
## 8 member Sunday 915.4380 secs
## 9 member Monday 852.2588 secs
## 10 member Tuesday 847.4632 secs
## # … with 12 more rows
I analyzed ridership data by type and weekday:
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% # creates weekday field using wday()
group_by(member_casual, weekday) %>% # groups by usertype and weekday
summarise(
number_of_rides = n(),
average_duration = mean(ride_length)
) %>% # calculates the number of rides and average duration
arrange(member_casual, weekday) # sorts
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 22 × 4
## # Groups: member_casual [3]
## member_casual weekday number_of_rides average_duration
## <chr> <ord> <int> <drtn>
## 1 casual Sun 166407 3391.1785 secs
## 2 casual Mon 99597 3281.7303 secs
## 3 casual Tue 85927 3477.3102 secs
## 4 casual Wed 87256 3634.7524 secs
## 5 casual Thu 98452 3466.1390 secs
## 6 casual Fri 117766 3611.5732 secs
## 7 casual Sat 202063 3233.4910 secs
## 8 member Sun 232001 915.4380 secs
## 9 member Mon 410273 852.2588 secs
## 10 member Tue 438748 847.4632 secs
## # … with 12 more rows
I visualized the number of rides by rider type:
all_trips_v2 %>%
filter(!is.na(member_casual)) %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(
number_of_rides = n(),
average_duration = mean(ride_length)
) %>%
arrange(member_casual, weekday) %>%
ggplot(mapping = aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
labs(title = 'Number of riders by weekday for member and casual riders', fill = '') +
xlab('Weekday') +
ylab('Number of rides') +
theme_bw()
all_trips_v2 %>%
filter(!is.na(member_casual)) %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(
number_of_rides = n(),
average_duration = mean(ride_length)
) %>%
arrange(member_casual, weekday) %>%
ggplot(mapping = aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "fill") +
labs(title = 'Number of riders by weekday for member and casual riders', fill = '') +
xlab('Weekday') +
ylab('Number of rides') +
theme_bw()
I also created a visualization for average duration:
all_trips_v2 %>%
filter(!is.na(member_casual)) %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(
number_of_rides = n(),
average_duration = mean(ride_length)
) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge") +
labs(title = 'Average ride duration by weekday for member and casual riders', fill = '') +
xlab('Weekday') +
ylab('Avergage duration') +
theme_bw()
all_trips_v2 %>%
filter(member_casual== 'member') %>%
ggplot(aes(ride_length)) +
geom_histogram(binwidth = 300) +
labs(title = 'Distribution of ride length for memebr riders') +
xlim(0, 5000) +
xlab('Ride length in seconds') +
ylab('Count') +
theme_bw()
all_trips_v2 %>%
filter(member_casual== 'casual') %>%
ggplot(aes(ride_length)) +
geom_histogram(binwidth = 300) +
labs(title = 'Distribution of ride length for casual riders') +
xlim(0, 5000) +
xlab('Ride length in seconds') +
ylab('Count') +
theme_bw()
all_trips_v2 %>%
filter(member_casual %in% c('member', 'casual')) %>%
ggplot(aes(ride_length, color = member_casual)) +
geom_freqpoly(binwidth = 300, linewidth = 1) +
labs(title = 'Distribution of ride length for member and casual riders') +
xlim(0, 5000) +
xlab('Ride length in seconds') +
ylab('Count') +
theme_bw()
Finally I create a csv file that I can visualize in Excel or Tableau for further analysis.
counts<- all_trips_v2 %>%
group_by(member_casual, day_of_week) %>%
summarise(mean_ride_length = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
counts
## # A tibble: 22 × 3
## # Groups: member_casual [3]
## member_casual day_of_week mean_ride_length
## <chr> <ord> <drtn>
## 1 casual Sunday 3391.1785 secs
## 2 casual Monday 3281.7303 secs
## 3 casual Tuesday 3477.3102 secs
## 4 casual Wednesday 3634.7524 secs
## 5 casual Thursday 3466.1390 secs
## 6 casual Friday 3611.5732 secs
## 7 casual Saturday 3233.4910 secs
## 8 member Sunday 915.4380 secs
## 9 member Monday 852.2588 secs
## 10 member Tuesday 847.4632 secs
## # … with 12 more rows
write.csv(x = counts, file = 'avg_ride_length.csv')