This analysis is based on the Divvy case study “‘Sophisticated, Clear, and Polished’: Divvy and Data Visualization” written by Kevin Hartman (found here: https://artscience.blog/home/divvy-dataviz-case-study). The purpose of this script is to consolidate downloaded Divvy data into a single dataframe and then conduct simple analysis to help answer the key question: “In what ways do members and casual riders use Divvy bikes differently?”
install.packages("tidyverse")
install.packages("lubridate")
install.packages("ggplot2")
library('tidyverse')
library('lubridate')
library(ggplot2)
getwd()
[1] "C:/Users/nicol/OneDrive/Documentos/Analisis de Datos de Google/Caso estidio1/Divvy_Exercise_Full_Year_Analysis/divvy_exercise_full_year_analysis"
setwd("C:/Users/nicol/OneDrive/Documentos/Analisis de Datos de Google/Caso estidio1/Divvy_Exercise_Full_Year_Analysis/divvy_exercise_full_year_analysis")
Upload Divvy datasets (csv files) here
q2_2019 <- read.csv("Data/Divvy_Trips_2019_Q2.csv")
q3_2019 <- read.csv("Data/Divvy_Trips_2019_Q3.csv")
q4_2019 <- read.csv("Data/Divvy_Trips_2019_Q4.csv")
q1_2020 <- read.csv("Data/Divvy_Trips_2020_Q1.csv")
While the names don’t have to be in the same order, they DO need to match perfectly before we can use a command to join them into one file
colnames(q3_2019)
[1] "trip_id" "start_time" "end_time" "bikeid"
[5] "tripduration" "from_station_id" "from_station_name" "to_station_id"
[9] "to_station_name" "usertype" "gender" "birthyear"
colnames(q4_2019)
[1] "trip_id" "start_time" "end_time" "bikeid"
[5] "tripduration" "from_station_id" "from_station_name" "to_station_id"
[9] "to_station_name" "usertype" "gender" "birthyear"
colnames(q2_2019)
[1] "X01...Rental.Details.Rental.ID"
[2] "X01...Rental.Details.Local.Start.Time"
[3] "X01...Rental.Details.Local.End.Time"
[4] "X01...Rental.Details.Bike.ID"
[5] "X01...Rental.Details.Duration.In.Seconds.Uncapped"
[6] "X03...Rental.Start.Station.ID"
[7] "X03...Rental.Start.Station.Name"
[8] "X02...Rental.End.Station.ID"
[9] "X02...Rental.End.Station.Name"
[10] "User.Type"
[11] "Member.Gender"
[12] "X05...Member.Details.Member.Birthday.Year"
colnames(q1_2020)
[1] "ride_id" "rideable_type" "started_at" "ended_at"
[5] "start_station_name" "start_station_id" "end_station_name" "end_station_id"
[9] "start_lat" "start_lng" "end_lat" "end_lng"
[13] "member_casual"
q4_2019 <- rename(q4_2019,ride_id = trip_id, rideable_type = bikeid, started_at = start_time, ended_at = end_time, start_station_name = from_station_name, start_station_id = from_station_id, end_station_name = to_station_name, end_station_id = to_station_id, menber_casual = usertype)
q3_2019 <- rename(q3_2019, ride_id = trip_id, rideable_type = bikeid, started_at = start_time, ended_at = end_time, start_station_name = from_station_name, start_station_id = from_station_id, end_station_name = to_station_name, end_station_id = to_station_id, member_casual = usertype)
q2_2019 <- rename(q2_2019, ride_id = "X01...Rental.Details.Rental.ID", rideable_type = "X01...Rental.Details.Bike.ID", started_at = "X01...Rental.Details.Local.Start.Time", ended_at = "X01...Rental.Details.Local.End.Time", start_station_name = "X03...Rental.Start.Station.Name", start_station_id = "X03...Rental.Start.Station.ID", end_station_name = "X02...Rental.End.Station.Name", end_station_id = "X02...Rental.End.Station.ID", member_casual = "User.Type")
str(q1_2020)
'data.frame': 426887 obs. of 13 variables:
$ ride_id : chr "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
$ rideable_type : chr "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
$ started_at : chr "2020-01-21 20:06:59" "2020-01-30 14:22:39" "2020-01-09 19:29:26" "2020-01-06 16:17:07" ...
$ ended_at : chr "2020-01-21 20:14:30" "2020-01-30 14:26:22" "2020-01-09 19:32:17" "2020-01-06 16:25:56" ...
$ start_station_name: chr "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
$ start_station_id : int 239 234 296 51 66 212 96 96 212 38 ...
$ end_station_name : chr "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
$ end_station_id : int 326 318 117 24 212 96 212 212 96 100 ...
$ start_lat : num 42 42 41.9 41.9 41.9 ...
$ start_lng : num -87.7 -87.7 -87.6 -87.6 -87.6 ...
$ end_lat : num 42 42 41.9 41.9 41.9 ...
$ end_lng : num -87.7 -87.7 -87.7 -87.6 -87.6 ...
$ member_casual : chr "member" "member" "member" "member" ...
str(q4_2019)
'data.frame': 704054 obs. of 12 variables:
$ ride_id : int 25223640 25223641 25223642 25223643 25223644 25223645 25223646 25223647 25223648 25223649 ...
$ started_at : chr "2019-10-01 00:01:39" "2019-10-01 00:02:16" "2019-10-01 00:04:32" "2019-10-01 00:04:32" ...
$ ended_at : chr "2019-10-01 00:17:20" "2019-10-01 00:06:34" "2019-10-01 00:18:43" "2019-10-01 00:43:43" ...
$ rideable_type : int 2215 6328 3003 3275 5294 1891 1061 1274 6011 2957 ...
$ tripduration : chr "940.0" "258.0" "850.0" "2,350.0" ...
$ start_station_id : int 20 19 84 313 210 156 84 156 156 336 ...
$ start_station_name: chr "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
$ end_station_id : int 309 241 199 290 382 226 142 463 463 336 ...
$ end_station_name : chr "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
$ menber_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
$ gender : chr "Male" "Male" "Female" "Male" ...
$ birthyear : int 1987 1998 1991 1990 1987 1994 1991 1995 1993 NA ...
str(q3_2019)
'data.frame': 1640718 obs. of 12 variables:
$ ride_id : int 23479388 23479389 23479390 23479391 23479392 23479393 23479394 23479395 23479396 23479397 ...
$ started_at : chr "2019-07-01 00:00:27" "2019-07-01 00:01:16" "2019-07-01 00:01:48" "2019-07-01 00:02:07" ...
$ ended_at : chr "2019-07-01 00:20:41" "2019-07-01 00:18:44" "2019-07-01 00:27:42" "2019-07-01 00:27:10" ...
$ rideable_type : int 3591 5353 6180 5540 6014 4941 3770 5442 2957 6091 ...
$ tripduration : chr "1,214.0" "1,048.0" "1,554.0" "1,503.0" ...
$ start_station_id : int 117 381 313 313 168 300 168 313 43 43 ...
$ start_station_name: chr "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
$ end_station_id : int 497 203 144 144 62 232 62 144 195 195 ...
$ end_station_name : chr "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
$ member_casual : chr "Subscriber" "Customer" "Customer" "Customer" ...
$ gender : chr "Male" "" "" "" ...
$ birthyear : int 1992 NA NA NA NA 1990 NA NA NA NA ...
str(q2_2019)
'data.frame': 1108163 obs. of 12 variables:
$ ride_id : int 22178529 22178530 22178531 22178532 22178533 22178534 22178535 22178536 22178537 22178538 ...
$ started_at : chr "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
$ ended_at : chr "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
$ rideable_type : int 6251 6226 5649 4151 3270 3123 6418 4513 3280 5534 ...
$ X01...Rental.Details.Duration.In.Seconds.Uncapped: chr "446.0" "1,048.0" "252.0" "357.0" ...
$ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
$ start_station_name : chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
$ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
$ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
$ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
$ Member.Gender : chr "Male" "Female" "Male" "Male" ...
$ X05...Member.Details.Member.Birthday.Year : int 1975 1984 1990 1993 1992 1999 1969 1991 NA NA ...
q4_2019 <- mutate(q4_2019, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type))
q3_2019 <- mutate(q3_2019, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type))
q2_2019 <- mutate(q2_2019, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type))
all_trips <- bind_rows(q2_2019, q3_2019, q4_2019, q1_2020)
all_trips <- all_trips %>%
select(-c(start_lat, start_lng, end_lat, end_lng, birthyear, gender, "X01...Rental.Details.Duration.In.Seconds.Uncapped", "X05...Member.Details.Member.Birthday.Year", "Member.Gender", "tripduration"))
colnames(all_trips)
[1] "ride_id" "started_at" "ended_at" "rideable_type"
[5] "start_station_id" "start_station_name" "end_station_id" "end_station_name"
[9] "member_casual" "menber_casual" "date" "month"
[13] "day" "year" "day_of_week" "ride_length"
nrow(all_trips)
[1] 3879822
dim(all_trips)
[1] 3879822 16
head(all_trips)
str(all_trips)
'data.frame': 3879822 obs. of 16 variables:
$ ride_id : chr "22178529" "22178530" "22178531" "22178532" ...
$ started_at : chr "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
$ ended_at : chr "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
$ rideable_type : chr "6251" "6226" "5649" "4151" ...
$ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
$ start_station_name: chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
$ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
$ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
$ member_casual : chr "member" "member" "member" "member" ...
$ menber_casual : chr NA NA NA NA ...
$ date : Date, format: "2019-04-01" "2019-04-01" "2019-04-01" ...
$ month : chr "04" "04" "04" "04" ...
$ day : chr "01" "01" "01" "01" ...
$ year : chr "2019" "2019" "2019" "2019" ...
$ day_of_week : chr "Monday" "Monday" "Monday" "Monday" ...
$ ride_length : num 446 1048 252 357 1007 ...
summary(all_trips)
_ (3) We will want to add a calculated field for length of ride since the 2020Q1 data did not have the “tripduration” column. We will add “ride_length” to the entire dataframe for consistency.
Before 2020, Divvy used different labels for these two types of riders … we will want to make our dataframe consistent with their current nomenclature
N.B.: “Level” is a special property of a column that is retained even if a subset does not contain any values from a specific level
Begin by seeing how many observations fall under each usertype
summary(all_trips)
ride_id started_at ended_at rideable_type start_station_id
Length:3879822 Length:3879822 Length:3879822 Length:3879822 Min. : 1.0
Class :character Class :character Class :character Class :character 1st Qu.: 77.0
Mode :character Mode :character Mode :character Mode :character Median :174.0
Mean :202.9
3rd Qu.:291.0
Max. :675.0
start_station_name end_station_id end_station_name member_casual menber_casual
Length:3879822 Min. : 1.0 Length:3879822 Length:3879822 Length:3879822
Class :character 1st Qu.: 77.0 Class :character Class :character Class :character
Mode :character Median :174.0 Mode :character Mode :character Mode :character
Mean :203.8
3rd Qu.:291.0
Max. :675.0
NA's :1
date month day year
Min. :2019-04-01 Length:3879822 Length:3879822 Length:3879822
1st Qu.:2019-06-23 Class :character Class :character Class :character
Median :2019-08-14 Mode :character Mode :character Mode :character
Mean :2019-08-25
3rd Qu.:2019-10-12
Max. :2020-03-31
day_of_week ride_length
Length:3879822 Min. : -6982
Class :character 1st Qu.: 411
Mode :character Median : 711
Mean : 1478
3rd Qu.: 1288
Max. :9383424
table(all_trips$member_casual)
casual member
799760 2376008
all_trips <- all_trips %>%
mutate(member_casual = recode(member_casual, "Subscriber" = "member", "Customer" = "casual"))
This will allow us to aggregate ride data for each month, day, or year … before completing these operations we could only aggregate at the ride level
all_trips$date <- as.Date(all_trips$started_at)
table(all_trips$member_casual)
casual member
799760 2376008
all_trips$month <- format(as.Date(all_trips$date),"%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
all_trips$ride_length <- difftime(all_trips$ended_at, all_trips$started_at)
str(all_trips)
'data.frame': 3879822 obs. of 16 variables:
$ ride_id : chr "22178529" "22178530" "22178531" "22178532" ...
$ started_at : chr "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
$ ended_at : chr "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
$ rideable_type : chr "6251" "6226" "5649" "4151" ...
$ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
$ start_station_name: chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
$ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
$ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
$ member_casual : chr "member" "member" "member" "member" ...
$ menber_casual : chr NA NA NA NA ...
$ date : Date, format: "2019-04-01" "2019-04-01" "2019-04-01" ...
$ month : chr "04" "04" "04" "04" ...
$ day : chr "01" "01" "01" "01" ...
$ year : chr "2019" "2019" "2019" "2019" ...
$ day_of_week : chr "Monday" "Monday" "Monday" "Monday" ...
$ ride_length : 'difftime' num 446 1048 252 357 ...
..- attr(*, "units")= chr "secs"
is.factor(all_trips$ride_length)
[1] FALSE
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative ## We will create a new version of the dataframe (v2) since data is being removed
is.numeric(all_trips$ride_length)
[1] TRUE
mean(all_trips_v2$ride_length)
[1] 1479.159
median(all_trips_v2$ride_length)
[1] 712
max(all_trips_v2$ride_length)
[1] 9383424
min(all_trips_v2$ride_length)
[1] 1
all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]
summary(all_trips_v2$ride_length)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 412 712 1479 1289 9383424
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Thursday", "Friday", "Saturday"))
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n(), average_duration = mean(ride_length))%>%
arrange(member_casual, weekday)
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2 %>%
mutate( weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n(), average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge")
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
all_trips_v2 %>%
mutate( weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n(), average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y =average_duration, fill = member_casual)) +
geom_col(position = "dodge")
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
counts <- aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)