# Case Study 1
#setting the working directory
setwd("~/Desktop/GA Capstone Project/Project 2_ Aug2024/RAW")
#Load Packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lubridate)
library(tibble)
#Import CSV
Jul_2024 <- read.csv("202407-divvy-tripdata.csv")
Jun_2024 <- read.csv("202406-divvy-tripdata.csv")
May_2024 <- read.csv("202405-divvy-tripdata.csv")
Apr_2024 <- read.csv("202404-divvy-tripdata.csv")
Mar_2024 <- read.csv("202403-divvy-tripdata.csv")
Feb_2024 <- read.csv("202402-divvy-tripdata.csv")
Jan_2024 <- read.csv("202401-divvy-tripdata.csv")
Dec_2023 <- read.csv("202312-divvy-tripdata.csv")
Nov_2023 <- read.csv("202311-divvy-tripdata.csv")
Oct_2023 <- read.csv("202310-divvy-tripdata.csv")
Sep_2023 <- read.csv("202309-divvy-tripdata.csv")
Aug_2023 <- read.csv("202308-divvy-tripdata.csv")
# Compare column names each of the files
colnames(Jul_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Jun_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(May_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Apr_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Mar_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Feb_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Jan_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Dec_2023)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Nov_2023)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Oct_2023)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Sep_2023)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Aug_2023)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
str(Jul_2024)
## 'data.frame': 748962 obs. of 13 variables:
## $ ride_id : chr "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2024-07-11 08:15:14.784" "2024-07-11 15:45:07.851" "2024-07-11 08:24:48.192" "2024-07-11 08:46:06.864" ...
## $ ended_at : chr "2024-07-11 08:17:56.335" "2024-07-11 16:06:04.243" "2024-07-11 08:28:05.237" "2024-07-11 09:14:11.664" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.8 41.8 41.8 41.9 42 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.8 41.8 41.8 41.9 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
# Merge
all_trips <- bind_rows(Jul_2024, Jun_2024, May_2024, Apr_2024, Mar_2024, Feb_2024, Jan_2024,
Dec_2023, Nov_2023, Oct_2023, Sep_2023, Aug_2023)
# Inspect the new table that has been created
colnames(all_trips) #List of column names
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
nrow(all_trips) #How many rows are in data frame?
## [1] 5715693
dim(all_trips) #Dimensions of the data frame?
## [1] 5715693 13
head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips)
## ride_id rideable_type started_at
## 1 2658E319B13141F9 electric_bike 2024-07-11 08:15:14.784
## 2 B2176315168A47CE electric_bike 2024-07-11 15:45:07.851
## 3 C2A9D33DF7EBB422 electric_bike 2024-07-11 08:24:48.192
## 4 8BFEA406DF01D8AD electric_bike 2024-07-11 08:46:06.864
## 5 ECD3EF02E5EB73B6 electric_bike 2024-07-11 18:18:16.588
## 6 A3C62391BBBAC107 electric_bike 2024-07-11 16:03:59.708
## ended_at start_station_name start_station_id end_station_name
## 1 2024-07-11 08:17:56.335
## 2 2024-07-11 16:06:04.243
## 3 2024-07-11 08:28:05.237
## 4 2024-07-11 09:14:11.664
## 5 2024-07-11 18:30:20.288
## 6 2024-07-11 16:32:38.635
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 41.80 -87.59 41.79 -87.59 casual
## 2 41.79 -87.60 41.80 -87.59 casual
## 3 41.79 -87.59 41.79 -87.60 casual
## 4 41.88 -87.64 41.90 -87.67 casual
## 5 41.95 -87.64 41.91 -87.62 casual
## 6 41.70 -87.61 41.70 -87.61 casual
str(all_trips) #See list of columns and data types (numeric, character, etc)
## 'data.frame': 5715693 obs. of 13 variables:
## $ ride_id : chr "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2024-07-11 08:15:14.784" "2024-07-11 15:45:07.851" "2024-07-11 08:24:48.192" "2024-07-11 08:46:06.864" ...
## $ ended_at : chr "2024-07-11 08:17:56.335" "2024-07-11 16:06:04.243" "2024-07-11 08:28:05.237" "2024-07-11 09:14:11.664" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.8 41.8 41.8 41.9 42 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.8 41.8 41.8 41.9 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
summary(all_trips) #Statistical summary of data. Mainly for numerics
## ride_id rideable_type started_at ended_at
## Length:5715693 Length:5715693 Length:5715693 Length:5715693
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:5715693 Length:5715693 Length:5715693 Length:5715693
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.63 Min. :-87.94 Min. : 0.00 Min. :-144.05
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.: -87.66
## Median :41.90 Median :-87.64 Median :41.90 Median : -87.64
## Mean :41.90 Mean :-87.65 Mean :41.90 Mean : -87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.: -87.63
## Max. :42.07 Max. :-87.46 Max. :87.96 Max. : 0.00
## NA's :7756 NA's :7756
## member_casual
## Length:5715693
## Class :character
## Mode :character
##
##
##
##
all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
# Add a "ride_length" calculation to all_trips (in seconds)
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)
str(all_trips)
## 'data.frame': 5715693 obs. of 19 variables:
## $ ride_id : chr "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2024-07-11 08:15:14.784" "2024-07-11 15:45:07.851" "2024-07-11 08:24:48.192" "2024-07-11 08:46:06.864" ...
## $ ended_at : chr "2024-07-11 08:17:56.335" "2024-07-11 16:06:04.243" "2024-07-11 08:28:05.237" "2024-07-11 09:14:11.664" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.8 41.8 41.8 41.9 42 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.8 41.8 41.8 41.9 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
## $ date : Date, format: "2024-07-11" "2024-07-11" ...
## $ month : chr "07" "07" "07" "07" ...
## $ day : chr "11" "11" "11" "11" ...
## $ year : chr "2024" "2024" "2024" "2024" ...
## $ day_of_week : chr "Thursday" "Thursday" "Thursday" "Thursday" ...
## $ ride_length : 'difftime' num 161.551000118256 1256.39199995995 197.045000076294 1684.79999995232 ...
## ..- attr(*, "units")= chr "secs"
#quality check with results in excel
check <- mean(all_trips$ride_length[all_trips$month == "07"])
print(check/60)
## Time difference of 19.54278 secs
# New version without un usable data (ride_length<10sec, faulty bike put back, inspection, problem with application, etc)
all_trips_v2 <- all_trips[!(all_trips$ride_length<10),]
summary(all_trips_v2$ride_length)
## Length Class Mode
## 5686857 difftime numeric
# Compare members and casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 1637.7863 secs
## 2 member 783.8447 secs
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 735.66 secs
## 2 member 526.00 secs
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 5909344 secs
## 2 member 93588 secs
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 10 secs
## 2 member 10 secs
# The average ride time by each day for members vs casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Friday 1627.6873 secs
## 2 member Friday 770.2110 secs
## 3 casual Monday 1583.2426 secs
## 4 member Monday 746.1417 secs
## 5 casual Saturday 1830.5872 secs
## 6 member Saturday 864.6809 secs
## 7 casual Sunday 1924.1781 secs
## 8 member Sunday 875.2154 secs
## 9 casual Thursday 1453.2443 secs
## 10 member Thursday 751.3384 secs
## 11 casual Tuesday 1413.1502 secs
## 12 member Tuesday 752.9224 secs
## 13 casual Wednesday 1403.4551 secs
## 14 member Wednesday 763.4795 secs
# The days of the week are out of order. Let's fix that.
all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
# Run the average ride time by each day for members vs casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Sunday 1924.1781 secs
## 2 member Sunday 875.2154 secs
## 3 casual Monday 1583.2426 secs
## 4 member Monday 746.1417 secs
## 5 casual Tuesday 1413.1502 secs
## 6 member Tuesday 752.9224 secs
## 7 casual Wednesday 1403.4551 secs
## 8 member Wednesday 763.4795 secs
## 9 casual Thursday 1453.2443 secs
## 10 member Thursday 751.3384 secs
## 11 casual Friday 1627.6873 secs
## 12 member Friday 770.2110 secs
## 13 casual Saturday 1830.5872 secs
## 14 member Saturday 864.6809 secs
# analyze ridership data by type and weekday
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday) # sorts
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups: member_casual [2]
## member_casual weekday number_of_rides average_duration
## <chr> <ord> <int> <drtn>
## 1 casual Sun 339167 1924.1781 secs
## 2 casual Mon 224878 1583.2426 secs
## 3 casual Tue 233743 1413.1502 secs
## 4 casual Wed 264908 1403.4551 secs
## 5 casual Thu 254357 1453.2443 secs
## 6 casual Fri 298764 1627.6873 secs
## 7 casual Sat 413230 1830.5872 secs
## 8 member Sun 406512 875.2154 secs
## 9 member Mon 501426 746.1417 secs
## 10 member Tue 580200 752.9224 secs
## 11 member Wed 612827 763.4795 secs
## 12 member Thu 573679 751.3384 secs
## 13 member Fri 514913 770.2110 secs
## 14 member Sat 468253 864.6809 secs
# Visualization of the number of rides by rider type
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# Visualization for average duration
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

# Analyze ridership data by type of membership and month
all_trips_v2 %>%
group_by(member_casual, month) %>%
summarise(number_of_rides = n(),average_duration = mean(ride_length)) %>%
arrange(member_casual, month) %>%
ggplot(aes(x = month, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") + labs(title="Total Number of Ride by Month", x = "Month", y = "Number of Rides") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# Created column to organize by seasons
all_trips_v2 <- all_trips_v2 %>%
mutate(seasons =
lapply(month, function(x) {
switch(x,
"06" = "Summer",
"07" = "Summer",
"08" = "Summer",
"09" = "Fall",
"10" = "Fall",
"11" = "Fall",
"12" = "Winter",
"01" = "Winter",
"02" = "Winter",
"Spring")
}))
#Evaluate what season casual members spend time riding, and how it is different from members
member_casual_by_season <- all_trips_v2 %>%
group_by(member_casual, seasons = factor(seasons, levels = c("Summer", "Fall", "Winter", "Spring"))) %>%
summarise(number_of_rides = n(), .groups = 'drop') %>%
group_by(member_casual) %>%
mutate(percentage = number_of_rides / sum(number_of_rides) * 100)
print(member_casual_by_season)
## # A tibble: 8 × 4
## # Groups: member_casual [2]
## member_casual seasons number_of_rides percentage
## <chr> <fct> <int> <dbl>
## 1 casual Summer 928438 45.8
## 2 casual Fall 534813 26.4
## 3 casual Winter 122736 6.05
## 4 casual Spring 443060 21.8
## 5 member Summer 1291645 35.3
## 6 member Fall 1023988 28.0
## 7 member Winter 466589 12.8
## 8 member Spring 875588 23.9