install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
Notes: Setting up my R environment by loading the ‘tidyverse’, ‘dplyr’ and ‘ggplot2’ packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
df_1 <- read_csv("Divvy_Trips_2019_Q1.csv")
## Rows: 365069 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): start_station_name, end_station_name, member_casual
## dbl (4): ride_id, start_station_id, end_station_id, day_of_week
## time (3): started_at, ended_at, ride_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_2 <- read_csv("Divvy_Trips_2020_Q1.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 426887 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): ride_id, start_station_name, end_station_name, member_casual
## dbl (3): start_station_id, end_station_id, day_of_week
## time (3): started_at, ended_at, ride_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
new_df <- rbind(df_1, df_2)
In order to obtain a sense of the kind of data in the data sets, I have utilized a bunch of functions here.
summary(new_df)
## ride_id started_at ended_at start_station_name
## Length:791956 Length:791956 Length:791956 Length:791956
## Class :character Class1:hms Class1:hms Class :character
## Mode :character Class2:difftime Class2:difftime Mode :character
## Mode :numeric Mode :numeric
##
##
##
## start_station_id end_station_name end_station_id member_casual
## Min. : 2.0 Length:791956 Min. : 2.0 Length:791956
## 1st Qu.: 77.0 Class :character 1st Qu.: 77.0 Class :character
## Median :174.0 Mode :character Median :174.0 Mode :character
## Mean :204.4 Mean :204.4
## 3rd Qu.:291.0 3rd Qu.:291.0
## Max. :675.0 Max. :675.0
## NA's :1
## ride_length day_of_week
## Length:791956 Min. :1.000
## Class1:hms 1st Qu.:2.000
## Class2:difftime Median :4.000
## Mode :numeric Mean :3.923
## 3rd Qu.:6.000
## Max. :7.000
##
colnames(new_df)
## [1] "ride_id" "started_at" "ended_at"
## [4] "start_station_name" "start_station_id" "end_station_name"
## [7] "end_station_id" "member_casual" "ride_length"
## [10] "day_of_week"
str(new_df)
## spc_tbl_ [791,956 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:791956] "21742443" "21742444" "21742445" "21742446" ...
## $ started_at : 'hms' num [1:791956] 00:04:37 01:04:37 02:04:37 03:04:37 ...
## ..- attr(*, "units")= chr "secs"
## $ ended_at : 'hms' num [1:791956] 00:11:07 01:11:07 02:11:07 03:11:07 ...
## ..- attr(*, "units")= chr "secs"
## $ start_station_name: chr [1:791956] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ start_station_id : num [1:791956] 199 44 15 123 173 98 98 211 150 268 ...
## $ end_station_name : chr [1:791956] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ end_station_id : num [1:791956] 84 624 644 176 35 49 49 142 148 141 ...
## $ member_casual : chr [1:791956] "member" "member" "member" "member" ...
## $ ride_length : 'hms' num [1:791956] 00:06:30 00:06:30 00:06:30 00:06:30 ...
## ..- attr(*, "units")= chr "secs"
## $ day_of_week : num [1:791956] 3 3 3 3 3 3 3 3 3 3 ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_double(),
## .. started_at = col_time(format = ""),
## .. ended_at = col_time(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. member_casual = col_character(),
## .. ride_length = col_time(format = ""),
## .. day_of_week = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(new_df)
## # A tibble: 6 × 10
## ride_id started_at ended_at start_station_name start_station_id
## <chr> <time> <time> <chr> <dbl>
## 1 21742443 00:04:37 00:11:07 Wabash Ave & Grand Ave 199
## 2 21742444 01:04:37 01:11:07 State St & Randolph St 44
## 3 21742445 02:04:37 02:11:07 Racine Ave & 18th St 15
## 4 21742446 03:04:37 03:11:07 California Ave & Milwaukee Ave 123
## 5 21742447 04:04:37 04:11:07 Mies van der Rohe Way & Chicago… 173
## 6 21742448 05:04:37 05:11:07 LaSalle St & Washington St 98
## # ℹ 5 more variables: end_station_name <chr>, end_station_id <dbl>,
## # member_casual <chr>, ride_length <time>, day_of_week <dbl>
Here, I am calculating the mean and max of the merged data set.
new_df_summary <-
new_df %>%
summarise(mean_ride_len = mean(new_df$ride_length[new_df$ride_length > 0], na.rm = TRUE),
max_ride_len = max(new_df$ride_length[new_df$ride_length > 0], na.rm = TRUE))
head(new_df_summary)
## # A tibble: 1 × 2
## mean_ride_len max_ride_len
## <drtn> <drtn>
## 1 687.6315 secs 357387 secs
In this part, I am analyzing the annual members of Cyclistic by calculating the average, min, and max ride duration.
members_ride_len <- filter(new_df, new_df$member_casual=="member")
members_summary <-
members_ride_len %>%
summarise(avg_ride_length = mean(members_ride_len$ride_length[members_ride_len$ride_length > 0], na.rm = TRUE),
min_ride_length = min(members_ride_len$ride_length[members_ride_len$ride_length > 0], na.rm = TRUE),
max_ride_length = max(members_ride_len$ride_length[members_ride_len$ride_length > 0], na.rm = TRUE))
head(members_summary)
## # A tibble: 1 × 3
## avg_ride_length min_ride_length max_ride_length
## <drtn> <drtn> <drtn>
## 1 559.137 secs 1 secs 347120 secs
Here, I am analyzing the casual riders of Cyclistic by calculating the average, min and max ride duration.
casuals_ride_len <- filter(new_df, new_df$member_casual== "casual")
casuals_summary <-
casuals_ride_len %>%
summarise(avg_ride_length = mean(casuals_ride_len$ride_length[casuals_ride_len$ride_length > 0], na.rm = TRUE),
min_ride_length = min(casuals_ride_len$ride_length[casuals_ride_len$ride_length > 0], na.rm = TRUE),
max_ride_length = max(casuals_ride_len$ride_length[casuals_ride_len$ride_length > 0], na.rm = TRUE))
head(casuals_summary)
## # A tibble: 1 × 3
## avg_ride_length min_ride_length max_ride_length
## <drtn> <drtn> <drtn>
## 1 1984.923 secs 1 secs 357387 secs
ride_count_df <-
new_df %>%
group_by(member_casual) %>%
summarise(ride_count = n())
head(ride_count_df)
## # A tibble: 2 × 2
## member_casual ride_count
## <chr> <int>
## 1 casual 71643
## 2 member 720313
ggplot(data = new_df) +
geom_bar(mapping = aes(x = member_casual, fill = member_casual)) +
labs(title = "Ride Count Vs Customer Type",
x = "Customer Type",
y = "Ride Count")
Here, I am changing the day of week column in the dataset by assigning weekdays to the numbers to give it more context.
new_df$day_of_week <- recode(new_df$day_of_week,
"1"="Sunday",
"2"="Monday",
"3"="Tuesday",
"4"="Wednesday",
"5"="Thursday",
"6"="Friday",
"7"="Saturday")
new_df$day_of_week <- ordered(new_df$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
new_df %>%
drop_na(member_casual) %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n(), .groups = 'keep') %>%
arrange(member_casual, day_of_week) %>%
ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual)) +
geom_col(width=0.5, position = position_dodge(width=0.5)) +
theme(axis.text.x = element_text(angle = 45)) +
labs(title = "Ride count by Customer Type for week days",
x = "Week days",
y = "Ride Count")
new_df %>%
drop_na(member_casual) %>%
drop_na(ride_length) %>%
group_by(member_casual, day_of_week) %>%
summarise(average_trip_duration = mean(ride_length), .groups = 'keep') %>%
ggplot(aes(x = day_of_week, y = average_trip_duration, fill = member_casual)) +
geom_col(width=0.5, position = position_dodge(width=0.5)) +
theme(axis.text.x = element_text(angle = 45)) +
labs(title = "Average trip duration by Customer Type for week days",
x = "Week days",
y = "Average trip duration")
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.