Here we will import our data. First uploaded the two datasets into RCloud. Now we will store the two datasets into two dataframes using the read_csv function from the tidyverse package.
div19_df <- read_csv("Divvy_Trips_2019_Q1 - Divvy_Trips_2019_Q1.csv")
## Rows: 365069 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): start_time, end_time, from_station_name, to_station_name, usertype,...
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
div20_df <- read_csv("Divvy_Trips_2020_Q1 - Divvy_Trips_2020_Q1.csv")
## Rows: 426887 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, started_at, ended_at, start_station_name, e...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, en...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(div19_df)
## # A tibble: 6 × 12
## trip_id start_time end_time bikeid tripduration from_station_id
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 21742443 2019-01-01 0:04:37 2019-01-01 0:… 2167 390 199
## 2 21742444 2019-01-01 0:08:13 2019-01-01 0:… 4386 441 44
## 3 21742445 2019-01-01 0:13:23 2019-01-01 0:… 1524 829 15
## 4 21742446 2019-01-01 0:13:45 2019-01-01 0:… 252 1783 123
## 5 21742447 2019-01-01 0:14:52 2019-01-01 0:… 1170 364 173
## 6 21742448 2019-01-01 0:15:33 2019-01-01 0:… 2437 216 98
## # ℹ 6 more variables: from_station_name <chr>, to_station_id <dbl>,
## # to_station_name <chr>, usertype <chr>, gender <chr>, birthyear <dbl>
colnames(div19_df)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
head(div20_df)
## # A tibble: 6 × 13
## ride_id rideable_type started_at ended_at start_station_name start_station_id
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 EACB191… docked_bike 2020-01-2… 2020-01… Western Ave & Lel… 239
## 2 8FED874… docked_bike 2020-01-3… 2020-01… Clark St & Montro… 234
## 3 789F3C2… docked_bike 2020-01-0… 2020-01… Broadway & Belmon… 296
## 4 C9A388D… docked_bike 2020-01-0… 2020-01… Clark St & Randol… 51
## 5 943BC3C… docked_bike 2020-01-3… 2020-01… Clinton St & Lake… 66
## 6 6D9C8A6… docked_bike 2020-01-1… 2020-01… Wells St & Hubbar… 212
## # ℹ 7 more variables: end_station_name <chr>, end_station_id <dbl>,
## # start_lat <dbl>, start_lng <dbl>, end_lat <dbl>, end_lng <dbl>,
## # member_casual <chr>
colnames(div20_df)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
To sort, we will first standardize the column names to match because the 2019 and 2020 columns are different
# rename(dataset_name, new_column_name = old_col_name)
# Standardize 2019 dataframe
div19_df <- rename(div19_df, start_station_name = from_station_name, end_station_name = to_station_name, start_station_id = from_station_id, end_station_id = to_station_id, ride_id = trip_id)
# Standardize 2020 dataframe.
div20_df <- rename(div20_df, start_time=started_at, end_time=ended_at)
# Standardize usertype column so that its values are either subscriber or customer for the 2020 dataset.
div20_df <- div20_df %>%
rename(usertype = member_casual) %>%
mutate(usertype = case_when(
usertype == "member" ~ "Subscriber",
usertype == "casual" ~ "Customer"
))
In the 2019 dataset, tripduration is calculated. 2020 dataset does not have this column, so we will calculate it and name it tripduration. The columns are chars so we must convert them to datetime. Must use dplyr and lubridate
Because div_19 trip duration is in seconds, div20 must be the same
div20_df <- div20_df %>%
mutate(
start_time = ymd_hms(start_time),
end_time = ymd_hms(end_time),
tripduration = as.numeric(end_time - start_time)
)
I want to first see how many datapoints are invalid, those that are invalid are those with ride times that are either negative or greater than 24 hours (86400 seconds). I store these values into a new variable, div19_invalid and div20_invalid. I do this to just get a number of how many data points were invalid.
div19_invalid <- div19_df %>% filter(tripduration <= 0 | tripduration >= 86400) nrow(div19_invalid)
div20_invalid <- div20_df %>% filter(tripduration <= 0 | tripduration >= 86400) nrow(div20_invalid)
I now have an idea as to how many valid datapoints I have. I filter data that is valid, and store it into the original variable. By doing this, any invalid data that does not fit the correct criteria will not be re-included in the original data frames. I do this for both datasets.
div19_df <- div19_df %>%
filter(tripduration > 0 & tripduration < 86400) #filter rides < 24 hours, or > 0
div20_df <- div20_df %>%
filter(tripduration > 0 & tripduration < 86400) #filter rides < 24 hours, or > 0
Actually filter now
div19_df <- div19_df %>%
filter(!is.na(start_station_name)& !is.na(end_station_name))
div20_df <- div20_df %>%
filter(!is.na(start_station_name)& !is.na(end_station_name))
# Check if any usertypes are missing, and exclude them
div19_df <- div19_df %>%
filter(!is.na(usertype))
div20_df <- div20_df %>%
filter(!is.na(usertype))
Now I will begin to analyze the data. I will create new variables for each usertype, for the years 2019 and 2020.
subs20 <- div20_df %>% filter(usertype == "Subscriber")
subs19 <- div19_df %>% filter(usertype == "Subscriber")
cust20 <- div20_df %>% filter(usertype == "Customer")
cust19 <- div19_df %>% filter(usertype == "Customer")
Calculate mean, median for 2020:
meansubs20 <- mean(subs20$tripduration)
mediansubs20 <- median(subs20$tripduration)
meancust20 <- mean(cust20$tripduration)
mediancust20 <- median(cust20$tripduration)
Calculate mean, median for 2019:
meansubs19 <- mean(subs19$tripduration)
mediansubs19 <- median(subs19$tripduration)
meancust19 <- mean(cust19$tripduration)
mediancust19 <- median(cust19$tripduration)
SummaryStats <- data.frame(
usertype = c("Subscriber", "Subscriber", "Customer", "Customer","Subscriber", "Subscriber", "Customer", "Customer"),
statistic = c("Mean", "Median","Mean", "Median","Mean", "Median","Mean", "Median"),
Year = c("2019", "2019", "2019", "2019", "2020","2020","2020","2020"),
value = c(meansubs19, mediansubs19, meancust19, mediancust19,meansubs20, mediansubs20, meancust20, mediancust20)
)
SummaryStats
## usertype statistic Year value
## 1 Subscriber Mean 2019 678.4683
## 2 Subscriber Median 2019 501.0000
## 3 Customer Mean 2019 2116.8862
## 4 Customer Median 2019 1396.0000
## 5 Subscriber Mean 2020 690.0363
## 6 Subscriber Median 2020 514.0000
## 7 Customer Mean 2020 2221.6528
## 8 Customer Median 2020 1269.0000
Visualize ride distribution for both years
G_RideDistr2019_sec <-ggplot(div20_df,aes(x=tripduration,fill=usertype))+
geom_histogram(binwidth=5,position="identity",alpha=0.6) +
scale_x_continuous(limits=c(0,3600))+
labs(title="Ride Duration Distribution (2020)",
x = "Duration (seconds)",
y = "Count") +
theme_minimal()
G_RideDistr2019_sec
## Warning: Removed 7186 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
G_RideDistr2020<-ggplot(div20_df,aes(x=tripduration/60,fill=usertype))+
geom_histogram(binwidth=5,position="identity",alpha=0.6) +
scale_x_continuous(limits=c(0,120))+
labs(title="Ride Duration Distribution (2020)",
x = "Duration (minutes)",
y = "Count") +
theme_minimal()
G_RideDistr2020
## Warning: Removed 2162 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
G_RideDistr2019<- ggplot(div19_df,aes(x=tripduration/60,fill=usertype))+
geom_histogram(binwidth=5,position="identity",alpha=0.6) +
scale_x_continuous(limits=c(0,120))+
labs(title="Ride Duration Distribution (2019)",
x = "Duration (minutes)",
y = "Count") +
theme_minimal()
G_RideDistr2019
## Warning: Removed 1259 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Add new columns to 2020 and 2019 data frame for day of the week, and
hour of the day
div20_df <- div20_df %>%
mutate(
day_of_week = wday(start_time, label = TRUE, abbr = FALSE),
hour_of_day = hour(start_time)
)
div19_df <- div19_df %>%
mutate(
day_of_week = wday(start_time, label = TRUE, abbr = FALSE),
hour_of_day = hour(start_time)
)
G_RidesbyWeek20 <- ggplot(div20_df,aes(x=day_of_week,fill=usertype))+
geom_bar() +
labs(title="Number of Rides by Day of Week 2020",
x = "Day",
y = "Count") +
theme_minimal()
G_RidesbyWeek20
G_RidesbyWeek19 <- ggplot(div19_df,aes(x=day_of_week,fill=usertype))+
geom_bar() +
labs(title="Number of Rides by Day of Week 2019",
x = "Day",
y = "Count") +
theme_minimal()
G_RidesbyWeek19
I want to combine the data from the two years to make a comprehensive
bar graph.
#select needed columns, usertype and day_of_week, from both 2019 and 2020.
df19_sel <- div19_df %>%
select(day_of_week, hour_of_day, usertype) %>%
mutate(year="2019")
df20_sel <- div20_df %>%
select(day_of_week, hour_of_day, usertype) %>%
mutate("2020")
#combine the data
combined_df <- bind_rows(df20_sel,df19_sel)
G_RidesbyWeek <- ggplot(combined_df, aes(x = day_of_week, fill = usertype)) +
geom_bar(position = "dodge") +
labs(
title = "Number of Rides by Day of Week and User Type",
x = "Day of Week",
y = "Number of Rides",
fill = "User Type"
) +
theme_minimal()
G_RidesbyWeek
G_RidesbyHour <-ggplot(combined_df, aes(x = hour_of_day, fill = usertype)) +
geom_bar(position = "dodge") +
labs(
title = "Number of Rides by Day of Week and User Type",
x = "Hour of Day (Military Time)",
y = "Number of Rides",
fill = "User Type"
) +
theme_minimal()
G_RidesbyHour
popular_starts_20<- div20_df %>%
group_by(usertype, start_station_name) %>%
summarise(ride_count = n()) %>%
arrange(usertype, desc(ride_count)) %>%
slice_head(n = 5) # Top 5 per usertype
## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.
popular_starts_19<- div19_df %>%
group_by(usertype, start_station_name) %>%
summarise(ride_count = n()) %>%
arrange(usertype, desc(ride_count)) %>%
slice_head(n = 5) # Top 5 per usertype
## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.
#combining both
popular_starts_combined <- bind_rows(popular_starts_19, popular_starts_20)
ggplot(popular_starts_19, aes(x = reorder(start_station_name, -ride_count), y = ride_count, fill = usertype)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Top 10 Start Stations by Usertype (2019)",
x = "Start Station",
y = "Number of Rides"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
G_PopStrtSt20 <- ggplot(popular_starts_20, aes(x = reorder(start_station_name, -ride_count), y = ride_count, fill = usertype)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Top 10 Start Stations by Usertype (2020)",
subtitle = "Most used start stations for both customers and subscribers",
x = "Start Station",
y = "Number of Rides"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
G_PopStrtSt20
G_PopStartSt19_20<- ggplot(popular_starts_combined, aes(x = reorder(start_station_name, -ride_count), y = ride_count, fill = usertype)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Top 12 Start Stations by Usertype (2019 and 2020)",
x = "Start Station",
y = "Number of Rides"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
G_PopStartSt19_20
## Different approach to Calculate Popular Start and End Stations
combined_starts <- bind_rows(
div19_df %>% select(usertype,start_station_name),
div20_df %>% select(usertype,start_station_name)
)
combined_ends <- bind_rows(
div19_df %>% select(usertype,end_station_name),
div20_df %>% select(usertype,end_station_name)
)
count total rides per station and usertype
pop_combined_starts <- combined_starts %>%
group_by(usertype,start_station_name) %>%
summarise(ride_count=n()) %>%
arrange(usertype,desc(ride_count)) %>%
slice_head(n=10)
## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.
pop_combined_ends <- combined_ends %>%
group_by(usertype,end_station_name) %>%
summarise(ride_count=n()) %>%
arrange(usertype,desc(ride_count)) %>%
slice_head(n=10)
## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.
G_StartStations <- ggplot(pop_combined_starts,aes(x=reorder(start_station_name,-ride_count),y=ride_count,fill=usertype))+
geom_bar(stat="identity",position="dodge")+
labs(
title="Top 10 Most Popular Start stations by Usertype (2019 and 2020)",
x = "Start Station",
y = "Total Ride Count") +
theme_minimal()+
theme(axis.text.x = element_text(angle=45,hjust=1))
G_StartStations
G_EndStations <- ggplot(pop_combined_ends,aes(x=reorder(end_station_name,-ride_count),y=ride_count,fill=usertype))+
geom_bar(stat="identity",position="dodge")+
labs(
title="Top 10 Most Popular End stations by Usertype (2019 and 2020)",
x = "End Station",
y = "Total Ride Count") +
theme_minimal()+
theme(axis.text.x = element_text(angle=45,hjust=1))
G_EndStations
Group by user, count rides to see how often each type uses the service
total_rides20 <- div20_df %>%
group_by(usertype) %>%
summarise(total_rides = n())
total_rides19<- div19_df %>%
group_by(usertype) %>%
summarise(total_rides = n())
total_rides_comb <- bind_rows(total_rides19,total_rides20)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
G_TotalRides<- ggplot(total_rides_comb,aes(x=usertype,y=total_rides,fill=usertype))+
geom_bar(stat="identity")+
scale_y_continuous(labels=label_comma()) +
labs(title="Total Rides by Usertype",
x = "User Type",
y = "Total Rides")+
theme_minimal()
G_TotalRides