Capstone Project

Process stage

Here we will import our data. First uploaded the two datasets into RCloud. Now we will store the two datasets into two dataframes using the read_csv function from the tidyverse package.

div19_df <- read_csv("Divvy_Trips_2019_Q1 - Divvy_Trips_2019_Q1.csv")

## Rows: 365069 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): start_time, end_time, from_station_name, to_station_name, usertype,...
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

div20_df <- read_csv("Divvy_Trips_2020_Q1 - Divvy_Trips_2020_Q1.csv")

## Rows: 426887 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, started_at, ended_at, start_station_name, e...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, en...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(div19_df)

## # A tibble: 6 × 12
##    trip_id start_time         end_time       bikeid tripduration from_station_id
##      <dbl> <chr>              <chr>           <dbl>        <dbl>           <dbl>
## 1 21742443 2019-01-01 0:04:37 2019-01-01 0:…   2167          390             199
## 2 21742444 2019-01-01 0:08:13 2019-01-01 0:…   4386          441              44
## 3 21742445 2019-01-01 0:13:23 2019-01-01 0:…   1524          829              15
## 4 21742446 2019-01-01 0:13:45 2019-01-01 0:…    252         1783             123
## 5 21742447 2019-01-01 0:14:52 2019-01-01 0:…   1170          364             173
## 6 21742448 2019-01-01 0:15:33 2019-01-01 0:…   2437          216              98
## # ℹ 6 more variables: from_station_name <chr>, to_station_id <dbl>,
## #   to_station_name <chr>, usertype <chr>, gender <chr>, birthyear <dbl>

colnames(div19_df)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

head(div20_df)

## # A tibble: 6 × 13
##   ride_id  rideable_type started_at ended_at start_station_name start_station_id
##   <chr>    <chr>         <chr>      <chr>    <chr>                         <dbl>
## 1 EACB191… docked_bike   2020-01-2… 2020-01… Western Ave & Lel…              239
## 2 8FED874… docked_bike   2020-01-3… 2020-01… Clark St & Montro…              234
## 3 789F3C2… docked_bike   2020-01-0… 2020-01… Broadway & Belmon…              296
## 4 C9A388D… docked_bike   2020-01-0… 2020-01… Clark St & Randol…               51
## 5 943BC3C… docked_bike   2020-01-3… 2020-01… Clinton St & Lake…               66
## 6 6D9C8A6… docked_bike   2020-01-1… 2020-01… Wells St & Hubbar…              212
## # ℹ 7 more variables: end_station_name <chr>, end_station_id <dbl>,
## #   start_lat <dbl>, start_lng <dbl>, end_lat <dbl>, end_lng <dbl>,
## #   member_casual <chr>

colnames(div20_df)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

To sort, we will first standardize the column names to match because the 2019 and 2020 columns are different

# rename(dataset_name, new_column_name = old_col_name)
# Standardize 2019 dataframe
div19_df <- rename(div19_df, start_station_name = from_station_name, end_station_name = to_station_name, start_station_id = from_station_id, end_station_id = to_station_id, ride_id = trip_id) 
# Standardize 2020 dataframe.
div20_df <- rename(div20_df, start_time=started_at, end_time=ended_at)
# Standardize usertype column so that its values are either subscriber or customer for the 2020 dataset.
div20_df <- div20_df %>% 
  rename(usertype = member_casual) %>% 
  mutate(usertype = case_when(
    usertype == "member" ~ "Subscriber",
    usertype == "casual" ~ "Customer"
  ))

In the 2019 dataset, tripduration is calculated. 2020 dataset does not have this column, so we will calculate it and name it tripduration. The columns are chars so we must convert them to datetime. Must use dplyr and lubridate

Because div_19 trip duration is in seconds, div20 must be the same

div20_df <- div20_df %>% 
  mutate(
    start_time = ymd_hms(start_time),
    end_time = ymd_hms(end_time),
    tripduration = as.numeric(end_time - start_time)
  )

Filter invalid or outlier data

I want to first see how many datapoints are invalid, those that are invalid are those with ride times that are either negative or greater than 24 hours (86400 seconds). I store these values into a new variable, div19_invalid and div20_invalid. I do this to just get a number of how many data points were invalid.

div19_invalid <- div19_df %>% filter(tripduration <= 0 | tripduration >= 86400) nrow(div19_invalid)

div20_invalid <- div20_df %>% filter(tripduration <= 0 | tripduration >= 86400) nrow(div20_invalid)

I now have an idea as to how many valid datapoints I have. I filter data that is valid, and store it into the original variable. By doing this, any invalid data that does not fit the correct criteria will not be re-included in the original data frames. I do this for both datasets.

div19_df <- div19_df %>%
  filter(tripduration > 0 & tripduration < 86400) #filter rides < 24 hours, or > 0
div20_df <- div20_df %>%
  filter(tripduration > 0 & tripduration < 86400) #filter rides < 24 hours, or > 0

Actually filter now

div19_df <- div19_df %>% 
  filter(!is.na(start_station_name)& !is.na(end_station_name))
div20_df <- div20_df %>% 
  filter(!is.na(start_station_name)& !is.na(end_station_name))
# Check if any usertypes are missing, and exclude them 
div19_df <- div19_df %>% 
  filter(!is.na(usertype))
div20_df <- div20_df %>% 
  filter(!is.na(usertype))

Analyze phase

Now I will begin to analyze the data. I will create new variables for each usertype, for the years 2019 and 2020.

subs20 <- div20_df %>% filter(usertype == "Subscriber")
subs19 <- div19_df %>%  filter(usertype == "Subscriber")
cust20 <- div20_df %>% filter(usertype == "Customer")
cust19 <- div19_df %>% filter(usertype == "Customer")

Calculate mean, median for 2020:

meansubs20 <- mean(subs20$tripduration)
mediansubs20 <- median(subs20$tripduration)
meancust20 <- mean(cust20$tripduration)
mediancust20 <- median(cust20$tripduration)

Calculate mean, median for 2019:

meansubs19 <- mean(subs19$tripduration)
mediansubs19 <- median(subs19$tripduration)
meancust19 <- mean(cust19$tripduration)
mediancust19 <- median(cust19$tripduration)

SummaryStats <- data.frame(
  usertype = c("Subscriber", "Subscriber", "Customer", "Customer","Subscriber", "Subscriber", "Customer", "Customer"),
  statistic = c("Mean", "Median","Mean", "Median","Mean", "Median","Mean", "Median"),
  Year = c("2019", "2019", "2019", "2019", "2020","2020","2020","2020"),
  value = c(meansubs19, mediansubs19, meancust19, mediancust19,meansubs20, mediansubs20, meancust20, mediancust20)
)

Summary Statistics for both 2019 and 2020

SummaryStats

##     usertype statistic Year     value
## 1 Subscriber      Mean 2019  678.4683
## 2 Subscriber    Median 2019  501.0000
## 3   Customer      Mean 2019 2116.8862
## 4   Customer    Median 2019 1396.0000
## 5 Subscriber      Mean 2020  690.0363
## 6 Subscriber    Median 2020  514.0000
## 7   Customer      Mean 2020 2221.6528
## 8   Customer    Median 2020 1269.0000

Visualize ride distribution for both years

G_RideDistr2019_sec <-ggplot(div20_df,aes(x=tripduration,fill=usertype))+
  geom_histogram(binwidth=5,position="identity",alpha=0.6) +
  scale_x_continuous(limits=c(0,3600))+
  labs(title="Ride Duration Distribution (2020)",
       x = "Duration (seconds)",
       y = "Count") +
  theme_minimal()
G_RideDistr2019_sec

## Warning: Removed 7186 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

G_RideDistr2020<-ggplot(div20_df,aes(x=tripduration/60,fill=usertype))+
  geom_histogram(binwidth=5,position="identity",alpha=0.6) +
  scale_x_continuous(limits=c(0,120))+
  labs(title="Ride Duration Distribution (2020)",
       x = "Duration (minutes)",
       y = "Count") +
  theme_minimal()
G_RideDistr2020

## Warning: Removed 2162 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

G_RideDistr2019<- ggplot(div19_df,aes(x=tripduration/60,fill=usertype))+
  geom_histogram(binwidth=5,position="identity",alpha=0.6) +
  scale_x_continuous(limits=c(0,120))+
  labs(title="Ride Duration Distribution (2019)",
       x = "Duration (minutes)",
       y = "Count") +
  theme_minimal()
G_RideDistr2019

## Warning: Removed 1259 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Add new columns to 2020 and 2019 data frame for day of the week, and hour of the day

div20_df <- div20_df %>% 
  mutate(
    day_of_week = wday(start_time, label = TRUE, abbr = FALSE),
    hour_of_day = hour(start_time)
  )
div19_df <- div19_df %>% 
  mutate(
    day_of_week = wday(start_time, label = TRUE, abbr = FALSE),
    hour_of_day = hour(start_time)
  )

G_RidesbyWeek20 <- ggplot(div20_df,aes(x=day_of_week,fill=usertype))+
  geom_bar() +
  labs(title="Number of Rides by Day of Week 2020",
       x = "Day",
       y = "Count") +
  theme_minimal()
G_RidesbyWeek20

G_RidesbyWeek19 <- ggplot(div19_df,aes(x=day_of_week,fill=usertype))+
  geom_bar() +
  labs(title="Number of Rides by Day of Week 2019",
       x = "Day",
       y = "Count") +
  theme_minimal()
G_RidesbyWeek19

I want to combine the data from the two years to make a comprehensive bar graph.

#select needed columns, usertype and day_of_week, from both 2019 and 2020. 
df19_sel <- div19_df %>% 
  select(day_of_week, hour_of_day, usertype) %>% 
  mutate(year="2019")

df20_sel <- div20_df %>% 
  select(day_of_week, hour_of_day, usertype) %>% 
  mutate("2020")
#combine the data 
combined_df <- bind_rows(df20_sel,df19_sel)

G_RidesbyWeek <- ggplot(combined_df, aes(x = day_of_week, fill = usertype)) +
  geom_bar(position = "dodge") +
  labs(
    title = "Number of Rides by Day of Week and User Type",
    x = "Day of Week",
    y = "Number of Rides",
    fill = "User Type"
  ) +
  theme_minimal()
G_RidesbyWeek

G_RidesbyHour <-ggplot(combined_df, aes(x = hour_of_day, fill = usertype)) +
  geom_bar(position = "dodge") +
  labs(
    title = "Number of Rides by Day of Week and User Type",
    x = "Hour of Day (Military Time)",
    y = "Number of Rides",
    fill = "User Type"
  ) +
  theme_minimal()
G_RidesbyHour

popular_starts_20<- div20_df %>%
  group_by(usertype, start_station_name) %>%
  summarise(ride_count = n()) %>%
  arrange(usertype, desc(ride_count)) %>%
  slice_head(n = 5)  # Top 5 per usertype

## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.

popular_starts_19<- div19_df %>%
  group_by(usertype, start_station_name) %>%
  summarise(ride_count = n()) %>%
  arrange(usertype, desc(ride_count)) %>%
  slice_head(n = 5)  # Top 5 per usertype

## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.

#combining both
popular_starts_combined <- bind_rows(popular_starts_19, popular_starts_20)

ggplot(popular_starts_19, aes(x = reorder(start_station_name, -ride_count), y = ride_count, fill = usertype)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Top 10 Start Stations by Usertype (2019)",
    x = "Start Station",
    y = "Number of Rides"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

G_PopStrtSt20 <- ggplot(popular_starts_20, aes(x = reorder(start_station_name, -ride_count), y = ride_count, fill = usertype)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Top 10 Start Stations by Usertype (2020)",
    subtitle = "Most used start stations for both customers and subscribers",
    x = "Start Station",
    y = "Number of Rides"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
G_PopStrtSt20

G_PopStartSt19_20<- ggplot(popular_starts_combined, aes(x = reorder(start_station_name, -ride_count), y = ride_count, fill = usertype)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Top 12 Start Stations by Usertype (2019 and 2020)",
    x = "Start Station",
    y = "Number of Rides"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
G_PopStartSt19_20

## Different approach to Calculate Popular Start and End Stations

combined_starts <- bind_rows(
  div19_df %>% select(usertype,start_station_name),
  div20_df %>% select(usertype,start_station_name)
)
combined_ends <- bind_rows(
  div19_df %>% select(usertype,end_station_name),
  div20_df %>% select(usertype,end_station_name)
)

count total rides per station and usertype

pop_combined_starts <- combined_starts %>% 
  group_by(usertype,start_station_name) %>% 
  summarise(ride_count=n()) %>% 
  arrange(usertype,desc(ride_count)) %>% 
  slice_head(n=10)

## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.

pop_combined_ends <- combined_ends %>% 
  group_by(usertype,end_station_name) %>% 
  summarise(ride_count=n()) %>% 
  arrange(usertype,desc(ride_count)) %>% 
  slice_head(n=10)

## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.

G_StartStations <- ggplot(pop_combined_starts,aes(x=reorder(start_station_name,-ride_count),y=ride_count,fill=usertype))+
  geom_bar(stat="identity",position="dodge")+
  labs(
    title="Top 10 Most Popular Start stations by Usertype (2019 and 2020)",
    x = "Start Station",
    y = "Total Ride Count") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle=45,hjust=1))
G_StartStations

G_EndStations <- ggplot(pop_combined_ends,aes(x=reorder(end_station_name,-ride_count),y=ride_count,fill=usertype))+
  geom_bar(stat="identity",position="dodge")+
  labs(
    title="Top 10 Most Popular End stations by Usertype (2019 and 2020)",
    x = "End Station",
    y = "Total Ride Count") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle=45,hjust=1))
G_EndStations

Trip Frequency

Group by user, count rides to see how often each type uses the service

total_rides20 <- div20_df %>%
  group_by(usertype) %>%
  summarise(total_rides = n())
total_rides19<- div19_df %>%
  group_by(usertype) %>%
  summarise(total_rides = n())
total_rides_comb <- bind_rows(total_rides19,total_rides20)

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

G_TotalRides<- ggplot(total_rides_comb,aes(x=usertype,y=total_rides,fill=usertype))+
  geom_bar(stat="identity")+
  scale_y_continuous(labels=label_comma()) +
  labs(title="Total Rides by Usertype",
       x = "User Type",
       y = "Total Rides")+
  theme_minimal()
G_TotalRides