library(tidyverse)
library(lubridate)
library(ggplot2)
getwd() setwd(“C:/Users/Satinder/Documents/Rstudio/Bike sharing Analysis”)

STEP 1: COLLECT DATA

q3_2019 <- read_csv(“Divvy_Trips_2019_Q3.csv”) q4_2019 <- read_csv(“Divvy_Trips_2019_Q4.csv”) q1_2020 <- read_csv(“Divvy_Trips_2020_Q1.csv”)

STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE

Compare column names each of the files

colnames(q3_2019) colnames(q4_2019) colnames(q1_2020)

Rename columns to make them consisent with q1_2020

(q4_2019 <- rename(q4_2019 ,ride_id = trip_id ,rideable_type = bikeid ,started_at = start_time
,ended_at = end_time
,start_station_name = from_station_name ,start_station_id = from_station_id ,end_station_name = to_station_name ,end_station_id = to_station_id ,member_casual = usertype))

(q3_2019 <- rename(q3_2019 ,ride_id = trip_id ,rideable_type = bikeid ,started_at = start_time
,ended_at = end_time
,start_station_name = from_station_name ,start_station_id = from_station_id ,end_station_name = to_station_name ,end_station_id = to_station_id ,member_casual = usertype))

Inspect the dataframes and look for inconguencies

str(q1_2020) str(q4_2019) str(q3_2019)

Convert ride_id and rideable_type to character so that they can stack correctly

q4_2019 <- mutate(q4_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type)) q3_2019 <- mutate(q3_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type))

Stack individual quarter’s data frames into one big data frame

all_trips <- bind_rows(q3_2019, q4_2019, q1_2020)

Remove lat, long, birthyear, and gender fields as this data was dropped beginning in 2020

all_trips <- all_trips %>%
select(-c(start_lat, start_lng, end_lat, end_lng, birthyear, gender))

STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS

Inspect the new table that has been created

colnames(all_trips) nrow(all_trips)
dim(all_trips)
head(all_trips)
str(all_trips) summary(all_trips)

There are a few problems we will need to fix:

(1) In the “member_casual” column, there are two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). We will need to consolidate that from four to two labels.

(2) The data can only be aggregated at the ride-level, which is too granular. We will want to add some additional columns of data – such as day, month, year – that provide additional opportunities to aggregate the data.

(3) We will want to add a calculated field for length of ride since the 2020Q1 data did not have the “tripduration” column. We will add “ride_length” to the entire dataframe for consistency.

(4) There are some rides where tripduration shows up as negative, including several hundred rides where Divvy took bikes out of circulation for Quality Control reasons. We will want to delete these rides.

In the “member_casual” column, replace “Subscriber” with “member” and “Customer” with “casual”

Before 2020, Divvy used different labels for these two types of riders … we will want to make our dataframe consistent with their current nomenclature

N.B.: “Level” is a special property of a column that is retained even if a subset does not contain any values from a specific level

Begin by seeing how many observations fall under each usertype

table(all_trips$member_casual)

Reassign to the desired values (we will go with the current 2020 labels)

all_trips <- all_trips %>% mutate(member_casual = recode(member_casual ,“Subscriber” = “member” ,“Customer” = “casual”)) # Check to make sure the proper number of observations were reassigned table(all_trips$member_casual)

Add columns that list the date, month, day, and year of each ride

all_trips$date <- as.Date(all_trips$started_at) all_trips$month <- format(as.Date(all_trips$date), “%m”) all_trips$day <- format(as.Date(all_trips$date), “%d”) all_trips$year <- format(as.Date(all_trips$date), “%Y”) all_trips$day_of_week <- format(as.Date(all_trips$date), “%A”)

Add a “ride_length” calculation to all_trips (in seconds)

all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)

Inspect the structure of the columns

str(all_trips)

Convert “ride_length” from Factor to numeric so we can run calculations on the data is.factor(all_trips$ride_length) all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length)) is.numeric(all_trips$ride_length)

Remove “bad” data

The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative

all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

Descriptive analysis on ride_length

mean(all_trips_v2$ride_length) median(all_trips_v2$ride_length) max(all_trips_v2$ride_length) min(all_trips_v2$ride_length)

Compare members and casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean) aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median) aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max) aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)

See the average ride time by each day for members vs casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)

Notice that the days of the week are out of order.

all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c(“Sunday”, “Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”, “Saturday”))

Now, let’s run the average ride time by each day for members vs casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)

analyze ridership data by type and weekday

all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday() group_by(member_casual, weekday) %>% #groups by usertype and weekday summarise(number_of_rides = n()

Let’s visualize the number of rides by rider type

all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) + geom_col(position = “dodge”)

Let’s create a visualization for average duration

all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) + geom_col(position = “dodge”

Bike sharing Analysis

Satinder Singh

8/4/2021

STEP 1: COLLECT DATA

STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE

Compare column names each of the files

Rename columns to make them consisent with q1_2020

Inspect the dataframes and look for inconguencies

Convert ride_id and rideable_type to character so that they can stack correctly

Stack individual quarter’s data frames into one big data frame

Remove lat, long, birthyear, and gender fields as this data was dropped beginning in 2020

STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS

Inspect the new table that has been created

There are a few problems we will need to fix:

(1) In the “member_casual” column, there are two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). We will need to consolidate that from four to two labels.

(2) The data can only be aggregated at the ride-level, which is too granular. We will want to add some additional columns of data – such as day, month, year – that provide additional opportunities to aggregate the data.

(3) We will want to add a calculated field for length of ride since the 2020Q1 data did not have the “tripduration” column. We will add “ride_length” to the entire dataframe for consistency.

(4) There are some rides where tripduration shows up as negative, including several hundred rides where Divvy took bikes out of circulation for Quality Control reasons. We will want to delete these rides.

In the “member_casual” column, replace “Subscriber” with “member” and “Customer” with “casual”

Before 2020, Divvy used different labels for these two types of riders … we will want to make our dataframe consistent with their current nomenclature

N.B.: “Level” is a special property of a column that is retained even if a subset does not contain any values from a specific level

Begin by seeing how many observations fall under each usertype

Reassign to the desired values (we will go with the current 2020 labels)

Add columns that list the date, month, day, and year of each ride

Add a “ride_length” calculation to all_trips (in seconds)

Inspect the structure of the columns

Convert “ride_length” from Factor to numeric so we can run calculations on the data is.factor(all_trips\(ride_length) all_trips\)ride_length <- as.numeric(as.character(all_trips\(ride_length)) is.numeric(all_trips\)ride_length)

Remove “bad” data

The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

Descriptive analysis on ride_length

Compare members and casual users

See the average ride time by each day for members vs casual users

Notice that the days of the week are out of order.

Now, let’s run the average ride time by each day for members vs casual users

analyze ridership data by type and weekday

Let’s visualize the number of rides by rider type

Let’s create a visualization for average duration