Author : Muhamad Ilyas
Step 1: Install & Load the packages.
install.packages("tidyverse")
install.packages("skimr")
install.packages("ggplot")
install.packages("readr")
install.packages("dplyr")
install.packages("janitor")
library(tidyverse)
library(skimr)
library(lubridate)
library(ggplot2)
library(readr)
library(dplyr)
library(janitor)
Step 2: Prepare the data and combine them in one data frame.
Step 3: Examine the data frame.
head(all_trips_2022)
dim(all_trips_2022)
colnames(all_trips_2022)
summary(all_trips_2022)
Step 4: Check for NA values.
sum(is.na(all_trips_2022))
Step 5 : Identify unnecessary data and remove columns we don’t need :
start_lat, start_lng, end_lat, end_lng
all_trips_2022 <- all_trips_2022 %>% select(-c(start_lat, start_lng, end_lat, end_lng))
colnames(all_trips_2022)
Step 6 : Add columns that list the date, month, day and year of each
ride
# This will allow us to aggregate ride data for each month and day. Before completing these operations we could only aggregate at the ride level
all_trips_2022$date <- as.Date(all_trips_2022$started_at) # Default format is yyyy-mm-dd
all_trips_2022$month <- format(as.Date(all_trips_2022$date), "%m")
all_trips_2022$day <- format(as.Date(all_trips_2022$date), "%d")
all_trips_2022$year <- format(as.Date(all_trips_2022$date), "%Y")
all_trips_2022$day_of_week <- format(as.Date(all_trips_2022$date), "%A")
Step 7 : Add column “ride_length”, which is the length of each ride
from ended_at minus started_at, and format as HH:MM:SS.
#Add column ride_length
all_trips_2022 <- all_trips_2022 %>% mutate(ride_length = ended_at - started_at)
#Convert from difftime to numeric values
all_trips_2022$ride_length <- as.numeric(as.character(all_trips_2022$ride_length))
is.numeric(all_trips_2022$ride_length)
#Convert seconds to minutes
all_trips_2022$ride_length <- as.numeric(all_trips_2022$ride_length/60)
Step 8 : Removing the bad data and do analysis on the ride
length.
The dataframe includes a few hundred entries when bikes were taken
out of docks and checked for quality by Divvy or ride_length was
negative
#we will create a new version of the dataframe (v2) since data is being removed
all_trips_2022_v2 <- all_trips_2022[!(all_trips_2022$start_station_name == "HQ QR" | all_trips_2022$ride_length<0),]
#check if there is any null values in data
sum(is.na(all_trips_2022_v2))
Remove NA values for clean data
#erase the null values
all_trips_2022_v2 <- all_trips_2022_v2 %>%
drop_na()
Analysis on ride length
#check for data with ride length more than 1 day (86400 seconds or 1440 mins). There is 3484 rides that the rider use the bike for more than 1 day
sum(all_trips_2022_v2$ride_length > 1440)
#Using summary to check min, max, median and mean
summary(all_trips_2022_v2$ride_length)
Step 9 : Aggregate to analyze the data based on user type: member vs
casual
#Compare members and casual users
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=mean)
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=median)
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=max)
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=min)
#See the average ride time by each day for members vs casual users
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual +
all_trips_2022_v2$day_of_week, FUN=mean)
Notice that the days of the week are out of order.
# Let's fix order
all_trips_2022_v2$day_of_week <- ordered(all_trips_2022_v2$day_of_week,
levels=c("Sunday", "Monday",
"Tuesday", "Wednesday",
"Thursday", "Friday",
"Saturday"))
#Let's check again the average ride by time by each day for members vs casual users
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual + all_trips_2022_v2$day_of_week, FUN = mean)
Step 10 : Further analysis into the stations
head(count(all_trips_2022_v2, start_station_name, member_casual, rideable_type, sort= TRUE))
head(count(all_trips_2022_v2, end_station_name, member_casual, rideable_type, sort= TRUE))
Step 11: Analyze ridership data by user types and day of the week
all_trips_2022_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday) # sorts by member_casual and weekday
Step 12 : Visualize the number of rides by rider types and average
duration
par(mfrow=c(2,2))
# Number of rides by rider types
all_trips_2022_v2 %>%
mutate(weekday = wday(started_at, label=TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n(), average_duration=mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x=weekday, y=number_of_rides, fill=member_casual))+
geom_col(position="dodge")
# Number of rides by average duration
all_trips_2022_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n(),average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge")
Step 13: Visualize Member vs Casual on short ride (less and one
hour)
one_hour_ride <- all_trips_2022_v2 %>% filter(ride_length < 60)
one_hour_ride$ride_length_by_mins <- cut(one_hour_ride$ride_length,breaks = 20)
ggplot(data = one_hour_ride) +
geom_bar(mapping = aes(x = ride_length_by_mins, fill = member_casual)) +
labs(title = "One hour ride length") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_x_discrete(labels=c("3", "6", "9", "12", "15", "18", "21", "24", "27", "30", "33", "36", "39", "42", "45", "48", "51", "54", "57", "60"))
Step 14 : Visualize day of the week riding choices between member vs
causal
ggplot(data = all_trips_2022_v2) +
geom_bar(mapping = aes(x = factor(day_of_week), fill = rideable_type))+
facet_wrap(~member_casual) +
labs(title='riding choice during day of the week', x= 'day of the week' )+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Step 15 : Check for peak time for bike usage between member vs
casual
hour_data <- all_trips_2022_v2
hour_data$start_hour <- as.numeric(format(strptime(all_trips_2022_v2$started_at,"%Y-%m-%d %H:%M:%OS"),'%H'))
ggplot(data = hour_data) +
geom_bar(mapping = aes(x = start_hour, fill = member_casual), stat = 'count') +
facet_wrap(~factor(day_of_week)) +
labs(title = "bike usage by starting hour", x = "starting hour") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Step 16 : Save as csv for further analysis and visualization in
Tableau
#the data frame
write_csv(all_trips_2022_v2, "all_trips.csv")
write_csv(hour_data, "hour_data_ride.csv")
write_csv(one_hour_ride, "one_hour_ride.csv")
#total and average weekly rides by rider type
summary_ride_weekly <- all_trips_2022_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday)
write_csv(summary_ride_weekly, "summary_ride_weekly.csv")
#total and average weekly rides by rider type
summary_ride_weekly_type <- all_trips_2022_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday, rideable_type) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday)
write_csv(summary_ride_weekly_type, "summary_ride_weekly_type.csv")
#total and avg monthly rides by rider type
summary_month <- all_trips_2022_v2 %>%
mutate(month = month(started_at, label = TRUE)) %>%
group_by(month,member_casual) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(month, member_casual)
write_csv(summary_month, "summary_ride_monthly.csv")
#most popular stations
popular_stations <- all_trips_2022_v2 %>%
mutate(station = start_station_name) %>%
drop_na(start_station_name) %>%
group_by(start_station_name, member_casual) %>%
summarise(number_of_rides=n())
write_csv(popular_stations, "popular_stations.csv")
#total membership types and rideable types
total_riders <- data.frame(table(all_trips_2022_v2$member_casual))
total_types <- data.frame(table(all_trips_2022_v2$rideable_type))
write_csv(total_riders, "total_riders.csv")
write_csv(total_types, "total_types.csv")
---
title: "Google Data Analytics : Cyclistic Bike Share"
output: html_notebook
---

Author : Muhamad Ilyas

Step 1: Install & Load the packages.
```{r}
install.packages("tidyverse")
install.packages("skimr")
install.packages("ggplot")
install.packages("readr")
install.packages("dplyr")
install.packages("janitor")
library(tidyverse)
library(skimr)
library(lubridate)
library(ggplot2)
library(readr)
library(dplyr)
library(janitor)
```

Step 2: Prepare the data and combine them in one data frame.
```{r echo=FALSE, message=FALSE, paged.print=FALSE}
#Combine the data from Jan 2022 to Dec 2022 into one data frame "all_trips_2022.

all_trips_2022 <- rbind(
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202201-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202202-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202203-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202204-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202205-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202206-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202207-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202208-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202209-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202210-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202211-divvy-tripdata.csv"),
 read_csv("C:/Users/Shaquille/Documents/Cyclistic/Bike_data/202212-divvy-tripdata.csv"))
```

Step 3: Examine the data frame.
```{r echo=TRUE}
head(all_trips_2022)
dim(all_trips_2022)
colnames(all_trips_2022)
summary(all_trips_2022)
```

Step 4: Check for NA values.
```{r}
sum(is.na(all_trips_2022)) 
```

Step 5 : Identify unnecessary data and remove columns we don't need : start_lat, start_lng, end_lat, end_lng
```{r}
all_trips_2022 <- all_trips_2022 %>% select(-c(start_lat, start_lng, end_lat, end_lng))
colnames(all_trips_2022)
```


Step 6 : Add columns that list the date, month, day and year of each ride
```{r}
# This will allow us to aggregate ride data for each month and day. Before completing these operations we could only aggregate at the ride level
all_trips_2022$date <- as.Date(all_trips_2022$started_at) # Default format is yyyy-mm-dd
all_trips_2022$month <- format(as.Date(all_trips_2022$date), "%m")
all_trips_2022$day <- format(as.Date(all_trips_2022$date), "%d")
all_trips_2022$year <- format(as.Date(all_trips_2022$date), "%Y")
all_trips_2022$day_of_week <- format(as.Date(all_trips_2022$date), "%A")
```

Step 7 : Add column "ride_length", which is the length of each ride from ended_at minus started_at, and format as HH:MM:SS.
```{r}
#Add column ride_length
all_trips_2022 <- all_trips_2022 %>% mutate(ride_length = ended_at - started_at)

#Convert from difftime to numeric values
all_trips_2022$ride_length <- as.numeric(as.character(all_trips_2022$ride_length))
is.numeric(all_trips_2022$ride_length)

#Convert seconds to minutes
all_trips_2022$ride_length <- as.numeric(all_trips_2022$ride_length/60)
```

Step 8 : Removing the bad data and do analysis on the ride length.

The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative
```{r}
#we will create a new version of the dataframe (v2) since data is being removed 
all_trips_2022_v2 <- all_trips_2022[!(all_trips_2022$start_station_name == "HQ QR" | all_trips_2022$ride_length<0),]
```

```{r}
#check if there is any null values in data
sum(is.na(all_trips_2022_v2))
```

Remove NA values for clean data
```{r}
#erase the null values 
all_trips_2022_v2 <- all_trips_2022_v2 %>% 
  drop_na()
```

Analysis on ride length
```{r}
#check for data with ride length  more than 1 day (86400 seconds or 1440 mins). There is 3484 rides that the rider use the bike for more than 1 day
sum(all_trips_2022_v2$ride_length > 1440)

#Using summary to check min, max, median and mean
summary(all_trips_2022_v2$ride_length)
```

Step 9 : Aggregate to analyze the data based on user type: member vs casual
```{r}
#Compare members and casual users
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=mean)
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=median)
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=max)
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual, FUN=min)

#See the average ride time by each day for members vs casual users
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual + 
            all_trips_2022_v2$day_of_week, FUN=mean)
```

Notice that the days of the week are out of order.

```{r}
# Let's fix order 
all_trips_2022_v2$day_of_week <- ordered(all_trips_2022_v2$day_of_week, 
                                         levels=c("Sunday", "Monday",
                                                  "Tuesday", "Wednesday",
                                                  "Thursday", "Friday", 
                                                  "Saturday"))
```

```{r}
#Let's check again the average ride by time by each day for members vs casual users
aggregate(all_trips_2022_v2$ride_length ~ all_trips_2022_v2$member_casual + all_trips_2022_v2$day_of_week, FUN = mean)
```

Step 10 : Further analysis into the stations
```{r}
head(count(all_trips_2022_v2, start_station_name, member_casual,  rideable_type, sort= TRUE))
head(count(all_trips_2022_v2, end_station_name, member_casual,  rideable_type, sort= TRUE))
```

Step 11: Analyze ridership data by user types and day of the week
```{r}
all_trips_2022_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%  #creates weekday field using wday()
  group_by(member_casual, weekday) %>%                  #groups by usertype and weekday
  summarise(number_of_rides = n()				#calculates the number of rides and average duration 
  ,average_duration = mean(ride_length)) %>% 	         	# calculates the average duration
  arrange(member_casual, weekday)								        # sorts by member_casual and weekday
```

Step 12 : Visualize the number of rides by rider types and average duration
```{r}
par(mfrow=c(2,2))

# Number of rides by rider types
all_trips_2022_v2 %>%
  mutate(weekday = wday(started_at, label=TRUE)) %>% 
  group_by(member_casual, weekday) %>%
  summarise(number_of_rides = n(), average_duration=mean(ride_length)) %>%
  arrange(member_casual, weekday) %>%
  ggplot(aes(x=weekday, y=number_of_rides, fill=member_casual))+
  geom_col(position="dodge")
```

```{r}
# Number of rides by average duration
all_trips_2022_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n(),average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge")
```

Step 13: Visualize Member vs Casual on short ride (less and one hour)
```{r}
one_hour_ride <- all_trips_2022_v2 %>% filter(ride_length < 60)
one_hour_ride$ride_length_by_mins <- cut(one_hour_ride$ride_length,breaks = 20)

ggplot(data = one_hour_ride) + 
  geom_bar(mapping = aes(x = ride_length_by_mins, fill = member_casual)) +
  labs(title = "One hour ride length") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_x_discrete(labels=c("3", "6", "9", "12", "15", "18", "21", "24", "27", "30", "33", "36", "39", "42", "45", "48", "51", "54", "57", "60"))
```

Step 14 : Visualize day of the week riding choices between member vs causal
```{r}
ggplot(data = all_trips_2022_v2) +
  geom_bar(mapping = aes(x = factor(day_of_week), fill = rideable_type))+
  facet_wrap(~member_casual) +
  labs(title='riding choice during day of the week', x= 'day of the week' )+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
```

Step 15 : Check for peak time for bike usage between member vs casual
```{r}
hour_data <- all_trips_2022_v2
hour_data$start_hour <- as.numeric(format(strptime(all_trips_2022_v2$started_at,"%Y-%m-%d %H:%M:%OS"),'%H'))

ggplot(data = hour_data) + 
  geom_bar(mapping = aes(x = start_hour, fill = member_casual), stat = 'count') + 
  facet_wrap(~factor(day_of_week)) +
  labs(title = "bike usage by starting hour", x = "starting hour") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 
```

Step 16 : Save as csv for further analysis and visualization in Tableau
```{r}
#the data frame
write_csv(all_trips_2022_v2, "all_trips.csv")
write_csv(hour_data, "hour_data_ride.csv")
write_csv(one_hour_ride, "one_hour_ride.csv")

#total and average weekly rides by rider type
summary_ride_weekly <- all_trips_2022_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  
  
write_csv(summary_ride_weekly, "summary_ride_weekly.csv")

#total and average weekly rides by rider type
summary_ride_weekly_type <- all_trips_2022_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday, rideable_type) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  
  
write_csv(summary_ride_weekly_type, "summary_ride_weekly_type.csv")
  
#total and avg monthly rides by rider type
summary_month <- all_trips_2022_v2 %>% 
  mutate(month = month(started_at, label = TRUE)) %>%  
  group_by(month,member_casual) %>%  
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>%    
  arrange(month, member_casual)
write_csv(summary_month, "summary_ride_monthly.csv")

#most popular stations
popular_stations <- all_trips_2022_v2 %>%
  mutate(station = start_station_name) %>%
  drop_na(start_station_name) %>%
  group_by(start_station_name, member_casual) %>%
  summarise(number_of_rides=n())

write_csv(popular_stations, "popular_stations.csv")

#total membership types and rideable types
total_riders <- data.frame(table(all_trips_2022_v2$member_casual))
total_types <- data.frame(table(all_trips_2022_v2$rideable_type))

write_csv(total_riders, "total_riders.csv")
write_csv(total_types, "total_types.csv")
```

