R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

install.packages(“readxl”) install.packages(“readr”) library(readr) rm(list = ls()) library(readxl) library(tidyr) setwd(“/Users/urvidesai/Desktop/Data Analyst Case Study/OG Files”) my_files <- list.files(pattern = “*.xls”) my_files View(my_files) library(tidyverse) #helps wrangle data # Use the conflicted package to manage conflicts library(conflicted) # Set dplyr::filter and dplyr::lag as the default choices conflict_prefer(“filter”, “dplyr”) conflict_prefer(“lag”, “dplyr”) library(dplyr) install.packages(“dplyr”) # # Upload Divvy datasets (csv files) here

if(!require(readr)){install.packages(“readr”)} # Install readr if it’s not already installed library(readr) # Load the readr package q1_2019 <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2019_Q1.csv”) q1_2020 <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2020_Q1.csv”) colnames(q1_2019) colnames(q1_2020) install.packages(“dplyr”) # Install the dplyr package if not already installed library(dplyr) # Load the dplyr package install.packages(“dplyr”)

if(!require(dplyr)){install.packages(“dplyr”)} # Install dplyr if it’s not already installed library(dplyr) # Load the dplyr package q1_2019 <- rename(q1_2019, ride_id = trip_id, rideable_type = bikeid, started_at = start_time, ended_at = end_time)

(q1_2019 <- rename(q1_2019 ,ride_id = trip_id ,rideable_type = bikeid ,started_at = start_time ,ended_at = end_time ,start_station_name = from_station_name ,start_station_id = from_station_id ,end_station_name = to_station_name ,end_station_id = to_station_id ,member_casual = usertype )

str(“q1_2019”) str(“q1_2020”)

q1_2019 <- (mutate(q1_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type))

Stack individual quarter’s data frames into one big data frame

“all_trips <- bind_rows(q1_2019, q1_2020)#, q3_2019)#, q4_2019, q1_2020)”

Remove lat, long, birthyear, and gender fields as this data was dropped beginning in 2020

Load the readr package (if you’re reading a CSV)

if (!require(readr)) { install.packages(“readr”) } library(readr)

Load the dplyr package (if you’re using dplyr functions)

if (!require(dplyr)) { install.packages(“dplyr”) } library(dplyr)

Load the data (replace with your actual file path)

all_trips <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2019_Q1.csv”) all_trips <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2020_Q1.csv”) # OR

Create all_trips by combining other data frames

all_trips <- bind_rows(q1_2019, q2_2019, q3_2019)

Now you can use the data frame:

all_trips <- all_trips %>% select(-c(start_lat, start_lng, end_lat, end_lng,)) all_trips <- all_trips %>%
select(-c(start_lng, end_lat, end_lng,))

#CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS colnames(all_trips) #List of column names nrow(all_trips) #How many rows are in data frame? dim(all_trips) #Dimensions of the data frame? head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips) str(all_trips) #See list of columns and data types (numeric, character, etc) summary(all_trips) #Statistical summary of data. Mainly for numerics

table(all_trips$member_casual) library(dplyr) # Make sure dplyr is loaded (recode is part of dplyr) if(!require(dplyr)){install.packages(“dplyr”)} library(dplyr)

Correct mutate() and recode() usage

all_trips <- all_trips %>% mutate(member_casual = recode(member_casual, “Subscriber” = “member”, “Customer” = “casual”))

all_trips\(date <- as.Date(all_trips\)started_at) #The default format is yyyy-mm-dd all_trips\(month <- format(as.Date(all_trips\)date), “%m”) all_trips\(day <- format(as.Date(all_trips\)date), “%d”) all_trips\(year <- format(as.Date(all_trips\)date), “%Y”) all_trips\(day_of_week <- format(as.Date(all_trips\)date), “%A”) all_trips\(ride_length <- difftime(all_trips\)ended_at,all_trips\(started_at) str(all_trips) is.factor(all_trips\)ride_length) all_trips\(ride_length <- as.numeric(as.character(all_trips\)ride_length)) is.numeric(all_trips\(ride_length) all_trips_v2 <- all_trips[!(all_trips\)start_station_name == “HQ QR” | all_trips$ride_length<0),]

#CONDUCT DESCRIPTIVE ANALYSIS mean(all_trips_v2\(ride_length) #straight average (total ride length / rides) median(all_trips_v2\)ride_length) #midpoint number in the ascending array of ride lengths max(all_trips_v2\(ride_length) #longest ride min(all_trips_v2\)ride_length) #shortest ride summary(all_trips_v2$ride_length)

Compare members and casual users

aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = mean) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = median) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = max) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = min)

aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)

all_trips_v2\(day_of_week <- ordered(all_trips_v2\)day_of_week, levels=c(“Sunday”, “Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”, “Saturday”))

aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)

install.packages(“lubridate”) # Install lubridate if not already installed library(lubridate) # Load the package

all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday() group_by(member_casual, weekday) %>% #groups by usertype and weekday summarise(number_of_rides = n() #calculates the number of rides and average duration ,average_duration = mean(ride_length)) %>% # calculates the average duration arrange(member_casual, weekday)

sorts

install.packages(“ggplot2”) # Install ggplot2 if not already installed library(ggplot2) # Load ggplot2

Let’s visualize the number of rides by rider type

all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) + geom_col(position = “dodge”) # Let’s create a visualization for average duration all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) + geom_col(position = “dodge”)

all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise( number_of_rides = n(), average_duration = mean(ride_length, na.rm = TRUE) ) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, color = member_casual, group = member_casual)) + geom_line(size = 1) + # Add lines for each group geom_point(size = 2) + # Add points for better visibility labs( title = “Usage Patterns Across Days of the Week”, x = “Day of Week”, y = “Number of Rides”, color = “User Type” ) + theme_minimal()

all_trips_v2 %>% mutate(hour = hour(started_at), # Extract the hour from the timestamp weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday, hour) %>% summarise(number_of_rides = n(), average_duration = mean(ride_length, na.rm = TRUE)) %>% ggplot(aes(x = hour, y = weekday, fill = number_of_rides)) + geom_tile() + # Create a heatmap scale_fill_gradient(low = “Pink”, high = “darkblue”) + # Adjust color gradient facet_wrap(~ member_casual) + # Create separate plots for member vs casual labs( title = “Ride Frequency by Time of Day and User Type”, x = “Hour of Day”, y = “Day of Week”, fill = “Number of Rides” ) + theme_minimal()

library(dplyr)

Count occurrences

Use a better name for your data frame

trip_data <- read.csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2019_Q1.csv”) # Load your data (replace with your file) trip_data <- read.csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2020_Q1.csv”) # Load your data (replace with your file)

Now the count function will work correctly

libraby(dplyr) counts <- trip_data %>% count(“category”) # If you’re using dplyr’s count print(counts) Example: Handling missing values (if needed) # trip_data_cleaned <- trip_data %>% drop_na(ride_length, member_casual, day_of_week)

counts_agg <- aggregate(trip_data\(ride_length ~ trip_data\)member_casual + trip_data$day_of_week, FUN = mean) write.csv(counts_agg, file = ‘avg_ride_length.csv’)

Your aggregate code (with the corrected data frame name)

counts_agg <- aggregate(trip_data\(ride_length ~ trip_data\)member_casual + trip_data$day_of_week, FUN = mean) write.csv(counts_agg, file = ‘avg_ride_length.csv’) counts <- data %>% count(category) print(counts)

counts <- aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean) write.csv(counts, file = ‘avg_ride_length.csv’)

Aggregate to calculate the mean ride length by member type and day of week

counts <- aggregate(ride_length ~ member_casual + day_of_week, data = all_trips_v2, FUN = mean)

Export the aggregated data to a CSV file

write.csv(counts, file = “avg_ride_length.csv”, row.names = FALSE)

Message to indicate success

print(“CSV file ‘avg_ride_length.csv’ has been successfully created!”) getwd() # This shows the current working directory

Guiding Questions and Key Insights

Where you able to answer how annual members and casual riders use Cyclistic bikes differently?
- Key Insight: Annual members may use bikes for commuting or daily errands, reflected in shorter, consistent ride lengths and weekday usage. Casual riders likely use bikes for leisure, with longer ride lengths and peak usage during weekends.

What story does your data tell?
- Cyclistic bikes serve two distinct user groups with varying needs. Understanding these patterns can help tailor marketing, promotions, and bike placement to maximize usage.

How do your findings relate to your original question?
- By identifying behavioral differences, the findings guide targeted strategies for increasing annual memberships and improving user satisfaction.

Who is your audience?
- The executive team and stakeholders who need actionable insights for decision-making.

Can data visualization help you share your findings?
- Yes, clear visualizations like bar charts, line graphs, and heatmaps can effectively communicate user patterns.