This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
install.packages(“readxl”) install.packages(“readr”) library(readr) rm(list = ls()) library(readxl) library(tidyr) setwd(“/Users/urvidesai/Desktop/Data Analyst Case Study/OG Files”) my_files <- list.files(pattern = “*.xls”) my_files View(my_files) library(tidyverse) #helps wrangle data # Use the conflicted package to manage conflicts library(conflicted) # Set dplyr::filter and dplyr::lag as the default choices conflict_prefer(“filter”, “dplyr”) conflict_prefer(“lag”, “dplyr”) library(dplyr) install.packages(“dplyr”) # # Upload Divvy datasets (csv files) here
if(!require(readr)){install.packages(“readr”)} # Install readr if it’s not already installed library(readr) # Load the readr package q1_2019 <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2019_Q1.csv”) q1_2020 <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2020_Q1.csv”) colnames(q1_2019) colnames(q1_2020) install.packages(“dplyr”) # Install the dplyr package if not already installed library(dplyr) # Load the dplyr package install.packages(“dplyr”)
if(!require(dplyr)){install.packages(“dplyr”)} # Install dplyr if it’s not already installed library(dplyr) # Load the dplyr package q1_2019 <- rename(q1_2019, ride_id = trip_id, rideable_type = bikeid, started_at = start_time, ended_at = end_time)
(q1_2019 <- rename(q1_2019 ,ride_id = trip_id ,rideable_type = bikeid ,started_at = start_time ,ended_at = end_time ,start_station_name = from_station_name ,start_station_id = from_station_id ,end_station_name = to_station_name ,end_station_id = to_station_id ,member_casual = usertype )
str(“q1_2019”) str(“q1_2020”)
q1_2019 <- (mutate(q1_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type))
“all_trips <- bind_rows(q1_2019, q1_2020)#, q3_2019)#, q4_2019, q1_2020)”
if (!require(readr)) { install.packages(“readr”) } library(readr)
if (!require(dplyr)) { install.packages(“dplyr”) } library(dplyr)
all_trips <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2019_Q1.csv”) all_trips <- read_csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2020_Q1.csv”) # OR
all_trips <- all_trips %>% select(-c(start_lat, start_lng,
end_lat, end_lng,)) all_trips <- all_trips %>%
select(-c(start_lng, end_lat, end_lng,))
#CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS colnames(all_trips) #List of column names nrow(all_trips) #How many rows are in data frame? dim(all_trips) #Dimensions of the data frame? head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips) str(all_trips) #See list of columns and data types (numeric, character, etc) summary(all_trips) #Statistical summary of data. Mainly for numerics
table(all_trips$member_casual) library(dplyr) # Make sure dplyr is loaded (recode is part of dplyr) if(!require(dplyr)){install.packages(“dplyr”)} library(dplyr)
all_trips <- all_trips %>% mutate(member_casual = recode(member_casual, “Subscriber” = “member”, “Customer” = “casual”))
all_trips\(date <- as.Date(all_trips\)started_at) #The default format is yyyy-mm-dd all_trips\(month <- format(as.Date(all_trips\)date), “%m”) all_trips\(day <- format(as.Date(all_trips\)date), “%d”) all_trips\(year <- format(as.Date(all_trips\)date), “%Y”) all_trips\(day_of_week <- format(as.Date(all_trips\)date), “%A”) all_trips\(ride_length <- difftime(all_trips\)ended_at,all_trips\(started_at) str(all_trips) is.factor(all_trips\)ride_length) all_trips\(ride_length <- as.numeric(as.character(all_trips\)ride_length)) is.numeric(all_trips\(ride_length) all_trips_v2 <- all_trips[!(all_trips\)start_station_name == “HQ QR” | all_trips$ride_length<0),]
#CONDUCT DESCRIPTIVE ANALYSIS mean(all_trips_v2\(ride_length) #straight average (total ride length / rides) median(all_trips_v2\)ride_length) #midpoint number in the ascending array of ride lengths max(all_trips_v2\(ride_length) #longest ride min(all_trips_v2\)ride_length) #shortest ride summary(all_trips_v2$ride_length)
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = mean) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = median) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = max) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = min)
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2\(day_of_week <- ordered(all_trips_v2\)day_of_week, levels=c(“Sunday”, “Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”, “Saturday”))
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)
install.packages(“lubridate”) # Install lubridate if not already installed library(lubridate) # Load the package
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday() group_by(member_casual, weekday) %>% #groups by usertype and weekday summarise(number_of_rides = n() #calculates the number of rides and average duration ,average_duration = mean(ride_length)) %>% # calculates the average duration arrange(member_casual, weekday)
install.packages(“ggplot2”) # Install ggplot2 if not already installed library(ggplot2) # Load ggplot2
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) + geom_col(position = “dodge”) # Let’s create a visualization for average duration all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) + geom_col(position = “dodge”)
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise( number_of_rides = n(), average_duration = mean(ride_length, na.rm = TRUE) ) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, color = member_casual, group = member_casual)) + geom_line(size = 1) + # Add lines for each group geom_point(size = 2) + # Add points for better visibility labs( title = “Usage Patterns Across Days of the Week”, x = “Day of Week”, y = “Number of Rides”, color = “User Type” ) + theme_minimal()
all_trips_v2 %>% mutate(hour = hour(started_at), # Extract the hour from the timestamp weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday, hour) %>% summarise(number_of_rides = n(), average_duration = mean(ride_length, na.rm = TRUE)) %>% ggplot(aes(x = hour, y = weekday, fill = number_of_rides)) + geom_tile() + # Create a heatmap scale_fill_gradient(low = “Pink”, high = “darkblue”) + # Adjust color gradient facet_wrap(~ member_casual) + # Create separate plots for member vs casual labs( title = “Ride Frequency by Time of Day and User Type”, x = “Hour of Day”, y = “Day of Week”, fill = “Number of Rides” ) + theme_minimal()
library(dplyr)
trip_data <- read.csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2019_Q1.csv”) # Load your data (replace with your file) trip_data <- read.csv(“~/Desktop/Data Analyst Case Study/OG Files/csv/Divvy_Trips_2020_Q1.csv”) # Load your data (replace with your file)
libraby(dplyr) counts <- trip_data %>% count(“category”) # If you’re using dplyr’s count print(counts) Example: Handling missing values (if needed) # trip_data_cleaned <- trip_data %>% drop_na(ride_length, member_casual, day_of_week)
counts_agg <- aggregate(trip_data\(ride_length ~ trip_data\)member_casual + trip_data$day_of_week, FUN = mean) write.csv(counts_agg, file = ‘avg_ride_length.csv’)
counts_agg <- aggregate(trip_data\(ride_length ~ trip_data\)member_casual + trip_data$day_of_week, FUN = mean) write.csv(counts_agg, file = ‘avg_ride_length.csv’) counts <- data %>% count(category) print(counts)
counts <- aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean) write.csv(counts, file = ‘avg_ride_length.csv’)
counts <- aggregate(ride_length ~ member_casual + day_of_week, data = all_trips_v2, FUN = mean)
write.csv(counts, file = “avg_ride_length.csv”, row.names = FALSE)
print(“CSV file ‘avg_ride_length.csv’ has been successfully created!”) getwd() # This shows the current working directory
Where you able to answer how annual members and casual riders
use Cyclistic bikes differently?
- Key Insight: Annual members may use bikes for commuting or
daily errands, reflected in shorter, consistent ride lengths and weekday
usage. Casual riders likely use bikes for leisure, with longer ride
lengths and peak usage during weekends.
What story does your data tell?
- Cyclistic bikes serve two distinct user groups with varying needs.
Understanding these patterns can help tailor marketing, promotions, and
bike placement to maximize usage.
How do your findings relate to your original
question?
- By identifying behavioral differences, the findings guide targeted
strategies for increasing annual memberships and improving user
satisfaction.
Who is your audience?
- The executive team and stakeholders who need actionable insights for
decision-making.
Can data visualization help you share your
findings?
- Yes, clear visualizations like bar charts, line graphs, and heatmaps
can effectively communicate user patterns.