#TRACK 1:Divv Casestudy #Coauthered By:Daizy Hearnsberger & Wendy Stremmel #google Daya Analytics Professional Certification Capstone Project #Merit America Data Analytics Cohort jan’2022 #Date Completed: 4/14/2022
#deliverables #=============
#Description of all data sources used #Documentiontation of any cleaning or manipulation of data #Summary of your analysis #Supporting Visualizations and key findings #Top three recommendationsbased on our analysis
#Buisness Task #==============
#Divvy’s Director of marketing, Lily Monroe, believes that maximizing Annual memeber numbers are key to future sucess for the company. Our marketing analysis team has been tasked with analyzing dta to answer the question below and give three recommendations based on the findings.
#how do annual members and casual riders use Divvy’bikes differently
#data description #=================
#Documentation-R Script #======================
#Collaberated work by Wendy Stremmel, Daizy Hearnsberger #The purpose of this scripe is to consolidate downloaded Divvy data into a single dataframe #Then conduct simple analysis to find the answer for our key question #In what ways do members and casual riders use Divvy bikes differently?
#___________________ #For sake of consistency, Cyclistics pseudo company name used in capstone description was changed to Divvy, company with public datasets actually belong
library(tidyverse) #helps wrangle data library(lubridate) #helps wrangle date attributes library(ggplot2) #helps visualize data getwd() #displays your working directory setwd(“/Users/Kayer/Desktop/Divvy_Exercise/csv”) #sets your working directory to simplify calls to data …
#===================== # Upload Divvy datasets (csv files) here q2_2019 <- read_csv(“Divvy_Trips_2019_Q2.csv”) q3_2019 <- read_csv(“Divvy_Trips_2019_Q3.csv”) q4_2019 <- read_csv(“Divvy_Trips_2019_Q4.csv”) q1_2020 <- read_csv(“Divvy_Trips_2020_Q1.csv”)
colnames(q3_2019) colnames(q4_2019) colnames(q2_2019) colnames(q1_2020)
(q4_2019 <- rename(q4_2019 ,ride_id = trip_id ,rideable_type = bikeid ,started_at = start_time ,ended_at = end_time ,start_station_name = from_station_name ,start_station_id = from_station_id ,end_station_name = to_station_name ,end_station_id = to_station_id ,member_casual = usertype))
(q3_2019 <- rename(q3_2019 ,ride_id = trip_id ,rideable_type = bikeid ,started_at = start_time ,ended_at = end_time ,start_station_name = from_station_name ,start_station_id = from_station_id ,end_station_name = to_station_name ,end_station_id = to_station_id ,member_casual = usertype))
(q2_2019 <- rename(q2_2019 ,ride_id = “01 - Rental Details Rental ID” ,rideable_type = “01 - Rental Details Bike ID” ,started_at = “01 - Rental Details Local Start Time” ,ended_at = “01 - Rental Details Local End Time” ,start_station_name = “03 - Rental Start Station Name” ,start_station_id = “03 - Rental Start Station ID” ,end_station_name = “02 - Rental End Station Name” ,end_station_id = “02 - Rental End Station ID” ,member_casual = “User Type”))
str(q1_2020) str(q4_2019) str(q3_2019) str(q2_2019)
q4_2019 <- mutate(q4_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type)) q3_2019 <- mutate(q3_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type)) q2_2019 <- mutate(q2_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type))
all_trips <- bind_rows(q2_2019, q3_2019, q4_2019, q1_2020)
all_trips <- all_trips %>% select(-c(start_lat, start_lng, end_lat, end_lng, birthyear, gender, “01 - Rental Details Duration In Seconds Uncapped”, “05 - Member Details Member Birthday Year”, “Member Gender”, “tripduration”))
#====================================================== # STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS #====================================================== # Inspect the new table that has been created colnames(all_trips) #List of column names nrow(all_trips) #How many rows are in data frame? dim(all_trips) #Dimensions of the data frame? head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips) str(all_trips) #See list of columns and data types (numeric, character, etc) summary(all_trips) #Statistical summary of data. Mainly for numerics
table(all_trips$member_casual)
all_trips <- all_trips %>% mutate(member_casual = recode(member_casual ,“Subscriber” = “member” ,“Customer” = “casual”))
table(all_trips$member_casual)
all_trips\(date <- as.Date(all_trips\)started_at) #The default format is yyyy-mm-dd all_trips\(month <- format(as.Date(all_trips\)date), “%m”) all_trips\(day <- format(as.Date(all_trips\)date), “%d”) all_trips\(year <- format(as.Date(all_trips\)date), “%Y”) all_trips\(day_of_week <- format(as.Date(all_trips\)date), “%A”)
all_trips\(ride_length <- difftime(all_trips\)ended_at,all_trips$started_at)
str(all_trips)
is.factor(all_trips\(ride_length) all_trips\)ride_length <- as.numeric(as.character(all_trips\(ride_length)) is.numeric(all_trips\)ride_length)
all_trips_v2 <- all_trips[!(all_trips\(start_station_name == "HQ QR" | all_trips\)ride_length<0),]
#===================================== # STEP 4: CONDUCT DESCRIPTIVE ANALYSIS #===================================== # Descriptive analysis on ride_length (all figures in seconds) mean(all_trips_v2\(ride_length) #straight average (total ride length / rides) median(all_trips_v2\)ride_length) #midpoint number in the ascending array of ride lengths max(all_trips_v2\(ride_length) #longest ride min(all_trips_v2\)ride_length) #shortest ride
summary(all_trips_v2$ride_length)
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = mean) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = median) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = max) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = min)
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2\(day_of_week <- ordered(all_trips_v2\)day_of_week, levels=c(“Sunday”, “Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”, “Saturday”))
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday() group_by(member_casual, weekday) %>% #groups by usertype and weekday summarise(number_of_rides = n() #calculates the number of rides and average duration ,average_duration = mean(ride_length)) %>% # calculates the average duration arrange(member_casual, weekday) # sorts
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) + geom_col(position = “dodge”)
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) + geom_col(position = “dodge”)
#================================================= # STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS #================================================= # Create a csv file that we will visualize in Excel, Tableau, or my presentation software #setwd (“/Users/Kayer/Desktop/Divvy_Exercise/csv”) counts <- aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean) write.csv(counts, file = ‘~/Desktop/Divvy_Exercise/avg_ride_length.csv’)
#You’re done! Congratulations!
#Recommendations : #The membership is not bring in enough revenue,The casual rider is bring in 50% more revenue than members
#It will be necessary to revamp the membership
#Marketing campaign: offer casual rider no fees to start, To bring in up annual numbers, With attracting more casual riders as well