You are a junior data analyst working in the marketing analyst team at Cyclistic, a bike-share company in Chicago. The director of marketing believes the company’s future success depends on maximizing the number of annual memberships. (Therefore, your team wants to understand how casual riders and annual members use Cyclistic bikes differently). From these insights, your team will design a new marketing strategy to convert casual riders into annual members. But first, Cyclistic executives must approve your recommendations, so they must be backed up with compelling data insights and professional data visualizations.
This course included six steps to the data analysis process, which will be outlined in this project.
I used excel to preview the data initially. I did some preliminary cleaning of the data in excel and added the ride_length column. I used R to organize, clean, and process the data. I then used R to create data visualizations. I couldn’t use Tableau public since my dataset (~6.1million entries) was too large.
install.packages("readxl", repos = "http://cran.us.r-project.org")
install.packages("dplyr", repos = "http://cran.us.r-project.org")
install.packages("readr", repos = "http://cran.us.r-project.org")
install.packages("janitor", repos = "http://cran.us.r-project.org")
install.packages("lubridate", repos = "http://cran.us.r-project.org")
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
install.packages("leaflet", repos = "http://cran.us.r-project.org")
library(readxl)
library(dplyr)
library(readr)
library(janitor)
library(lubridate)
library(tidyverse)
library(leaflet)
april_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/april_2022_tripdata_revised.csv")
may_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/may_2022_tripdata_revised.csv")
june_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/june_2022_tripdata_revised.csv")
july_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/july_2022_tripdata_revised.csv")
august_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/august_2022_tripdata_revised.csv")
september_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/september_2022_tripdata_revised.csv")
october_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/october_2022_tripdata_revised.csv")
november_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/november_2022_tripdata_revised.csv")
december_22_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/december_2022_tripdata_revised.csv")
january_23_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/january_2023_tripdata_revised.csv")
february_23_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/february_2023_tripdata_revised.csv")
march_23_tripdata <- read_csv("/Users/bethanyleach/Desktop/12_months_cycle_data/march_2023_tripdata_revised.csv")
cyclistic_one_yr_data <- bind_rows(april_22_tripdata, may_22_tripdata,
june_22_tripdata, july_22_tripdata,
august_22_tripdata, september_22_tripdata, october_22_tripdata,
november_22_tripdata, december_22_tripdata, january_23_tripdata,
february_23_tripdata, march_23_tripdata)
glimpse(cyclistic_one_yr_data)
## Rows: 5,803,720
## Columns: 31
## $ ride_id <chr> "3564070EEFD12711", "0B820C7FCF22F489", "89EEEE3229…
## $ rideable_type <chr> "electric_bike", "classic_bike", "classic_bike", "c…
## $ started_at <chr> "4/6/22 17:42:48", "4/24/22 19:23:07", "4/20/22 19:…
## $ start_date <chr> "4/6/22", "4/24/22", "4/20/22", "4/22/22", "4/16/22…
## $ start_time <time> 17:42:48, 19:23:07, 19:29:08, 21:14:06, 15:56:30, …
## $ ended_at <chr> "4/6/22 17:54:36", "4/24/22 19:43:17", "4/20/22 19:…
## $ end_date <chr> "4/6/22", "4/24/22", "4/20/22", "4/22/22", "4/16/22…
## $ end_time <time> 17:54:36, 19:43:17, 19:35:16, 21:23:29, 16:02:11, …
## $ start_station_name <chr> "Paulina St & Howard St", "Wentworth Ave & Cermak R…
## $ start_station_id <chr> "515", "13075", "TA1307000121", "13075", "TA1307000…
## $ end_station_name <chr> "University Library (NU)", "Green St & Madison St",…
## $ end_station_id <chr> "605", "TA1307000120", "TA1307000120", "KA170600500…
## $ start_lat <dbl> 42.01913, 41.85308, 41.87184, 41.85308, 41.87181, 4…
## $ start_lng <dbl> -87.67353, -87.63193, -87.64664, -87.63193, -87.646…
## $ end_lat <dbl> 42.05294, 41.88189, 41.88189, 41.86749, 41.88224, 4…
## $ end_lng <dbl> -87.67345, -87.64879, -87.64879, -87.63219, -87.641…
## $ member_casual <chr> "member", "member", "member", "casual", "member", "…
## $ ride_length <time> 00:11:48, 00:20:10, 00:06:08, 00:09:23, 00:05:41, …
## $ day_of_week <dbl> 4, 1, 4, 6, 7, 5, 2, 3, 6, 6, 7, 4, 4, 7, 4, 2, 2, …
## $ ...20 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...21 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...22 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...23 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...24 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...25 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...26 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...27 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...28 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...29 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...30 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ...31 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
summary(cyclistic_one_yr_data)
## ride_id rideable_type started_at start_date
## Length:5803720 Length:5803720 Length:5803720 Length:5803720
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_time ended_at end_date end_time
## Length:5803720 Length:5803720 Length:5803720 Length:5803720
## Class1:hms Class :character Class :character Class1:hms
## Class2:difftime Mode :character Mode :character Class2:difftime
## Mode :numeric Mode :numeric
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:5803720 Length:5803720 Length:5803720 Length:5803720
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.64 Min. :-87.84 Min. : 0.00 Min. :-88.14
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65 Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.07 Max. :-87.52 Max. :42.37 Max. : 0.00
## NA's :5855 NA's :5855
## member_casual ride_length day_of_week ...20
## Length:5803720 Length:5803720 Min. :1.000 Mode:logical
## Class :character Class1:hms 1st Qu.:2.000 NA's:5803720
## Mode :character Class2:difftime Median :4.000
## Mode :numeric Mean :4.103
## 3rd Qu.:6.000
## Max. :7.000
##
## ...21 ...22 ...23 ...24 ...25
## Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:5803720 TRUE:1 TRUE:11 NA's:5803720 NA's:5803720
## NA's:5803719 NA's:5803709
##
##
##
##
## ...26 ...27 ...28 ...29 ...30
## Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:5803720 NA's:5803720 NA's:5803720 NA's:5803720 NA's:5803720
##
##
##
##
##
## ...31
## Mode:logical
## NA's:5803720
##
##
##
##
##
cyclistic_one_yr_data_2 <- cyclistic_one_yr_data
count(cyclistic_one_yr_data_2, rideable_type)
## # A tibble: 3 × 2
## rideable_type n
## <chr> <int>
## 1 classic_bike 2639559
## 2 docked_bike 173747
## 3 electric_bike 2990414
count(cyclistic_one_yr_data_2, member_casual)
## # A tibble: 2 × 2
## member_casual n
## <chr> <int>
## 1 casual 2337439
## 2 member 3466281
cyclistic_one_yr_data_3 <- cyclistic_one_yr_data_2
cyclistic_one_yr_data_3 <- cyclistic_one_yr_data_3 %>%
select(-c(...20,...21,...22,...23,...24,...25,...26,...27,...28,...29,...30,
...31))
colnames(cyclistic_one_yr_data_3)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "start_date" "start_time" "ended_at"
## [7] "end_date" "end_time" "start_station_name"
## [10] "start_station_id" "end_station_name" "end_station_id"
## [13] "start_lat" "start_lng" "end_lat"
## [16] "end_lng" "member_casual" "ride_length"
## [19] "day_of_week"
nrow(cyclistic_one_yr_data_3)
## [1] 5803720
dim(cyclistic_one_yr_data_3)
## [1] 5803720 19
head(cyclistic_one_yr_data_3)
## # A tibble: 6 × 19
## ride_id rideable_type started_at start_date start_time ended_at end_date
## <chr> <chr> <chr> <chr> <time> <chr> <chr>
## 1 3564070EEFD1… electric_bike 4/6/22 17… 4/6/22 17:42:48 4/6/22 … 4/6/22
## 2 0B820C7FCF22… classic_bike 4/24/22 1… 4/24/22 19:23:07 4/24/22… 4/24/22
## 3 89EEEE32293F… classic_bike 4/20/22 1… 4/20/22 19:29:08 4/20/22… 4/20/22
## 4 84D4751AEB31… classic_bike 4/22/22 2… 4/22/22 21:14:06 4/22/22… 4/22/22
## 5 5664BCF0D1DE… electric_bike 4/16/22 1… 4/16/22 15:56:30 4/16/22… 4/16/22
## 6 AA9EB7BD2E1F… classic_bike 4/21/22 1… 4/21/22 16:52:33 4/21/22… 4/21/22
## # ℹ 12 more variables: end_time <time>, start_station_name <chr>,
## # start_station_id <chr>, end_station_name <chr>, end_station_id <chr>,
## # start_lat <dbl>, start_lng <dbl>, end_lat <dbl>, end_lng <dbl>,
## # member_casual <chr>, ride_length <time>, day_of_week <dbl>
summary(cyclistic_one_yr_data_3)
## ride_id rideable_type started_at start_date
## Length:5803720 Length:5803720 Length:5803720 Length:5803720
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_time ended_at end_date end_time
## Length:5803720 Length:5803720 Length:5803720 Length:5803720
## Class1:hms Class :character Class :character Class1:hms
## Class2:difftime Mode :character Mode :character Class2:difftime
## Mode :numeric Mode :numeric
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:5803720 Length:5803720 Length:5803720 Length:5803720
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.64 Min. :-87.84 Min. : 0.00 Min. :-88.14
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65 Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.07 Max. :-87.52 Max. :42.37 Max. : 0.00
## NA's :5855 NA's :5855
## member_casual ride_length day_of_week
## Length:5803720 Length:5803720 Min. :1.000
## Class :character Class1:hms 1st Qu.:2.000
## Mode :character Class2:difftime Median :4.000
## Mode :numeric Mean :4.103
## 3rd Qu.:6.000
## Max. :7.000
##
cyclistic_one_yr_data_3$month <- month(mdy(cyclistic_one_yr_data_2$start_date))
cyclistic_one_yr_data_3 <- cyclistic_one_yr_data_3 %>%
mutate('month_name' = month.abb[month])
cyclistic_one_yr_data_3$start_date_dow <- mdy(cyclistic_one_yr_data_3$start_date)
cyclistic_one_yr_data_3$start_date_dow <- weekdays(cyclistic_one_yr_data_3$start_date_dow)
#Sys.setlocale("LC_TIME", "")
day_of_week_totalrides <- cyclistic_one_yr_data_3 %>%
group_by(start_date_dow) %>%
dplyr::summarize(total = n()) %>%
arrange(ordered(start_date_dow, levels = c("Sunday","Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday")))
cyclistic_one_yr_data_3$date <- as.Date(cyclistic_one_yr_data_3$start_date, "%m/%d/%y")
cyclistic_one_yr_data_3$Month <- format(as.Date(cyclistic_one_yr_data_3$date), "%m")
cyclistic_one_yr_data_3$day <- format(as.Date(cyclistic_one_yr_data_3$date), "%d")
cyclistic_one_yr_data_3$year <- format(as.Date(cyclistic_one_yr_data_3$date), "%y")
cyclistic_one_yr_data_3$Day_Of_Week <- format(as.Date(cyclistic_one_yr_data_3$date), "%A")
started_at <- cyclistic_one_yr_data_3$started_at
cyclistic_one_yr_data_3$started_at <- as.POSIXct(paste0(started_at, ':00'), format="%m/%d/%y %H:%M:%S")
ended_at <- cyclistic_one_yr_data_3$ended_at
cyclistic_one_yr_data_3$ended_at <- as.POSIXct(paste0(ended_at, ':00'), format="%m/%d/%y %H:%M:%S")
cyclistic_one_yr_data_3$ride_duration <- difftime(cyclistic_one_yr_data_3$ended_at,
cyclistic_one_yr_data_3$started_at)
bike_rider_type <- cyclistic_one_yr_data_3 %>%
dplyr::group_by(rideable_type, member_casual)
dplyr::summarize(bike_rider_type)
## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.
## # A tibble: 5 × 2
## # Groups: rideable_type [3]
## rideable_type member_casual
## <chr> <chr>
## 1 classic_bike casual
## 2 classic_bike member
## 3 docked_bike casual
## 4 electric_bike casual
## 5 electric_bike member
is.factor(cyclistic_one_yr_data_3$ride_duration)
## [1] FALSE
cyclistic_one_yr_data_3$ride_duration <- as.numeric(as.character(cyclistic_one_yr_data_3$ride_duration))
is.numeric(cyclistic_one_yr_data_3$ride_duration)
## [1] TRUE
cyclistic_one_yr_data_4 <- cyclistic_one_yr_data_3 %>%
filter(rideable_type != "docked_bike")
sum(is.na(cyclistic_one_yr_data_4$start_station_name))
## [1] 839082
sum(is.na(cyclistic_one_yr_data_4$end_station_name))
## [1] 893698
cyclistic_one_yr_data_5 <- cyclistic_one_yr_data_4
sum(is.na(cyclistic_one_yr_data_5$start_station_id))
## [1] 839214
sum(is.na(cyclistic_one_yr_data_5$end_station_id))
## [1] 893839
cyclistic_one_yr_data_6 <- cyclistic_one_yr_data_5 %>%
select(-c(start_station_id, end_station_id))
cyclistic_one_yr_data_6b <- cyclistic_one_yr_data_6 %>%
mutate(ride_duration_hms = hms::hms(seconds_to_period(ride_duration)))
cyclistic_one_yr_data_6b$ride_duration_hms <- seconds(hms(cyclistic_one_yr_data_6b$ride_duration_hms))
## Warning in .parse_hms(..., order = "HMS", quiet = quiet): Some strings failed to
## parse, or all strings are NAs
cyclistic_one_yr_data_6b$ride_duration_hms <- period_to_seconds(cyclistic_one_yr_data_6b$ride_duration_hms)
sum(is.na(cyclistic_one_yr_data_6b$ride_duration_hms))
## [1] 88
cyclistic_one_yr_data_6b_no_na <- cyclistic_one_yr_data_6b[!is.na(cyclistic_one_yr_data_6b$ride_duration_hms),]
cyclistic_one_yr_data_6b_no_na <- cyclistic_one_yr_data_6b_no_na[cyclistic_one_yr_data_6b_no_na$ride_duration_hms >= 0, ]
min(cyclistic_one_yr_data_6b_no_na$ride_duration_hms)
## [1] 0
max(cyclistic_one_yr_data_6b_no_na$ride_duration_hms)
## [1] 90060
median(cyclistic_one_yr_data_6b_no_na$ride_duration_hms)
## [1] 598
mean(cyclistic_one_yr_data_6b_no_na$ride_duration_hms)
## [1] 944.7706
descriptive_analysis_summary <- summary(na.omit(cyclistic_one_yr_data_6b_no_na$ride_duration_hms))
descriptive_analysis_summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 338.0 598.0 944.8 1034.0 90060.0
descriptive_analysis_summary_whole <- lapply(descriptive_analysis_summary, as.integer)
ride_duration_hms_min <- aggregate(cyclistic_one_yr_data_6b_no_na$ride_duration_hms ~ cyclistic_one_yr_data_6b_no_na$member_casual, FUN = "min")
ride_duration_hms_max <- aggregate(cyclistic_one_yr_data_6b_no_na$ride_duration_hms ~ cyclistic_one_yr_data_6b_no_na$member_casual, FUN = "max")
ride_duration_hms_median <- aggregate(cyclistic_one_yr_data_6b_no_na$ride_duration_hms ~ cyclistic_one_yr_data_6b_no_na$member_casual, FUN = "median")
ride_duration_hms_mean <- aggregate(cyclistic_one_yr_data_6b_no_na$ride_duration_hms ~ cyclistic_one_yr_data_6b_no_na$member_casual, FUN = "mean")
avg_ride_time_day_user <- aggregate(cyclistic_one_yr_data_6b_no_na$ride_duration_hms ~ cyclistic_one_yr_data_6b_no_na$member_casual + cyclistic_one_yr_data_6b_no_na$Day_Of_Week, FUN = "mean")
aggregated_data <- data.frame(ride_duration_hms_min, ride_duration_hms_max, ride_duration_hms_median,
ride_duration_hms_mean, avg_ride_time_day_user)
cyclistic_one_yr_data_6b_no_na$Day_Of_Week <- ordered(cyclistic_one_yr_data_6b_no_na$Day_Of_Week,
levels = c("Sunday", "Monday", "Tuesday",
"Wednesday", "Thursday",
"Friday", "Saturday"))
avg_ride_time_day_user_ordered_Day_Of_Week <-aggregate(cyclistic_one_yr_data_6b_no_na$ride_duration ~ cyclistic_one_yr_data_6b_no_na$member_casual + cyclistic_one_yr_data_6b_no_na$Day_Of_Week, FUN = "mean")
cyclistic_one_yr_data_organized <- cyclistic_one_yr_data_6b_no_na %>%
group_by(member_casual,Day_Of_Week) %>%
dplyr::summarize(number_of_rides = n(), average_duration = mean(ride_duration_hms)) %>%
arrange(member_casual, Day_Of_Week)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
cyclistic_one_yr_data_organized_2 <- cyclistic_one_yr_data_organized %>%
mutate(average_duration_min = average_duration/60)
cyclistic_one_yr_data_6b_no_na_fin <- cyclistic_one_yr_data_6b_no_na %>%
mutate(average_duration_min = ride_duration/60)
plot_total_rides <- ggplot(cyclistic_one_yr_data_organized_2, aes(x = Day_Of_Week, y = number_of_rides, fill = member_casual)) + geom_col(position = "dodge") + labs(title = "Total Number of Rides by Type of Rider", x = "Day of the Week", y = "Total Number of Rides", fill = "Type of Rider") + theme(plot.title = element_text(hjust = 0.5))
plot_total_rides
* + Created another plot showing the average ride duration by type of
rider and arranged by day of the week.
plot_avg_ride_duration<- ggplot(cyclistic_one_yr_data_organized_2, aes(x = Day_Of_Week, y = average_duration_min, fill = member_casual)) + geom_col(position = "dodge") + labs(title = "Average Ride Duration by Type of Rider", x = "Day of the Week", y = "Average Ride Duration (min)", fill = "Type of Rider") + theme(plot.title = element_text(hjust = 0.5))
plot_avg_ride_duration
#```{r}
write.csv(cyclistic_one_yr_data_6b_no_na_fin, “/Users/bethanyleach/Desktop/12_months_cycle_data_organized_final_2.csv”, row.names=FALSE)
#{r setup, include=FALSE} knitr::opts_chunk$set(echo=TRUE) library(knitr) #
Conclusion
Recommendation for Cyclistic’s marketing team