As part of Google’s Data Analytics Professional Certification, an optional capstone project can be completed to round off the course. Google provided two recommended case studies, each based off of open-sourced data sets.
I selected first case study option - a bike-sharing service provided in the city of Chicago. Data is licensed by “Bikeshare”, an LCC operated by Lyft Bikes and Scooters, under City of Chicago’s (“City”) Divvy bicycle sharing service. For the purposes of this case study, a fictional company title of ‘Cyclistic’ was used.
While this R Markdown file comprises of the key code chunks for the case study analysis, the original R file will provide the most clear and extensive view of the entire code base.
Notes: setting up my R environment by loading multiple packages:
library("ggplot2")
library("plyr")
library("dplyr")
library("readr")
library("tidyr")
library("data.table")
library("stringr")
library("lubridate")
library("viridis")
library("scales")
Notes: Consolidated 12 .csv files (close to 1gb), into a single file. This file is shown in here just for display purposes and is not loaded.
data_unclean <- list.files(path = "C://Users/Public/r_studio/google_capstone/source_data/02_05_2022_data_bar_all.csv",
pattern = "*.csv", full.names = TRUE) %>%
lapply(read_csv) %>%
bind_rows
Notes: checking data types, formats, and looking for errors such as duplication
glimpse(data_unclean) # check colnames, typeof
dim(data_unclean[duplicated(data_all$ride_id),])[1] # check if any duplicate ride_id (as they should be unique)
unique(data_unclean$rideable_type) # confirming unique values ("electric_bike" "classic_bike" "docked_bike")
unique(data_unclean$member_casual) # confirming unique values ("member" "casual")
mutate(data_unclean,
end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id)) # ensure all station_id's are characters
Notes: checking data types, formats, and looking for errors such as duplication
data_clean <- data_unclean %>%
drop_na(start_lat) %>%
data_all[!duplicated(data_all$ride_id), ] %>%
filter(!(ride_length < 0)) # remove NA from end_lat
data_clean <- data_clean %>% # remove trips with a ride length of less than 0
filter(!(ride_length < 0))
Notes: create columns with data needed to produce the various visualizations
data_clean$date <-as.Date(data_clean$started_at)
data_clean$year <- format(as.Date(data_clean$date), "%Y") # create column for year
data_clean$month <- format(as.Date(data_clean$date), "%m") # create column for month
data_clean$week <- format(as.Date(data_clean$date), "%W") # create column for week
data_clean$day_of_week <- format(as.Date(data_clean$date), "%A") # create column for day of week
data_clean$yyyy_mm_dd <- format(as.Date(data_clean$date), "%Y-%m-%d") # create column for date
data_clean$day_of_week <- as.factor(data_clean$day_of_week) # convert to factor
data_clean$day_of_week <- ordered(data_clean$day_of_week,
levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) # set order for days of week
data_clean$ride_length_seconds <- difftime(data_clean$ended_at, data_clean$started_at) # create column for ride length calculated in seconds
data_clean$started_at_time <- format(as.POSIXct(
data_clean$started_at), format = "%H:%M:%S") # create column with ride start times
data_clean$ended_at_time <- format(as.POSIXct(
data_clean$ended_at), format = "%H:%M:%S") # create column with ride end times
data_clean$ride_length_seconds <- as.numeric(str_extract(data_clean$ride_length_seconds, "[0-9]+")) # remove 'secs' from ride_length_seconds
data_clean$ride_length_minutes <- (data_clean$ride_length / 60) # create column with ride length in minutes
Notes: These data sets have been cleaned and manipulated, and are ready to be visualized by ggplot
data_heat_mem_only <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_heat_mem_only.csv")
data_heat_cas_only <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_heat_cas_only.csv")
data_bar_all <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_bar_all.csv")
data_scatter_all <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_scatter_all.csv")
Comparing the difference in total member and casual riders trips per day over the year
Comparing the average number of daily rides starting at each of hour the day
Analyzing the daily ride length over a weekly period for each month
Saving each of the data sets so they can be accessed for further analysis and visualization in Excel and Tableau
## 5.1) Entire clean dataset
fwrite(data_clean,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_clean.csv", col.names = TRUE, row.names = FALSE)
## 5.2) Map data (for Excel analysis & Tableau viz)
fwrite(data_map_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_map_all.csv", col.names = TRUE, row.names = FALSE)
## 5.3) Heat map data (for Excel analysis)
fwrite(data_heat_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_heat_all.csv", col.names = TRUE, row.names = FALSE)
fwrite(data_heat_mem_only,"C://Users/Public/r_studio/google_capstone/data_heat_mem_only.csv", col.names = TRUE, row.names = FALSE)
fwrite(data_heat_cas_only,"C://Users/Public/r_studio/google_capstone/data_heat_cas_only.csv", col.names = TRUE, row.names = FALSE)
## 5.4) Bar chart data (for Excel analysis)
fwrite(data_bar_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_bar_all.csv", col.names = TRUE, row.names = FALSE)
## 5.5) Scatter plot data (for Excel analysis)
fwrite(data_scatter_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_scatter_all.csv", col.names = TRUE, row.names = FALSE)