Google Data Analytics Capstone Project


Case Study 1

As part of Google’s Data Analytics Professional Certification, an optional capstone project can be completed to round off the course. Google provided two recommended case studies, each based off of open-sourced data sets.

I selected first case study option - a bike-sharing service provided in the city of Chicago. Data is licensed by “Bikeshare”, an LCC operated by Lyft Bikes and Scooters, under City of Chicago’s (“City”) Divvy bicycle sharing service. For the purposes of this case study, a fictional company title of ‘Cyclistic’ was used.

While this R Markdown file comprises of the key code chunks for the case study analysis, the original R file will provide the most clear and extensive view of the entire code base.

Setting up my enviroment

Notes: setting up my R environment by loading multiple packages:

library("ggplot2")
library("plyr")
library("dplyr")
library("readr")
library("tidyr")
library("data.table")
library("stringr") 
library("lubridate")
library("viridis")
library("scales")

Loading the raw (unclean) data set

Notes: Consolidated 12 .csv files (close to 1gb), into a single file. This file is shown in here just for display purposes and is not loaded.

data_unclean <- list.files(path = "C://Users/Public/r_studio/google_capstone/source_data/02_05_2022_data_bar_all.csv", 
                       pattern = "*.csv", full.names = TRUE) %>% 
  lapply(read_csv) %>%                              
  bind_rows

Data cleaning

Notes: checking data types, formats, and looking for errors such as duplication

glimpse(data_unclean) # check colnames, typeof

dim(data_unclean[duplicated(data_all$ride_id),])[1] # check if any duplicate ride_id (as they should be unique)

unique(data_unclean$rideable_type) # confirming unique values ("electric_bike" "classic_bike"  "docked_bike")
unique(data_unclean$member_casual) # confirming unique values ("member" "casual")

mutate(data_unclean, 
       end_station_id = as.character(end_station_id), 
       start_station_id = as.character(start_station_id)) # ensure all station_id's are characters

Notes: checking data types, formats, and looking for errors such as duplication

data_clean <- data_unclean %>% 
  drop_na(start_lat) %>%
  data_all[!duplicated(data_all$ride_id), ] %>%
  filter(!(ride_length < 0)) # remove NA from end_lat

data_clean <- data_clean %>% # remove trips with a ride length of less than 0
  filter(!(ride_length < 0))

Notes: create columns with data needed to produce the various visualizations

data_clean$date <-as.Date(data_clean$started_at)
data_clean$year <- format(as.Date(data_clean$date), "%Y") # create column for year
data_clean$month <- format(as.Date(data_clean$date), "%m") # create column for month
data_clean$week <- format(as.Date(data_clean$date), "%W") # create column for week
data_clean$day_of_week <- format(as.Date(data_clean$date), "%A") # create column for day of week
data_clean$yyyy_mm_dd <- format(as.Date(data_clean$date), "%Y-%m-%d") # create column for date

data_clean$day_of_week <- as.factor(data_clean$day_of_week) # convert to factor
data_clean$day_of_week <- ordered(data_clean$day_of_week, 
                                  levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) # set order for days of week

data_clean$ride_length_seconds <- difftime(data_clean$ended_at, data_clean$started_at) # create column for ride length calculated in seconds

data_clean$started_at_time <- format(as.POSIXct(
  data_clean$started_at), format = "%H:%M:%S") # create column with ride start times

data_clean$ended_at_time <- format(as.POSIXct(
  data_clean$ended_at), format = "%H:%M:%S") # create column with ride end times

data_clean$ride_length_seconds <- as.numeric(str_extract(data_clean$ride_length_seconds, "[0-9]+")) # remove 'secs' from ride_length_seconds

data_clean$ride_length_minutes <- (data_clean$ride_length / 60) # create column with ride length in minutes

Loading the cleaned data sets

Notes: These data sets have been cleaned and manipulated, and are ready to be visualized by ggplot

data_heat_mem_only <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_heat_mem_only.csv")
data_heat_cas_only <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_heat_cas_only.csv")
data_bar_all <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_bar_all.csv")
data_scatter_all <- read.csv("C://Users/Public/r_studio/google_capstone/02_05_2022_data_scatter_all.csv")

Data visualizations - Heat maps

Comparing the difference in total member and casual riders trips per day over the year

Data visualization - Bar chart

Comparing the average number of daily rides starting at each of hour the day

Data visualization - Scatter plot

Analyzing the daily ride length over a weekly period for each month

Saving data

Saving each of the data sets so they can be accessed for further analysis and visualization in Excel and Tableau

## 5.1) Entire clean dataset

fwrite(data_clean,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_clean.csv", col.names = TRUE, row.names = FALSE)

## 5.2) Map data (for Excel analysis & Tableau viz)

fwrite(data_map_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_map_all.csv", col.names = TRUE, row.names = FALSE)

## 5.3) Heat map data (for Excel analysis)

fwrite(data_heat_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_heat_all.csv", col.names = TRUE, row.names = FALSE)

fwrite(data_heat_mem_only,"C://Users/Public/r_studio/google_capstone/data_heat_mem_only.csv", col.names = TRUE, row.names = FALSE)

fwrite(data_heat_cas_only,"C://Users/Public/r_studio/google_capstone/data_heat_cas_only.csv", col.names = TRUE, row.names = FALSE)

## 5.4) Bar chart data (for Excel analysis)

fwrite(data_bar_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_bar_all.csv", col.names = TRUE, row.names = FALSE)

## 5.5) Scatter plot data (for Excel analysis)

fwrite(data_scatter_all,"C://Users/Public/r_studio/google_capstone/02_05_2022_data_scatter_all.csv", col.names = TRUE, row.names = FALSE)