In the following code hunk, import your data. The data is from https://s3.amazonaws.com/tripdata/index.html and I initially downloaded files of Feb. 2020 to Apr. 2022 and unzipped them locally.
#### Use read_csv() or another function
#### Make sure your data is converted into a tibble. <<< data cleaning will be in the next chunk (this chuck is just to show the codes, since I cannot execute below codes every time I knit the document for its data size) -- instead this chunk is used to store them in .RData file.
dat2002 <- as_tibble(read_csv("202002-citibike-tripdata.csv"))
dat2003 <- as_tibble(read_csv("202003-citibike-tripdata.csv"))
dat2004 <- as_tibble(read_csv("202004-citibike-tripdata.csv"))
dat2005 <- as_tibble(read_csv("202005-citibike-tripdata.csv"))
dat2006 <- as_tibble(read_csv("202006-citibike-tripdata.csv"))
dat2007 <- as_tibble(read_csv("202007-citibike-tripdata.csv"))
dat2008 <- as_tibble(read_csv("202008-citibike-tripdata.csv"))
dat2009 <- as_tibble(read_csv("202009-citibike-tripdata.csv"))
dat2010 <- as_tibble(read_csv("202010-citibike-tripdata.csv"))
dat2011 <- as_tibble(read_csv("202011-citibike-tripdata.csv"))
dat2012 <- as_tibble(read_csv("202012-citibike-tripdata.csv"))
dat2101 <- as_tibble(read_csv("202101-citibike-tripdata.csv"))
dat2102 <- as_tibble(read_csv("202102-citibike-tripdata.csv"))
dat2103 <- as_tibble(read_csv("202103-citibike-tripdata.csv"))
dat2104 <- as_tibble(read_csv("202104-citibike-tripdata.csv"))
dat2105 <- as_tibble(read_csv("202105-citibike-tripdata.csv"))
dat2106 <- as_tibble(read_csv("202106-citibike-tripdata.csv"))
dat2107 <- as_tibble(read_csv("202107-citibike-tripdata.csv"))
dat2108 <- as_tibble(read_csv("202108-citibike-tripdata.csv"))
dat2109 <- as_tibble(read_csv("202109-citibike-tripdata.csv"))
dat2110 <- as_tibble(read_csv("202110-citibike-tripdata.csv"))
dat2111 <- as_tibble(read_csv("202111-citibike-tripdata.csv"))
dat2112 <- as_tibble(read_csv("202112-citibike-tripdata.csv"))
dat2201 <- as_tibble(read_csv("202201-citibike-tripdata.csv"))
dat2202 <- as_tibble(read_csv("202202-citibike-tripdata.csv"))
dat2203 <- as_tibble(read_csv("202203-citibike-tripdata.csv"))
dat2204 <- as_tibble(read_csv("202204-citibike-tripdata.csv"))
#### storing all above into .RData format
save(dat2002, dat2003, dat2004, dat2005, dat2006, dat2007, dat2008, dat2009, dat2010, dat2011, dat2012, dat2101, dat2102, dat2103, dat2104, dat2105, dat2106, dat2107, dat2108, dat2109, dat2110, dat2111, dat2112, dat2201, dat2202, dat2203, dat2204, file = "cs_bike.RData")
#### loading .RData file from above chunk to process further
load("cs_bike.RData")
#### The datasets from Feb. 2020 to Jan. 2021 have 15 variables and after that, the company omitted 2 variables and made it total of 13 variables. Comparing the two kinds of datasets with variable names;
names(dat2002)
## [1] "tripduration" "starttime"
## [3] "stoptime" "start station id"
## [5] "start station name" "start station latitude"
## [7] "start station longitude" "end station id"
## [9] "end station name" "end station latitude"
## [11] "end station longitude" "bikeid"
## [13] "usertype" "birth year"
## [15] "gender"
names(dat2102)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
## I am going to prepare both types (first 12 months and second 14 months) of datasets with the same variables of my interest and make the tibbles identical for further comparison over time.
## The variables used in this project are, "start_sta", "start_time", "end_sta", "end_time", "duration", "start_lat", "start_lng", "end_lat", "end_lng", and "type", total of 10 in this order. For the last variable "type", it will have two levels as factors, one for Casual Rider (originally as Customer or casual), and other for Annual Member (Subscriber or member).
#### the first 12 months script
dat2002c <- dat2002 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2003c <- dat2003 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2004c <- dat2004 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2005c <- dat2005 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2006c <- dat2006 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2007c <- dat2007 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2008c <- dat2008 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2009c <- dat2009 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2010c <- dat2010 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2011c <- dat2011 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2012c <- dat2012 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
dat2101c <- dat2101 %>%
mutate(type = (fct_recode(usertype, `Annual Member` = "Subscriber", `Casual Rider` = "Customer"))) %>%
select(start_sta = `start station name`, end_sta = `end station name`, start_time = starttime, end_time = stoptime, duration = tripduration, start_lat = `start station latitude`, start_lng = `start station longitude`, end_lat = `end station latitude`, end_lng = `end station longitude`, type)
#### the second 14 months script
dat2102c<- dat2102 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2103c <- dat2103 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2104c <- dat2104 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2105c <- dat2105 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2106c <- dat2106 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2107c <- dat2107 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2108c <- dat2108 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2109c <- dat2109 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2110c <- dat2110 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2111c <- dat2111 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2112c <- dat2112 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2201c <- dat2201 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2202c <- dat2202 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2203c <- dat2203 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
dat2204c <- dat2204 %>%
mutate(duration = as.numeric(ended_at - started_at)) %>%
mutate(type = (fct_recode(member_casual, `Annual Member` = "member", `Casual Rider` = "casual"))) %>%
select(start_sta = start_station_name, end_sta = end_station_name, start_time = started_at, end_time = ended_at, duration, start_lat, start_lng, end_lat, end_lng, type)
#####################
#### saving it as clean data to process in .RData format
save(dat2002c, dat2003c, dat2004c, dat2005c, dat2006c, dat2007c, dat2008c, dat2009c, dat2010c, dat2011c, dat2012c, dat2101c, dat2102c, dat2103c, dat2104c, dat2105c, dat2106c, dat2107c, dat2108c, dat2109c, dat2110c, dat2111c, dat2112c, dat2201c, dat2202c, dat2203c, dat2204c, file = "cs_bike_clean.RData")
Using words, describe the visualization you are going to make using which variables/characteristics in your data:
For my first figure, I am simply going to create a visual representation that plots the transition of the bike-shsare ride totals in NYC since the Covid-19 pandemic started.
For its purpose and the size of the dataset, I did not combine all and filter, but created another data.frame and used it to plot. Apparently Mar. 2020 was when The “New York State on Pause” Executive Order (a.k.a. stay-at-home order) was issued.
# if starting a new session, uncomment below
# load("cs_bike_clean.RData")
months <- c("2020/02", "2020/03", "2020/04", "2020/05", "2020/06", "2020/07", "2020/08", "2020/09", "2020/10", "2020/11", "2020/12", "2021/01", "2021/02", "2021/03", "2021/04", "2021/05", "2021/06", "2021/07", "2021/08", "2021/09", "2021/10", "2021/11", "2021/12", "2022/01", "2022/02", "2022/03", "2022/04")
total_rides <- c(dim(dat2002c)[1], dim(dat2003c)[1], dim(dat2004c)[1], dim(dat2005c)[1], dim(dat2006c)[1], dim(dat2007c)[1], dim(dat2008c)[1], dim(dat2009c)[1], dim(dat2010c)[1], dim(dat2011c)[1], dim(dat2012c)[1], dim(dat2101c)[1], dim(dat2102c)[1], dim(dat2103c)[1], dim(dat2104c)[1], dim(dat2105c)[1], dim(dat2106c)[1], dim(dat2107c)[1], dim(dat2108c)[1], dim(dat2109c)[1], dim(dat2110c)[1], dim(dat2111c)[1], dim(dat2112c)[1], dim(dat2201c)[1], dim(dat2202c)[1], dim(dat2203c)[1], dim(dat2204c)[1])
month_total <- tibble(months, total_rides)
ggplot(month_total, aes(x = months, y = total_rides)) +
geom_line(group = 1, color = "deeppink3") +
labs(title = "Total Usage of Bikeshare in NYC during Covid Era", x = "", y = "", caption = "source: https://s3.amazonaws.com/tripdata/index.html") +
theme(text = element_text(color = "white"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "white"),
axis.text.y = element_text(angle = 45, hjust = 1, color = "white"),
plot.background = element_rect(fill = "#003870"),
panel.background = element_rect(fill = "aliceblue"),
strip.background.x = element_rect(fill = "white"),
plot.title = element_text(size = 18, face = "bold", color = "white")) +
geom_vline(aes(xintercept = 2.5), color = "red") +
annotate(geom = "text", label = "NYS Stay-at-Home", x = 5.3, y = 2750000, color = "red") +
annotate(geom = "text", label = "Executive Order", x = 5, y = 2600000, color = "red")
Using words, describe the second visualization you are going to make using which variables/characteristics in your data:
For the second figure, I will display a bar chart of the total usages occured in Apr. 2022 (immediate recent month) categorized by membership status (Casual Rider vs. Annual Member). For later in this project, I am planning to create an animation to view transition of membership proportions over time, based on this chart.
ggplot(dat2204c, aes(type, fill = type)) +
geom_bar(show.legend = F) +
labs(x = "", y = "", title = "Total Usages by Membership Status for Apr. 2022") +
theme(text = element_text(color = "white"),
axis.text.x = element_text(color = "white"),
axis.text.y = element_text(angle = 45, color = "white"),
plot.background = element_rect(fill = "#003870"),
panel.background = element_rect(fill = "aliceblue"),
strip.background.x = element_rect(fill = "white"),
plot.title = element_text(size = 18, face = "bold", color = "white"))
For my third figure, I am going to see where the most popular stations are for pick-ups and returns in the last 3 months.
# create another tibble for the recent 3 months
most_recent_q <- rbind(dat2202c, dat2203c, dat2204c)
most_popular_start <- head(most_recent_q %>%
group_by(start_sta) %>%
count() %>%
arrange(desc(n)), 1)
most_popular_end <- head(most_recent_q %>%
group_by(end_sta) %>%
count() %>%
arrange(desc(n)), 1)
So apparently the most popular bikshare station is the one at W 21 St & 6 Ave for both pick-up and return. I am placing a marker in the map with a code below.
if (!require("leaflet")) install.packages("leaflet")
## Loading required package: leaflet
lng <- most_recent_q %>%
group_by(start_sta) %>%
filter(start_sta == "W 21 St & 6 Ave")
lng_lat <- head(most_recent_q %>%
select(start_sta, start_lng, start_lat) %>%
group_by(start_sta, start_lng, start_lat) %>%
count() %>%
arrange(desc(n)) , 1)
most_pop <- leaflet(lng_lat) %>%
setView(lat = lng_lat$start_lat, lng = lng_lat$start_lng, zoom = 13) %>%
addTiles() %>%
addCircleMarkers(lng = lng_lat$start_lng, lat = lng_lat$start_lat, fillColor = "#003870",
popup = "The most popular bikeshare station for both pick-up and return")
most_pop