Introduction

I used the MBTA dataset for this cleaning and tidying project.

Code

I started by loading in the libraries and dataset.

# load required libraries
library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)

# load the mbta csv
mbta <- read.csv("mbta.csv", header = FALSE)

My first step in cleaning the data was to extract the dates used. These were initally stored as variables, but I wanted them to be instances.

# clean a little to get the dates in date format
dates <- mbta %>%
  filter(V2 == "mode") %>%
  select(V3:V60) %>%
  pivot_longer(c(V3:V60), names_to = "V", values_to = "dates")

dates$dates <- str_c(dates$dates,"-01")
dates <- ymd(dates$dates)

After I completed this step, I cleaned the data nd made a couple plots.

# clean the data to make each date an instance
mbta <- mbta %>%
  pivot_longer(c("V3":"V60"), names_to = "day", values_to = "rides") %>%
  cbind(dates) %>%
  select(-day, -V1) %>%
  filter(V2 != "mode") %>%
  filter(V2 != "All Modes by Qtr") %>%
  filter(V2 != "Pct Chg / Yr")

mbta$rides <- as.numeric(as.character(mbta$rides))
colnames(mbta)[1] <- "mode"

# create a plot of dates vs rides for each mode of transportation
# this is the initial overview of the data
ggplot(mbta, aes(x = dates, y = rides, color = mode)) +
  geom_point() +
  labs(title = "MBTA",
       subtitle = "Boston Transit",
       x = "Date",
       y = "Rides",
       color = "Mode")

# remove the total rides to get a better picture of the data
without_total <- mbta %>%
  filter(mode != "TOTAL")

ggplot(without_total, aes(x = dates, y = rides, color = mode)) +
  geom_point() +
  labs(title = "MBTA",
       subtitle = "Boston Transit",
       x = "Date",
       y = "Rides",
       color = "Mode")

ggplot(without_total, aes(x = reorder(mode, rides), y = rides)) +
  geom_bar(stat = "identity") +
  labs(title = "MBTA",
       subtitle = "Boston Transit",
       x = "Mode",
       y = "Total Rides 2007-2010") +
  coord_flip()

Finally, I noticed a periodic increase in MBTA usage, so I created a dataset holding the values by month for each year and plotted this to see when the increase was occuring.

# create a data frame of year by year data and plot
zoom <- c()
for(i in 8:12){
  zoom_hold <- mbta %>%
    filter(dates < ymd(str_c(2000+i,"-01-01"))) %>%
    filter(dates >= ymd(str_c(2000+i-1,"-01-01"))) %>%
    filter(mode == "TOTAL") %>%
    mutate("year" = str_c(2000+i-1)) %>%
    mutate("months" = row_number()) %>%
    select(-mode, -dates)
  
  zoom <- rbind(zoom, zoom_hold)
}

ggplot(zoom, aes(x = months, y = rides, color = year)) +
  geom_point() +
  geom_smooth(color = "black", se = FALSE) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "MBTA",
       subtitle = "Boston Transit",
       x = "Month",
       y = "Rides",
       color = "Year")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Conclusion

I can’t figure out why there is an increase in September, but there is a strong decrease in usage during December and January, which I imagine is due to the cold Boston temperatures. My hypothesis is that fewer people are using transit outside of going to and from work during these months.