library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## Warning: package 'plyr' was built under R version 3.6.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(funModeling)
## Warning: package 'funModeling' was built under R version 3.6.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.3
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.6.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
NYC Subway is widely used transportation method in the New York City. Many have mixed opinions about NYC Subway performances. This project will analyze its performance and give us an idea why commuters have mixed opinion about Subway performance. These data are available to download at http://web.mta.info/developers/developer-data-terms.html#data as a zip file. I am only using the Performance_NYCT.csv data set that comes with that zip file that is uploaded into my Github repository for this project.I will be focusing on the On Time Performances (OTP) subway data for this project analysis.
# Load the full dataset
data<- read.csv("https://raw.githubusercontent.com/gpadmaperuma/DATA608FinalProject/main/Performance_NYCT.csv", header=TRUE, check.names = FALSE)
# subsetting the data to only pull out the OTP for individual subway lines
otp <-data[ which(data$PARENT_SEQ=='391690'), ]
head(otp)
## INDICATOR_SEQ PARENT_SEQ AGENCY_NAME INDICATOR_NAME
## 1075 391691 391690 NYC Transit OTP (Terminal) - 1 Line
## 1076 391691 391690 NYC Transit OTP (Terminal) - 1 Line
## 1077 391691 391690 NYC Transit OTP (Terminal) - 1 Line
## 1078 391691 391690 NYC Transit OTP (Terminal) - 1 Line
## 1079 391691 391690 NYC Transit OTP (Terminal) - 1 Line
## 1080 391691 391690 NYC Transit OTP (Terminal) - 1 Line
## DESCRIPTION
## 1075 Subways weekday Terminal OTP evaluates performance based on schedule/service plan in effect, includes all delays.
## 1076 Subways weekday Terminal OTP evaluates performance based on schedule/service plan in effect, includes all delays.
## 1077 Subways weekday Terminal OTP evaluates performance based on schedule/service plan in effect, includes all delays.
## 1078 Subways weekday Terminal OTP evaluates performance based on schedule/service plan in effect, includes all delays.
## 1079 Subways weekday Terminal OTP evaluates performance based on schedule/service plan in effect, includes all delays.
## 1080 Subways weekday Terminal OTP evaluates performance based on schedule/service plan in effect, includes all delays.
## CATEGORY FREQUENCY DESIRED_CHANGE INDICATOR_UNIT DECIMAL_PLACES
## 1075 Service Indicators M U % 1
## 1076 Service Indicators M U % 1
## 1077 Service Indicators M U % 1
## 1078 Service Indicators M U % 1
## 1079 Service Indicators M U % 1
## 1080 Service Indicators M U % 1
## PERIOD_YEAR PERIOD_MONTH YTD_TARGET YTD_ACTUAL MONTHLY_TARGET
## 1075 2009 6 0 86.8 0
## 1076 2009 7 0 90.0 0
## 1077 2009 8 0 88.3 0
## 1078 2009 9 0 89.6 0
## 1079 2009 10 0 90.2 0
## 1080 2009 11 0 91.1 0
## MONTHLY_ACTUAL
## 1075 86.8
## 1076 93.2
## 1077 84.7
## 1078 93.9
## 1079 92.6
## 1080 95.6
# remove columns that are not needed for this analysis
otp<- otp[c(-1:-3, -5:-10)]
# change int months to string months
otp$PERIOD_MONTH<-with(otp, month.abb[PERIOD_MONTH])
head(otp)
## INDICATOR_NAME PERIOD_YEAR PERIOD_MONTH YTD_TARGET YTD_ACTUAL
## 1075 OTP (Terminal) - 1 Line 2009 Jun 0 86.8
## 1076 OTP (Terminal) - 1 Line 2009 Jul 0 90.0
## 1077 OTP (Terminal) - 1 Line 2009 Aug 0 88.3
## 1078 OTP (Terminal) - 1 Line 2009 Sep 0 89.6
## 1079 OTP (Terminal) - 1 Line 2009 Oct 0 90.2
## 1080 OTP (Terminal) - 1 Line 2009 Nov 0 91.1
## MONTHLY_TARGET MONTHLY_ACTUAL
## 1075 0 86.8
## 1076 0 93.2
## 1077 0 84.7
## 1078 0 93.9
## 1079 0 92.6
## 1080 0 95.6
For this analysis, MTA subway data was used from 2011-2017 during the weekdays. The goal of the final project on MTA Individual subway line data is to analyze the on-time performance of individual subway lines. My objective of this analysis is to find out in what years/months did subway perform worst/best.
# check for data completeness to see how many subways were analyzed
unique(otp$INDICATOR_NAME)
## [1] OTP (Terminal) - 1 Line OTP (Terminal) - 2 Line
## [3] OTP (Terminal) - 3 Line OTP (Terminal) - 4 Line
## [5] OTP (Terminal) - 5 Line OTP (Terminal) - 6 Line
## [7] OTP (Terminal) - 7 Line OTP (Terminal) - S Line 42 St.
## [9] OTP (Terminal) - A Line OTP (Terminal) - B Line
## [11] OTP (Terminal) - C Line OTP (Terminal) - D Line
## [13] OTP (Terminal) - E Line OTP (Terminal) - F Line
## [15] OTP (Terminal) - S Fkln Line OTP (Terminal) - G Line
## [17] OTP (Terminal) - J Z Line OTP (Terminal) - L Line
## [19] OTP (Terminal) - M Line OTP (Terminal) - N Line
## [21] OTP (Terminal) - Q Line OTP (Terminal) - R Line
## [23] OTP (Terminal) - S Line Rock OTP (Terminal) - W Line
## 86 Levels: Bus Passenger Wheelchair Lift Usage - NYCT Bus ...
# 24 subways were analyzed over 12 months so each year should have 288 values
table(otp$PERIOD_YEAR)
##
## 2009 2010 2011 2012 2013 2014 2015 2016 2017
## 168 236 276 267 276 276 276 278 288
# See if OTP was measured for all subway lines
table(otp$INDICATOR_NAME)
##
## Bus Passenger Wheelchair Lift Usage - NYCT Bus
## 0
## % of Completed Trips - NYCT Bus
## 0
## 100th Street Depot - % of Completed Trips
## 0
## 126th Street Depot - % of Completed Trips
## 0
## Casey Stengel Depot - % of Completed Trips
## 0
## Castleton Depot - % of Completed Trips
## 0
## Charleston Depot - % of Completed Trips
## 0
## Collisions with Injury Rate - NYCT Bus
## 0
## Customer Accident Injury Rate - NYCT Bus
## 0
## Customer Injury Rate - Subways
## 0
## East New York Depot - % of Completed Trips
## 0
## Elevator Availability - Subways
## 0
## Employee Lost Time and Restricted Duty Rate
## 0
## Escalator Availability - Subways
## 0
## Flatbush Depot - % of Completed Trips
## 0
## Fresh Pond Depot - % of Completed Trips
## 0
## Grand Avenue Depot - % of Completed Trips
## 0
## Gun Hill Depot - % of Completed Trips
## 0
## Jackie Gleason Depot - % of Completed Trips
## 0
## Jamaica Depot - % of Completed Trips
## 0
## Kingsbridge Depot - % of Completed Trips
## 0
## Manhattanville Depot - % of Completed Trips
## 0
## Mean Distance Between Failures - NYCT Bus
## 0
## Mean Distance Between Failures - Staten Island Railway
## 0
## Mean Distance Between Failures - Subways
## 0
## Meredith Avenue Depot - % of Completed Trips
## 0
## Michael J. Quill Depot - % of Completed Trips
## 0
## On-Time Performance - Staten Island Railway
## 0
## On-Time Performance (Terminal)
## 0
## OTP (Terminal) - 1 Line
## 101
## OTP (Terminal) - 2 Line
## 101
## OTP (Terminal) - 3 Line
## 101
## OTP (Terminal) - 4 Line
## 101
## OTP (Terminal) - 5 Line
## 101
## OTP (Terminal) - 6 Line
## 101
## OTP (Terminal) - 7 Line
## 101
## OTP (Terminal) - A Line
## 101
## OTP (Terminal) - B Line
## 101
## OTP (Terminal) - C Line
## 101
## OTP (Terminal) - D Line
## 101
## OTP (Terminal) - E Line
## 101
## OTP (Terminal) - F Line
## 101
## OTP (Terminal) - G Line
## 101
## OTP (Terminal) - J Z Line
## 101
## OTP (Terminal) - L Line
## 101
## OTP (Terminal) - M Line
## 101
## OTP (Terminal) - N Line
## 101
## OTP (Terminal) - Q Line
## 101
## OTP (Terminal) - R Line
## 101
## OTP (Terminal) - S Fkln Line
## 101
## OTP (Terminal) - S Line 42 St.
## 92
## OTP (Terminal) - S Line Rock
## 101
## OTP (Terminal) - W Line
## 27
## Queens Village Depot - % of Completed Trips
## 0
## Subway Wait Assessment
## 0
## Subway Wait Assessment - 1 Line
## 0
## Subway Wait Assessment - 2 Line
## 0
## Subway Wait Assessment - 3 Line
## 0
## Subway Wait Assessment - 4 Line
## 0
## Subway Wait Assessment - 5 Line
## 0
## Subway Wait Assessment - 6 Line
## 0
## Subway Wait Assessment - 7 Line
## 0
## Subway Wait Assessment - A Line
## 0
## Subway Wait Assessment - B Line
## 0
## Subway Wait Assessment - C Line
## 0
## Subway Wait Assessment - D Line
## 0
## Subway Wait Assessment - E Line
## 0
## Subway Wait Assessment - F Line
## 0
## Subway Wait Assessment - G Line
## 0
## Subway Wait Assessment - J Z Line
## 0
## Subway Wait Assessment - L Line
## 0
## Subway Wait Assessment - M Line
## 0
## Subway Wait Assessment - N Line
## 0
## Subway Wait Assessment - Q Line
## 0
## Subway Wait Assessment - R Line
## 0
## Subway Wait Assessment - S 42 St
## 0
## Subway Wait Assessment - S Fkln
## 0
## Subway Wait Assessment - S Rock
## 0
## Subway Wait Assessment - W Line
## 0
## Total Paratransit Ridership - NYCT Bus
## 0
## Total Ridership - NYCT Bus
## 0
## Total Ridership - Subways
## 0
## Ulmer Park Depot - % of Completed Trips
## 0
## Wait Assessment - Subways (Inactive, Historic Calculations)
## 0
## West Farms Depot - % of Completed Trips
## 0
## Yukon Depot - % of Completed Trips
## 0
remove the years 2009 and 2010 and standardize the remainder of the years be removing incomplete subway lines: S line 42 & the W line
otp.updated<-otp %>%
filter(PERIOD_YEAR >= "2011", INDICATOR_NAME!= "OTP (Terminal) - W Line", INDICATOR_NAME!= "OTP (Terminal) - S Line 42 St.")
# check on the data once more
table(otp.updated$PERIOD_YEAR)
##
## 2011 2012 2013 2014 2015 2016 2017
## 264 264 264 264 264 264 264
table(otp.updated$PERIOD_MONTH)
##
## Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
## 154 154 154 154 154 154 154 154 154 154 154 154
otp.updated$INDICATOR_NAME<-revalue(otp.updated$INDICATOR_NAME, c("OTP (Terminal) - 1 Line" = "one" , "OTP (Terminal) - 2 Line"="two", "OTP (Terminal) - 3 Line"="three","OTP (Terminal) - 4 Line"="four","OTP (Terminal) - 5 Line"="five","OTP (Terminal) - 6 Line"="six","OTP (Terminal) - 7 Line"="seven","OTP (Terminal) - A Line"="A","OTP (Terminal) - B Line"="B","OTP (Terminal) - C Line"="C","OTP (Terminal) - E Line"="E","OTP (Terminal) - F Line"="F","OTP (Terminal) - D Line"="D","OTP (Terminal) - G Line"="G", "OTP (Terminal) - J Z Line" = "J Z","OTP (Terminal) - L Line"="L","OTP (Terminal) - M Line"="M","OTP (Terminal) - N Line"="N","OTP (Terminal) - Q Line"="Q","OTP (Terminal) - R Line"="R","OTP (Terminal) - S Fkln Line" = "S Fkln","OTP (Terminal) - S Line 42 St." = "S 42 St.","OTP (Terminal) - S Line Rock" = "S Rock", "OTP (Terminal) - W Line" = "W"))
We will change the structure of some variables to factors, then compare the monthly actual OTP to the monthly target OTP to find out if the actual is greater or equal and then set the value as true. Continue cleaning with changing percentages to decimals and checking for missing values.
# change the structure of certain variables to factors
otp.updated$PERIOD_YEAR<-as.factor(otp.updated$PERIOD_YEAR)
otp.updated$PERIOD_MONTH<-as.factor(otp.updated$PERIOD_MONTH)
# compare the monthly actual OTP to the monthly target OTP and if the actual is >= then the value is true
otp.updated$ON_TIME <- as.numeric(otp.updated$MONTHLY_ACTUAL >= otp.updated$MONTHLY_TARGET)
# change percentages to decimals
otp.updated$MONTHLY_TARGET <-otp.updated$MONTHLY_TARGET/100
otp.updated$MONTHLY_ACTUAL <-otp.updated$MONTHLY_ACTUAL/100
otp.updated$YTD_TARGET <-otp.updated$YTD_TARGET/100
otp.updated$YTD_ACTUAL <-otp.updated$YTD_ACTUAL/100
# approximately 20 weekdays in a month as the OTP is only measured for weekdays
otp.updated$DAYS_ON_TIME<-otp.updated$MONTHLY_ACTUAL*20
#check for any missing values
df_status(otp.updated, print_results = TRUE)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 INDICATOR_NAME 0 0.00 0 0 0 0 factor 22
## 2 PERIOD_YEAR 0 0.00 0 0 0 0 factor 7
## 3 PERIOD_MONTH 0 0.00 0 0 0 0 factor 12
## 4 YTD_TARGET 0 0.00 0 0 0 0 numeric 25
## 5 YTD_ACTUAL 0 0.00 0 0 0 0 numeric 785
## 6 MONTHLY_TARGET 0 0.00 0 0 0 0 numeric 25
## 7 MONTHLY_ACTUAL 0 0.00 0 0 0 0 numeric 802
## 8 ON_TIME 1587 85.88 0 0 0 0 numeric 2
## 9 DAYS_ON_TIME 0 0.00 0 0 0 0 numeric 802
Export the OPT updated to .CSV to use in the app.
write.csv(otp.updated,'MTA_CLEANED.csv', row.names = FALSE)
#facet wrap by year
qplot(PERIOD_MONTH, DAYS_ON_TIME, data = otp.updated, facets = . ~ PERIOD_YEAR) + theme(axis.text.x = element_text(size = 5, angle = 90))
#facet wrap by year
qplot(PERIOD_MONTH, DAYS_ON_TIME, data = otp.updated, facets = . ~ INDICATOR_NAME) + theme(axis.text.x = element_text(size = 3, angle = 90))
Using the shiny app function in R, I have taken the cleaned .CSV file to visualize the subway line performance information for the year 2017.
https://don-padmaperuma.shinyapps.io/DATA608FinalProject/
I chose the year 2017 because as I analyse data on the dataset it was clear that 2017 was the poorest performing year.
From my visualizations I would say that over the years the on-time performance for subways became poorer, specifically in the year 2017. As I imagined the winter months, like December and even at times November, showed poor on-time performance. Unfortunately the data was up until 2017, buy If I had more current data for 2020 this would show a huge difference as there was a travel limitations due to the COVID pendamic.