Load libraries.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
Read in data.
Downloading the file is commented out here because we already ran it, but should un-comment if running this for the first time.
#download.file("https://raw.githubusercontent.com/JeremyOBrien16/NYC-MTA-Weekly-MetroCard-Swipes/master/Fare_Card_History_for_Metropolitan_Transportation_Authority__MTA___Beginning_2010.csv",destfile="mta_by_station.csv")
mta <- read.csv("mta_by_station.csv",header=TRUE,check.names=FALSE,stringsAsFactors=FALSE)
Check how many rows and columns.
dim(mta)
## [1] 180608 26
Look at first few lines.
head(mta,n=3)
## From Date To Date Remote Station ID Station Full Fare Senior Citizen / Disabled
## 1 2/13/2016 2/19/2016 R001 WHITEHALL STREET 46710 1811
## 2 2/13/2016 2/19/2016 R003 CYPRESS HILLS 2793 138
## 3 2/13/2016 2/19/2016 R004 75TH STREET & ELDERTS LANE 7229 356
## 7 Day ADA Farecard Access System Unlimited 30 Day ADA Farecard Access System Unlimited
## 1 264 756
## 2 14 40
## 3 28 102
## Joint Rail Road Ticket
## 1 231
## 2 0
## 3 0
## 7 Day Unlimited 30 Day Unlimited 14 Day Reduced Fare Media Unlimited 1 Day Unlimited 14 Day Unlimited
## 1 21775 28241 0 0 0
## 2 2171 1168 0 0 0
## 3 5144 3822 0 0 0
## 7 Day Express Bus Pass Transit Check Metrocard LIB Special Senior Rail Road Unlimited No Trade
## 1 0 773 430 432
## 2 0 22 61 1
## 3 0 82 193 0
## Transit Check Metrocard Annual Metrocard Mail and Ride EZPass Express Mail and Ride Unlimited
## 1 3440 1202 470
## 2 97 6 9
## 3 254 32 52
## Path 2 Trip Airtran Full Fare Airtran 30 Day Airtran 10 Trip Airtran Monthly
## 1 0 843 0 0 0
## 2 0 7 0 0 0
## 3 0 75 0 0 0
From some initial exploration already run (not shown here), we already know some cleaning and transformations we need to do.
Sometimes the same station will occur twice, with and without trailing whitespace.
Let’s run the station names through the trimws function to fix this. Then, filter out duplicate lines.
Also use the toupper function to make sure all station names are all caps (it seems they are, but just to be safe).
mta$Station <- toupper(trimws(mta$Station))
mta <- mta[!duplicated(mta),]
There was a simple error where Prospect Ave was listed as either “PROSPECT AVE-4TH AVE” or “PROSPECT AVENUE-4TH AVENUE”. Fix this.
Also, Fulton Street was listed under two names before the renovation. Fix this as well.
mta$Station <- plyr::mapvalues(mta$Station,
from = c("PROSPECT AVE-4TH AVE", "FULTON ST & BROADWAY NASSAU"),
to = c("PROSPECT AVENUE-4TH AVENUE","FULTON STREET"))
Several stations had the same remote station ID, but two different station names. Fix this.
station_names_to_consolidate <- c("FULTON STREET","BROADWAY NASSAU STREET","BARCLAYS CENTER","ATLANTIC AVENUE","ROCKAWAY AVENUE","ROCKWAY AVENUE","241ST ST-WHITE PLAINS RD","241TH ST-WHITE PLAINS RD","NEWKIRK PLAZA","NEWKIRK AVE-EAST 16TH ST","COURT SQ","45TH ROAD-COURT HOUSE SQ","COURT SQ-23 ST","23RD STREET-ELY AVENUE","KOSCIUSZKO STREET-BROADWAY","KOSCIUSKO STREET-BROADWAY","GRANT AVENUE-PITKIN AVENUE","GRANT AVNUE-PITKIN AVENUE","SHEPHERD AVE-PITKIN AVE","SHEPERD AVE-PITKIN AVE")
mta$Station <- plyr::mapvalues(mta$Station,
from = station_names_to_consolidate,
to = rep(station_names_to_consolidate[seq(from=1,to=19,by=2)],each=2))
Next, take only the first occurence for each combination of start date, remote station ID, and station name.
We found through previous exploration that on one start date (December 25, 2010), we would often see the same station repeated, with very similar but not exactly the same values per column.
However, the values were close enough that just taking the first occurence should be fine.
date_plus_id_and_name_of_stations_for_dedup <- paste0(mta[,"From Date"],"-",
mta[,"Remote Station ID"],"-",
mta[,"Station"])
mta <- mta[!duplicated(date_plus_id_and_name_of_stations_for_dedup),]
Some lines had binary values instead of actual numbers for the various columns.
Remove these by requiring that “Full Fare” be > 1.
mta <- mta[mta[,"Full Fare"] > 1,]
Remove column “To Date” and change all spaces and slashes in column names with “.” and “or” to make easier to work with.
colnames(mta) <- str_replace_all(colnames(mta),pattern="/",replace="or")
colnames(mta) <- str_replace_all(colnames(mta),pattern=" ",replace=".")
mta <- mta[,setdiff(colnames(mta),"To.Date")]
colnames(mta)
## [1] "From.Date"
## [2] "Remote.Station.ID"
## [3] "Station"
## [4] "Full.Fare"
## [5] "Senior.Citizen.or.Disabled"
## [6] "7.Day.ADA.Farecard.Access.System.Unlimited"
## [7] "30.Day.ADA.Farecard.Access.System.Unlimited"
## [8] "Joint.Rail.Road.Ticket"
## [9] "7.Day.Unlimited"
## [10] "30.Day.Unlimited"
## [11] "14.Day.Reduced.Fare.Media.Unlimited"
## [12] "1.Day.Unlimited"
## [13] "14.Day.Unlimited"
## [14] "7.Day.Express.Bus.Pass"
## [15] "Transit.Check.Metrocard"
## [16] "LIB.Special.Senior"
## [17] "Rail.Road.Unlimited.No.Trade"
## [18] "Transit.Check.Metrocard.Annual.Metrocard"
## [19] "Mail.and.Ride.EZPass.Express"
## [20] "Mail.and.Ride.Unlimited"
## [21] "Path.2.Trip"
## [22] "Airtran.Full.Fare"
## [23] "Airtran.30.Day"
## [24] "Airtran.10.Trip"
## [25] "Airtran.Monthly"
Convert dates to date format. Next order by date, then station.
mta$From.Date <- as.Date(mta$From.Date,format="%m/%d/%Y")
mta <- mta[order(mta$From.Date,mta$Station),]
Several stations had more than one remote station ID for the same station name. Let’s look at this in more detail.
dates_plus_stations <- mta[,c("From.Date","Station")]
repeated_station_names <- unique(dates_plus_stations$Station[duplicated(dates_plus_stations)])
dates_plus_stations_repeated_stations <- mta[mta$Station %in% repeated_station_names,
c("From.Date","Remote.Station.ID","Station")]
repeated_stations_ids_pasted_to_names <- paste0(dates_plus_stations_repeated_stations$Remote.Station.ID,"-",
dates_plus_stations_repeated_stations$Station)
freq_per_repeated_station <- data.frame(table(repeated_stations_ids_pasted_to_names),stringsAsFactors=FALSE)
freq_per_repeated_station$Var1 <- as.vector(freq_per_repeated_station$Var1)
length(unique(dates_plus_stations$From.Date))
## [1] 384
table(freq_per_repeated_station$Freq)
##
## 110 348 365 384
## 1 1 1 24
freq_per_repeated_station[freq_per_repeated_station$Freq != length(unique(dates_plus_stations$From.Date)),]
## repeated_stations_ids_pasted_to_names Freq
## 1 R002-FULTON STREET 110
## 23 R246-PROSPECT AVENUE-4TH AVENUE 365
## 26 R454-PROSPECT AVENUE-4TH AVENUE 348
Most stations except for a few, have all dates for each remote station ID in cases where the station name is repeated.
I already know from previous exploration that for Fulton Street, there is one remote station ID that was only there in earlier part of data, vs. one that was there the whole time.
We can just sum for cases where there are both remote station IDs.
What about Prospect Avenue?
dates_in_R246 <- dates_plus_stations_repeated_stations$From.Date[dates_plus_stations_repeated_stations$Remote.Station.ID == "R246"]
dates_in_R454 <- dates_plus_stations_repeated_stations$From.Date[dates_plus_stations_repeated_stations$Remote.Station.ID == "R454"]
as.Date(setdiff(dates_in_R246,dates_in_R454),origin = "1970-01-01")
## [1] "2017-06-03" "2017-10-28" "2017-11-04" "2017-11-11" "2017-11-18"
## [6] "2017-11-25" "2017-12-02" "2017-12-09" "2017-12-16" "2017-12-23"
## [11] "2017-12-30" "2018-01-06" "2018-01-13" "2018-01-20" "2018-01-27"
## [16] "2018-02-03" "2018-02-10"
A quick Google search reveals that the Prospect Avenue station was closed for maintenance from early June to early November 2017 (https://en.wikipedia.org/wiki/Prospect_Avenue_(BMT_Fourth_Avenue_Line).
Based on this, the dates we are seeing for one remote station ID but not the other probably make sense.
Anyway, let’s now sum all numeric columns by date + station for these stations.
repeated_station_name_indices <- mta$Station %in% repeated_station_names
repeated_stations_full_info <- mta[repeated_station_name_indices,]
recurrent_stations_sum_by_date_and_station <- repeated_stations_full_info[,setdiff(colnames(mta),"Remote.Station.ID")] %>%
group_by(From.Date,Station) %>%
summarize_all(funs(sum))
recurrent_stations_sum_by_date_and_station <- data.frame(recurrent_stations_sum_by_date_and_station,check.names=FALSE,stringsAsFactors=FALSE)
Get collapsed remote station IDs as well, and merge this information with recurrent_stations_sum_by_date_and_station.
repeated_stations_ids_plus_names <- repeated_stations_full_info[,c("Remote.Station.ID","Station")]
repeated_stations_ids_plus_names <- repeated_stations_ids_plus_names[!duplicated(repeated_stations_ids_plus_names),]
repeated_stations_ids_plus_names <- repeated_stations_ids_plus_names %>% group_by(Station) %>% summarize(Remote.Station.ID = paste0(Remote.Station.ID,collapse="/"))
repeated_stations_ids_plus_names <- data.frame(repeated_stations_ids_plus_names,stringsAsFactors=FALSE)
repeated_stations_aggregated <- merge(repeated_stations_ids_plus_names,
recurrent_stations_sum_by_date_and_station,
by="Station")
Remove the repeated stations from mta, then merge mta with repeated_stations_aggregated.
mta <- mta[!repeated_station_name_indices,]
repeated_stations_aggregated <- repeated_stations_aggregated[,colnames(mta)]
mta <- rbind(mta,repeated_stations_aggregated)
mta <- mta[order(mta$From.Date,mta$Station),]
Now, there were some stations that we want to exclude, as they are not actual MTA subway stations.
possible_stations <- unique(mta$Station)
stations_to_exclude <- grep('^PA-PATH',possible_stations,value=TRUE) #PATH stations are not a part of the subway system and should be excluded.
stations_to_exclude <- c(stations_to_exclude,
c("METROCARD VAN 1",
"MTABC - EASTCHESTER 2",
"HEMPSTEAD-LIB CUST SERVICE",
"LGA AIRPORT CTB",
"ORCHARD BEACH")) #Removing some specific names that I realize are not actual subway stations (eg Orchard Beach actually refers to a bus service).
stations_to_exclude
## [1] "PA-PATH 14TH STREET" "PA-PATH 9TH STREET"
## [3] "PA-PATH CHRISTOPHER ST" "PA-PATH EXCHANGE PLACE"
## [5] "PA-PATH GROVE ST" "PA-PATH HARRISON"
## [7] "PA-PATH HOBOKEN" "PA-PATH JOURNAL SQUARE"
## [9] "PA-PATH NEWARK" "PA-PATH PAVONIA/NEWPORT"
## [11] "PA-PATH THIRTY THIRD ST" "PA-PATH TWENTY THIRD ST"
## [13] "PA-PATH WORLD TRADE CNTR" "METROCARD VAN 1"
## [15] "MTABC - EASTCHESTER 2" "HEMPSTEAD-LIB CUST SERVICE"
## [17] "LGA AIRPORT CTB" "ORCHARD BEACH"
stations_to_exclude_indices <- mta$Station %in% stations_to_exclude
mta <- mta[!stations_to_exclude_indices,]
Finally, let’s explore if there are any dates missing for all stations.
Then, see which stations have some missing dates, and if so which dates they are missing.
possible_dates <- unique(mta$From.Date)
possible_dates[1]
## [1] "2010-05-29"
possible_dates[length(possible_dates)]
## [1] "2018-02-10"
all_weeks_in_range_of_possible_dates <- seq.Date(as.Date(possible_dates[1]),as.Date(possible_dates[length(possible_dates)]),by="7 days")
as.Date(setdiff(all_weeks_in_range_of_possible_dates,possible_dates),origin="1970-01-01")
## [1] "2011-05-07" "2013-04-20" "2014-02-22" "2014-03-01" "2014-12-06"
## [6] "2014-12-20" "2015-02-28" "2015-03-14" "2015-07-04" "2015-08-01"
## [11] "2015-08-08" "2015-08-29" "2015-09-19" "2015-10-03" "2016-05-28"
## [16] "2016-06-04" "2016-06-25" "2017-05-27" "2017-08-26"
There are definitely some dates that are missing for all stations. We will want to impute for these.
Now, check dates per station.
num_dates_per_station <- data.frame(table(mta$Station),stringsAsFactors=FALSE)
num_dates_per_station$Var1 <- as.vector(num_dates_per_station$Var1)
num_dates_per_station <- num_dates_per_station[num_dates_per_station$Freq < length(possible_dates),]
num_dates_per_station <- num_dates_per_station[order(num_dates_per_station$Freq),]
#Let's display only stations that have at least 4 weeks less than other stations.
num_dates_per_station[num_dates_per_station$Freq < (length(possible_dates) - 4),]
## Var1 Freq
## 141 72ND STREET - 2 AVENUE 58
## 156 86TH STREET - 2 AVENUE 58
## 166 96TH STREET - 2 AVENUE 59
## 97 34TH STREET - HUDSON YARDS 121
## 174 AQUEDUCT RACE TRACK 254
## 406 SMITH STREET-9TH STREET 289
## 330 KNICKERBOCKER AVE-MYRTLE AV 328
## 232 CENTRAL AVE-MYRTLE AVE 331
## 393 ROCKAWAY AVENUE 342
## 426 VAN SICLEN AVENUE 342
## 221 BUHRE AVE-WESTCHESTER AVE 346
## 378 PENNSYLVANIA AVENUE 348
## 443 ZEREGA AVE-WESTCHESTER AVE 348
## 322 JUNIUS STREET 349
## 418 SUTTER AVENUE 350
## 266 ELDER AVE-WESTCHESTER AVE 354
## 410 ST LAWRENCE AVE-WEST AVE 354
## 212 BROAD CHANNEL-NOEL ROAD 356
## 291 GASTON AVENUE-BEACH 67TH ST 356
## 306 HOLLAND-BEACH 90TH STREET 356
## 379 PLAYLAND-BEACH 98TH STREET 356
## 395 ROCKAWAY PARK-BEACH 116TH 356
## 400 SEASIDE BEACH-105TH STREET 356
## 345 MIDDLETOWN ROAD-WESTCHESTER 357
## 281 FRANK AVENUE-BEACH 44TH ST 358
## 195 BEACH 25TH STREET-WAVECREST 359
## 231 CASTLE HILL AVENUE 359
## 414 STRAITON AVE-BEACH 60TH ST 359
## 265 EDGEMERE-BEACH 36TH STREET 360
## 351 MOTT AVENUE-BEACH 22ND ST 360
## 390 RI TRAMWAY (MANHATTAN) 361
## 391 RI TRAMWAY (ROOSEVELT) 361
## 193 BAY RIDGE AVE-4TH AVENUE 363
## 124 53RD STREET-4TH AVENUE 364
## 382 PROSPECT AVENUE-4TH AVENUE 365
## 399 SARATOGA AVENUE 366
## 102 36TH AVENUE & 31ST STREET 368
## 94 30TH AVENUE & 31ST STREET 369
## 177 ATLANTIC AV-VAN SINDEREN AV 369
## 435 WHITLOCK AVE-WESTCHESTER AV 371
## 246 COURT SQ 375
## 278 FOREST AVENUE-PUTNAM AVENUE 376
## 286 FRESH POND ROAD-68TH AVENUE 376
## 344 METROPOLITAN AVENUE 376
## 401 SENECA AVENUE-PALMETTO ST 376
## 209 BOWLING GREEN & BROADWAY 377
## 73 21ST STREET-JACKSON AVENUE 379
## 349 MORRISON AVE & WESTCHESTER 379
## 437 WILSON AVENUE-MOFFAT STREET 379
The three stations with the fewest results are the 2nd Avenue subway, which has not been open very long.
Hudson Yards is also definitely a new station.
Let’s check AQUEDUCT RACE TRACK and SMITH STREET-9TH STREET manually.
After this, we start getting into the range of dates missed that is most likely just due to temporary station closures.
aqueduct_dates_not_found <- as.Date(setdiff(possible_dates,mta$From.Date[which(mta$Station == "AQUEDUCT RACE TRACK")]),origin="1970-01-01")
aqueduct_dates_not_found[1]
## [1] "2010-05-29"
aqueduct_dates_not_found[length(aqueduct_dates_not_found)]
## [1] "2013-06-22"
as.Date(setdiff(possible_dates[which(possible_dates >= min(aqueduct_dates_not_found) & possible_dates <= max(aqueduct_dates_not_found))],aqueduct_dates_not_found),origin="1970-01-01")
## [1] "2010-10-30" "2010-11-06" "2010-11-13" "2010-11-20" "2010-11-27"
## [6] "2010-12-04" "2010-12-11" "2010-12-18" "2010-12-25" "2011-01-01"
## [11] "2011-01-08" "2011-01-15" "2011-01-22" "2011-01-29" "2011-02-05"
## [16] "2011-02-12" "2011-02-19" "2011-02-26" "2011-03-05" "2011-03-12"
## [21] "2011-03-19" "2011-03-26" "2011-04-02" "2011-04-09" "2011-04-16"
## [26] "2011-04-23" "2012-09-15" "2013-06-01" "2013-06-08"
Looks like closures were mostly during a contiguous period, excepting from late 2010-early 2011, one week in 2012, and two weeks in 2013 near when it eventually opened for good again.
This matches pretty well with the history of the station (https://en.wikipedia.org/wiki/Aqueduct_Racetrack_(IND_Rockaway_Line)).
smith_9th_dates_not_found <- as.Date(setdiff(possible_dates,mta$From.Date[which(mta$Station == "SMITH STREET-9TH STREET")]),origin="1970-01-01")
smith_9th_dates_not_found[1]
## [1] "2011-06-25"
smith_9th_dates_not_found[length(smith_9th_dates_not_found)]
## [1] "2013-04-13"
length(smith_9th_dates_not_found)
## [1] 95
length(possible_dates[which(possible_dates >= min(smith_9th_dates_not_found) & possible_dates <= max(smith_9th_dates_not_found))])
## [1] 95
Closures at Smith-9th occur completely within a contiguous period, which matches perfectly with the time it was under construction (https://en.wikipedia.org/wiki/Smith–Ninth_Streets_(IND_Culver_Line)).
Now just need to impute missing dates.
First, let’s put NAs for all numeric columns in cases where a station was closed on a certain date or set of dates.
i = 0
rows_to_add_to_mta <- c()
for(station in num_dates_per_station$Var1)
{
i = i + 1
dates_not_found_this_station <- as.Date(setdiff(possible_dates,
mta$From.Date[which(mta$Station == station)]),
origin="1970-01-01")
remote_station_id <- as.vector(mta$Remote.Station.ID)[which(mta$Station == station)[1]]
dates_not_found_this_station_dat <- data.frame(From.Date = dates_not_found_this_station,
Remote.Station.ID = remote_station_id,
Station = station,
stringsAsFactors=FALSE)
dates_not_found_this_station_dat <- cbind(dates_not_found_this_station_dat,
data.frame(matrix(NA,ncol=(ncol(mta) - 3),nrow=nrow(dates_not_found_this_station_dat)),
stringsAsFactors=FALSE))
if(i == 1){rows_to_add_to_mta <- dates_not_found_this_station_dat}
if(i > 1){rows_to_add_to_mta <- rbind(rows_to_add_to_mta,
dates_not_found_this_station_dat)}
}
colnames(rows_to_add_to_mta) <- colnames(mta)
Check our work.
sum(length(possible_dates) - num_dates_per_station$Freq)
## [1] 2524
dim(rows_to_add_to_mta)
## [1] 2524 25
head(rows_to_add_to_mta[,1:5])
## From.Date Remote.Station.ID Station Full.Fare
## 1 2010-05-29 R570 72ND STREET - 2 AVENUE NA
## 2 2010-06-05 R570 72ND STREET - 2 AVENUE NA
## 3 2010-06-12 R570 72ND STREET - 2 AVENUE NA
## 4 2010-06-19 R570 72ND STREET - 2 AVENUE NA
## 5 2010-06-26 R570 72ND STREET - 2 AVENUE NA
## 6 2010-07-03 R570 72ND STREET - 2 AVENUE NA
## Senior.Citizen.or.Disabled
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
tail(rows_to_add_to_mta[,1:5])
## From.Date Remote.Station.ID Station Full.Fare
## 2519 2011-09-24 R388 EAST 180TH ST-MORRIS PARK NA
## 2520 2012-11-10 R257 EAST BROADWAY-MADISON ST NA
## 2521 2012-11-03 R015 FIFTH AVENUE NA
## 2522 2018-01-13 R422 MCDONALD AVE-22ND AVE NA
## 2523 2013-12-14 R430 PELHAM PKWY-ESPLANADE NA
## 2524 2012-11-03 R259 ROOSEVELT ISLAND NA
## Senior.Citizen.or.Disabled
## 2519 NA
## 2520 NA
## 2521 NA
## 2522 NA
## 2523 NA
## 2524 NA
mta <- rbind(mta,rows_to_add_to_mta)
mta <- mta[order(mta$From.Date,mta$Station),]
Then for the missing dates, impute based on the nearest non-missing date.
Let’s look and see exactly which dates are missing again.
dates_missing_for_all <- as.Date(setdiff(all_weeks_in_range_of_possible_dates,possible_dates),origin="1970-01-01")
dates_missing_for_all
## [1] "2011-05-07" "2013-04-20" "2014-02-22" "2014-03-01" "2014-12-06"
## [6] "2014-12-20" "2015-02-28" "2015-03-14" "2015-07-04" "2015-08-01"
## [11] "2015-08-08" "2015-08-29" "2015-09-19" "2015-10-03" "2016-05-28"
## [16] "2016-06-04" "2016-06-25" "2017-05-27" "2017-08-26"
There are three pairs of concurrent missing dates.
Let’s impute based on the two nearest dates.
Unless concurrent, in which cases take nearest valid date (either before or after).
i = 0
for(date in dates_missing_for_all)
{
i = i + 1
date_before <- as.Date(date - 7,origin="1970-01-01")
date_after <- as.Date(date + 7,origin="1970-01-01")
dates_before_and_after <- as.Date(intersect(c(date_before,date_after),
possible_dates),
origin="1970-01-01")
mta_on_dates_before_and_after <- mta[mta$From.Date %in% dates_before_and_after,]
mta_on_dates_before_and_after <- mta_on_dates_before_and_after[,2:ncol(mta_on_dates_before_and_after)] %>%
group_by(Remote.Station.ID,Station) %>%
summarize_all(funs(mean))
mta_on_dates_before_and_after <- data.frame(mta_on_dates_before_and_after,check.names=FALSE,stringsAsFactors=FALSE)
mta_on_dates_before_and_after <- data.frame(From.Date = as.Date(date,origin="1970-01-01"),
mta_on_dates_before_and_after,
check.names=FALSE,stringsAsFactors=FALSE)
if(i == 1){imputed_data <- mta_on_dates_before_and_after}
if(i > 1){imputed_data <- rbind(imputed_data,
mta_on_dates_before_and_after)}
}
Check our work.
dim(imputed_data)
## [1] 8417 25
length(unique(imputed_data$Station))
## [1] 443
length(unique(imputed_data$Station)) * length(unique(imputed_data$From.Date))
## [1] 8417
length(unique(mta$Station))
## [1] 443
head(imputed_data[,1:5])
## From.Date Remote.Station.ID Station Full.Fare
## 1 2011-05-07 R001 WHITEHALL STREET 66311.0
## 2 2011-05-07 R002/R014 FULTON STREET 103932.5
## 3 2011-05-07 R003 CYPRESS HILLS 3340.0
## 4 2011-05-07 R004 75TH STREET & ELDERTS LANE 8139.0
## 5 2011-05-07 R005 85TH STREET & FOREST PKWAY 9109.0
## 6 2011-05-07 R006 WOODHAVEN BOULEVARD 9141.0
## Senior.Citizen.or.Disabled
## 1 2039.0
## 2 3030.5
## 3 106.5
## 4 266.5
## 5 393.0
## 6 392.5
tail(imputed_data[,1:5])
## From.Date Remote.Station.ID Station Full.Fare
## 8412 2017-08-26 R469 RI TRAMWAY (ROOSEVELT) 7283.0
## 8413 2017-08-26 R535 AIRTRAIN @ HOWARD BEACH 21808.0
## 8414 2017-08-26 R536 AIRTRAIN JAMAICA CENTER 1 37901.5
## 8415 2017-08-26 R570 72ND STREET - 2 AVENUE 66654.0
## 8416 2017-08-26 R571 86TH STREET - 2 AVENUE 52100.5
## 8417 2017-08-26 R572 96TH STREET - 2 AVENUE 34945.5
## Senior.Citizen.or.Disabled
## 8412 686.0
## 8413 286.0
## 8414 344.0
## 8415 5503.0
## 8416 4752.5
## 8417 2725.5
Just need to round to nearest integer for numeric fields. Then, let’s combine with other data.
imputed_data[,4:ncol(mta)] <- round(imputed_data[,4:ncol(mta)])
head(imputed_data[,1:7])
## From.Date Remote.Station.ID Station Full.Fare
## 1 2011-05-07 R001 WHITEHALL STREET 66311
## 2 2011-05-07 R002/R014 FULTON STREET 103932
## 3 2011-05-07 R003 CYPRESS HILLS 3340
## 4 2011-05-07 R004 75TH STREET & ELDERTS LANE 8139
## 5 2011-05-07 R005 85TH STREET & FOREST PKWAY 9109
## 6 2011-05-07 R006 WOODHAVEN BOULEVARD 9141
## Senior.Citizen.or.Disabled 7.Day.ADA.Farecard.Access.System.Unlimited
## 1 2039 308
## 2 3030 228
## 3 106 16
## 4 266 20
## 5 393 27
## 6 392 34
## 30.Day.ADA.Farecard.Access.System.Unlimited
## 1 860
## 2 780
## 3 38
## 4 106
## 5 144
## 6 128
mta <- rbind(mta,imputed_data)
mta <- mta[order(mta$From.Date,mta$Station),]
nrow(mta)
## [1] 178529
length(unique(mta$Station))
## [1] 443
length(unique(mta$From.Date)) * length(unique(mta$Station))
## [1] 178529
length(unique(mta$From.Date))
## [1] 403
length(all_weeks_in_range_of_possible_dates)
## [1] 403
Everything looks good!
Let’s pick a station and plot so we make sure imputed data looks OK.
num_non_NA_per_station <- table(mta[which(is.na(mta$Full.Fare) == FALSE),"Station"])
num_non_NA_per_station <- data.frame(num_non_NA_per_station)
num_non_NA_per_station$Var1 <- as.vector(num_non_NA_per_station$Var1)
test_station <- num_non_NA_per_station$Var1[num_non_NA_per_station$Freq == length(all_weeks_in_range_of_possible_dates)]
test_station <- mta[mta$Station == test_station[1],]
plot(1:nrow(test_station),
test_station$Full.Fare,
type="l",
xlab="Week",ylab="Number of full fares",
main=test_station$Station[1])
test_station$From.Date[which(test_station$Full.Fare < 5000)]
## [1] "2012-10-27" "2017-01-14"
test_station$From.Date[which(test_station$Full.Fare > 15000)]
## [1] "2015-03-28" "2015-04-11" "2015-04-18" "2015-04-25" "2015-05-02"
## [6] "2015-05-09" "2015-05-16" "2015-05-30" "2015-06-06" "2015-06-13"
One of the very low dates was for Hurricane Sandy, and another is in winter so may have been a winter weather event of some kind.
The high events are all around a similar time. Not sure why this is, but don’t think this shows any imputation errors.
Now, as the final step, let’s convert from wide to long format.
nrow(mta) * (ncol(mta) - 3)
## [1] 3927638
mta <- gather(mta,Fare.Type,Num.fares,-From.Date,-Remote.Station.ID,-Station)
nrow(mta)
## [1] 3927638
head(mta)
## From.Date Remote.Station.ID Station Fare.Type Num.fares
## 1 2010-05-29 R314 103RD ST-CENTRAL PARK WEST Full.Fare 9593
## 2 2010-05-29 R208 103RD ST-ROOSEVELT AVE Full.Fare 45188
## 3 2010-05-29 R191 103RD STREET-BROADWAY Full.Fare 26502
## 4 2010-05-29 R180 103RD STREET-LEXINGTON AVE Full.Fare 32881
## 5 2010-05-29 R007 104TH STREET Full.Fare 6012
## 6 2010-05-29 R354 104TH STREET-LIBERTY AVENUE Full.Fare 4721
tail(mta)
## From.Date Remote.Station.ID Station Fare.Type
## 3927633 2018-02-10 R451 WINTHROP ST-NOSTRAND AVE Airtran.Monthly
## 3927634 2018-02-10 R201 WOODHAVEN BLVD-QUEENS BLVD Airtran.Monthly
## 3927635 2018-02-10 R006 WOODHAVEN BOULEVARD Airtran.Monthly
## 3927636 2018-02-10 R052 WOODLAWN ROAD Airtran.Monthly
## 3927637 2018-02-10 R301 YORK STREET-JAY STREET Airtran.Monthly
## 3927638 2018-02-10 R326 ZEREGA AVE-WESTCHESTER AVE Airtran.Monthly
## Num.fares
## 3927633 0
## 3927634 0
## 3927635 0
## 3927636 0
## 3927637 0
## 3927638 0
Looks great!
Now we are in a good place to do whatever kind of analysis we would like.
However, I think we’ve done enough for now.
I am going to save mta as an R object, so we can load it again when we are ready to analyze.
save(mta,file="mta_by_station_tidied.Rdata")